diff --git a/fla/layers/__pycache__/delta_net.cpython-311.pyc b/fla/layers/__pycache__/delta_net.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..15db4e95e6d96d3e7e2107957b7ca45cabda9711 Binary files /dev/null and b/fla/layers/__pycache__/delta_net.cpython-311.pyc differ diff --git a/fla/models/__init__.py b/fla/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c940d620212a0201aa4667ffa4a94e2130f77aab --- /dev/null +++ b/fla/models/__init__.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +from fla.models.abc import ABCConfig, ABCForCausalLM, ABCModel +from fla.models.bitnet import BitNetConfig, BitNetForCausalLM, BitNetModel +from fla.models.delta_net import DeltaNetConfig, DeltaNetForCausalLM, DeltaNetModel +from fla.models.forgetting_transformer import ( + ForgettingTransformerConfig, + ForgettingTransformerForCausalLM, + ForgettingTransformerModel +) +from fla.models.gated_deltanet import GatedDeltaNetConfig, GatedDeltaNetForCausalLM, GatedDeltaNetModel +from fla.models.gated_deltaproduct import GatedDeltaProductConfig, GatedDeltaProductForCausalLM, GatedDeltaProductModel +from fla.models.gla import GLAConfig, GLAForCausalLM, GLAModel +from fla.models.gsa import GSAConfig, GSAForCausalLM, GSAModel +from fla.models.hgrn import HGRNConfig, HGRNForCausalLM, HGRNModel +from fla.models.hgrn2 import HGRN2Config, HGRN2ForCausalLM, HGRN2Model +from fla.models.lightnet import LightNetConfig, LightNetForCausalLM, LightNetModel +from fla.models.linear_attn import LinearAttentionConfig, LinearAttentionForCausalLM, LinearAttentionModel +from fla.models.mamba import MambaConfig, MambaForCausalLM, MambaModel +from fla.models.mamba2 import Mamba2Config, Mamba2ForCausalLM, Mamba2Model +from fla.models.nsa import NSAConfig, NSAForCausalLM, NSAModel +from fla.models.retnet import RetNetConfig, RetNetForCausalLM, RetNetModel +from fla.models.rwkv6 import RWKV6Config, RWKV6ForCausalLM, RWKV6Model +from fla.models.rwkv7 import RWKV7Config, RWKV7ForCausalLM, RWKV7Model +from fla.models.samba import SambaConfig, SambaForCausalLM, SambaModel +from fla.models.transformer import TransformerConfig, TransformerForCausalLM, TransformerModel +from fla.models.transformer_mtp import MTPTransformerConfig, MTPTransformerForCausalLM, MTPTransformerModel + +__all__ = [ + 'ABCConfig', 'ABCForCausalLM', 'ABCModel', + 'BitNetConfig', 'BitNetForCausalLM', 'BitNetModel', + 'DeltaNetConfig', 'DeltaNetForCausalLM', 'DeltaNetModel', + 'ForgettingTransformerConfig', 'ForgettingTransformerForCausalLM', 'ForgettingTransformerModel', + 'GatedDeltaNetConfig', 'GatedDeltaNetForCausalLM', 'GatedDeltaNetModel', + 'GLAConfig', 'GLAForCausalLM', 'GLAModel', + 'GSAConfig', 'GSAForCausalLM', 'GSAModel', + 'HGRNConfig', 'HGRNForCausalLM', 'HGRNModel', + 'HGRN2Config', 'HGRN2ForCausalLM', 'HGRN2Model', + 'LightNetConfig', 'LightNetForCausalLM', 'LightNetModel', + 'LinearAttentionConfig', 'LinearAttentionForCausalLM', 'LinearAttentionModel', + 'MambaConfig', 'MambaForCausalLM', 'MambaModel', + 'Mamba2Config', 'Mamba2ForCausalLM', 'Mamba2Model', + 'NSAConfig', 'NSAForCausalLM', 'NSAModel', + 'RetNetConfig', 'RetNetForCausalLM', 'RetNetModel', + 'RWKV6Config', 'RWKV6ForCausalLM', 'RWKV6Model', + 'RWKV7Config', 'RWKV7ForCausalLM', 'RWKV7Model', + 'SambaConfig', 'SambaForCausalLM', 'SambaModel', + 'TransformerConfig', 'TransformerForCausalLM', 'TransformerModel', + 'MTPTransformerConfig', 'MTPTransformerForCausalLM', 'MTPTransformerModel', + 'GatedDeltaProductConfig', 'GatedDeltaProductForCausalLM', 'GatedDeltaProductModel', +] diff --git a/fla/models/__pycache__/__init__.cpython-311.pyc b/fla/models/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bad8f97c6a3a141e98c1e7d6bfc56ee704355ae5 Binary files /dev/null and b/fla/models/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/models/__pycache__/utils.cpython-311.pyc b/fla/models/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d04a955195d4fe75184d39dc1f6b12871862aae Binary files /dev/null and b/fla/models/__pycache__/utils.cpython-311.pyc differ diff --git a/fla/models/lightnet/__pycache__/__init__.cpython-311.pyc b/fla/models/lightnet/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0a2b43f6a58c73bd0971c0313c1f5cbedc90ad4 Binary files /dev/null and b/fla/models/lightnet/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/models/linear_attn/__pycache__/configuration_linear_attn.cpython-311.pyc b/fla/models/linear_attn/__pycache__/configuration_linear_attn.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..590fc3886c64a026905a48111b51291f7d80ebbc Binary files /dev/null and b/fla/models/linear_attn/__pycache__/configuration_linear_attn.cpython-311.pyc differ diff --git a/fla/models/linear_attn/modeling_linear_attn.py b/fla/models/linear_attn/modeling_linear_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..73f8454cd285f75fcf56797954df318dedb3e5c8 --- /dev/null +++ b/fla/models/linear_attn/modeling_linear_attn.py @@ -0,0 +1,406 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from transformers.generation import GenerationMixin +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging +from transformers.utils.deprecation import deprecate_kwarg + +from fla.layers.attn import Attention +from fla.layers.linear_attn import LinearAttention +from fla.models.linear_attn.configuration_linear_attn import LinearAttentionConfig +from fla.models.utils import Cache +from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss +from fla.modules import GatedMLP as LinearAttentionMLP +from fla.modules import RMSNorm + +logger = logging.get_logger(__name__) + + +class LinearAttentionBlock(nn.Module): + def __init__(self, config: LinearAttentionConfig, layer_idx: int): + super().__init__() + + self.config = config + self.layer_idx = layer_idx + + self.attn_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + if config.attn is not None and layer_idx in config.attn['layers']: + self.attn = Attention( + hidden_size=config.hidden_size, + num_heads=config.attn['num_heads'], + num_kv_heads=config.attn['num_kv_heads'], + qkv_bias=config.attn['qkv_bias'], + window_size=config.attn['window_size'], + rope_theta=config.attn['rope_theta'], + max_position_embeddings=config.max_position_embeddings, + layer_idx=layer_idx + ) + else: + self.attn = LinearAttention( + mode=config.attn_mode, + hidden_size=config.hidden_size, + expand_k=config.expand_k, + expand_v=config.expand_v, + num_heads=config.num_heads, + num_kv_heads=config.num_kv_heads, + feature_map=config.feature_map, + tie_feature_map_qk=config.tie_feature_map_qk, + norm_q=config.norm_q, + norm_k=config.norm_k, + do_feature_map_norm=config.norm_feature_map, + elementwise_affine=config.elementwise_affine, + norm_eps=config.norm_eps, + layer_idx=layer_idx + ) + self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + self.mlp = LinearAttentionMLP( + hidden_size=config.hidden_size, + hidden_ratio=config.hidden_ratio, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + fuse_swiglu=config.fuse_swiglu + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + # currently not supported + attentions, past_key_values = None, None + hidden_states = self.attn_norm(hidden_states) + hidden_states = self.attn(hidden_states=hidden_states, **kwargs) + if self.config.fuse_norm: + hidden_states, residual = self.mlp_norm(hidden_states, residual, True) + else: + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.mlp_norm(hidden_states) + hidden_states = self.mlp(hidden_states, **kwargs) + hidden_states = residual + hidden_states + + outputs = (hidden_states, attentions, past_key_values) + + return outputs + + +class LinearAttentionPreTrainedModel(PreTrainedModel): + + config_class = LinearAttentionConfig + base_model_prefix = 'model' + supports_gradient_checkpointing = True + _no_split_modules = ['LinearAttentionBlock'] + _supports_cache_class = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights( + self, + module: nn.Module, + prenorm_residual_strategy: Optional[str] = 'rescale', + num_residuals_per_layer: int = 2, + ): + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + elif hasattr(module, 'reset_parameters'): + module.reset_parameters() + + if prenorm_residual_strategy is not None: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + p = None + if hasattr(module, 'o_proj'): + p = module.o_proj.weight + elif hasattr(module, 'down_proj'): + p = module.down_proj.weight + if p is not None: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + if prenorm_residual_strategy == 'rescale': + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + with torch.no_grad(): + p /= math.sqrt(num_residuals_per_layer * self.config.num_hidden_layers) + elif prenorm_residual_strategy == 'zero': + nn.init.zeros_(p) + else: + raise ValueError(f"Invalid prenorm_residual_strategy: {prenorm_residual_strategy}") + + +class LinearAttentionModel(LinearAttentionPreTrainedModel): + + def __init__(self, config: LinearAttentionConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([LinearAttentionBlock(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]) + self.norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + + self.gradient_checkpointing = False + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, # noqa + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None + ) -> Union[Tuple, BaseModelOutputWithPast]: + if output_attentions: + warnings.warn( + "`LinearAttentionModel` does not support output attention weights now, " + "so `output_attentions` is set to `False`." + ) + output_attentions = False + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + if input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + hidden_states = inputs_embeds + + if use_cache and not isinstance(past_key_values, Cache): + past_key_values = Cache.from_legacy_cache(past_key_values) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...") + use_cache = False + + all_hidden_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + hidden_states, attentions, past_key_values = self._gradient_checkpointing_func( + layer.__call__, + hidden_states, + attention_mask, + past_key_values, + use_cache, + output_attentions, + ) + else: + hidden_states, attentions, past_key_values = layer( + hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions + ) + + if output_attentions: + all_attns += (attentions,) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple(i for i in [hidden_states, past_key_values, all_hidden_states, all_attns] if i is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_attns + ) + + +class LinearAttentionForCausalLM(LinearAttentionPreTrainedModel, GenerationMixin): + + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = LinearAttentionModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.criterion = None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embeddings + + def set_input_embeddings(self, value): + self.model.embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def generate(self, *args, **kwargs): + try: + return super().generate(*args, **kwargs) + except AttributeError as exception: + if 'past_key_values' in str(exception): + raise AttributeError( + f"You tried to call `generate` with a decoding strategy that manipulates `past_key_values`, " + f"which is not supported for {self.__class__.__name__}. " + f"Try another generation strategy instead. " + f"For the available generation strategies, check this doc: " + f"https://huggingface.co/docs/transformers/en/generation_strategies#decoding-strategies" + ) + else: + raise exception + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: bool = True, + logits_to_keep: Optional[int] = None, + **kwargs + ): + # only last token for `inputs_ids` if the `past_key_values` is not empty. + if past_key_values is not None and len(past_key_values) > 0: + input_ids = input_ids[:, -1:] + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(past_key_values) == 0: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. + # Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {'input_ids': input_ids.contiguous()} + + if logits_to_keep is not None: + model_inputs['logits_to_keep'] = logits_to_keep + + model_inputs.update({ + 'past_key_values': past_key_values, + 'use_cache': use_cache, + 'attention_mask': attention_mask, + }) + return model_inputs + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + logits_to_keep: Optional[int] = 0 + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict + ) + + hidden_states = outputs[0] + fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training + + loss, logits = None, None + if not fuse_linear_and_cross_entropy or labels is None: + logits = self.lm_head(hidden_states if logits_to_keep is None else hidden_states[:, -logits_to_keep:]) + if labels is not None: + if getattr(self, 'criterion', None) is None: + if fuse_linear_and_cross_entropy: + criterion = FusedLinearCrossEntropyLoss() + elif self.config.fuse_cross_entropy: + criterion = FusedCrossEntropyLoss(inplace_backward=True) + else: + criterion = nn.CrossEntropyLoss() + else: + criterion = self.criterion + labels = labels.to(hidden_states.device) + labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1) + if fuse_linear_and_cross_entropy: + loss = criterion(hidden_states, labels, self.lm_head.weight, self.lm_head.bias) + else: + loss = criterion(logits.view(labels.numel(), -1), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/fla/models/mamba/__pycache__/configuration_mamba.cpython-311.pyc b/fla/models/mamba/__pycache__/configuration_mamba.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b34d1751323cff7a0cf89306fce9189358ca99bc Binary files /dev/null and b/fla/models/mamba/__pycache__/configuration_mamba.cpython-311.pyc differ diff --git a/fla/models/mamba/__pycache__/modeling_mamba.cpython-311.pyc b/fla/models/mamba/__pycache__/modeling_mamba.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7c2b6a649020e642f06c07457e4e115412ee5697 Binary files /dev/null and b/fla/models/mamba/__pycache__/modeling_mamba.cpython-311.pyc differ diff --git a/fla/models/mamba2/__pycache__/configuration_mamba2.cpython-311.pyc b/fla/models/mamba2/__pycache__/configuration_mamba2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cf92ce0021546d75b6e64567de23baa3c4e84783 Binary files /dev/null and b/fla/models/mamba2/__pycache__/configuration_mamba2.cpython-311.pyc differ diff --git a/fla/models/mamba2/__pycache__/modeling_mamba2.cpython-311.pyc b/fla/models/mamba2/__pycache__/modeling_mamba2.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..70450727982e981001bc21806edb10a1dba11da6 Binary files /dev/null and b/fla/models/mamba2/__pycache__/modeling_mamba2.cpython-311.pyc differ diff --git a/fla/models/mamba2/configuration_mamba2.py b/fla/models/mamba2/configuration_mamba2.py new file mode 100644 index 0000000000000000000000000000000000000000..4541257e687c5ca5121cb2eb92ea190839935345 --- /dev/null +++ b/fla/models/mamba2/configuration_mamba2.py @@ -0,0 +1,170 @@ +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MAMBA2 configuration""" + +import math + +from transformers.configuration_utils import PretrainedConfig + + +class Mamba2Config(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`Mamba2Model`]. It is used to instantiate a MAMBA2 + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the MAMBA2 + [state-spaces/mamba2-2.8b](https://huggingface.co/state-spaces/mamba2-2.8b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + num_heads (`int`, *optional*, defaults to 64): + Number of heads for the evolution matrices of mamba 2. + head_dim (`int`, *optional*, defaults to 64): + Dimension of each head. + vocab_size (`int`, *optional*, defaults to 32768): + Vocabulary size of the MAMBA2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Mamba2Model`]. + hidden_size (`int`, *optional*, defaults to 2048): + Dimensionality of the embeddings and hidden states. + state_size (`int`, *optional*, defaults to 128): shape of the state space latents. + num_hidden_layers (`int`, *optional*, defaults to 48): + Number of hidden layers in the model. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): + The epsilon to use in the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the beginning of sentence token in the vocabulary. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the end of sentence token in the vocabulary. + expand (`int`, *optional*, defaults to 2): Expanding factor used to determine the intermediate size. + conv_kernel (`int`, *optional*, defaults to 4): Size of the convolution kernel. + n_groups (`int`, *optional*, defaults to 1): + Number of groups for the evolution matrices of mamba 2. + use_bias (`bool`, *optional*, defaults to `False`): + Whether or not to use bias in ["in_proj", "out_proj"] of the mixer block + use_conv_bias (`bool`, *optional*, defaults to `True`): + Whether or not to use bias in the convolution layer of the mixer block. + hidden_act (`str`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + initializer_range (`float`, *optional*, defaults to 0.1): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + residual_in_fp32 (`bool`, *optional*, defaults to `True`): + Whether or not residuals should be in `float32`. + If set to `False` residuals will keep the same `dtype` as the rest of the model + time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`): + Rank of the discretization projection matrix. + `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)` + time_step_min (`float`, *optional*, defaults to 0.001): + Minimum `time_step` used to bound `dt_proj.bias`. + time_step_max (`float`, *optional*, defaults to 0.1): + Maximum `time_step` used to bound `dt_proj.bias`. + time_step_floor (`float`, *optional*, defaults to 0.0001): + Minimum clamping value of the `dt_proj.bias` layer initialization. + time_step_limit (`tuple`, *optional*, defaults to `(0.0, inf)`): + Accepted range of time step values. + rescale_prenorm_residual (`bool`, *optional*, defaults to `True`): + Whether or not to rescale `out_proj` weights when initializing. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the cache should be used. + rms_norm (`bool`, *optional*, defaults to `True`): + Whether to use RMS norm or not. + chunk_size (`int`, *optional*, defaults to 256): + Size of the chunks that will comprise the sequence. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie word embeddings or not. + """ + + model_type = "mamba2" + + def __init__( + self, + num_heads: int = 64, + head_dim: int = 64, + vocab_size: int = 32000, + hidden_size: int = 2048, + state_size: int = 128, + num_hidden_layers: int = 48, + layer_norm_epsilon: float = 1e-5, + pad_token_id: int = 0, + bos_token_id: int = 1, + eos_token_id: int = 2, + expand: int = 2, + conv_kernel: int = 4, + n_groups: int = 1, + use_bias: bool = False, + use_conv_bias: bool = True, + hidden_act: str = "silu", + initializer_range: float = 0.1, + residual_in_fp32: bool = True, + time_step_rank: str = "auto", + time_step_min: float = 0.001, + time_step_max: float = 0.1, + time_step_floor: float = 1e-4, + time_step_limit=(0.0, float("inf")), + rescale_prenorm_residual: bool = True, + use_cache: bool = True, + rms_norm: bool = True, + chunk_size: int = 256, + fuse_norm: bool = True, + fuse_cross_entropy: bool = True, + tie_word_embeddings: bool = False, + **kwargs, + ): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.state_size = state_size + self.num_hidden_layers = num_hidden_layers + self.layer_norm_epsilon = layer_norm_epsilon + self.conv_kernel = conv_kernel + self.expand = expand + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.use_bias = use_bias + self.use_conv_bias = use_conv_bias + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.time_step_rank = ( + math.ceil(self.hidden_size / 16) + if time_step_rank == "auto" + else time_step_rank + ) + self.time_step_min = time_step_min + self.time_step_max = time_step_max + self.time_step_floor = time_step_floor + self.rescale_prenorm_residual = rescale_prenorm_residual + self.residual_in_fp32 = residual_in_fp32 + self.use_cache = use_cache + self.n_groups = n_groups + self.num_heads = num_heads + self.head_dim = head_dim + self.rms_norm = rms_norm + self.state_size = state_size + self.chunk_size = chunk_size + self.time_step_limit = time_step_limit + self.fuse_norm = fuse_norm + self.fuse_cross_entropy = fuse_cross_entropy + self.tie_word_embeddings = tie_word_embeddings + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/fla/models/nsa/__init__.py b/fla/models/nsa/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..65b8d8982cfb751a9dc0b15b4c8546ac08bf1b06 --- /dev/null +++ b/fla/models/nsa/__init__.py @@ -0,0 +1,15 @@ +# -*- coding: utf-8 -*- + +from transformers import AutoConfig, AutoModel, AutoModelForCausalLM + +from fla.models.nsa.configuration_nsa import NSAConfig +from fla.models.nsa.modeling_nsa import NSAForCausalLM, NSAModel + +AutoConfig.register(NSAConfig.model_type, NSAConfig) +AutoModel.register(NSAConfig, NSAModel) +AutoModelForCausalLM.register(NSAConfig, NSAForCausalLM) + + +__all__ = [ + 'NSAConfig', 'NSAModel', 'NSAForCausalLM', +] diff --git a/fla/models/nsa/__pycache__/configuration_nsa.cpython-311.pyc b/fla/models/nsa/__pycache__/configuration_nsa.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..229aed84d58623f47ee80f9eb15564f621e95bda Binary files /dev/null and b/fla/models/nsa/__pycache__/configuration_nsa.cpython-311.pyc differ diff --git a/fla/models/nsa/__pycache__/modeling_nsa.cpython-311.pyc b/fla/models/nsa/__pycache__/modeling_nsa.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb811119850c6e1ddce0f8b76e6b07754a07f88f Binary files /dev/null and b/fla/models/nsa/__pycache__/modeling_nsa.cpython-311.pyc differ diff --git a/fla/models/nsa/configuration_nsa.py b/fla/models/nsa/configuration_nsa.py new file mode 100644 index 0000000000000000000000000000000000000000..f4d9de31e3feb49ff20cd3d06143d05ca41eb6c6 --- /dev/null +++ b/fla/models/nsa/configuration_nsa.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- + +from typing import Optional + +from transformers.configuration_utils import PretrainedConfig + + +class NSAConfig(PretrainedConfig): + + model_type = 'nsa' + keys_to_ignore_at_inference = ['past_key_values'] + + def __init__( + self, + hidden_size: int = 2048, + num_hidden_layers: int = 24, + num_heads: int = 64, + num_kv_heads: int = 4, + head_dim: int = 32, + qkv_bias: bool = False, + block_size: int = 64, + block_counts: Optional[int] = 16, + window_size: Optional[int] = 512, + rope_theta: Optional[float] = 10000., + max_position_embeddings: int = 2048, + hidden_ratio: Optional[int] = 4, + intermediate_size: Optional[int] = None, + hidden_act: str = "swish", + initializer_range: float = 0.006, + elementwise_affine: Optional[bool] = True, + norm_eps: float = 1e-6, + use_cache: bool = True, + pad_token_id: int = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + fuse_norm: bool = True, + fuse_swiglu: bool = True, + fuse_cross_entropy: bool = True, + vocab_size: int = 32000, + **kwargs, + ): + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = head_dim + self.qkv_bias = qkv_bias + self.block_size = block_size + self.block_counts = block_counts + self.window_size = window_size + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + + self.initializer_range = initializer_range + self.elementwise_affine = elementwise_affine + self.norm_eps = norm_eps + self.use_cache = use_cache + + self.fuse_norm = fuse_norm + self.fuse_swiglu = fuse_swiglu + self.fuse_cross_entropy = fuse_cross_entropy + self.vocab_size = vocab_size + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/fla/models/retnet/__init__.py b/fla/models/retnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ad7d9e9da930819a2a6728e3e189090651b82a2e --- /dev/null +++ b/fla/models/retnet/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- + +from transformers import AutoConfig, AutoModel, AutoModelForCausalLM + +from fla.models.retnet.configuration_retnet import RetNetConfig +from fla.models.retnet.modeling_retnet import RetNetForCausalLM, RetNetModel + +AutoConfig.register(RetNetConfig.model_type, RetNetConfig) +AutoModel.register(RetNetConfig, RetNetModel) +AutoModelForCausalLM.register(RetNetConfig, RetNetForCausalLM) + + +__all__ = ['RetNetConfig', 'RetNetForCausalLM', 'RetNetModel'] diff --git a/fla/models/retnet/__pycache__/configuration_retnet.cpython-311.pyc b/fla/models/retnet/__pycache__/configuration_retnet.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5db3c06e0850b2cf6ef4d0439ece20879ffbb0d9 Binary files /dev/null and b/fla/models/retnet/__pycache__/configuration_retnet.cpython-311.pyc differ diff --git a/fla/models/retnet/__pycache__/modeling_retnet.cpython-311.pyc b/fla/models/retnet/__pycache__/modeling_retnet.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5538329514c5ac22c73cc87724ce20a49da984c Binary files /dev/null and b/fla/models/retnet/__pycache__/modeling_retnet.cpython-311.pyc differ diff --git a/fla/models/retnet/configuration_retnet.py b/fla/models/retnet/configuration_retnet.py new file mode 100644 index 0000000000000000000000000000000000000000..07ec0465f0ab9e698f3f299ed9bea1024f56cb1d --- /dev/null +++ b/fla/models/retnet/configuration_retnet.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from typing import Dict, Optional + +from transformers.configuration_utils import PretrainedConfig + + +class RetNetConfig(PretrainedConfig): + + model_type = 'retnet' + keys_to_ignore_at_inference = ['past_key_values'] + + def __init__( + self, + attn_mode: str = "chunk", + hidden_size: int = 2048, + expand_k: int = 1, + expand_v: int = 2, + hidden_ratio: Optional[int] = 2, + intermediate_size: Optional[int] = None, + num_hidden_layers: int = 24, + num_heads: int = 8, + num_kv_heads: Optional[int] = None, + feature_map: Optional[str] = None, + hidden_act: str = "swish", + use_short_conv: bool = False, + conv_size: int = 4, + use_output_gate: bool = True, + max_position_embeddings: int = 2048, + elementwise_affine: Optional[bool] = True, + norm_eps: float = 1e-6, + attn: Optional[Dict] = None, + use_cache: bool = True, + pad_token_id: int = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + initializer_range: float = 0.006, + fuse_norm: bool = True, + fuse_swiglu: bool = True, + fuse_cross_entropy: bool = True, + vocab_size: int = 32000, + **kwargs + ) -> RetNetConfig: + self.attn_mode = attn_mode + self.hidden_size = hidden_size + self.expand_k = expand_k + self.expand_v = expand_v + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.feature_map = feature_map + self.hidden_act = hidden_act + self.use_short_conv = use_short_conv + self.conv_size = conv_size + self.use_output_gate = use_output_gate + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.elementwise_affine = elementwise_affine + self.norm_eps = norm_eps + self.attn = attn + self.use_cache = use_cache + self.initializer_range = initializer_range + + self.fuse_norm = fuse_norm + self.fuse_swiglu = fuse_swiglu + self.fuse_cross_entropy = fuse_cross_entropy + self.vocab_size = vocab_size + + if attn is not None: + if not isinstance(attn, Dict): + raise ValueError("attn must be a dictionary") + if 'layers' not in attn: + raise ValueError("Layer indices must be provided to initialize hybrid attention layers") + if 'num_heads' not in attn: + raise ValueError("Number of heads must be provided to initialize hybrid attention layers") + attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads']) + attn['qkv_bias'] = attn.get('qkv_bias', False) + attn['window_size'] = attn.get('window_size', None) + attn['rope_theta'] = attn.get('rope_theta', 10000.) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/fla/models/retnet/modeling_retnet.py b/fla/models/retnet/modeling_retnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f4500020dc152d5ce418983722bdad9e3de0287e --- /dev/null +++ b/fla/models/retnet/modeling_retnet.py @@ -0,0 +1,425 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +import warnings +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from transformers.generation import GenerationMixin +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging +from transformers.utils.deprecation import deprecate_kwarg + +from fla.layers.attn import Attention +from fla.layers.multiscale_retention import MultiScaleRetention +from fla.models.retnet.configuration_retnet import RetNetConfig +from fla.models.utils import Cache +from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss +from fla.modules import GatedMLP as RetNetMLP +from fla.modules import RMSNorm + +if TYPE_CHECKING: + from transformers.processing_utils import Unpack + +logger = logging.get_logger(__name__) + + +class RetNetBlock(nn.Module): + def __init__(self, config: RetNetConfig, layer_idx: int): + super().__init__() + + self.config = config + self.layer_idx = layer_idx + + self.attn_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + if config.attn is not None and layer_idx in config.attn['layers']: + self.attn = Attention( + hidden_size=config.hidden_size, + num_heads=config.attn['num_heads'], + num_kv_heads=config.attn['num_kv_heads'], + qkv_bias=config.attn['qkv_bias'], + window_size=config.attn['window_size'], + rope_theta=config.attn['rope_theta'], + max_position_embeddings=config.max_position_embeddings, + layer_idx=layer_idx + ) + else: + self.attn = MultiScaleRetention( + mode=config.attn_mode, + hidden_size=config.hidden_size, + expand_k=config.expand_k, + expand_v=config.expand_v, + num_heads=config.num_heads, + num_kv_heads=config.num_kv_heads, + feature_map=config.feature_map, + use_output_gate=config.use_output_gate, + gate_fn=config.hidden_act, + elementwise_affine=config.elementwise_affine, + norm_eps=config.norm_eps, + fuse_norm=config.fuse_norm, + layer_idx=layer_idx + ) + self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + self.mlp = RetNetMLP( + hidden_size=config.hidden_size, + hidden_ratio=config.hidden_ratio, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + fuse_swiglu=config.fuse_swiglu + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + **kwargs: Unpack[Dict] + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + + hidden_states = self.attn_norm(hidden_states) + hidden_states, attentions, past_key_values = self.attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + **kwargs + ) + if self.config.fuse_norm: + hidden_states, residual = self.mlp_norm(hidden_states, residual, True) + else: + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.mlp_norm(hidden_states) + hidden_states = self.mlp(hidden_states, **kwargs) + hidden_states = residual + hidden_states + + outputs = (hidden_states, attentions, past_key_values) + + return outputs + + +class RetNetPreTrainedModel(PreTrainedModel): + + config_class = RetNetConfig + base_model_prefix = 'model' + supports_gradient_checkpointing = True + _no_split_modules = ['RetNetBlock'] + _supports_cache_class = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights( + self, + module: nn.Module, + prenorm_residual_strategy: Optional[str] = 'rescale', + num_residuals_per_layer: int = 2, + ): + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + elif hasattr(module, 'reset_parameters'): + module.reset_parameters() + + if prenorm_residual_strategy is not None: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + p = None + if hasattr(module, 'o_proj'): + p = module.o_proj.weight + elif hasattr(module, 'down_proj'): + p = module.down_proj.weight + if p is not None: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + if prenorm_residual_strategy == 'rescale': + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + with torch.no_grad(): + p /= math.sqrt(num_residuals_per_layer * self.config.num_hidden_layers) + elif prenorm_residual_strategy == 'zero': + nn.init.zeros_(p) + else: + raise ValueError(f"Invalid prenorm_residual_strategy: {prenorm_residual_strategy}") + + +class RetNetModel(RetNetPreTrainedModel): + + def __init__(self, config: RetNetConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [RetNetBlock(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + + self.gradient_checkpointing = False + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, # noqa + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs: Unpack[Dict] + ) -> Union[Tuple, BaseModelOutputWithPast]: + if output_attentions: + warnings.warn( + "`RetNetModel` does not support output attention weights now, so `output_attentions` is set to `False`." + ) + output_attentions = False + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + if input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + hidden_states = inputs_embeds + + if use_cache and not isinstance(past_key_values, Cache): + past_key_values = Cache.from_legacy_cache(past_key_values) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...") + use_cache = False + + all_hidden_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + hidden_states, attentions, past_key_values = self._gradient_checkpointing_func( + layer.__call__, + hidden_states, + attention_mask, + past_key_values, + use_cache, + output_attentions, + **kwargs + ) + else: + hidden_states, attentions, past_key_values = layer( + hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + **kwargs + ) + + if output_attentions: + all_attns += (attentions,) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple(i for i in [hidden_states, past_key_values, all_hidden_states, all_attns] if i is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_attns + ) + + +class RetNetForCausalLM(RetNetPreTrainedModel, GenerationMixin): + + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = RetNetModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.criterion = None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embeddings + + def set_input_embeddings(self, value): + self.model.embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def generate(self, *args, **kwargs): + try: + return super().generate(*args, **kwargs) + except AttributeError as exception: + # Expected exception: "AttributeError: '(object name)' object has no attribute 'past_key_values'" + if 'past_key_values' in str(exception): + raise AttributeError( + f"You tried to call `generate` with a decoding strategy that manipulates `past_key_values`, " + f"which is not supported for {self.__class__.__name__}. " + f"Try another generation strategy instead. " + f"For the available generation strategies, check this doc: " + f"https://huggingface.co/docs/transformers/en/generation_strategies#decoding-strategies" + ) + else: + raise exception + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor = None, + past_key_values: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = True, + logits_to_keep: Optional[int] = None, + **kwargs: Unpack[Dict] + ): + # only last token for `inputs_ids` if the `past_key_values` is passed along. + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(past_key_values) == 0: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. + # Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {'input_ids': input_ids.contiguous()} + + if logits_to_keep is not None: + model_inputs['logits_to_keep'] = logits_to_keep + + model_inputs.update({ + 'past_key_values': past_key_values, + 'use_cache': use_cache, + 'attention_mask': attention_mask, + }) + return model_inputs + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + logits_to_keep: Optional[int] = 0, + **kwargs: Unpack[Dict] + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs + ) + + hidden_states = outputs[0] + fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training + + loss, logits = None, None + if not fuse_linear_and_cross_entropy or labels is None: + logits = self.lm_head(hidden_states if logits_to_keep is None else hidden_states[:, -logits_to_keep:]) + if labels is not None: + if getattr(self, 'criterion', None) is None: + if fuse_linear_and_cross_entropy: + criterion = FusedLinearCrossEntropyLoss() + elif self.config.fuse_cross_entropy: + criterion = FusedCrossEntropyLoss(inplace_backward=True) + else: + criterion = nn.CrossEntropyLoss() + else: + criterion = self.criterion + labels = labels.to(hidden_states.device) + labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1) + if fuse_linear_and_cross_entropy: + loss = criterion(hidden_states, labels, self.lm_head.weight, self.lm_head.bias) + else: + loss = criterion(logits.view(labels.numel(), -1), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/fla/models/rwkv6/__pycache__/__init__.cpython-311.pyc b/fla/models/rwkv6/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16e1a1e96da8f369646562c928893e2ae38c0111 Binary files /dev/null and b/fla/models/rwkv6/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/models/rwkv6/__pycache__/configuration_rwkv6.cpython-311.pyc b/fla/models/rwkv6/__pycache__/configuration_rwkv6.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..82c85dc97c499233c4d19545b0db973d8baf9b19 Binary files /dev/null and b/fla/models/rwkv6/__pycache__/configuration_rwkv6.cpython-311.pyc differ diff --git a/fla/models/rwkv6/__pycache__/modeling_rwkv6.cpython-311.pyc b/fla/models/rwkv6/__pycache__/modeling_rwkv6.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a65cb4812a69a9b263058f363e48461ef7ceac34 Binary files /dev/null and b/fla/models/rwkv6/__pycache__/modeling_rwkv6.cpython-311.pyc differ diff --git a/fla/models/rwkv6/configuration_rwkv6.py b/fla/models/rwkv6/configuration_rwkv6.py new file mode 100644 index 0000000000000000000000000000000000000000..8635aa543bf0373e260279fc9d6db3c7e8985f7d --- /dev/null +++ b/fla/models/rwkv6/configuration_rwkv6.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +from typing import Dict, Optional + +from transformers.configuration_utils import PretrainedConfig + + +class RWKV6Config(PretrainedConfig): + + model_type = 'rwkv6' + keys_to_ignore_at_inference = ['past_key_values'] + + def __init__( + self, + attn_mode: str = "chunk", + hidden_size: int = 2048, + expand_k: int = 0.5, + expand_v: int = 1, + hidden_ratio: Optional[int] = 3.5, + intermediate_size: Optional[int] = None, + num_hidden_layers: int = 24, + num_heads: int = 4, + proj_low_rank_dim: int = 32, + gate_low_rank_dim: int = 64, + hidden_act: str = "sqrelu", + max_position_embeddings: int = 2048, + norm_first: bool = True, + norm_bias: bool = True, + norm_eps: float = 1e-5, + attn: Optional[Dict] = None, + use_cache: bool = True, + pad_token_id: int = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + initializer_range: float = 0.006, + fuse_norm: bool = True, + fuse_cross_entropy: bool = True, + vocab_size: int = 32000, + **kwargs + ): + self.attn_mode = attn_mode + self.hidden_size = hidden_size + self.expand_k = expand_k + self.expand_v = expand_v + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.norm_first = norm_first + self.num_hidden_layers = num_hidden_layers + self.num_heads = num_heads + self.proj_low_rank_dim = proj_low_rank_dim + self.gate_low_rank_dim = gate_low_rank_dim + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.norm_bias = norm_bias + self.norm_eps = norm_eps + self.attn = attn + self.use_cache = use_cache + self.initializer_range = initializer_range + self.fuse_norm = fuse_norm + self.fuse_cross_entropy = fuse_cross_entropy + self.vocab_size = vocab_size + + if attn is not None: + if not isinstance(attn, Dict): + raise ValueError("attn must be a dictionary") + if 'layers' not in attn: + raise ValueError("Layer indices must be provided to initialize hybrid attention layers") + if 'num_heads' not in attn: + raise ValueError("Number of heads must be provided to initialize hybrid attention layers") + attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads']) + attn['qkv_bias'] = attn.get('qkv_bias', False) + attn['window_size'] = attn.get('window_size', None) + attn['rope_theta'] = attn.get('rope_theta', 10000.) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/fla/models/rwkv7/__init__.py b/fla/models/rwkv7/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b6f132f3fc8de7108242e1accc51e55f4a4e6ed5 --- /dev/null +++ b/fla/models/rwkv7/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- + +from transformers import AutoConfig, AutoModel, AutoModelForCausalLM + +from fla.models.rwkv7.configuration_rwkv7 import RWKV7Config +from fla.models.rwkv7.modeling_rwkv7 import RWKV7ForCausalLM, RWKV7Model + +AutoConfig.register(RWKV7Config.model_type, RWKV7Config, True) +AutoModel.register(RWKV7Config, RWKV7Model, True) +AutoModelForCausalLM.register(RWKV7Config, RWKV7ForCausalLM, True) + + +__all__ = ['RWKV7Config', 'RWKV7ForCausalLM', 'RWKV7Model'] diff --git a/fla/models/rwkv7/__pycache__/__init__.cpython-311.pyc b/fla/models/rwkv7/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29ac8969b9175ddf46924ad28840851d2c73a119 Binary files /dev/null and b/fla/models/rwkv7/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/models/rwkv7/__pycache__/configuration_rwkv7.cpython-311.pyc b/fla/models/rwkv7/__pycache__/configuration_rwkv7.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d7fdb3e7180162fa4dcf84f1fe5bdad3b0ac1b3 Binary files /dev/null and b/fla/models/rwkv7/__pycache__/configuration_rwkv7.cpython-311.pyc differ diff --git a/fla/models/rwkv7/__pycache__/modeling_rwkv7.cpython-311.pyc b/fla/models/rwkv7/__pycache__/modeling_rwkv7.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6b8ffcd36a3c8fb0d7b4b1d4480b0a43ef5716e3 Binary files /dev/null and b/fla/models/rwkv7/__pycache__/modeling_rwkv7.cpython-311.pyc differ diff --git a/fla/models/rwkv7/configuration_rwkv7.py b/fla/models/rwkv7/configuration_rwkv7.py new file mode 100644 index 0000000000000000000000000000000000000000..2def0393fdedcd84c415c908c280d3550fd3d942 --- /dev/null +++ b/fla/models/rwkv7/configuration_rwkv7.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +from typing import Dict, List, Optional, Union + +from transformers.configuration_utils import PretrainedConfig + + +class RWKV7Config(PretrainedConfig): + + model_type = 'rwkv7' + keys_to_ignore_at_inference = ['past_key_values'] + + def __init__( + self, + attn_mode: str = "chunk", + hidden_size: int = 2048, + hidden_ratio: Optional[int] = 4, + intermediate_size: Optional[int] = None, + num_hidden_layers: int = 24, + head_dim: Optional[int] = 64, + num_heads: Optional[int] = None, + decay_low_rank_dim: int = 64, + gate_low_rank_dim: int = 128, + a_low_rank_dim: int = 64, + v_low_rank_dim: int = 16, + hidden_act: str = "sqrelu", + max_position_embeddings: int = 2048, + norm_first: bool = True, + norm_bias: bool = True, + norm_eps: float = 1e-5, + attn: Optional[Dict] = None, + use_cache: bool = True, + pad_token_id: int = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + initializer_range: float = 0.006, + fuse_norm: bool = True, + fuse_cross_entropy: bool = True, + vocab_size: int = 32000, + value_dim: Optional[Union[int, List[int]]] = None, + **kwargs + ): + self.attn_mode = attn_mode + self.hidden_size = hidden_size + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.norm_first = norm_first + self.num_hidden_layers = num_hidden_layers + + if head_dim is None and num_heads is not None: + head_dim = int(hidden_size // num_heads) + elif head_dim is not None and num_heads is None: + num_heads = int(hidden_size // head_dim) + + if value_dim is None: + value_dim = [hidden_size] * num_hidden_layers + elif isinstance(value_dim, int): + assert value_dim >= hidden_size, "value_dim must be greater than hidden_size" + assert value_dim % hidden_size == 0, "value_dim must be divisible by hidden_size" + value_dim = [value_dim] * num_hidden_layers + else: + assert len(value_dim) == num_hidden_layers, "value_dim must have the same length as num_hidden_layers" + for v in value_dim: + assert v >= hidden_size, "value_dim must be greater than hidden_size" + assert v % hidden_size == 0, "value_dim must be divisible by hidden_size" + + self.head_dim = head_dim + self.num_heads = num_heads + self.value_dim = value_dim + + self.decay_low_rank_dim = decay_low_rank_dim + self.gate_low_rank_dim = gate_low_rank_dim + self.a_low_rank_dim = a_low_rank_dim + self.v_low_rank_dim = v_low_rank_dim + self.hidden_act = hidden_act + self.max_position_embeddings = max_position_embeddings + self.norm_bias = norm_bias + self.norm_eps = norm_eps + self.attn = attn + self.use_cache = use_cache + self.initializer_range = initializer_range + self.fuse_norm = fuse_norm + self.fuse_cross_entropy = fuse_cross_entropy + self.vocab_size = vocab_size + + if attn is not None: + if not isinstance(attn, Dict): + raise ValueError("attn must be a dictionary") + if 'layers' not in attn: + raise ValueError("Layer indices must be provided to initialize hybrid attention layers") + if 'num_heads' not in attn: + raise ValueError("Number of heads must be provided to initialize hybrid attention layers") + attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads']) + attn['qkv_bias'] = attn.get('qkv_bias', False) + attn['window_size'] = attn.get('window_size', None) + attn['rope_theta'] = attn.get('rope_theta', 10000.) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/fla/models/rwkv7/modeling_rwkv7.py b/fla/models/rwkv7/modeling_rwkv7.py new file mode 100644 index 0000000000000000000000000000000000000000..038e58d254883865f2f5d8a612ec0d0060c130c1 --- /dev/null +++ b/fla/models/rwkv7/modeling_rwkv7.py @@ -0,0 +1,505 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +import warnings +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from transformers.generation import GenerationMixin +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging +from transformers.utils.deprecation import deprecate_kwarg + +from fla.layers.attn import Attention +from fla.layers.rwkv7 import RWKV7Attention +from fla.models.rwkv7.configuration_rwkv7 import RWKV7Config +from fla.models.utils import Cache +from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss, LayerNorm +from fla.modules.activations import ACT2FN + +if TYPE_CHECKING: + from transformers.processing_utils import Unpack + +logger = logging.get_logger(__name__) + + +class RWKV7FeedForward(nn.Module): + + def __init__( + self, + hidden_size: int, + hidden_ratio: Optional[int] = None, + intermediate_size: Optional[int] = None, + hidden_act: str = 'sqrelu', + layer_idx: int = None + ) -> RWKV7FeedForward: + super().__init__() + + self.hidden_size = hidden_size + if hidden_ratio is None: + hidden_ratio = 4 + if intermediate_size is None: + intermediate_size = int(hidden_size * hidden_ratio) + intermediate_size = 32 * ((intermediate_size + 32 - 1) // 32) + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + + self.time_shift = nn.ZeroPad2d((0, 0, 1, -1)) + + self.x_k = nn.Parameter(torch.zeros(hidden_size)) + + self.key = nn.Linear(hidden_size, intermediate_size, bias=False) + self.value = nn.Linear(intermediate_size, hidden_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + self.layer_idx = layer_idx + + def forward( + self, + x: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + state: Optional[Cache] = None + ) -> torch.Tensor: + if attention_mask is not None: + x = x.mul(attention_mask[:, -x.shape[-2]:, None]) + if x.shape[1] == 1 and state is not None and state[self.layer_idx]['ffn_state'] is not None: + shifted = state[self.layer_idx]['ffn_state'].unsqueeze(1) + else: + shifted = self.time_shift(x) + if state is not None and state[self.layer_idx]['ffn_state'] is not None: + shifted[:, 0] = state[self.layer_idx]['ffn_state'][-1] + if state is not None: + # no need to update the offset twice + state.update(ffn_state=x[:, -1], layer_idx=self.layer_idx, offset=0) + return self.value(self.act_fn(self.key(x.addcmul(shifted - x, self.x_k)))), state + + +class RWKV7Block(nn.Module): + + def __init__( + self, + config: RWKV7Config, + layer_idx: int + ) -> RWKV7Block: + super().__init__() + + self.config = config + self.layer_idx = layer_idx + + if config.norm_first and layer_idx == 0: + self.pre_norm = (LayerNorm if config.fuse_norm else nn.LayerNorm)( + config.hidden_size, + bias=config.norm_bias, + eps=config.norm_eps + ) + self.attn_norm = (LayerNorm if config.fuse_norm else nn.LayerNorm)( + config.hidden_size, + bias=config.norm_bias, + eps=config.norm_eps + ) + if config.attn is not None and layer_idx in config.attn['layers']: + self.attn = Attention( + hidden_size=config.hidden_size, + num_heads=config.attn['num_heads'], + num_kv_heads=config.attn['num_kv_heads'], + qkv_bias=config.attn['qkv_bias'], + window_size=config.attn['window_size'], + rope_theta=config.attn['rope_theta'], + max_position_embeddings=config.max_position_embeddings, + layer_idx=layer_idx + ) + else: + self.attn = RWKV7Attention( + mode=config.attn_mode, + hidden_size=config.hidden_size, + head_dim=config.head_dim, + num_heads=config.num_heads, + decay_low_rank_dim=config.decay_low_rank_dim, + gate_low_rank_dim=config.gate_low_rank_dim, + a_low_rank_dim=config.a_low_rank_dim, + v_low_rank_dim=config.v_low_rank_dim, + norm_eps=config.norm_eps, + fuse_norm=config.fuse_norm, + layer_idx=layer_idx, + value_dim=config.value_dim[layer_idx] + ) + self.ffn_norm = (LayerNorm if config.fuse_norm else nn.LayerNorm)( + config.hidden_size, + bias=config.norm_bias, + eps=config.norm_eps + ) + self.ffn = RWKV7FeedForward( + hidden_size=config.hidden_size, + hidden_ratio=config.hidden_ratio, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + layer_idx=layer_idx + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = False, + output_attentions: Optional[bool] = False, + v_first: torch.Tensor = None, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = self.pre_norm(hidden_states) if hasattr(self, 'pre_norm') else hidden_states + hidden_states = self.attn_norm(residual) + hidden_states, attentions, past_key_values, v_first = self.attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + v_first=v_first, + **kwargs + ) + if self.config.fuse_norm: + hidden_states, residual = self.ffn_norm(hidden_states, residual, True) + else: + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.ffn_norm(hidden_states) + hidden_states, past_key_values = self.ffn(hidden_states, attention_mask, past_key_values) + hidden_states = residual + hidden_states + + outputs = (hidden_states, attentions, past_key_values, v_first) + + return outputs + + +class RWKV7PreTrainedModel(PreTrainedModel): + + config_class = RWKV7Config + base_model_prefix = 'model' + supports_gradient_checkpointing = True + _no_split_modules = ['RWKV7Block'] + _supports_cache_class = True + _skip_keys_device_placement = ["past_key_values"] + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights( + self, + module: nn.Module, + rescale_prenorm_residual: bool = True, + num_residuals_per_layer: int = 2, + ): + warnings.warn( + "RWKV-7 employs a carefully designed initialization strategy tailored to its architecture. " + "The detailed initialization scheme is currently not implemented here but can be found in the " + "official code repository. We emphasize that using the recommended initialization is essential " + "for replicating the results in RWKV-7 paper. Deviations from the prescribed initialization " + "may lead to performance degradation.\n" + "Alternatively, please generate initial weights from the official RWKV code repository, and " + "convert the PyTorch checkpoint into FLA supported format." + ) + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Parameter): + nn.init.normal_(module, mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + elif hasattr(module, 'reset_parameters'): + module.reset_parameters() + + if rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + p = None + if hasattr(module, 'o_proj'): + p = module.o_proj.weight + elif hasattr(module, 'down_proj'): + p = module.down_proj.weight + if p is not None: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + with torch.no_grad(): + p /= math.sqrt(num_residuals_per_layer * self.config.num_hidden_layers) + + +class RWKV7Model(RWKV7PreTrainedModel): + + def __init__(self, config: RWKV7Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([RWKV7Block(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]) + self.norm = (LayerNorm if config.fuse_norm else nn.LayerNorm)( + config.hidden_size, + bias=config.norm_bias, + eps=config.norm_eps + ) + + self.gradient_checkpointing = False + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, # noqa + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Cache] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs: Unpack[Dict] + ) -> Union[Tuple, BaseModelOutputWithPast]: + if output_attentions: + warnings.warn("`RWKV7Model` does not `output_attentions` now, setting it to `False`.") + output_attentions = False + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + if input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + hidden_states = inputs_embeds + + if use_cache and not isinstance(past_key_values, Cache): + past_key_values = Cache.from_legacy_cache(past_key_values) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...") + use_cache = False + + all_hidden_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + + v_first = torch.zeros_like(hidden_states) + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + hidden_states, attentions, past_key_values, v_first = self._gradient_checkpointing_func( + layer.__call__, + hidden_states, + attention_mask, + past_key_values, + use_cache, + output_attentions, + v_first, + **kwargs + ) + else: + hidden_states, attentions, past_key_values, v_first = layer( + hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + v_first=v_first, + **kwargs + ) + + if output_attentions: + all_attns += (attentions,) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple(i for i in [hidden_states, past_key_values, all_hidden_states, all_attns] if i is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + hidden_states=all_hidden_states, + attentions=all_attns + ) + + +class RWKV7ForCausalLM(RWKV7PreTrainedModel, GenerationMixin): + + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = RWKV7Model(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.criterion = None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embeddings + + def set_input_embeddings(self, value): + self.model.embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def generate(self, *args, **kwargs): + try: + return super().generate(*args, **kwargs) + except AttributeError as exception: + if 'past_key_values' in str(exception): + raise AttributeError( + f"You tried to call `generate` with a decoding strategy that manipulates `past_key_values`, " + f"which is not supported for {self.__class__.__name__}. " + f"Try another generation strategy instead. " + f"For the available generation strategies, check this doc: " + f"https://huggingface.co/docs/transformers/en/generation_strategies#decoding-strategies" + ) + else: + raise exception + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor = None, + past_key_values: Optional[Cache] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: bool = True, + logits_to_keep: Optional[int] = None, + **kwargs + ): + # only last token for `inputs_ids` if the `past_key_values` is not empty. + if past_key_values is not None and len(past_key_values) > 0: + input_ids = input_ids[:, -1:] + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(past_key_values) == 0: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. + # Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {'input_ids': input_ids.contiguous()} + + if logits_to_keep is not None: + model_inputs['logits_to_keep'] = logits_to_keep + + model_inputs.update({ + 'past_key_values': past_key_values, + 'use_cache': use_cache, + 'attention_mask': attention_mask, + }) + return model_inputs + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values: Optional[Cache] = None, + labels: Optional[torch.LongTensor] = None, + shift_labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + logits_to_keep: Optional[int] = 0, + **kwargs: Unpack[Dict] + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs + ) + + hidden_states = outputs[0] + fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training + + loss, logits = None, None + has_labels = (labels is not None) or (shift_labels is not None) + if not (fuse_linear_and_cross_entropy and has_labels): + logits = self.lm_head(hidden_states if logits_to_keep is None else hidden_states[:, -logits_to_keep:]) + if has_labels: + if getattr(self, 'criterion', None) is None: + if fuse_linear_and_cross_entropy: + criterion = FusedLinearCrossEntropyLoss() + elif self.config.fuse_cross_entropy: + criterion = FusedCrossEntropyLoss(inplace_backward=True) + else: + criterion = nn.CrossEntropyLoss() + else: + criterion = self.criterion + + # shift_labels: See https://github.com/huggingface/transformers/pull/36607/files. + if shift_labels is None: + shift_labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1) + shift_labels = shift_labels.to(hidden_states.device) + + if fuse_linear_and_cross_entropy: + loss = criterion(hidden_states, shift_labels, self.lm_head.weight, self.lm_head.bias) + else: + loss = criterion(logits.view(shift_labels.numel(), -1), shift_labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/fla/models/samba/__init__.py b/fla/models/samba/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a27a4b4cac782eb4a3e6c35216405d320e2c6507 --- /dev/null +++ b/fla/models/samba/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- + +from transformers import AutoConfig, AutoModel, AutoModelForCausalLM + +from fla.models.samba.configuration_samba import SambaConfig +from fla.models.samba.modeling_samba import SambaBlock, SambaForCausalLM, SambaModel + +AutoConfig.register(SambaConfig.model_type, SambaConfig, True) +AutoModel.register(SambaConfig, SambaModel, True) +AutoModelForCausalLM.register(SambaConfig, SambaForCausalLM, True) + + +__all__ = ['SambaConfig', 'SambaForCausalLM', 'SambaModel', 'SambaBlock'] diff --git a/fla/models/samba/__pycache__/configuration_samba.cpython-311.pyc b/fla/models/samba/__pycache__/configuration_samba.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b3bbeb3360d07e0669be0c5f16c132bc16a56be2 Binary files /dev/null and b/fla/models/samba/__pycache__/configuration_samba.cpython-311.pyc differ diff --git a/fla/models/samba/__pycache__/modeling_samba.cpython-311.pyc b/fla/models/samba/__pycache__/modeling_samba.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2994b6f16bab796109b40b6e36cafd32a991d662 Binary files /dev/null and b/fla/models/samba/__pycache__/modeling_samba.cpython-311.pyc differ diff --git a/fla/models/samba/configuration_samba.py b/fla/models/samba/configuration_samba.py new file mode 100644 index 0000000000000000000000000000000000000000..27311f06a81f0132a409b9dab10b63fc9e19333a --- /dev/null +++ b/fla/models/samba/configuration_samba.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- + +import math +from typing import Dict, Optional + +from transformers.configuration_utils import PretrainedConfig + + +class SambaConfig(PretrainedConfig): + + model_type = "samba" + + def __init__( + self, + hidden_size: int = 2304, + state_size: int = 16, + num_hidden_layers: int = 18, + norm_eps=1e-5, + pad_token_id: int = 0, + bos_token_id: int = 1, + eos_token_id: int = 2, + expand: int = 2, + conv_kernel: int = 4, + use_bias: bool = False, + use_conv_bias: bool = True, + hidden_act: str = "swish", + initializer_range: str = 0.02, + residual_in_fp32: bool = False, + time_step_rank: str = "auto", + time_step_scale: float = 1.0, + time_step_min: float = 0.001, + time_step_max: float = 0.1, + time_step_init_scheme: str = "random", + time_step_floor: float = 1e-4, + max_position_embeddings: int = 2048, + attn: Optional[Dict] = { + 'layers': (1, 3, 5, 7, 9, 11, 13, 15, 17), + 'num_heads': 18, + 'num_kv_heads': 18, + 'qkv_bias': False, + 'window_size': 2048, + 'rope_theta': 10000. + }, + hidden_ratio: Optional[int] = 4, + rescale_prenorm_residual: bool = False, + use_cache: bool = True, + fuse_norm: bool = True, + fuse_swiglu: bool = True, + fuse_cross_entropy: bool = True, + vocab_size: int = 32000, + tie_word_embeddings: bool = False, + **kwargs, + ): + self.hidden_size = hidden_size + self.state_size = state_size + self.num_hidden_layers = num_hidden_layers + self.norm_eps = norm_eps + self.conv_kernel = conv_kernel + self.expand = expand + self.intermediate_size = int(expand * self.hidden_size) + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.pad_token_id = pad_token_id + self.use_bias = use_bias + self.use_conv_bias = use_conv_bias + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.time_step_rank = math.ceil(self.hidden_size / 16) if time_step_rank == "auto" else time_step_rank + self.time_step_scale = time_step_scale + self.time_step_min = time_step_min + self.time_step_max = time_step_max + self.time_step_init_scheme = time_step_init_scheme + self.time_step_floor = time_step_floor + self.max_position_embeddings = max_position_embeddings + self.attn = attn + self.hidden_ratio = hidden_ratio + self.rescale_prenorm_residual = rescale_prenorm_residual + self.residual_in_fp32 = residual_in_fp32 + self.use_cache = use_cache + + self.fuse_norm = fuse_norm + self.fuse_swiglu = fuse_swiglu + self.fuse_cross_entropy = fuse_cross_entropy + self.vocab_size = vocab_size + + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs + ) diff --git a/fla/models/samba/modeling_samba.py b/fla/models/samba/modeling_samba.py new file mode 100644 index 0000000000000000000000000000000000000000..0da2cfa64f3bb89e51e2f799194625e4448138a9 --- /dev/null +++ b/fla/models/samba/modeling_samba.py @@ -0,0 +1,413 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from transformers.generation import GenerationMixin +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ModelOutput, logging +from transformers.utils.deprecation import deprecate_kwarg + +from fla.layers.attn import Attention +from fla.models.mamba.modeling_mamba import MambaCache, MambaMixer +from fla.models.samba.configuration_samba import SambaConfig +from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss +from fla.modules import GatedMLP as SambaMLP +from fla.modules import RMSNorm + +if TYPE_CHECKING: + from transformers.processing_utils import Unpack + +logger = logging.get_logger(__name__) + + +class SambaBlock(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + + self.config = config + self.layer_idx = layer_idx + + self.mixer_norm = RMSNorm(hidden_size=config.hidden_size, eps=config.norm_eps) + if config.attn is not None and layer_idx in config.attn['layers']: + self.mixer = Attention( + hidden_size=config.hidden_size, + num_heads=config.attn['num_heads'], + num_kv_heads=config.attn['num_kv_heads'], + qkv_bias=config.attn['qkv_bias'], + window_size=config.attn['window_size'], + rope_theta=config.attn['rope_theta'], + max_position_embeddings=config.max_position_embeddings, + layer_idx=layer_idx + ) + else: + self.mixer = MambaMixer(config, layer_idx=layer_idx) + self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + self.mlp = SambaMLP( + hidden_size=config.hidden_size, + hidden_ratio=config.hidden_ratio, + hidden_act=config.hidden_act, + fuse_swiglu=config.fuse_swiglu + ) + + def forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[Tuple[torch.Tensor]] = None, + **kwargs: Unpack[Dict] + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + hidden_states = self.mixer_norm(hidden_states) + if isinstance(self.mixer, MambaMixer): + hidden_states = self.mixer(hidden_states, cache_params=cache_params, **kwargs) + else: + hidden_states, _, cache_params = self.mixer(hidden_states=hidden_states, past_key_values=cache_params, **kwargs) + if self.config.fuse_norm: + hidden_states, residual = self.mlp_norm(hidden_states, residual, True) + else: + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.mlp_norm(hidden_states) + hidden_states = self.mlp(hidden_states, **kwargs) + hidden_states = residual + hidden_states + return hidden_states + + +class SambaPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SambaConfig + base_model_prefix = "backbone" + _no_split_modules = ["SambaBlock"] + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + if not getattr(module.bias, "_no_reinit", False): + nn.init.zeros_(module.bias) + elif isinstance(module, MambaMixer): + module.A_log._no_weight_decay = True + module.D._no_weight_decay = True + + dt_init_std = self.config.time_step_rank**-0.5 * self.config.time_step_scale + if self.config.time_step_init_scheme == "constant": + nn.init.constant_(module.dt_proj.weight, dt_init_std) + elif self.config.time_step_init_scheme == "random": + nn.init.uniform_(module.dt_proj.weight, -dt_init_std, dt_init_std) + + dt = torch.exp( + torch.rand(self.config.intermediate_size) + * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min)) + + math.log(self.config.time_step_min) + ).clamp(min=self.config.time_step_floor) + # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759 + inv_dt = dt + torch.log(-torch.expm1(-dt)) + with torch.no_grad(): + module.dt_proj.bias.data = nn.Parameter(inv_dt.to(module.dt_proj.bias.device)) + module.dt_proj.bias._no_reinit = True + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, std=self.config.initializer_range) + elif hasattr(module, 'reset_parameters'): + module.reset_parameters() + + if self.config.rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + for name, p in module.named_parameters(): + if name in ["out_proj.weight"]: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + with torch.no_grad(): + p /= math.sqrt(self.config.num_layers) + + +@dataclass +class SambaOutput(ModelOutput): + """ + Class for the Samba model outputs. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + cache_params (`MambaCache`): + The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to + avoid providing the old `input_ids`. + + Includes both the State space model state matrices after the selective scan, and the Convolutional states + hidden_states (`tuple(torch.FloatTensor)`, *optional*, + returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + cache_params: Optional[MambaCache] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class SambaCausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + cache_params (`MambaCache`): + The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to + avoid providing the old `input_ids`. + + Includes both the State space model state matrices after the selective scan, and the Convolutional states + hidden_states (`tuple(torch.FloatTensor)`, *optional*, + returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + cache_params: Optional[MambaCache] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +class SambaModel(SambaPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size) + self.layers = nn.ModuleList([SambaBlock(config, layer_idx=idx) for idx in range(config.num_hidden_layers)]) + + self.gradient_checkpointing = False + self.norm_f = RMSNorm(config.hidden_size, eps=config.norm_eps) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, new_embeddings): + self.embeddings = new_embeddings + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + cache_params: Optional[MambaCache] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs: Unpack[Dict] + ) -> Union[Tuple, SambaOutput]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): # ^ is python for xor + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + if self.gradient_checkpointing and self.training and use_cache: + use_cache = False + + if cache_params is None and use_cache: + cache_params = MambaCache( + self.config, inputs_embeds.size(0), device=inputs_embeds.device, dtype=inputs_embeds.dtype + ) + + hidden_states = inputs_embeds + all_hidden_states = () if output_hidden_states else None + for mixer_block in self.layers: + if self.gradient_checkpointing and self.training: + hidden_states = self._gradient_checkpointing_func( + mixer_block.__call__, + hidden_states, + cache_params, + **kwargs + ) + else: + hidden_states = mixer_block( + hidden_states, + cache_params=cache_params, + **kwargs + ) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if use_cache: + cache_params.seqlen_offset += inputs_embeds.shape[1] + + hidden_states = self.norm_f(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, cache_params, all_hidden_states] if v is not None) + + return SambaOutput( + last_hidden_state=hidden_states, + cache_params=cache_params if use_cache else None, + hidden_states=all_hidden_states, + ) + + +class SambaForCausalLM(SambaPreTrainedModel, GenerationMixin): + + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.backbone = SambaModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.criterion = None + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def get_input_embeddings(self): + return self.backbone.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + return self.backbone.set_input_embeddings(new_embeddings) + + def _update_model_kwargs_for_generation( + self, outputs: ModelOutput, model_kwargs: Dict[str, Any], **kwargs + ) -> Dict[str, Any]: + model_kwargs["cache_params"] = outputs.get("cache_params", None) + return model_kwargs + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def prepare_inputs_for_generation( + self, + input_ids, + cache_params: + Optional[MambaCache] = None, + inputs_embeds=None, + attention_mask=None, + use_cache: Optional[bool] = True, + logits_to_keep: Optional[int] = None, + **kwargs: Unpack[Dict] + ): + # only last token for inputs_ids if the state is passed along. + if cache_params is not None: + input_ids = input_ids[:, -1].unsqueeze(-1) + + if inputs_embeds is not None and cache_params is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + if logits_to_keep is not None: + model_inputs['logits_to_keep'] = logits_to_keep + + model_inputs.update({ + 'cache_params': cache_params, + 'use_cache': use_cache, + 'attention_mask': attention_mask, + 'logits_to_keep': logits_to_keep, + }) + return model_inputs + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, # noqa + inputs_embeds: Optional[torch.FloatTensor] = None, + cache_params: Optional[MambaCache] = None, + labels: Optional[torch.LongTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + use_cache: Optional[bool] = None, + logits_to_keep: Optional[int] = 0, + **kwargs: Unpack[Dict] + ) -> Union[Tuple, SambaCausalLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.backbone( + input_ids, + cache_params=cache_params, + inputs_embeds=inputs_embeds, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + use_cache=use_cache, + **kwargs + ) + hidden_states = outputs[0] + fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training + + loss, logits = None, None + if not fuse_linear_and_cross_entropy or labels is None: + logits = self.lm_head(hidden_states if logits_to_keep is None else hidden_states[:, -logits_to_keep:]) + if labels is not None: + if getattr(self, 'criterion', None) is None: + if fuse_linear_and_cross_entropy: + criterion = FusedLinearCrossEntropyLoss() + elif self.config.fuse_cross_entropy: + criterion = FusedCrossEntropyLoss(inplace_backward=True) + else: + criterion = nn.CrossEntropyLoss() + else: + criterion = self.criterion + labels = labels.to(hidden_states.device) + labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1) + if fuse_linear_and_cross_entropy: + loss = criterion(hidden_states, labels, self.lm_head.weight, self.lm_head.bias) + else: + loss = criterion(logits.view(labels.numel(), -1), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return SambaCausalLMOutput( + loss=loss, + logits=logits, + cache_params=outputs.cache_params, + hidden_states=outputs.hidden_states, + ) diff --git a/fla/models/transformer/__init__.py b/fla/models/transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3c1a82c4ffb298bda4baf05000ca057c3b5a458f --- /dev/null +++ b/fla/models/transformer/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- + +from transformers import AutoConfig, AutoModel, AutoModelForCausalLM + +from fla.models.transformer.configuration_transformer import TransformerConfig +from fla.models.transformer.modeling_transformer import TransformerForCausalLM, TransformerModel + +AutoConfig.register(TransformerConfig.model_type, TransformerConfig) +AutoModel.register(TransformerConfig, TransformerModel) +AutoModelForCausalLM.register(TransformerConfig, TransformerForCausalLM) + + +__all__ = ['TransformerConfig', 'TransformerForCausalLM', 'TransformerModel'] diff --git a/fla/models/transformer/__pycache__/__init__.cpython-311.pyc b/fla/models/transformer/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8de1c4e6f90e9ccfaf837d117d6efb1da1d0981 Binary files /dev/null and b/fla/models/transformer/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/models/transformer/__pycache__/configuration_transformer.cpython-311.pyc b/fla/models/transformer/__pycache__/configuration_transformer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81441a38c0acf28bd302d835c8953ae0862fe393 Binary files /dev/null and b/fla/models/transformer/__pycache__/configuration_transformer.cpython-311.pyc differ diff --git a/fla/models/transformer/__pycache__/modeling_transformer.cpython-311.pyc b/fla/models/transformer/__pycache__/modeling_transformer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45ca2dab41f2dc6da4e6baf233ac752dd76cc9c6 Binary files /dev/null and b/fla/models/transformer/__pycache__/modeling_transformer.cpython-311.pyc differ diff --git a/fla/models/transformer/configuration_transformer.py b/fla/models/transformer/configuration_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..4f38961b024c4d1e52de3cab4985fd3e31cf71cc --- /dev/null +++ b/fla/models/transformer/configuration_transformer.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- + +from typing import Optional + +from transformers.configuration_utils import PretrainedConfig + + +class TransformerConfig(PretrainedConfig): + + model_type = 'transformer' + keys_to_ignore_at_inference = ['past_key_values'] + + def __init__( + self, + hidden_size: int = 2048, + num_hidden_layers: int = 24, + num_heads: int = 32, + num_kv_heads: int = None, + qkv_bias: bool = False, + qk_norm: bool = False, + window_size: Optional[int] = None, + rope_theta: Optional[float] = 10000., + max_position_embeddings: int = 2048, + hidden_ratio: Optional[int] = 4, + intermediate_size: Optional[int] = None, + hidden_act: str = "swish", + initializer_range: float = 0.006, + elementwise_affine: Optional[bool] = True, + norm_eps: float = 1e-6, + use_cache: bool = True, + pad_token_id: int = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + fuse_norm: bool = True, + fuse_swiglu: bool = True, + fuse_cross_entropy: bool = True, + vocab_size: int = 32000, + use_myopic_loss: bool = False, + **kwargs, + ): + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.qkv_bias = qkv_bias + self.qk_norm = qk_norm + self.window_size = window_size + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + + self.initializer_range = initializer_range + self.elementwise_affine = elementwise_affine + self.norm_eps = norm_eps + self.use_cache = use_cache + + self.fuse_norm = fuse_norm + self.fuse_swiglu = fuse_swiglu + self.fuse_cross_entropy = fuse_cross_entropy + self.vocab_size = vocab_size + + self.use_myopic_loss = use_myopic_loss + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/fla/models/transformer/modeling_transformer.py b/fla/models/transformer/modeling_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e8f683c6ebc06efeadb28d350849c47bf5823b91 --- /dev/null +++ b/fla/models/transformer/modeling_transformer.py @@ -0,0 +1,437 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +import warnings +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint +from dataclasses import dataclass +from transformers.generation import GenerationMixin +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging +from transformers.utils.deprecation import deprecate_kwarg + +import triton +import triton.language as tl + +from fla.layers.attn import Attention +from fla.models.transformer.configuration_transformer import TransformerConfig +from fla.models.utils import Cache +from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss, FusedLinearListNetLoss +from fla.modules import GatedMLP as TransformerMLP +from fla.modules import RMSNorm +from fla.modules.seq_to_myopic import seq_to_myopic + +if TYPE_CHECKING: + from transformers.processing_utils import Unpack + + +logger = logging.get_logger(__name__) + +@dataclass +class TOPLMOutputWithPast(CausalLMOutputWithPast): + ntp_loss: Optional[torch.FloatTensor] = None + top_loss: Optional[torch.FloatTensor] = None + +class TransformerBlock(nn.Module): + + def __init__(self, config: TransformerConfig, layer_idx: int): + super().__init__() + + self.config = config + self.layer_idx = layer_idx + + self.attn_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + self.attn = Attention( + hidden_size=config.hidden_size, + num_heads=config.num_heads, + num_kv_heads=config.num_kv_heads, + qkv_bias=config.qkv_bias, + qk_norm=config.qk_norm, + window_size=config.window_size, + rope_theta=config.rope_theta, + max_position_embeddings=config.max_position_embeddings, + layer_idx=layer_idx + ) + + self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + self.mlp = TransformerMLP( + hidden_size=config.hidden_size, + hidden_ratio=config.hidden_ratio, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + fuse_swiglu=config.fuse_swiglu + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs: Unpack[Any] + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + hidden_states = self.attn_norm(hidden_states) + hidden_states, attentions, past_key_values = self.attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + **kwargs + ) + if self.config.fuse_norm: + hidden_states, residual = self.mlp_norm(hidden_states, residual, True) + else: + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.mlp_norm(hidden_states) + hidden_states = self.mlp(hidden_states, **kwargs) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attentions,) + + if use_cache: + outputs += (past_key_values,) + + return outputs + + +class TransformerPreTrainedModel(PreTrainedModel): + + config_class = TransformerConfig + base_model_prefix = 'model' + supports_gradient_checkpointing = True + _no_split_modules = ['TransformerBlock'] + _supports_cache_class = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights( + self, + module: nn.Module, + rescale_prenorm_residual: bool = False, + num_residuals_per_layer: int = 2, + ): + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + elif hasattr(module, 'reset_parameters'): + module.reset_parameters() + + if rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + p = None + if hasattr(module, 'o_proj'): + p = module.o_proj.weight + elif hasattr(module, 'down_proj'): + p = module.down_proj.weight + if p is not None: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + with torch.no_grad(): + p /= math.sqrt(num_residuals_per_layer * self.config.num_hidden_layers) + + +class TransformerModel(TransformerPreTrainedModel): + + def __init__( + self, + config: TransformerConfig + ) -> TransformerModel: + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([TransformerBlock(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]) + self.norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + + self.gradient_checkpointing = False + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs: Unpack[Any] + ) -> Union[Tuple, CausalLMOutputWithPast]: + if output_attentions: + warnings.warn( + "`TransformerModel` does not support output attention weights now, so `output_attentions` is set to `False`." + ) + output_attentions = False + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if use_cache and not isinstance(past_key_values, Cache): + past_key_values = Cache.from_legacy_cache(past_key_values) + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_hidden_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + next_cache = None + + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer.__call__, + hidden_states, + attention_mask, + past_key_values, + output_attentions, + use_cache, + **kwargs + ) + else: + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attns] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_attns + ) + + +class TransformerForCausalLM(TransformerPreTrainedModel, GenerationMixin): + + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = TransformerModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + if config.use_myopic_loss: + self.myopic_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.myopic_criterion = FusedLinearListNetLoss() + self.criterion = None + self.pad_token_id = config.pad_token_id + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embeddings + + def set_input_embeddings(self, value): + self.model.embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: bool = True, + logits_to_keep: Optional[int] = None, + **kwargs + ): + # only last token for `inputs_ids` if the `past_key_values` is not empty. + if past_key_values is not None and len(past_key_values) > 0: + input_ids = input_ids[:, -1:] + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(past_key_values) == 0: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. + # Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {'input_ids': input_ids.contiguous()} + + if logits_to_keep is not None: + model_inputs['logits_to_keep'] = logits_to_keep + + model_inputs.update({ + 'past_key_values': past_key_values, + 'use_cache': use_cache, + 'attention_mask': attention_mask, + }) + return model_inputs + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + logits_to_keep: Optional[int] = 0, + **kwargs: Unpack[Any] + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs + ) + + hidden_states = outputs[0] + fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training + logits = None if fuse_linear_and_cross_entropy else self.lm_head(hidden_states[:, -logits_to_keep:]) + + loss = None + ntp_loss = None + myopic_loss = None + if labels is not None: + if getattr(self, 'criterion', None) is None: + if fuse_linear_and_cross_entropy: + criterion = FusedLinearCrossEntropyLoss() + elif self.config.fuse_cross_entropy: + criterion = FusedCrossEntropyLoss(inplace_backward=True) + else: + criterion = nn.CrossEntropyLoss() + else: + criterion = self.criterion + # Enable model parallelism + labels = labels.to(hidden_states.device) + labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1) + ntp_labels = labels[..., :hidden_states.shape[1]].contiguous() + if fuse_linear_and_cross_entropy: + ntp_loss = criterion(hidden_states, ntp_labels, self.lm_head.weight, self.lm_head.bias) + else: + ntp_loss = criterion(logits.view(ntp_labels.numel(), -1), ntp_labels.reshape(-1)) + + if self.config.use_myopic_loss: + myopic_labels = seq_to_myopic(labels, self.vocab_size, hidden_states.shape[1], pad_token_id=self.pad_token_id).contiguous() + myopic_loss = self.myopic_criterion(hidden_states, myopic_labels, self.myopic_head.weight, self.myopic_head.bias) + # print(f"NTP Loss: {ntp_loss.item()}, Myopic Loss: {myopic_loss.item()}") + # For debugging, get the index where the myopic label is the highest and print the corresponding logits + # idx_max = torch.argmax(myopic_labels.view(-1, self.vocab_size), dim=1) + # # Print the labels and logits at that index + # print(f"Labels: {myopic_labels.view(-1, self.vocab_size)[0, idx_max[0]-3:idx_max[0]+3]}") + # print(f"Logits: {F.sigmoid(myopic_logits).view(-1, self.vocab_size)[0, idx_max[0]-3:idx_max[0]+3]}") + loss = ntp_loss + myopic_loss + else: + loss = ntp_loss + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return TOPLMOutputWithPast( + loss=loss, + ntp_loss=ntp_loss, + top_loss=myopic_loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/fla/models/transformer_mtp/__init__.py b/fla/models/transformer_mtp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f906f6b2b76d2b2c25ba12df2403682195607fc8 --- /dev/null +++ b/fla/models/transformer_mtp/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- + +from transformers import AutoConfig, AutoModel, AutoModelForCausalLM + +from fla.models.transformer_mtp.configuration_transformer import MTPTransformerConfig +from fla.models.transformer_mtp.modeling_transformer import MTPTransformerForCausalLM, MTPTransformerModel + +AutoConfig.register(MTPTransformerConfig.model_type, MTPTransformerConfig) +AutoModel.register(MTPTransformerConfig, MTPTransformerModel) +AutoModelForCausalLM.register(MTPTransformerConfig, MTPTransformerForCausalLM) + + +__all__ = ['TransformerConfig', 'TransformerForCausalLM', 'TransformerModel'] diff --git a/fla/models/transformer_mtp/__pycache__/__init__.cpython-311.pyc b/fla/models/transformer_mtp/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b713260dace05cb339ae4611dd9b3a0bfaea716f Binary files /dev/null and b/fla/models/transformer_mtp/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/models/transformer_mtp/__pycache__/configuration_transformer.cpython-311.pyc b/fla/models/transformer_mtp/__pycache__/configuration_transformer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d532a329146894b01e235b8851488905eed5d70 Binary files /dev/null and b/fla/models/transformer_mtp/__pycache__/configuration_transformer.cpython-311.pyc differ diff --git a/fla/models/transformer_mtp/__pycache__/modeling_transformer.cpython-311.pyc b/fla/models/transformer_mtp/__pycache__/modeling_transformer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2f1ff6a267a9bf00354c23d208a49f203b86503 Binary files /dev/null and b/fla/models/transformer_mtp/__pycache__/modeling_transformer.cpython-311.pyc differ diff --git a/fla/models/transformer_mtp/configuration_transformer.py b/fla/models/transformer_mtp/configuration_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..b3f8be05fe72d42e7f10476d719acb2944c9c4c0 --- /dev/null +++ b/fla/models/transformer_mtp/configuration_transformer.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- + +from typing import Optional + +from transformers.configuration_utils import PretrainedConfig + + +class MTPTransformerConfig(PretrainedConfig): + + model_type = 'mtp_transformer' + keys_to_ignore_at_inference = ['past_key_values'] + + def __init__( + self, + hidden_size: int = 2048, + num_hidden_layers: int = 24, + num_heads: int = 32, + num_kv_heads: int = None, + qkv_bias: bool = False, + qk_norm: bool = False, + window_size: Optional[int] = None, + rope_theta: Optional[float] = 10000., + max_position_embeddings: int = 2048, + hidden_ratio: Optional[int] = 4, + intermediate_size: Optional[int] = None, + hidden_act: str = "swish", + initializer_range: float = 0.006, + elementwise_affine: Optional[bool] = True, + norm_eps: float = 1e-6, + use_cache: bool = True, + pad_token_id: int = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + fuse_norm: bool = True, + fuse_swiglu: bool = True, + fuse_cross_entropy: bool = True, + vocab_size: int = 32000, + n_future_tokens: int = 1, + use_custom_backward: Optional[bool] = False, + **kwargs, + ): + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.qkv_bias = qkv_bias + self.qk_norm = qk_norm + self.window_size = window_size + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + + self.initializer_range = initializer_range + self.elementwise_affine = elementwise_affine + self.norm_eps = norm_eps + self.use_cache = use_cache + + self.fuse_norm = fuse_norm + self.fuse_swiglu = fuse_swiglu + self.fuse_cross_entropy = fuse_cross_entropy + self.vocab_size = vocab_size + + self.n_future_tokens = n_future_tokens + self.use_custom_backward = use_custom_backward + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/fla/models/transformer_mtp/modeling_transformer.py b/fla/models/transformer_mtp/modeling_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..bc1b68ade418cb2c15ce8d62faf6e30487226ca1 --- /dev/null +++ b/fla/models/transformer_mtp/modeling_transformer.py @@ -0,0 +1,601 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +import warnings +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint +from dataclasses import dataclass +from transformers.generation import GenerationMixin +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging +from transformers.utils.deprecation import deprecate_kwarg + +import triton +import triton.language as tl + +from fla.layers.attn import Attention +from fla.models.transformer_mtp.configuration_transformer import MTPTransformerConfig +from fla.models.utils import Cache +from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss +from fla.modules import GatedMLP as TransformerMLP +from fla.modules import RMSNorm + +if TYPE_CHECKING: + from transformers.processing_utils import Unpack + + +logger = logging.get_logger(__name__) + +class SequentialHeadsCustomBackward(torch.autograd.Function): + @staticmethod + def forward(ctx, trunk_output, lm_head, norm_layer, logits_to_keep, *prediction_heads): + # We now need the norm layer in the forward pass calculation + ctx.prediction_heads = prediction_heads + ctx.lm_head = lm_head + ctx.norm_layer = norm_layer + ctx.logits_to_keep = logits_to_keep + ctx.save_for_backward(trunk_output) + + latents = [] + for head in prediction_heads: + # Assuming head forward signature is `head(hidden_states)` + latent = head(trunk_output)[0] + latents.append(latent) + + latents_stacked = torch.stack(latents, dim=-2) + # Apply the final norm before the lm_head + normalized_latents = norm_layer(latents_stacked) + all_logits = lm_head(normalized_latents[:, -logits_to_keep:]) + return all_logits + + @staticmethod + def backward(ctx, grad_output): + trunk_output, = ctx.saved_tensors + prediction_heads = ctx.prediction_heads + lm_head = ctx.lm_head + norm_layer = ctx.norm_layer + logits_to_keep = ctx.logits_to_keep + + d = trunk_output.detach().requires_grad_(True) + grad_output_per_head = grad_output.unbind(dim=2) + + # We need to manually handle the backward pass for the final norm layer once + # before the loop, as its gradient depends on all heads. + # To do this, we reconstruct the input to the lm_head and do a backward pass. + with torch.enable_grad(): + # Re-run the head computations to get the input to the norm layer + latents = [] + for head in prediction_heads: + latents.append(head(d)[0]) + latents_stacked = torch.stack(latents, dim=-2) + latents_stacked.requires_grad_(True) + # The part of the graph we need to backprop through first + normalized_latents = norm_layer(latents_stacked) + + # Backpropagate through the lm_head and norm_layer + normalized_latents.backward(lm_head.weight.grad @ grad_output) + + # Now, `latents_stacked.grad` contains the sum of gradients from all heads + # just before the final normalization. We can now unbind it. + grad_per_head_latent = latents_stacked.grad.unbind(dim=-2) + + # Now, backpropagate through each head individually. + for i, head in enumerate(prediction_heads): + with torch.enable_grad(): + head_latent = head(d)[0] + # Backpropagate using the gradient for this specific head's output + head_latent.backward(gradient=grad_per_head_latent[i]) + + num_nones = 2 + len(prediction_heads) # for lm_head, norm_layer, and *prediction_heads + return (d.grad,) + (None,) * num_nones + +def seq_to_mtp( + long_input_ids: torch.Tensor, + model_seq_len: int, + n_future_tokens: int +) -> torch.Tensor: + """ + Generates a tensor of future targets on the fly from a long input sequence. + + This version assumes `long_input_ids` contains both the tokens for the model's + input AND the future tokens needed for the labels. + It extracts the correct targets without adding artificial padding. + + Args: + long_input_ids (torch.Tensor): The input sequences from the dataloader, + shape (B, T + n_future_tokens). + model_seq_len (int): The sequence length `T` that the model processes. + n_future_tokens (int): The number of future tokens to predict for each time step. + + Returns: + torch.Tensor: The target tensor of shape (B, T, n_future_tokens). + y[b, t, k] corresponds to the (k+1)-th token after input_ids[b, t]. + """ + B, total_len = long_input_ids.shape + assert total_len >= model_seq_len + n_future_tokens, \ + "long_input_ids must be at least model_seq_len + n_future_tokens long." + + # 1. Create sliding windows (views) over the long tensor. + # .unfold() is a highly efficient way to create sliding windows. + # We create windows of size `n_future_tokens + 1`. For each time step `t`, + # the window will contain the input token and its `n_future_tokens` targets. + # Example (n=3, window_size=4): + # For t=0, window is [t0, t1, t2, t3] + # For t=1, window is [t1, t2, t3, t4] + # Shape of windows: (B, total_len - n_future_tokens, n_future_tokens + 1) + windows = long_input_ids.unfold(dimension=1, size=n_future_tokens + 1, step=1) + + # 2. Slice the windows to get only the targets. + # We slice off the first element of each window (the input token itself) + # to keep only the future tokens. + # Example window [t0, t1, t2, t3] -> becomes targets [t1, t2, t3] + all_targets = windows[:, :, 1:] + + # 3. Trim the result to match the model's output sequence length. + # We only need the targets for the first `model_seq_len` positions. + output_targets = all_targets[:, :model_seq_len, :] + + return output_targets.transpose(1, 2) + + +@dataclass +class MTPLMOutputWithPast(CausalLMOutputWithPast): + ntp_loss: Optional[torch.FloatTensor] = None + mtp_loss: Optional[torch.FloatTensor] = None + +class MTPTransformerBlock(nn.Module): + + def __init__(self, config: MTPTransformerConfig, layer_idx: int): + super().__init__() + + self.config = config + self.layer_idx = layer_idx + + self.attn_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + self.attn = Attention( + hidden_size=config.hidden_size, + num_heads=config.num_heads, + num_kv_heads=config.num_kv_heads, + qkv_bias=config.qkv_bias, + qk_norm=config.qk_norm, + window_size=config.window_size, + rope_theta=config.rope_theta, + max_position_embeddings=config.max_position_embeddings, + layer_idx=layer_idx + ) + + self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + self.mlp = TransformerMLP( + hidden_size=config.hidden_size, + hidden_ratio=config.hidden_ratio, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + fuse_swiglu=config.fuse_swiglu + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs: Unpack[Any] + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + hidden_states = self.attn_norm(hidden_states) + hidden_states, attentions, past_key_values = self.attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + **kwargs + ) + if self.config.fuse_norm: + hidden_states, residual = self.mlp_norm(hidden_states, residual, True) + else: + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.mlp_norm(hidden_states) + hidden_states = self.mlp(hidden_states, **kwargs) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attentions,) + + if use_cache: + outputs += (past_key_values,) + + return outputs + + +class MTPTransformerPreTrainedModel(PreTrainedModel): + + config_class = MTPTransformerConfig + base_model_prefix = 'model' + supports_gradient_checkpointing = True + _no_split_modules = ['MTPTransformerBlock'] + _supports_cache_class = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights( + self, + module: nn.Module, + rescale_prenorm_residual: bool = False, + num_residuals_per_layer: int = 2, + ): + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + elif hasattr(module, 'reset_parameters'): + module.reset_parameters() + + if rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + p = None + if hasattr(module, 'o_proj'): + p = module.o_proj.weight + elif hasattr(module, 'down_proj'): + p = module.down_proj.weight + if p is not None: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + with torch.no_grad(): + p /= math.sqrt(num_residuals_per_layer * self.config.num_hidden_layers) + + +class MTPTransformerModel(MTPTransformerPreTrainedModel): + + def __init__( + self, + config: MTPTransformerConfig + ) -> MTPTransformerModel: + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([MTPTransformerBlock(config, layer_idx) for layer_idx in range(config.num_hidden_layers - config.n_future_tokens)]) + self.extra_heads = nn.ModuleList([MTPTransformerBlock(config, layer_idx) for layer_idx in range(config.n_future_tokens)]) + self.norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + + self.gradient_checkpointing = False + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + return_all_heads: bool = False, # if Training, this is True + **kwargs: Unpack[Any] + ) -> Union[Tuple, CausalLMOutputWithPast]: + if output_attentions: + warnings.warn( + "`TransformerModel` does not support output attention weights now, so `output_attentions` is set to `False`." + ) + output_attentions = False + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + use_custom_backward = self.config.use_custom_backward and self.training + if self.training and return_all_heads is False: + logger.warning_once( + "`return_all_heads=False` is incompatible with training. Setting `return_all_heads=True`..." + ) + return_all_heads = True + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if use_cache and not isinstance(past_key_values, Cache): + past_key_values = Cache.from_legacy_cache(past_key_values) + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_hidden_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + next_cache = None + + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer.__call__, + hidden_states, + attention_mask, + past_key_values, + output_attentions, + use_cache, + **kwargs + ) + else: + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_attns += (layer_outputs[1],) + + trunk = hidden_states + + n_heads_to_use = self.config.n_future_tokens if return_all_heads else 1 + prediction_heads = self.extra_heads + + if use_custom_backward and self.training: + # all_logits = SequentialHeadsCustomBackward.apply(trunk, self.lm_head, *prediction_heads) + hidden_states = trunk # return hidden states and apply custom backward on the MTPTransformersLM + else: + latents = [] + for i, layer in enumerate(prediction_heads): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer.__call__, + trunk, # Use trunk instead of hidden states + attention_mask, + past_key_values, + output_attentions, + use_cache, + **kwargs + ) + else: + layer_outputs = layer( + trunk, # Use trunk instead of hidden states + attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs + ) + hidden_states = layer_outputs[0] + latents.append(hidden_states) + + if use_cache: + next_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_attns += (layer_outputs[1],) + + hidden_states = torch.stack(latents, dim=-2) # (B, T, n_heads_to_use, D) + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states and not self.custom_backward: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attns] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_attns + ) + + +class MTPTransformerForCausalLM(MTPTransformerPreTrainedModel, GenerationMixin): + + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = MTPTransformerModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.criterion = None + self.pad_token_id = config.pad_token_id + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embeddings + + def set_input_embeddings(self, value): + self.model.embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: bool = True, + logits_to_keep: Optional[int] = None, + **kwargs + ): + # only last token for `inputs_ids` if the `past_key_values` is not empty. + if past_key_values is not None and len(past_key_values) > 0: + input_ids = input_ids[:, -1:] + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(past_key_values) == 0: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. + # Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {'input_ids': input_ids.contiguous()} + + if logits_to_keep is not None: + model_inputs['logits_to_keep'] = logits_to_keep + + model_inputs.update({ + 'past_key_values': past_key_values, + 'use_cache': use_cache, + 'attention_mask': attention_mask, + }) + return model_inputs + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + logits_to_keep: Optional[int] = 0, + **kwargs: Unpack[Any] + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs + ) + + hidden_states = outputs[0] # (B, T, n_heads_to_use, D) + fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training + + use_custom_backward = self.config.use_custom_backward and self.training + if use_custom_backward and self.training: + all_logits = SequentialHeadsCustomBackward.apply( + hidden_states, self.lm_head, self.model.norm, logits_to_keep, *self.model.extra_heads + ) + else: + all_logits = None if fuse_linear_and_cross_entropy else self.lm_head(hidden_states[:, -logits_to_keep:]) + + loss = None + if labels is not None: + B, T, n_heads_prediction, D = hidden_states.shape + loss = torch.zeros(1, device=hidden_states.device) + ntp_loss = torch.zeros(1, device=hidden_states.device) + mtp_loss = torch.zeros(1, device=hidden_states.device) + if getattr(self, 'criterion', None) is None: + if fuse_linear_and_cross_entropy: + criterion = FusedLinearCrossEntropyLoss() + elif self.config.fuse_cross_entropy: + criterion = FusedCrossEntropyLoss(inplace_backward=True) + else: + criterion = nn.CrossEntropyLoss() + else: + criterion = self.criterion + # Enable model parallelism + labels = labels.to(hidden_states.device) + all_labels = seq_to_mtp(labels, n_future_tokens=n_heads_prediction, model_seq_len=T) + # Loop across prediction heads + for i in range(n_heads_prediction): + # labels in the shape of (B, n_heads_prediction, T) + labels = all_labels[:, i, :] + if fuse_linear_and_cross_entropy: + current_loss = criterion(hidden_states[:, :, i, :], labels.contiguous(), self.lm_head.weight, self.lm_head.bias) + else: + logits = all_logits[:, :, i, :] + current_loss = criterion(logits.view(labels.numel(), -1), labels.reshape(-1)) + if i == 0: # NTP + ntp_loss = current_loss + else: + mtp_loss += current_loss + loss += current_loss + + if not return_dict: + output = (all_logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return MTPLMOutputWithPast( + loss=loss, + ntp_loss=ntp_loss if loss is not None else None, + mtp_loss=mtp_loss if loss is not None else None, + logits=all_logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/fla/models/transformer_vanilla/__init__.py b/fla/models/transformer_vanilla/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3c1a82c4ffb298bda4baf05000ca057c3b5a458f --- /dev/null +++ b/fla/models/transformer_vanilla/__init__.py @@ -0,0 +1,13 @@ +# -*- coding: utf-8 -*- + +from transformers import AutoConfig, AutoModel, AutoModelForCausalLM + +from fla.models.transformer.configuration_transformer import TransformerConfig +from fla.models.transformer.modeling_transformer import TransformerForCausalLM, TransformerModel + +AutoConfig.register(TransformerConfig.model_type, TransformerConfig) +AutoModel.register(TransformerConfig, TransformerModel) +AutoModelForCausalLM.register(TransformerConfig, TransformerForCausalLM) + + +__all__ = ['TransformerConfig', 'TransformerForCausalLM', 'TransformerModel'] diff --git a/fla/models/transformer_vanilla/configuration_transformer.py b/fla/models/transformer_vanilla/configuration_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..427e6e663003c6935ebcb670f9b75cfdf7ccd95b --- /dev/null +++ b/fla/models/transformer_vanilla/configuration_transformer.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- + +from typing import Optional + +from transformers.configuration_utils import PretrainedConfig + + +class MTPTransformerConfig(PretrainedConfig): + + model_type = 'transformer' + keys_to_ignore_at_inference = ['past_key_values'] + + def __init__( + self, + hidden_size: int = 2048, + num_hidden_layers: int = 24, + num_heads: int = 32, + num_kv_heads: int = None, + qkv_bias: bool = False, + qk_norm: bool = False, + window_size: Optional[int] = None, + rope_theta: Optional[float] = 10000., + max_position_embeddings: int = 2048, + hidden_ratio: Optional[int] = 4, + intermediate_size: Optional[int] = None, + hidden_act: str = "swish", + initializer_range: float = 0.006, + elementwise_affine: Optional[bool] = True, + norm_eps: float = 1e-6, + use_cache: bool = True, + pad_token_id: int = None, + bos_token_id: int = 1, + eos_token_id: int = 2, + tie_word_embeddings: bool = False, + fuse_norm: bool = True, + fuse_swiglu: bool = True, + fuse_cross_entropy: bool = True, + vocab_size: int = 32000, + **kwargs, + ): + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.qkv_bias = qkv_bias + self.qk_norm = qk_norm + self.window_size = window_size + self.rope_theta = rope_theta + self.max_position_embeddings = max_position_embeddings + + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + + self.initializer_range = initializer_range + self.elementwise_affine = elementwise_affine + self.norm_eps = norm_eps + self.use_cache = use_cache + + self.fuse_norm = fuse_norm + self.fuse_swiglu = fuse_swiglu + self.fuse_cross_entropy = fuse_cross_entropy + self.vocab_size = vocab_size + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/fla/models/transformer_vanilla/modeling_transformer.py b/fla/models/transformer_vanilla/modeling_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..db424e50d29fcce5ab6e171af7b37edf53540612 --- /dev/null +++ b/fla/models/transformer_vanilla/modeling_transformer.py @@ -0,0 +1,415 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +import warnings +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint +from dataclasses import dataclass +from transformers.generation import GenerationMixin +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging +from transformers.utils.deprecation import deprecate_kwarg + +import triton +import triton.language as tl + +from fla.layers.attn import Attention +from fla.models.transformer_mtp.configuration_transformer import MTPTransformerConfig +from fla.models.utils import Cache +from fla.modules import FusedCrossEntropyLoss, FusedLinearCrossEntropyLoss +from fla.modules import GatedMLP as TransformerMLP +from fla.modules import RMSNorm +from fla.modules.seq_to_myopic import seq_to_myopic + +if TYPE_CHECKING: + from transformers.processing_utils import Unpack + + +logger = logging.get_logger(__name__) + +@dataclass +class MTPLMOutputWithPast(CausalLMOutputWithPast): + pass + +class MTPTransformerBlock(nn.Module): + + def __init__(self, config: MTPTransformerConfig, layer_idx: int): + super().__init__() + + self.config = config + self.layer_idx = layer_idx + + self.attn_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + self.attn = Attention( + hidden_size=config.hidden_size, + num_heads=config.num_heads, + num_kv_heads=config.num_kv_heads, + qkv_bias=config.qkv_bias, + qk_norm=config.qk_norm, + window_size=config.window_size, + rope_theta=config.rope_theta, + max_position_embeddings=config.max_position_embeddings, + layer_idx=layer_idx + ) + + self.mlp_norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + self.mlp = TransformerMLP( + hidden_size=config.hidden_size, + hidden_ratio=config.hidden_ratio, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + fuse_swiglu=config.fuse_swiglu + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs: Unpack[Any] + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + + residual = hidden_states + hidden_states = self.attn_norm(hidden_states) + hidden_states, attentions, past_key_values = self.attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + **kwargs + ) + if self.config.fuse_norm: + hidden_states, residual = self.mlp_norm(hidden_states, residual, True) + else: + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.mlp_norm(hidden_states) + hidden_states = self.mlp(hidden_states, **kwargs) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attentions,) + + if use_cache: + outputs += (past_key_values,) + + return outputs + + +class MTPTransformerPreTrainedModel(PreTrainedModel): + + config_class = MTPTransformerConfig + base_model_prefix = 'model' + supports_gradient_checkpointing = True + _no_split_modules = ['MTPTransformerBlock'] + _supports_cache_class = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights( + self, + module: nn.Module, + rescale_prenorm_residual: bool = False, + num_residuals_per_layer: int = 2, + ): + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Embedding): + nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range) + elif hasattr(module, 'reset_parameters'): + module.reset_parameters() + + if rescale_prenorm_residual: + # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme: + # > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale + # > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers. + # > -- GPT-2 :: https://openai.com/blog/better-language-models/ + # + # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py + p = None + if hasattr(module, 'o_proj'): + p = module.o_proj.weight + elif hasattr(module, 'down_proj'): + p = module.down_proj.weight + if p is not None: + # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block + # Following Pytorch init, except scale by 1/sqrt(2 * n_layer) + # We need to reinit p since this code could be called multiple times + # Having just p *= scale would repeatedly scale it down + nn.init.kaiming_uniform_(p, a=math.sqrt(5)) + with torch.no_grad(): + p /= math.sqrt(num_residuals_per_layer * self.config.num_hidden_layers) + + +class MTPTransformerModel(MTPTransformerPreTrainedModel): + + def __init__( + self, + config: MTPTransformerConfig + ) -> MTPTransformerModel: + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([MTPTransformerBlock(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]) + self.norm = (RMSNorm if config.fuse_norm else nn.RMSNorm)(config.hidden_size, eps=config.norm_eps) + + self.gradient_checkpointing = False + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs: Unpack[Any] + ) -> Union[Tuple, CausalLMOutputWithPast]: + if output_attentions: + warnings.warn( + "`TransformerModel` does not support output attention weights now, so `output_attentions` is set to `False`." + ) + output_attentions = False + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + use_cache = use_cache if use_cache is not None else (self.config.use_cache if not self.training else False) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if use_cache and not isinstance(past_key_values, Cache): + past_key_values = Cache.from_legacy_cache(past_key_values) + + if inputs_embeds is None: + inputs_embeds = self.embeddings(input_ids) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_hidden_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + next_cache = None + + for layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer.__call__, + hidden_states, + attention_mask, + past_key_values, + output_attentions, + use_cache, + **kwargs + ) + else: + layer_outputs = layer( + hidden_states, + attention_mask=attention_mask, + past_key_values=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_attns] if v is not None) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_attns + ) + + +class MTPTransformerForCausalLM(MTPTransformerPreTrainedModel, GenerationMixin): + + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = MTPTransformerModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.criterion = None + self.pad_token_id = config.pad_token_id + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embeddings + + def set_input_embeddings(self, value): + self.model.embeddings = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def prepare_inputs_for_generation( + self, + input_ids: torch.LongTensor = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: bool = True, + logits_to_keep: Optional[int] = None, + **kwargs + ): + # only last token for `inputs_ids` if the `past_key_values` is not empty. + if past_key_values is not None and len(past_key_values) > 0: + input_ids = input_ids[:, -1:] + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(past_key_values) == 0: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise + # recompiles graphs as the stride of the inputs is a guard. + # Ref: https://github.com/huggingface/transformers/pull/29114 + # TODO: use `next_tokens` directly instead. + model_inputs = {'input_ids': input_ids.contiguous()} + + if logits_to_keep is not None: + model_inputs['logits_to_keep'] = logits_to_keep + + model_inputs.update({ + 'past_key_values': past_key_values, + 'use_cache': use_cache, + 'attention_mask': attention_mask, + }) + return model_inputs + + @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep") + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + logits_to_keep: Optional[int] = 0, + **kwargs: Unpack[Any] + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs + ) + + hidden_states = outputs[0] + fuse_linear_and_cross_entropy = self.config.fuse_cross_entropy and self.training + logits = None if fuse_linear_and_cross_entropy else self.lm_head(hidden_states[:, -logits_to_keep:]) + + loss = None + if labels is not None: + if getattr(self, 'criterion', None) is None: + if fuse_linear_and_cross_entropy: + criterion = FusedLinearCrossEntropyLoss() + elif self.config.fuse_cross_entropy: + criterion = FusedCrossEntropyLoss(inplace_backward=True) + else: + criterion = nn.CrossEntropyLoss() + else: + criterion = self.criterion + # Enable model parallelism + labels = labels.to(hidden_states.device) + labels = torch.cat((labels[..., 1:], torch.full_like(labels[:, :1], criterion.ignore_index)), 1) + if fuse_linear_and_cross_entropy: + loss = criterion(hidden_states, labels, self.lm_head.weight, self.lm_head.bias) + else: + loss = criterion(logits.view(labels.numel(), -1), labels.reshape(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return MTPLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/fla/models/utils.py b/fla/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..da9d070fb95e2ebb7d90620d05ed2db0f8738c57 --- /dev/null +++ b/fla/models/utils.py @@ -0,0 +1,147 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Tuple + +import torch +import transformers + + +class Cache(transformers.cache_utils.Cache): + """ + A cache used for storing hidden states produced by flash linear attention models. + + It stores the states of each layer as the tensor of shape `[batch_size, key_dim, value_dim]`. + """ + + is_compileable = True + + def __init__( + self, + seen_tokens: int = 0 + ) -> Cache: + super().__init__() + + self.states: List[Dict[str, Any]] = [] + + self._seen_tokens = seen_tokens # Used in `generate` to keep tally of how many tokens the cache has seen + + def __getitem__(self, layer_idx: int) -> Dict[str, Any]: + if layer_idx < len(self): + return self.states[layer_idx] + else: + raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") + + def __iter__(self): + for state in self.states: + yield state + + def __len__(self): + return len(self.states) + + def update( + self, + recurrent_state: torch.Tensor = None, + attn_state: Tuple[torch.Tensor, torch.Tensor] = None, + conv_state: Tuple[torch.Tensor] = None, + ffn_state: torch.Tensor = None, + layer_idx: int = 0, + offset: Optional[int] = 1, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + """ + Updates the cache with the new `recurrent_state`/`attn_state`/`conv_state` for the layer `layer_idx`. + + Args: + recurrent_state (`torch.Tensor`, `optional`): + The new recurrent state to cache. + attn_state (`Tuple[torch.Tensor, torch.Tensor]`, `optional`): + The new attention key/value states to cache. + conv_state (`Tuple[torch.Tensor]`, `optional`): + The new convolution state to cache. + layer_idx (`int`, defaults to 0): + The index of the layer to cache the states for. + offset (`int`, `optional`, defaults to 1): + The number of new tokens being processed. + cache_kwargs (`Dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. + + Return: + Dictionary of the updated state. + """ + + # Update the number of seen tokens + if layer_idx == 0: + self._seen_tokens += offset + + if attn_state is not None: + input_size = attn_state[0].shape[-2] + window_size = cache_kwargs.get('window_size', None) + if not isinstance(attn_state, Tuple) or len(attn_state) != 2: + raise ValueError("`attn_state` must be a tuple of two tensors for key/value states") + if len(self.states) <= layer_idx: + if attn_state is not None: + if window_size is not None and input_size > window_size: + attn_state = (attn_state[0][..., -window_size:, :].contiguous(), + attn_state[1][..., -window_size:, :].contiguous()) + state = dict( + recurrent_state=recurrent_state, + attn_state=attn_state, + conv_state=conv_state, + ffn_state=ffn_state + ) + self.states.append(state) + else: + state = self.states[layer_idx] + if recurrent_state is not None: + state['recurrent_state'] = recurrent_state + if attn_state is not None: + key_state, value_state = state['attn_state'] + if window_size is not None and key_state.shape[-2] == window_size: + # DO NOT allocate new memory if the cache is full + # roll the key/value states to the left by `input_size` + key_state = key_state.roll(-input_size, -2) + value_state = value_state.roll(-input_size, -2) + # replace the last `input_size` tokens with the new key/value states + key_state[..., -input_size:, :] = attn_state[0] + value_state[..., -input_size:, :] = attn_state[1] + attn_state = (key_state, value_state) + else: + attn_state = (torch.cat([key_state, attn_state[0]], -2), + torch.cat([value_state, attn_state[1]], -2),) + state['attn_state'] = attn_state + if conv_state is not None: + state['conv_state'] = conv_state + if ffn_state is not None: + state['ffn_state'] = ffn_state + + return state + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + if len(self.states) <= layer_idx: + return 0 + return self._seen_tokens + + def get_max_length(self) -> Optional[int]: + """Returns the maximum sequence length of the cached states. Cache does not have a maximum length.""" + return None + + def to_legacy_cache(self) -> Tuple: + return tuple(self.states) + + @classmethod + @torch.compiler.disable + def from_legacy_cache( + cls, + past_key_values: Optional[Tuple] = None, + seen_tokens: int = 0 + ) -> Cache: + """Converts a cache in the legacy cache format into an equivalent `Cache`.""" + + cache = cls(seen_tokens) + if isinstance(past_key_values, list): + for layer_idx in range(len(past_key_values)): + cache.states.append(past_key_values[layer_idx]) + return cache diff --git a/fla/modules/__init__.py b/fla/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..28dd0861bb8e2a12a296f620f90a62366144e575 --- /dev/null +++ b/fla/modules/__init__.py @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- + +from fla.modules.convolution import ImplicitLongConvolution, LongConvolution, ShortConvolution +from fla.modules.fused_bitlinear import BitLinear, FusedBitLinear +from fla.modules.fused_cross_entropy import FusedCrossEntropyLoss +from fla.modules.fused_kl_div import FusedKLDivLoss +from fla.modules.fused_linear_cross_entropy import FusedLinearCrossEntropyLoss +from fla.modules.fused_linear_listnet_loss import FusedLinearListNetLoss +from fla.modules.fused_norm_gate import ( + FusedLayerNormGated, + FusedLayerNormSwishGate, + FusedLayerNormSwishGateLinear, + FusedRMSNormGated, + FusedRMSNormSwishGate, + FusedRMSNormSwishGateLinear +) +from fla.modules.layernorm import GroupNorm, GroupNormLinear, LayerNorm, LayerNormLinear, RMSNorm, RMSNormLinear +from fla.modules.mlp import GatedMLP +from fla.modules.rotary import RotaryEmbedding + +__all__ = [ + 'ImplicitLongConvolution', 'LongConvolution', 'ShortConvolution', + 'BitLinear', 'FusedBitLinear', + 'FusedCrossEntropyLoss', 'FusedLinearCrossEntropyLoss', 'FusedKLDivLoss', + 'GroupNorm', 'GroupNormLinear', 'LayerNorm', 'LayerNormLinear', 'RMSNorm', 'RMSNormLinear', + 'FusedLayerNormGated', 'FusedLayerNormSwishGate', 'FusedLayerNormSwishGateLinear', + 'FusedRMSNormGated', 'FusedRMSNormSwishGate', 'FusedRMSNormSwishGateLinear', + 'GatedMLP', + 'RotaryEmbedding' +] diff --git a/fla/modules/__pycache__/__init__.cpython-311.pyc b/fla/modules/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..47b584c4a16c30fd3a2c12650660187bac90112f Binary files /dev/null and b/fla/modules/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/activations.cpython-311.pyc b/fla/modules/__pycache__/activations.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0bf159608f3e948d7af58f7236931d583dcc1362 Binary files /dev/null and b/fla/modules/__pycache__/activations.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/convolution.cpython-311.pyc b/fla/modules/__pycache__/convolution.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..621507d28651c5895553433041b860e0a1839218 Binary files /dev/null and b/fla/modules/__pycache__/convolution.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/feature_map.cpython-311.pyc b/fla/modules/__pycache__/feature_map.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1741eb4c3e2498a38f56a9c03fb76d122c279dc3 Binary files /dev/null and b/fla/modules/__pycache__/feature_map.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/fused_bitlinear.cpython-311.pyc b/fla/modules/__pycache__/fused_bitlinear.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..86a3a0b91f0b50ab48b18fe18c21e92281c8a7ac Binary files /dev/null and b/fla/modules/__pycache__/fused_bitlinear.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/fused_cross_entropy.cpython-311.pyc b/fla/modules/__pycache__/fused_cross_entropy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d024f0700d0f940f1e13ecad17650e558b8e1675 Binary files /dev/null and b/fla/modules/__pycache__/fused_cross_entropy.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/fused_kl_div.cpython-311.pyc b/fla/modules/__pycache__/fused_kl_div.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..33a08d7b561a0ffeb1797caf1d51463b55aa37d0 Binary files /dev/null and b/fla/modules/__pycache__/fused_kl_div.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/fused_linear_cross_entropy.cpython-311.pyc b/fla/modules/__pycache__/fused_linear_cross_entropy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..65de0ee9c2c89ae048dfb1a30894c4bcd781250c Binary files /dev/null and b/fla/modules/__pycache__/fused_linear_cross_entropy.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/fused_linear_listnet_loss.cpython-311.pyc b/fla/modules/__pycache__/fused_linear_listnet_loss.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b6a291af9e25c39b1f7f5dcdc7cc012c166f456c Binary files /dev/null and b/fla/modules/__pycache__/fused_linear_listnet_loss.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/fused_norm_gate.cpython-311.pyc b/fla/modules/__pycache__/fused_norm_gate.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..741ef1c1e99415638fa001ed772c3321f6a10d57 Binary files /dev/null and b/fla/modules/__pycache__/fused_norm_gate.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/l2norm.cpython-311.pyc b/fla/modules/__pycache__/l2norm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4b304e867ac19bd68e945b8fd1a23e224c5621d7 Binary files /dev/null and b/fla/modules/__pycache__/l2norm.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/layernorm.cpython-311.pyc b/fla/modules/__pycache__/layernorm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88aeddc5b7ecb2f0509c44d2b3f424ea6168afd4 Binary files /dev/null and b/fla/modules/__pycache__/layernorm.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/layernorm_gated.cpython-311.pyc b/fla/modules/__pycache__/layernorm_gated.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a51cefabf7ae77665a5468667d1e9722fa36df13 Binary files /dev/null and b/fla/modules/__pycache__/layernorm_gated.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/mlp.cpython-311.pyc b/fla/modules/__pycache__/mlp.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..97a1f119aa4cd36da58ae34343271a4b61eeb8c9 Binary files /dev/null and b/fla/modules/__pycache__/mlp.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/rotary.cpython-311.pyc b/fla/modules/__pycache__/rotary.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b26239477c8c5c9968e7c34c7b67e1d36ce46e36 Binary files /dev/null and b/fla/modules/__pycache__/rotary.cpython-311.pyc differ diff --git a/fla/modules/__pycache__/seq_to_myopic.cpython-311.pyc b/fla/modules/__pycache__/seq_to_myopic.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2c9bf6b7bcc4965a92a4aeab248fd4c7e220bbb Binary files /dev/null and b/fla/modules/__pycache__/seq_to_myopic.cpython-311.pyc differ diff --git a/fla/modules/activations.py b/fla/modules/activations.py new file mode 100644 index 0000000000000000000000000000000000000000..92f4212581aa5a27b52113deed9f3f685d4346af --- /dev/null +++ b/fla/modules/activations.py @@ -0,0 +1,471 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Tri Dao, Yu Zhang, Songlin Yang. + +import torch +import torch.nn.functional as F +import triton +import triton.language as tl + +from fla.ops.utils.op import exp, log +from fla.utils import autocast_custom_bwd, autocast_custom_fwd, get_multiprocessor_count, input_guard + +sigmoid_fwd_codestring = """ +template T sigmoid_fwd(T x) { + return 1.0f / (1.0f + ::exp(-float(x))); +} +""" +sigmoid_bwd_codestring = """ +template T sigmoid_bwd(T x, T g) { + float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x))); + return float(g) * x_sigmoid * (1.0f - x_sigmoid); +} +""" + +sigmoid_fwd_jit_fn = torch.cuda.jiterator._create_jit_fn(sigmoid_fwd_codestring) +sigmoid_bwd_jit_fn = torch.cuda.jiterator._create_jit_fn(sigmoid_bwd_codestring) + + +@torch.compiler.disable +def sigmoid_fwd(x): + return sigmoid_fwd_jit_fn(x) + + +@torch.compiler.disable +def sigmoid_bwd(x, g): + return sigmoid_bwd_jit_fn(x, g) + + +class SigmoidFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return sigmoid_fwd(x) + + @staticmethod + def backward(ctx, dout): + x, = ctx.saved_tensors + return sigmoid_bwd(x, dout) + + +sigmoid = SigmoidFunction.apply + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4, 8, 16, 32] + ], + key=['D'] +) +@triton.jit +def logsigmoid_fwd_kernel( + x, + y, + temperature, + T: tl.constexpr, + D: tl.constexpr, + B: tl.constexpr +): + i = tl.program_id(0) + o_i = i * B + tl.arange(0, B) + m_i = o_i < T + + b_x = tl.load(x + o_i, mask=m_i, other=0.).to(tl.float32) + b_m = tl.minimum(0., b_x) + b_z = 1. + exp(-tl.abs(b_x)) + b_y = (b_m - log(b_z)) / temperature + tl.store(y + o_i, b_y.to(y.dtype.element_ty), mask=m_i) + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4, 8, 16, 32] + ], + key=['D'] +) +@triton.jit +def logsigmoid_bwd_kernel( + x, + dx, + dy, + temperature, + T: tl.constexpr, + D: tl.constexpr, + B: tl.constexpr +): + i = tl.program_id(0) + o_i = i * B + tl.arange(0, B) + m_i = o_i < T + + b_x = tl.load(x + o_i, mask=m_i, other=0.).to(tl.float32) + b_dy = tl.load(dy + o_i, mask=m_i, other=0.).to(tl.float32) + b_dx = b_dy * (1. - tl.sigmoid(b_x)) / temperature + tl.store(dx + o_i, b_dx.to(dx.dtype.element_ty), mask=m_i) + + +def logsigmoid_fwd(x: torch.Tensor, temperature: float = 1.) -> torch.Tensor: + T, D = x.numel(), x.shape[-1] + B = triton.next_power_of_2(triton.cdiv(T, get_multiprocessor_count(x.device.index))) + y = torch.empty_like(x) + logsigmoid_fwd_kernel[(triton.cdiv(T, B),)]( + x=x, + y=y, + temperature=temperature, + T=T, + D=D, + B=B + ) + return y + + +def logsigmoid_bwd(x: torch.Tensor, dy: torch.Tensor, temperature: float = 1.) -> torch.Tensor: + T, D = x.numel(), x.shape[-1] + B = triton.next_power_of_2(triton.cdiv(T, get_multiprocessor_count(x.device.index))) + dx = torch.empty_like(x) + logsigmoid_bwd_kernel[(triton.cdiv(T, B),)]( + x=x, + dx=dx, + dy=dy, + temperature=temperature, + T=T, + D=D, + B=B + ) + return dx + + +class LogSigmoidFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward(ctx, x, temperature): + ctx.save_for_backward(x,) + ctx.temperature = temperature + return logsigmoid_fwd(x, temperature) + + @staticmethod + @input_guard + def backward(ctx, dy): + x, = ctx.saved_tensors + return logsigmoid_bwd(x, dy, ctx.temperature), None + + +def logsigmoid(x: torch.Tensor, temperature: float = 1.) -> torch.Tensor: + return LogSigmoidFunction.apply(x, temperature) + + +swish_fwd_codestring = """ +template T swish_fwd(T x) { + float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x))); + return float(x) * x_sigmoid; +} +""" +swish_bwd_codestring = """ +template T swish_bwd(T x, T g) { + float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x))); + return float(g) * x_sigmoid * (1.0f - float(x) * x_sigmoid + float(x)); +} +""" + +swish_fwd_jit_fn = torch.cuda.jiterator._create_jit_fn(swish_fwd_codestring) +swish_bwd_jit_fn = torch.cuda.jiterator._create_jit_fn(swish_bwd_codestring) + + +@torch.compiler.disable +def swish_fwd(x): + return swish_fwd_jit_fn(x) + + +@torch.compiler.disable +def swish_bwd(x, g): + return swish_bwd_jit_fn(x, g) + + +class SwishFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return swish_fwd(x) + + @staticmethod + def backward(ctx, dout): + x, = ctx.saved_tensors + return swish_bwd(x, dout) + + +swish = SwishFunction.apply + +# 1/sqrt(2*pi)-> 0.3989423 +# 1/sqrt(2) -> 0.70710678 +# sqrt(2/pi) -> 0.79788456 + + +# this function is tanh approximation of gelu +# actual gelu is: +# x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) +@torch.compile +def bias_gelu(y, bias): + x = bias + y + return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=y.dtype) + + +# gradient of tanh approximation of gelu +# gradient of actual gelu is: +# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) +@torch.compile +def bias_gelu_bwd(g, y, bias): + """Assume that y has shape (B, D) and bias has shape (D)""" + x = bias + y + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * ( + 1 + tanh_out + ) + grad_y = ff * g + return grad_y.to(dtype=y.dtype), grad_y.sum(dim=(0), dtype=bias.dtype) + + +class GeLUFunction(torch.autograd.Function): + + @staticmethod + # bias is an optional argument + def forward(ctx, input, bias): + ctx.save_for_backward(input, bias) + return bias_gelu(input, bias) + + @staticmethod + def backward(ctx, grad_output): + input, bias = ctx.saved_tensors + tmp = bias_gelu_bwd(grad_output, input, bias) + return tmp, tmp + + +bias_gelu_impl = GeLUFunction.apply + + +# this function is tanh approximation of gelu +# actual gelu is: +# x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) +@torch.compile +def gelu_fwd(x): + return (x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))).to(dtype=x.dtype) + + +# gradient of tanh approximation of gelu +# gradient of actual gelu is: +# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x) +@torch.compile +def gelu_bwd(g, x): + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * ( + 1 + tanh_out + ) + return (ff * g).to(dtype=x.dtype) + + +class FastGeLUFunction(torch.autograd.Function): + @staticmethod + # bias is an optional argument + def forward(ctx, input): + ctx.save_for_backward(input) + return gelu_fwd(input) + + @staticmethod + def backward(ctx, grad_output): + (input,) = ctx.saved_tensors + tmp = gelu_bwd(grad_output, input) + return tmp + + +fast_gelu_impl = FastGeLUFunction.apply + + +@torch.compile +def relu_bwd(g, x): + return torch.where(x >= 0, g, 0.0).to(dtype=x.dtype) + + +@torch.compile +def sqrelu_fwd(x): + r = F.relu(x.float()) + return (r * r).to(dtype=x.dtype) + + +@torch.compile +def sqrelu_bwd(g, x): + return (2.0 * g * F.relu(x.float())).to(dtype=x.dtype) + + +class SquaredReLUFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, input): + ctx.save_for_backward(input) + return sqrelu_fwd(input) + + @staticmethod + def backward(ctx, grad_output): + input, = ctx.saved_tensors + return sqrelu_bwd(grad_output, input) + + +sqrelu = SquaredReLUFunction.apply + + +swiglu_fwd_codestring = """ +template T swiglu_fwd(T x, T y) { + return float(x) * float(y) / (1.0f + ::exp(-float(x))); +} +""" +swiglu_bwd_codestring = """ +template T swiglu_bwd(T x, T y, T g, T& dx, T& dy) { + float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x))); + dx = x_sigmoid * (1 + float(x) * (1.0f - x_sigmoid)) * float(g) * float(y); + dy = float(x) * x_sigmoid * float(g); +} +""" + +swiglu_fwdbwd_codestring = """ +template T swiglu_fwdbwd(T x, T y, T g, T& dx, T& dy, T& z) { + float x_sigmoid = 1.0f / (1.0f + ::exp(-float(x))); + float x_swish = float(x) * x_sigmoid; + dx = x_sigmoid * (1 + float(x) * (1.0f - x_sigmoid)) * float(g) * float(y); + dy = x_swish * float(g); + z = x_swish * float(y); +} +""" + + +swiglu_fwd_jit_fn = torch.cuda.jiterator._create_jit_fn(swiglu_fwd_codestring) +swiglu_bwd_jit_fn = torch.cuda.jiterator._create_multi_output_jit_fn(swiglu_bwd_codestring, num_outputs=2) +swiglu_fwdbwd_jit_fn = torch.cuda.jiterator._create_multi_output_jit_fn(swiglu_fwdbwd_codestring, num_outputs=3) + + +@torch.compiler.disable +def swiglu_fwd(x, y): + return swiglu_fwd_jit_fn(x, y) + + +@torch.compiler.disable +def swiglu_bwd(x, y, g): + return swiglu_bwd_jit_fn(x, y, g) + + +@torch.compiler.disable +def swiglu_fwdbwd(x, y, g): + return swiglu_fwdbwd_jit_fn(x, y, g) + + +@torch.compile +def swiglu_fwd_torch(x, y): + return (F.silu(x.float()) * y).to(x.dtype) + + +@torch.compile +def swiglu_bwd_torch(x, y, g): + dtype = x.dtype + x, y, g = x.float(), y.float(), g.float() + x_sigmoid = x.sigmoid() + x_swish = x * x_sigmoid + dx = x_sigmoid * (1 + x * (1.0 - x_sigmoid)) * g * y + dy = x_swish * g + return dx.to(dtype), dy.to(dtype) + + +@torch.compile +def swiglu_fwdbwd_torch(x, y, g): + dtype = x.dtype + x, y, g = x.float(), y.float(), g.float() + x_sigmoid = x.sigmoid() + x_swish = x * x_sigmoid + dx = x_sigmoid * (1 + x * (1.0 - x_sigmoid)) * g * y + dy = x_swish * g + z = x_swish * y + return dx.to(dtype), dy.to(dtype), z.to(dtype) + + +class SwiGLUFunction(torch.autograd.Function): + r""" + Swish-Gated Linear Unit (SwiGLU) function. + + .. math:: + \text{SwiGLU}(x, y) = swish(x) * y = \frac{x}{1 + \exp(-x)} * y + """ + + @staticmethod + def forward(ctx, x, y): + ctx.save_for_backward(x, y) + if torch.compiler.is_compiling() or isinstance(x, torch.distributed.tensor.DTensor): + return swiglu_fwd_torch(x, y) + else: + return swiglu_fwd(x, y) + + @staticmethod + def backward(ctx, dout): + x, y = ctx.saved_tensors + if torch.compiler.is_compiling() or isinstance(x, torch.distributed.tensor.DTensor): + return swiglu_bwd_torch(x, y, dout) + else: + return swiglu_bwd(x, y, dout) + + +class SwiGLULinearFunction(torch.autograd.Function): + r""" + Swish-Gated Linear Unit (SwiGLU) function followed by a linear transformation. + + .. math:: + \text{SwiGLULinear}(x, y, W, b) = (swish(x) * y) W + b + + This simple wrap discards the intermediate results of SwiGLU(x, y) to save memory. + """ + + @staticmethod + @autocast_custom_fwd + def forward(ctx, x, y, weight, bias): + with torch.no_grad(): + if torch.compiler.is_compiling() or isinstance(x, torch.distributed.tensor.DTensor): + z = swiglu_fwd_torch(x, y) + else: + z = swiglu_fwd(x, y) + out = F.linear(z, weight, bias) + # We don't store z, will be recomputed in the backward pass to save memory + ctx.save_for_backward(x, y, weight) + ctx.linear_bias_is_none = bias is None + return out + + @staticmethod + @autocast_custom_bwd + def backward(ctx, dout, *args): + x, y, weight = ctx.saved_tensors + dout = dout.reshape(-1, dout.shape[-1]) + dz = F.linear(dout, weight.t()).view_as(x) + with torch.no_grad(): + if torch.compiler.is_compiling() or isinstance(x, torch.distributed.tensor.DTensor): + dx, dy, z = swiglu_fwdbwd_torch(x, y, dz) + else: + dx, dy, z = swiglu_fwdbwd(x, y, dz) + dlinear_weight = torch.einsum("bo,bi->oi", dout, z.reshape(-1, z.shape[-1])) + dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0) + return dx, dy, dlinear_weight, dlinear_bias + + +swiglu = SwiGLUFunction.apply + + +swiglu_linear = SwiGLULinearFunction.apply + + +ACT2FN = { + 'relu': F.relu, + 'sigmoid': sigmoid, + 'logsigmoid': logsigmoid, + 'silu': swish, + 'swish': swish, + 'sqrelu': sqrelu, + 'gelu': fast_gelu_impl, + 'bias_gelu': bias_gelu_impl, +} diff --git a/fla/modules/feature_map.py b/fla/modules/feature_map.py new file mode 100644 index 0000000000000000000000000000000000000000..6af81e74d3975f67b8df23c1dfa60cd01b5a4950 --- /dev/null +++ b/fla/modules/feature_map.py @@ -0,0 +1,300 @@ +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import math +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from fla.modules.activations import fast_gelu_impl, sigmoid, sqrelu, swish +from fla.modules.layernorm import layer_norm +from fla.utils import checkpoint + + +@checkpoint +def flatten_diag_outer_product(x, y): + z = torch.einsum("...i,...j->...ij", x, y) + N = z.size(-1) + indicies = torch.triu_indices(N, N) + return z[..., indicies[0], indicies[1]] + + +@checkpoint +def flatten_diag_outer_product_off1(x, y): + z = torch.einsum("...i,...j->...ij", x, y) + N = z.size(-1) + indicies = torch.triu_indices(N, N, 1) + indices2 = torch.arange(0, N) + return z[..., indicies[0], indicies[1]], z[..., indices2, indices2] + + +def is_power_of_2(n): + return (n & (n - 1) == 0) and n != 0 + + +class HedgehogFeatureMap(nn.Module): + + r""" + Hedgehog feature map as introduced in + `The Hedgehog & the Porcupine: Expressive Linear Attentions with Softmax Mimicry `_ + """ + + def __init__( + self, + head_dim: int + ) -> HedgehogFeatureMap: + super().__init__() + # Trainable map + self.layer = nn.Linear(head_dim, head_dim) + self.init_weights_() + + def init_weights_(self): + """Initialize trainable map as identity""" + with torch.no_grad(): + identity = torch.eye(*self.layer.weight.shape[-2:], dtype=torch.float) + self.layer.weight.copy_(identity.to(self.layer.weight)) + nn.init.zeros_(self.layer.bias) + + def forward(self, x: torch.Tensor): + x = self.layer(x) # shape b, h, l, d + return torch.cat([2*x, -2*x], dim=-1).softmax(-1) + + +class T2RFeatureMap(nn.Module): + + r""" + Simple linear mapping feature map as in + `Finetuning Pretrained Transformers into RNNs `_ + """ + + def __init__( + self, + head_dim: int, + dot_dim: int = None, + bias: Optional[bool] = False + ) -> T2RFeatureMap: + super().__init__() + # Trainable map + if dot_dim is None: + dot_dim = head_dim + + self.head_dim = head_dim + self.dot_dim = dot_dim + self.bias = bias + + self.layer = nn.Linear(head_dim, dot_dim, bias=bias) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(head_dim={self.head_dim}, dot_dim={self.dot_dim}, bias={self.bias})" + + def forward(self, x: torch.Tensor): + return self.layer(x).relu() + + +class DPFPFeatureMap(nn.Module): + + r""" + Deterministic Parameter-Free Projection (DPFP) feature map in + `Linear Transformers Are Secretly Fast Weight Programmers `_ + """ + + def __init__( + self, + head_dim: int, + nu: int = 4 + ) -> DPFPFeatureMap: + super().__init__() + self.nu = nu + + def forward(self, x: torch.Tensor): + x = torch.cat([x.relu(), -x.relu()], dim=-1) + x_rolled = torch.cat([x.roll(shifts=j, dims=-1) for j in range(1, self.nu+1)], dim=-1) + x_repeat = torch.cat([x] * self.nu, dim=-1) + return x_repeat * x_rolled + + +class HadamardFeatureMap(nn.Module): + def __init__( + self, + head_dim: int + ) -> HadamardFeatureMap: + super().__init__() + # Trainable map + self.layer1 = nn.Linear(head_dim, head_dim) + self.layer2 = nn.Linear(head_dim, head_dim) + + def forward(self, x: torch.Tensor): + return self.layer1(x) * self.layer2(x) + + +class LearnableOuterProductFeatureMap(nn.Module): + def __init__( + self, + head_dim: int, + feature_dim: int + ) -> LearnableOuterProductFeatureMap: + super().__init__() + # Trainable map + self.layer1 = nn.Linear(head_dim, feature_dim, bias=False) + self.layer2 = nn.Linear(head_dim, feature_dim, bias=False) + self.normalizer = feature_dim ** -0.5 + + def forward(self, x: torch.Tensor): + return flatten_diag_outer_product(self.layer1(x), self.layer2(x)) + + +class LearnablePolySketchNonNegativeFeatureMap(nn.Module): + + def __init__( + self, + head_dim: int, + sketch_size: Optional[int] = None, + degree: Optional[int] = 2 + ) -> LearnablePolySketchNonNegativeFeatureMap: + super().__init__() + + assert is_power_of_2(degree) and degree >= 2, f"The degree {degree} must be a power of 2" + + self.head_dim = head_dim + self.sketch_size = sketch_size if sketch_size is not None else head_dim + self.degree = degree + + self.gamma = nn.Parameter(torch.ones(head_dim)) + self.beta = nn.Parameter(torch.zeros(head_dim)) + # NOTE: the sketch layers defined here are quite different from the original paper + # currently we simply use linear layers without any non-linear activations + self.sketches1 = nn.ModuleList([ + nn.Linear(head_dim, sketch_size, bias=False), + *[nn.Linear(sketch_size, sketch_size, bias=False) for _ in range(int(math.log2(self.degree)) - 2)] + ]) + self.sketches2 = nn.ModuleList([ + nn.Linear(head_dim, sketch_size, bias=False), + *[nn.Linear(sketch_size, sketch_size, bias=False) for _ in range(int(math.log2(self.degree)) - 2)] + ]) + + def forward(self, x: torch.Tensor): + # Section 2.1 + x = layer_norm(x, self.gamma, self.beta) + # first map the input to sketch size with learnable parameters + x = self.sketches1[0](x) * self.sketches2[0](x) * self.head_dim ** -0.5 + for i in range(1, int(math.log2(self.degree)) - 1): + x = self.sketches1[i](x) * self.sketches2[i](x) * self.head_dim ** -0.5 + # do sketch mapping for log2(p) - 1 times in total + # do p=2 mapping to ensure non-negativity + return flatten_diag_outer_product(x, x) + + +class TaylorFeatureMap(nn.Module): + def __init__( + self, + head_dim: int + ) -> TaylorFeatureMap: + super().__init__() + self.head_dim = head_dim + self.r2 = math.sqrt(2) + self.rd = math.sqrt(self.head_dim) + self.rrd = math.sqrt(self.rd) + + def forward(self, x: torch.Tensor): + x2_1, x2_2 = flatten_diag_outer_product_off1(x, x) + return torch.cat([torch.ones_like(x[..., 0:1]), x / self.rrd, x2_2 / (self.rd * self.r2), x2_1 / self.rd], dim=-1) + + +class RebasedFeatureMap(nn.Module): + + def __init__( + self, + head_dim: int, + use_gamma: Optional[bool] = True, + use_beta: Optional[bool] = True, + normalize: Optional[bool] = True + ) -> RebasedFeatureMap: + super().__init__() + + self.head_dim = head_dim + self.use_gamma = use_gamma + self.use_beta = use_beta + self.normalize = normalize + + self.gamma = None + self.beta = None + if use_gamma: + self.gamma = nn.Parameter(torch.ones(head_dim)) + if use_beta: + self.beta = nn.Parameter(torch.zeros(head_dim)) + + def forward(self, x: torch.Tensor, flatten: Optional[bool] = True): + if self.use_beta and self.use_gamma and self.normalize: + x = layer_norm(x, self.gamma, self.beta) + elif self.normalize: + x = F.layer_norm(x, (self.head_dim,), self.gamma, self.beta) + elif self.use_gamma and self.use_beta: + x = torch.addcmul(self.beta, x, self.gamma) + elif self.use_gamma: + x = x.mul(self.gamma) + else: + raise RuntimeError(f"Not supported combination of `use_gamma`, `use_beta` and `normalize`, " + f"which is currentlt set as (`{self.use_gamma}`, `{self.use_beta}`, `{self.normalize}`)") + if not flatten: + return x + x2_1, x2_2 = flatten_diag_outer_product_off1(x, x) + # rebased use learnable parameters to approximate any quadratic function + return torch.cat([x2_2 * self.head_dim ** -0.5, x2_1 * (2 / self.head_dim) ** 0.5], dim=-1) + + +class ReLUFeatureMap(nn.Module): + + def __init__( + self, + ) -> ReLUFeatureMap: + super().__init__() + + def forward(self, x: torch.Tensor): + return F.relu(x) + + +class SquaredReLUFeatureMap(nn.Module): + + def __init__( + self, + ) -> SquaredReLUFeatureMap: + super().__init__() + + def forward(self, x: torch.Tensor): + return sqrelu(x) + + +class GELUFeatureMap(nn.Module): + + def __init__( + self, + ) -> GELUFeatureMap: + super().__init__() + + def forward(self, x: torch.Tensor): + return fast_gelu_impl(x) + + +class SwishFeatureMap(nn.Module): + + def __init__( + self, + ) -> SwishFeatureMap: + super().__init__() + + def forward(self, x: torch.Tensor): + return swish(x) + + +class SigmoidFeatureMap(nn.Module): + + def __init__( + self, + ) -> SigmoidFeatureMap: + super().__init__() + + def forward(self, x: torch.Tensor): + return sigmoid(x) diff --git a/fla/modules/fused_bitlinear.py b/fla/modules/fused_bitlinear.py new file mode 100644 index 0000000000000000000000000000000000000000..d05928eaf6b721f29a4d15967cae3a8e014e7c9c --- /dev/null +++ b/fla/modules/fused_bitlinear.py @@ -0,0 +1,638 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +# Implementations of BitLinear layer with fused LayerNorm and quantized Linear layer. +# [The Era of 1-bit LLMs: All Large Language Models are in 1.58 Bits](https://arxiv.org/abs/2402.17764) +# [Scalable MatMul-free Language Modeling](https://arxiv.org/abs/2406.02528) + +# Code adapted from https://github.com/ridgerchu/matmulfreellm/ + +from __future__ import annotations + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl + +from fla.modules.layernorm import RMSNorm +from fla.utils import get_multiprocessor_count, input_guard, require_version + + +def activation_quant(x): + """ + Per-token quantization to 8 bits. No grouping is needed for quantization. + + Args: + x: An activation tensor with shape [n, d]. + + Returns: + A quantized activation tensor with shape [n, d]. + """ + # Compute the scale factor + scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5) + # Quantize and then de-quantize the tensor + y = (x * scale).round().clamp_(-128, 127) / scale + return y + + +def weight_quant(w): + """ + Per-tensor quantization to 1.58 bits. No grouping is needed for quantization. + + Args: + w: A weight tensor with shape [d, k]. + + Returns: + A quantized weight tensor with shape [d, k]. + """ + # Compute the scale factor + scale = 1.0 / w.abs().mean().clamp_(min=1e-5) + # Quantize and then de-quantize the tensor + u = (w * scale).round().clamp_(-1, 1) / scale + return u + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + triton.Config({}, num_warps=16), + triton.Config({}, num_warps=32), + ], + key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"], +) +@triton.jit +def layer_norm_fwd_kernel_quant( + X, # pointer to the input + Y, # pointer to the output + W, # pointer to the weights + B, # pointer to the biases + RESIDUAL, # pointer to the residual + RESIDUAL_OUT, # pointer to the residual + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row, # how much to increase the pointer when moving by 1 row + stride_y_row, + stride_res_row, + stride_res_out_row, + N, # number of columns in X + eps, # epsilon to avoid division by zero + IS_RMS_NORM: tl.constexpr, + BLOCK_N: tl.constexpr, + HAS_RESIDUAL: tl.constexpr, + STORE_RESIDUAL_OUT: tl.constexpr, + HAS_WEIGHT: tl.constexpr, + HAS_BIAS: tl.constexpr +): + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + X += row * stride_x_row + Y += row * stride_y_row + if HAS_RESIDUAL: + RESIDUAL += row * stride_res_row + if STORE_RESIDUAL_OUT: + RESIDUAL_OUT += row * stride_res_out_row + # Compute mean and variance + cols = tl.arange(0, BLOCK_N) + x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32) + if HAS_RESIDUAL: + residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32) + x += residual + if STORE_RESIDUAL_OUT: + tl.store(RESIDUAL_OUT + cols, x, mask=cols < N) + if not IS_RMS_NORM: + mean = tl.sum(x, axis=0) / N + tl.store(Mean + row, mean) + xbar = tl.where(cols < N, x - mean, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + else: + xbar = tl.where(cols < N, x, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + rstd = 1 / tl.sqrt(var + eps) + tl.store(Rstd + row, rstd) + # Normalize and apply linear transformation + mask = cols < N + if HAS_WEIGHT: + w = tl.load(W + cols, mask=mask).to(tl.float32) + if HAS_BIAS: + b = tl.load(B + cols, mask=mask).to(tl.float32) + x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + + y = x_hat * w if HAS_WEIGHT else x_hat + if HAS_BIAS: + y = y + b + + # Aply quantization to the output + scale = 127.0 / tl.maximum(tl.max(tl.abs(y), 0), 1e-5) + # Quantize and then de-quantize the tensor + y = tl.extra.cuda.libdevice.round(y * scale) + y = tl.maximum(tl.minimum(y, 127), -128) / scale + + # Write output + tl.store(Y + cols, y, mask=mask) + + +def layer_norm_fwd_quant( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + residual: torch.Tensor = None, + out_dtype: torch.dtype = None, + residual_dtype: torch.dtype = None, + is_rms_norm: bool = False +): + if residual is not None: + residual_dtype = residual.dtype + M, N = x.shape + # allocate output + y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype) + if residual is not None or (residual_dtype is not None and residual_dtype != x.dtype): + residual_out = torch.empty(M, N, device=x.device, dtype=residual_dtype) + else: + residual_out = None + mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None + rstd = torch.empty((M,), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + if N > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + layer_norm_fwd_kernel_quant[(M,)]( + x, + y, + weight, + bias, + residual, + residual_out, + mean, + rstd, + x.stride(0), + y.stride(0), + residual.stride(0) if residual is not None else 0, + residual_out.stride(0) if residual_out is not None else 0, + N, + eps, + is_rms_norm, + BLOCK_N, + residual is not None, + residual_out is not None, + weight is not None, + bias is not None, + ) + # residual_out is None if residual is None and residual_dtype == input_dtype + return y, mean, rstd, residual_out if residual_out is not None else x + + +@triton.heuristics({ + "RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=1), + triton.Config({}, num_warps=2), + triton.Config({}, num_warps=4), + triton.Config({}, num_warps=8), + triton.Config({}, num_warps=16), + triton.Config({}, num_warps=32), + ], + key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS"], +) +@triton.jit +def layer_norm_bwd_kernel( + X, # pointer to the input + W, # pointer to the weights + B, # pointer to the biases + Y, # pointer to the output to be recomputed + DY, # pointer to the output gradient + DX, # pointer to the input gradient + DW, # pointer to the partial sum of weights gradient + DB, # pointer to the partial sum of biases gradient + DRESIDUAL, + DRESIDUAL_IN, + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row, # how much to increase the pointer when moving by 1 row + stride_y_row, + stride_dy_row, + stride_dx_row, + stride_dres_row, + stride_dres_in_row, + M, # number of rows in X + N, # number of columns in X + eps, # epsilon to avoid division by zero + rows_per_program, + IS_RMS_NORM: tl.constexpr, + BLOCK_N: tl.constexpr, + HAS_DRESIDUAL: tl.constexpr, + STORE_DRESIDUAL: tl.constexpr, + HAS_WEIGHT: tl.constexpr, + HAS_BIAS: tl.constexpr, + RECOMPUTE_OUTPUT: tl.constexpr, +): + # Map the program id to the elements of X, DX, and DY it should compute. + row_block_id = tl.program_id(0) + row_start = row_block_id * rows_per_program + cols = tl.arange(0, BLOCK_N) + mask = cols < N + X += row_start * stride_x_row + if HAS_DRESIDUAL: + DRESIDUAL += row_start * stride_dres_row + if STORE_DRESIDUAL: + DRESIDUAL_IN += row_start * stride_dres_in_row + DY += row_start * stride_dy_row + DX += row_start * stride_dx_row + if RECOMPUTE_OUTPUT: + Y += row_start * stride_y_row + if HAS_WEIGHT: + w = tl.load(W + cols, mask=mask).to(tl.float32) + dw = tl.zeros((BLOCK_N,), dtype=tl.float32) + if RECOMPUTE_OUTPUT and HAS_BIAS: + b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32) + if HAS_BIAS: + db = tl.zeros((BLOCK_N,), dtype=tl.float32) + row_end = min((row_block_id + 1) * rows_per_program, M) + for row in range(row_start, row_end): + # Load data to SRAM + x = tl.load(X + cols, mask=mask, other=0).to(tl.float32) + dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32) + if not IS_RMS_NORM: + mean = tl.load(Mean + row) + rstd = tl.load(Rstd + row) + # Compute dx + xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + xhat = tl.where(mask, xhat, 0.0) + if RECOMPUTE_OUTPUT: + y = xhat * w if HAS_WEIGHT else xhat + if HAS_BIAS: + y = y + b + + # Aply quantization to the output + scale = 127.0 / tl.maximum(tl.max(tl.abs(y), 0), 1e-5) + # Quantize and then de-quantize the tensor + y = tl.extra.cuda.libdevice.round(y * scale) + y = tl.maximum(tl.minimum(y, 127), -128) / scale + + tl.store(Y + cols, y, mask=mask) + wdy = dy + if HAS_WEIGHT: + wdy = dy * w + dw += dy * xhat + if HAS_BIAS: + db += dy + if not IS_RMS_NORM: + c1 = tl.sum(xhat * wdy, axis=0) / N + c2 = tl.sum(wdy, axis=0) / N + dx = (wdy - (xhat * c1 + c2)) * rstd + else: + c1 = tl.sum(xhat * wdy, axis=0) / N + dx = (wdy - xhat * c1) * rstd + if HAS_DRESIDUAL: + dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32) + dx += dres + # Write dx + if STORE_DRESIDUAL: + tl.store(DRESIDUAL_IN + cols, dx, mask=mask) + tl.store(DX + cols, dx, mask=mask) + + X += stride_x_row + if HAS_DRESIDUAL: + DRESIDUAL += stride_dres_row + if STORE_DRESIDUAL: + DRESIDUAL_IN += stride_dres_in_row + if RECOMPUTE_OUTPUT: + Y += stride_y_row + DY += stride_dy_row + DX += stride_dx_row + if HAS_WEIGHT: + tl.store(DW + row_block_id * N + cols, dw, mask=mask) + if HAS_BIAS: + tl.store(DB + row_block_id * N + cols, db, mask=mask) + + +def layer_norm_bwd( + dy: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + mean: torch.Tensor, + rstd: torch.Tensor, + dresidual: torch.Tensor = None, + has_residual: bool = False, + is_rms_norm: bool = False, + x_dtype: torch.dtype = None, + recompute_output: bool = False, +): + M, N = x.shape + # allocate output + dx = torch.empty_like(x) if x_dtype is None else torch.empty(M, N, dtype=x_dtype, device=x.device) + dresidual_in = torch.empty_like(x) if has_residual and dx.dtype != x.dtype else None + y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + if N > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + sm_count = get_multiprocessor_count(x.device.index) + _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device) if weight is not None else None + _db = torch.empty((sm_count, N), dtype=torch.float32, device=bias.device) if bias is not None else None + rows_per_program = math.ceil(M / sm_count) + grid = (sm_count,) + layer_norm_bwd_kernel[grid]( + x, + weight, + bias, + y, + dy, + dx, + _dw, + _db, + dresidual, + dresidual_in, + mean, + rstd, + x.stride(0), + 0 if not recompute_output else y.stride(0), + dy.stride(0), + dx.stride(0), + dresidual.stride(0) if dresidual is not None else 0, + dresidual_in.stride(0) if dresidual_in is not None else 0, + M, + N, + eps, + rows_per_program, + is_rms_norm, + BLOCK_N, + dresidual is not None, + dresidual_in is not None, + weight is not None, + bias is not None, + ) + dw = _dw.sum(0).to(weight.dtype) if weight is not None else None + db = _db.sum(0).to(bias.dtype) if bias is not None else None + # Don't need to compute dresidual_in separately in this case + if has_residual and dx.dtype == x.dtype: + dresidual_in = dx + return (dx, dw, db, dresidual_in) if not recompute_output else (dx, dw, db, dresidual_in, y) + + +class LayerNormLinearQuantFn(torch.autograd.Function): + + @staticmethod + @input_guard + def forward( + ctx, + x, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual=None, + eps=1e-6, + prenorm=False, + residual_in_fp32=False, + is_rms_norm=False, + ): + x_shape_og = x.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + if residual is not None: + assert residual.shape == x_shape_og + residual = residual.reshape(-1, residual.shape[-1]) + residual_dtype = residual.dtype if residual is not None else (torch.float32 if residual_in_fp32 else None) + y, mean, rstd, residual_out = layer_norm_fwd_quant( + x, + norm_weight, + norm_bias, + eps, + residual, + out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype(), + residual_dtype=residual_dtype, + is_rms_norm=is_rms_norm, + ) + y = y.reshape(x_shape_og) + dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype + linear_weight = weight_quant(linear_weight).to(dtype) + linear_bias = linear_bias.to(dtype) if linear_bias is not None else None + out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias) + # We don't store y, will be recomputed in the backward pass to save memory + ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd) + ctx.x_shape_og = x_shape_og + ctx.eps = eps + ctx.is_rms_norm = is_rms_norm + ctx.has_residual = residual is not None + ctx.prenorm = prenorm + ctx.x_dtype = x.dtype + ctx.linear_bias_is_none = linear_bias is None + return out if not prenorm else (out, residual_out.reshape(x_shape_og)) + + @staticmethod + @input_guard + def backward(ctx, dout, *args): + x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors + dout = dout.reshape(-1, dout.shape[-1]) + dy = F.linear(dout, linear_weight.t()) + dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0) + assert dy.shape == x.shape + if ctx.prenorm: + dresidual = args[0] + dresidual = dresidual.reshape(-1, dresidual.shape[-1]) + assert dresidual.shape == x.shape + else: + dresidual = None + dx, dnorm_weight, dnorm_bias, dresidual_in, y = layer_norm_bwd( + dy, + x, + norm_weight, + norm_bias, + ctx.eps, + mean, + rstd, + dresidual, + ctx.has_residual, + ctx.is_rms_norm, + x_dtype=ctx.x_dtype, + recompute_output=True + ) + dlinear_weight = torch.einsum("bo,bi->oi", dout, y) + return ( + dx.reshape(ctx.x_shape_og), + dnorm_weight, + dnorm_bias, + dlinear_weight, + dlinear_bias, + dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None, + None, + None, + None, + None, + ) + + +def layer_norm_linear_quant_fn( + x, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual=None, + eps=1e-6, + prenorm=False, + residual_in_fp32=False, + is_rms_norm=False, +): + return LayerNormLinearQuantFn.apply( + x, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual, + eps, + prenorm, + residual_in_fp32, + is_rms_norm, + ) + + +def rms_norm_linear_quant( + x: torch.Tensor, + norm_weight: torch.Tensor, + norm_bias: torch.Tensor, + linear_weight: torch.Tensor, + linear_bias: torch.Tensor, + residual: torch.Tensor = None, + eps: float = 1e-5, + prenorm: bool = False, + residual_in_fp32: bool = False +): + return layer_norm_linear_quant_fn( + x=x, + norm_weight=norm_weight, + norm_bias=norm_bias, + linear_weight=linear_weight, + linear_bias=linear_bias, + residual=residual, + eps=eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + is_rms_norm=True + ) + + +@require_version("triton>=3.0", "Triton >= 3.0 is required to do online quantization.") +def bit_linear(x, weight, bias=None, norm_weight=None, norm_bias=None, eps=1e-8): + """ + A functional version of BitLinear that applies quantization to activations and weights. + + Args: + x: Input tensor with shape [n, d]. + weight: Weight tensor with shape [out_features, in_features]. + bias: Bias tensor with shape [out_features] (optional). + norm_weight: Weight tensor for RMS normalization with shape [in_features]. + norm_bias: Bias tensor for RMS normalization with shape [in_features]. + eps: A small constant for numerical stability in normalization. + + Returns: + Output tensor with shape [n, out_features]. + """ + return layer_norm_linear_quant_fn( + x, + norm_weight, + norm_bias, + weight, + bias, + is_rms_norm=True + ) + + +class BitLinear(nn.Linear): + """ + A custom linear layer that applies quantization on both activations and weights. + This is primarily for training; kernel optimization is needed for efficiency in deployment. + """ + + def __init__( + self, + in_features: int, + out_features: int, + bias: bool = False, + norm_eps: float = 1e-8 + ): + """ + Initializes the BitLinear layer. + + Args: + in_features: Size of each input sample. + out_features: Size of each output sample. + bias: If set to False, the layer will not learn an additive bias. Default: True. + """ + # Initialize the superclass nn.Linear with the given parameters + super(BitLinear, self).__init__(in_features, out_features, bias=bias) + + self.norm = RMSNorm(in_features, eps=norm_eps) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({super().extra_repr()}, norm_eps={self.norm.eps})" + + def forward(self, x): + """ + Overrides the forward pass to include quantization. + + Args: + x: An input tensor with shape [n, d]. + + Returns: + An output tensor with shape [n, d]. + """ + # Weight tensor + w = self.weight + + # Apply RMS normalization to the input + x_norm = self.norm(x) + + # Apply quantization to both activations and weights + # Uses Straight-Through Estimator (STE) trick with .detach() for gradient flow + x_quant = x_norm + (activation_quant(x_norm) - x_norm).detach() + w_quant = w + (weight_quant(w) - w).detach() + # Perform linear operation with quantized values + y = F.linear(x_quant, w_quant) + + return y + + +class FusedBitLinear(BitLinear): + """ + A custom linear layer that applies quantization on both activations and weights. + This is primarily for training; kernel optimization is needed for efficiency in deployment. + """ + + def __init__(self, in_features, out_features, bias=False): + """ + Initializes the BitLinear layer. + + Args: + in_features: Size of each input sample. + out_features: Size of each output sample. + bias: If set to False, the layer will not learn an additive bias. Default: True. + """ + # Initialize the superclass nn.Linear with the given parameters + super(FusedBitLinear, self).__init__(in_features, out_features, bias=bias) + + def forward(self, x): + return layer_norm_linear_quant_fn( + x, + self.norm.weight, + self.norm.bias, + self.weight, + self.bias, + is_rms_norm=True + ) diff --git a/fla/modules/fused_cross_entropy.py b/fla/modules/fused_cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..f85091f66fe5539d4d6c68ca801b3b51ac8b94e4 --- /dev/null +++ b/fla/modules/fused_cross_entropy.py @@ -0,0 +1,419 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2023, Tri Dao. + +from typing import Any, Tuple + +import torch +import torch.nn as nn +import triton +import triton.language as tl + +from fla.ops.utils.op import exp, log +from fla.utils import input_guard + +# `all_gather_into_tensor` and `reduce_scatter_tensor` are new placeholders for +# `_all_gather_base` and `_reduce_scatter_base`. They require the most recent +# version of PyTorch. The following 2 lines are for backward compatibility with +# older PyTorch. +if "all_gather_into_tensor" not in dir(torch.distributed): + torch.distributed.all_gather_into_tensor = torch.distributed._all_gather_base + + +@triton.heuristics({ + "HAS_SMOOTHING": lambda args: args["label_smoothing"] > 0.0, +}) +@triton.jit +def cross_entropy_fwd_kernel( + loss_ptr, # data ptrs + lse_ptr, + z_loss_ptr, + logits_ptr, + labels_ptr, + label_smoothing, + logit_scale, + lse_square_scale, + ignore_index, + total_classes, + class_start_idx, # Useful for tensor parallel when each rank only has a subset of classes + n_cols, # shapes + n_rows, + logits_row_stride, # strides + BLOCK_SIZE: tl.constexpr, + HAS_SMOOTHING: tl.constexpr, + # if SPLIT (e.g. tensor parallel), don't include the LSE in the loss since it's not the final LSE + SPLIT: tl.constexpr, +): + row_idx = tl.program_id(0) + col_block_idx = tl.program_id(1) + logits_ptr = logits_ptr + row_idx * logits_row_stride.to(tl.int64) + col_offsets = col_block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + label_idx = tl.load(labels_ptr + row_idx) + logits = tl.load(logits_ptr + col_offsets, mask=col_offsets < n_cols, other=-float("inf")) + logits = logits.to(tl.float32) * logit_scale + max_logits = tl.max(logits, 0) + if HAS_SMOOTHING: + sum_logits = tl.sum(tl.where(col_offsets < n_cols, logits, 0.0), 0) + lse = log(tl.sum(exp(logits - max_logits), 0)) + max_logits + tl.store(lse_ptr + col_block_idx * n_rows + row_idx, lse) + if label_idx == ignore_index: + loss = 0.0 + z_loss = 0.0 + else: + label_idx -= class_start_idx + if label_idx >= col_block_idx * BLOCK_SIZE and label_idx < min( + n_cols, (col_block_idx + 1) * BLOCK_SIZE + ): + logits_label = tl.load(logits_ptr + label_idx) * logit_scale + if HAS_SMOOTHING: + loss = ( + (lse if not SPLIT else 0.0) + - label_smoothing * sum_logits / total_classes + - (1 - label_smoothing) * logits_label + ) + else: + loss = (lse if not SPLIT else 0.0) - logits_label + else: + # If label is out of bounds, we set the CE loss to 0.0. But we still want the label_smoothing loss + if HAS_SMOOTHING: + loss = label_smoothing * ((lse if not SPLIT else 0.0) - sum_logits / total_classes) + else: + loss = 0.0 + if not SPLIT: + z_loss = lse_square_scale * lse * lse + loss += z_loss + else: + z_loss = 0.0 + tl.store(loss_ptr + col_block_idx * n_rows + row_idx, loss) + if not SPLIT: + tl.store(z_loss_ptr + col_block_idx * n_rows + row_idx, z_loss) + + +@triton.heuristics({ + "HAS_SMOOTHING": lambda args: args["label_smoothing"] > 0.0, +}) +@triton.jit +def cross_entropy_bwd_kernel( + dlogits_ptr, # data ptrs + dloss_ptr, + logits_ptr, + lse_ptr, + labels_ptr, + label_smoothing, + logit_scale, + lse_square_scale, + ignore_index, + total_classes, + class_start_idx, # Useful for tensor parallel when each rank only has a subset of classes + n_cols, # shapes + logits_row_stride, # strides + dlogits_row_stride, + dloss_row_stride, + BLOCK_SIZE: tl.constexpr, + HAS_SMOOTHING: tl.constexpr, +): + row_idx = tl.program_id(0) + col_block_idx = tl.program_id(1) + logits_ptr = logits_ptr + row_idx * logits_row_stride.to(tl.int64) + dlogits_ptr = dlogits_ptr + row_idx * dlogits_row_stride.to(tl.int64) + col_offsets = col_block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + label_idx = tl.load(labels_ptr + row_idx) + if label_idx != ignore_index: + dloss = tl.load(dloss_ptr + row_idx * dloss_row_stride) + else: + dloss = 0.0 + logits = tl.load(logits_ptr + col_offsets, mask=col_offsets < n_cols, other=-float("inf")).to( + tl.float32 + ) * logit_scale + lse = tl.load(lse_ptr + row_idx) + probs = exp(logits - lse) + probs += 2.0 * lse_square_scale * lse * probs + label_idx -= class_start_idx + if HAS_SMOOTHING: + smooth_negative = label_smoothing / total_classes + probs = tl.where(col_offsets == label_idx, probs - (1 - label_smoothing), probs) - smooth_negative + else: + probs = tl.where(col_offsets == label_idx, probs - 1.0, probs) + tl.store(dlogits_ptr + col_offsets, (dloss * logit_scale) * probs, mask=col_offsets < n_cols) + + +def fused_cross_entropy_forward( + logits: torch.Tensor, + target: torch.Tensor, + label_smoothing: float = 0.0, + logit_scale: float = 1.0, + lse_square_scale: float = 0.0, + ignore_index: int = -100, + process_group=None, +): + n_rows, n_cols = logits.shape + assert target.shape == (n_rows,) + world_size = 1 if process_group is None else torch.distributed.get_world_size(process_group) + total_classes = world_size * n_cols + rank = 0 if process_group is None else torch.distributed.get_rank(process_group) + class_start_idx = rank * n_cols + + if logits.stride(-1) != 1: + logits = logits.contiguous() + # Set these similar to https://github.com/openai/triton/blob/main/python/tutorials/02-fused-softmax.py + MAX_BLOCK_SIZE = 64 * 1024 + BLOCK_SIZE = min(triton.next_power_of_2(n_cols), MAX_BLOCK_SIZE) + num_warps = ( + 4 + if BLOCK_SIZE < 2048 + else (8 if BLOCK_SIZE < 8192 else (16 if BLOCK_SIZE < 128 * 1024 else 32)) + ) + # We may split the lse computation across multiple blocks, then do a reduction + # lse(local_lse) to get the final LSE. This is faster for large n_cols (e.g., > 64k) + # where having just one thread block processing more than 64k elements is slow. + split = world_size > 1 or n_cols > MAX_BLOCK_SIZE + n_splits = (n_cols + BLOCK_SIZE - 1) // BLOCK_SIZE + loss_shape = (n_splits, n_rows) if n_splits > 1 else (n_rows,) + losses = torch.empty(*loss_shape, dtype=torch.float, device=logits.device) + lse = torch.empty(*loss_shape, dtype=torch.float, device=logits.device) + z_losses = torch.empty(*loss_shape, dtype=torch.float, device=logits.device) + + cross_entropy_fwd_kernel[(n_rows, n_splits)]( + losses, # data ptrs + lse, + z_losses, + logits, + target, + label_smoothing, + logit_scale, + lse_square_scale, + ignore_index, + total_classes, + class_start_idx, + n_cols, # shapes + n_rows, + logits.stride(0), # strides + BLOCK_SIZE=BLOCK_SIZE, # constants + num_warps=num_warps, + SPLIT=split + ) + + if split: + # If there's no label_smoothing, if target are in the vocab of this partition, losses contains + # - predicted logit, and 0 otherwise. + # If there's label_smoothing=0.1, for target in the vocab of this partition, losses contains + # -0.9 * predicted logit - 0.1 * sum logit / total_classes. + # For target not in the vocab of this partition, losses contains + # -0.1 * sum logit / total_classes. + if n_splits > 1: + lse = torch.logsumexp(lse, dim=0) + losses = losses.sum(dim=0) + if world_size > 1: + lse_allgather = torch.empty(world_size, n_rows, dtype=lse.dtype, device=lse.device) + torch.distributed.all_gather_into_tensor(lse_allgather, lse, group=process_group) + handle_losses = torch.distributed.all_reduce( + losses, op=torch.distributed.ReduceOp.SUM, group=process_group, async_op=True + ) + lse = torch.logsumexp(lse_allgather, dim=0) + handle_losses.wait() + # After the allreduce, if there's no label_smoothing, the total losses are - predicted_logit, + # we just have to add the (global) lse. + # If there's label_smoothing=0.1, the total losses are + # -0.9 * predicted_logit - 0.1 * sum logit / total_classes. + # Again, we just have to add the (global) lse. + losses += lse + if lse_square_scale != 0.0: + z_losses = lse_square_scale * lse.square() + z_losses.masked_fill_(target == ignore_index, 0.0) + losses += z_losses + else: + z_losses = torch.zeros_like(losses) + losses.masked_fill_(target == ignore_index, 0.0) + + return losses, z_losses, lse, total_classes, class_start_idx + + +class CrossEntropyLossFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward( + ctx, + logits, + target, + label_smoothing=0.0, + logit_scale=1.0, + lse_square_scale=0.0, + ignore_index=-100, + inplace_backward=False, + process_group=None, + ): + losses, z_losses, lse, total_classes, class_start_idx = fused_cross_entropy_forward( + logits, + target, + label_smoothing, + logit_scale, + lse_square_scale, + ignore_index, + process_group, + ) + ctx.save_for_backward(logits, lse, target) + ctx.mark_non_differentiable(z_losses) + ctx.label_smoothing = label_smoothing + ctx.logit_scale = logit_scale + ctx.lse_square_scale = lse_square_scale + ctx.ignore_index = ignore_index + ctx.total_classes = total_classes + ctx.class_start_idx = class_start_idx + ctx.inplace_backward = inplace_backward + + return losses, z_losses + + @staticmethod + @input_guard + def backward(ctx, grad_losses, grad_z_losses): + del grad_z_losses # z_losses are only for logging. + + logits, lse, target = ctx.saved_tensors + dlogits = logits if ctx.inplace_backward else torch.empty_like(logits) + n_rows, n_cols = logits.shape + BLOCK_SIZE = min(triton.next_power_of_2(n_cols), 4 * 1024) + num_warps = 4 if BLOCK_SIZE < 2048 else (8 if BLOCK_SIZE < 8192 else 16) + def grid(META): return (n_rows, triton.cdiv(n_cols, META["BLOCK_SIZE"])) # noqa + cross_entropy_bwd_kernel[grid]( + dlogits, # data ptrs + grad_losses, + logits, + lse, + target, + ctx.label_smoothing, + ctx.logit_scale, + ctx.lse_square_scale, + ctx.ignore_index, + ctx.total_classes, + ctx.class_start_idx, + n_cols, # shapes + logits.stride(0), # strides + dlogits.stride(0), + grad_losses.stride(0), + BLOCK_SIZE=BLOCK_SIZE, # constants + num_warps=num_warps, + ) + return dlogits, None, None, None, None, None, None, None, None + + +def cross_entropy_loss( + logits: torch.Tensor, + target: torch.Tensor, + label_smoothing: float = 0.0, + logit_scale: float = 1.0, + lse_square_scale: float = 0.0, + ignore_index=-100, + inplace_backward: bool = False, + process_group=None, +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Arguments: + logits: [batch, vocab_size] + target: [batch,] + label_smoothing: float + logit_scale: float. + Multiply logits by this scale before calculating the loss. + lse_square_scale: float. + If > 0, we add lse_square_scale * lse(logits) ^ 2 to the loss. + This is also referred to as "z-loss". + ignore_index: int. + If target == ignore_index, the loss is set to 0.0. + inplace_backward: bool. + If True, we do the backward pass in-place by modifying the logits. + This saves memory. + process_group: + if not None, we're doing Tensor Parallel: each process is responsible for + one part of the vocab. The loss will be aggregated across processes. + Returns: + losses: [batch,], float + z_losses: [batch,], float + """ + return CrossEntropyLossFunction.apply( + logits, + target, + label_smoothing, + logit_scale, + lse_square_scale, + ignore_index, + inplace_backward, + process_group, + ) + + +class FusedCrossEntropyLoss(nn.Module): + def __init__( + self, + ignore_index: int = -100, + reduction: str = "mean", + label_smoothing: float = 0.0, + logit_scale: float = 1.0, + lse_square_scale: float = 0.0, + inplace_backward: bool = False, + process_group: Any = None, + return_z_loss: bool = False, + ): + """ + Arguments: + ignore_index: int. If target == ignore_index, the loss is set to 0.0. + label_smoothing: float + lse_square_scale: float. If > 0, we add lse_square_scale * lse(logits) ^ 2 to the loss. + This is also referred to as "z-loss". + inplace_backward: bool. If True, we do the backward pass in-place by modifying the logits. + This saves memory. + process_group: if not None, we're doing Tensor Parallel: each process is responsible for + one part of the vocab. The loss will be aggregated across processes. + return_z_loss: bool. If True, we return the component of the loss contributed by + the lse_square_scale value. This value is only for logging and does not support + backprop. + """ + super().__init__() + if reduction not in ["mean", "none", "sum"]: + raise NotImplementedError("Only support reduction = 'mean' or 'none' or 'sum'") + self.ignore_index = ignore_index + self.reduction = reduction + self.label_smoothing = label_smoothing + self.logit_scale = logit_scale + self.lse_square_scale = lse_square_scale + self.inplace_backward = inplace_backward + self.process_group = process_group + self.return_z_loss = return_z_loss + + def forward(self, input, target): + """ + Arguments: + input: (batch, vocab_size) + target: (batch,) + Returns: + losses: (batch,) if reduction is 'none', else (1,), dtype float + z_loss: (batch,) if reduction is 'none', else (1,), dtype float (if self.return_z_loss) + """ + assert input.is_cuda and target.is_cuda, "Only support CUDA tensors" + loss, z_loss = cross_entropy_loss( + input, + target, + label_smoothing=self.label_smoothing, + logit_scale=self.logit_scale, + lse_square_scale=self.lse_square_scale, + ignore_index=self.ignore_index, + inplace_backward=self.inplace_backward, + process_group=self.process_group, + ) + if self.reduction == "mean": + loss = loss.sum() / (target != self.ignore_index).sum() + elif self.reduction == "sum": + loss = loss.sum() + else: + loss = loss + + if not self.return_z_loss: + return loss + + if self.reduction == "mean": + z_loss = z_loss.sum() / (target != self.ignore_index).sum() + elif self.reduction == "sum": + z_loss = z_loss.sum() + else: + z_loss = z_loss + + return loss, z_loss diff --git a/fla/modules/fused_kl_div.py b/fla/modules/fused_kl_div.py new file mode 100644 index 0000000000000000000000000000000000000000..5e49269dec9e4c09d058c0ac0d5e6e059c6240b8 --- /dev/null +++ b/fla/modules/fused_kl_div.py @@ -0,0 +1,323 @@ +# -*- coding: utf-8 -*- + +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl + +from fla.ops.utils.op import exp, log +from fla.utils import input_guard + +# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 +# https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19 +# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling +# The optimal maximum block size depends on your hardware, your kernel, and your dtype +MAX_FUSED_SIZE = 65536 // 2 + + +@triton.jit +def kl_div_kernel( + logits, + target_logits, + loss, + s_logits, + s_loss, + reduction: tl.constexpr, + N: tl.constexpr, + V: tl.constexpr, + BV: tl.constexpr +): + # https://github.com/triton-lang/triton/issues/1058 + # If N*V is too large, i_n * stride will overflow out of int32, so we convert to int64 + i_n = tl.program_id(0).to(tl.int64) + + logits += i_n * s_logits + target_logits += i_n * s_logits + + # m is the max value. use the notation from the paper + sm = float('-inf') + tm = float('-inf') + # d is the sum. use the notation from the paper + sd, td = 0.0, 0.0 + + NV = tl.cdiv(V, BV) + for iv in range(0, NV): + o_x = iv * BV + tl.arange(0, BV) + # for student + b_sl = tl.load(logits + o_x, mask=o_x < V, other=float('-inf')) + b_sm = tl.max(b_sl) + m_new = tl.maximum(sm, b_sm) + sd = sd * exp(sm - m_new) + tl.sum(exp(b_sl - m_new)) + sm = m_new + # for teacher + b_tl = tl.load(target_logits + o_x, mask=o_x < V, other=float('-inf')) + b_tm = tl.max(b_tl) + m_new = tl.maximum(tm, b_tm) + td = td * exp(tm - m_new) + tl.sum(exp(b_tl - m_new)) + tm = m_new + + b_loss = 0. + # KL(y_true || y) = exp(y_true) * (log(y_true) - log(y)) + for iv in range(0, NV): + o_x = iv * BV + tl.arange(0, BV) + b_sl = tl.load(logits + o_x, mask=o_x < V, other=float('-inf')) + b_tl = tl.load(target_logits + o_x, mask=o_x < V, other=float('-inf')) + b_sp_log = b_sl - sm - log(sd) + b_tp_log = b_tl - tm - log(td) + b_sp = exp(b_sp_log) + b_tp = exp(b_tp_log) + b_kl = tl.where(o_x < V, b_tp * (b_tp_log - b_sp_log), 0) + b_dl = -b_tp + b_sp + b_loss += tl.sum(b_kl) + if reduction == 'batchmean': + b_dl = b_dl / N + tl.store(logits + o_x, b_dl, mask=o_x < V) + + # Normalize the loss by the number of elements if reduction is 'batchmean' + if reduction == 'batchmean': + b_loss = b_loss / N + + tl.store(loss + i_n * s_loss, b_loss) + + +@triton.jit +def elementwise_mul_kernel( + x, + g, + N: tl.constexpr, + B: tl.constexpr +): + """ + This function multiplies each element of the tensor pointed by x with the value pointed by g. + The multiplication is performed in-place on the tensor pointed by x. + + Parameters: + x: + Pointer to the input tensor. + g: + Pointer to the gradient output value. + N (int): + The number of columns in the input tensor. + B (int): + The block size for Triton operations. + """ + + # Get the program ID and convert it to int64 to avoid overflow + i_x = tl.program_id(0).to(tl.int64) + o_x = i_x * B + tl.arange(0, B) + + # Load the gradient output value + b_g = tl.load(g) + b_x = tl.load(x + o_x, mask=o_x < N) + tl.store(x + o_x, b_x * b_g, mask=o_x < N) + + +def fused_kl_div_forward( + x: torch.Tensor, + target_x: torch.Tensor, + weight: torch.Tensor, + target_weight: torch.Tensor, + reduction: str = 'batchmean' +): + device = x.device + + # ideally, we would like to achieve the same memory consumption as [N, H], + # so the expected chunk size should be: + # NC = ceil(V / H) + # C = ceil(N / NC) + # for ex: N = 4096*4, V = 32000, H = 4096 ==> NC = 8, C = ceil(N / NC) = 2048 + N, H, V = *x.shape, weight.shape[0] + BV = min(MAX_FUSED_SIZE, triton.next_power_of_2(V)) + # TODO: in real cases, we may need to limit the number of chunks NC to + # ensure the precisions of accumulated gradients + NC = min(8, triton.cdiv(V, H)) + C = triton.next_power_of_2(triton.cdiv(N, NC)) + NC = triton.cdiv(N, C) + + dx = torch.zeros_like(x, device=device) + dw = torch.zeros_like(weight, device=device) if weight is not None else None + # we use fp32 for loss accumulator + loss = torch.zeros(N, dtype=torch.float32, device=device) + + for ic in range(NC): + start, end = ic * C, min((ic + 1) * C, N) + # [C, N] + c_sx = x[start:end] + c_tx = target_x[start:end] + # when doing matmul, use the original precision + # [C, V] + c_sl = F.linear(c_sx, weight) + c_tl = F.linear(c_tx, target_weight) + + # unreduced loss + c_loss = loss[start:end] + + # Here we calculate the gradient of c_sx in place so we can save memory. + kl_div_kernel[(c_sx.shape[0],)]( + logits=c_sl, + target_logits=c_tl, + loss=c_loss, + s_logits=c_sl.stride(-2), + s_loss=c_loss.stride(-1), + reduction=reduction, + N=N, + V=V, + BV=BV, + num_warps=32 + ) + + # gradient of logits is computed in-place by the above triton kernel and is of shape: C x V + # thus dx[start: end] should be of shape: C x H + # additionally, since we are chunking the inputs, observe that the loss and gradients are calculated only + # on `n_non_ignore` tokens. However, the gradient of the input should be calculated for all tokens. + # Thus, we need an additional scaling factor of (n_non_ignore/total) to scale the gradients. + # [C, H] + + dx[start:end] = torch.mm(c_sl, weight) + + if weight is not None: + torch.addmm(input=dw, mat1=c_sl.t(), mat2=c_sx, out=dw) + + loss = loss.sum() + return loss, dx, dw + + +def fused_kl_div_backward( + do: torch.Tensor, + dx: torch.Tensor, + dw: torch.Tensor +): + # If cross entropy is the last layer, do is 1.0. Skip the mul to save time + if torch.ne(do, torch.tensor(1.0, device=do.device)): + # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place + # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton. + N, H = dx.shape + B = min(MAX_FUSED_SIZE, triton.next_power_of_2(H)) + + elementwise_mul_kernel[(triton.cdiv(N * H, B),)]( + x=dx, + g=do, + N=N*H, + B=B, + num_warps=32, + ) + + # handle dw + if dw is not None: + V, H = dw.shape + elementwise_mul_kernel[(triton.cdiv(V * H, B),)]( + x=dw, + g=do, + N=V*H, + B=B, + num_warps=32, + ) + + return dx, dw + + +class FusedKLDivLossFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward( + ctx, + x: torch.Tensor, + target_x: torch.Tensor, + weight: torch.Tensor, + target_weight: torch.Tensor, + reduction: str + ): + loss, dx, dw = fused_kl_div_forward( + x=x, + target_x=target_x, + weight=weight, + target_weight=target_weight, + reduction=reduction + ) + ctx.save_for_backward(dx, dw) + return loss + + @staticmethod + @input_guard + def backward(ctx, do): + dx, dw = ctx.saved_tensors + dx, dw = fused_kl_div_backward(do, dx, dw) + return dx, None, dw, None, None + + +def fused_kl_div_loss( + x: torch.Tensor, + target_x: torch.Tensor, + weight: torch.Tensor, + target_weight: torch.Tensor, + reduction: str = 'batchmean' +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x (torch.Tensor): [batch_size * seq_len, hidden_size] + target_x (torch.Tensor): [batch_size * seq_len, hidden_size] + weight (torch.Tensor): [vocab_size, hidden_size] + where `vocab_size` is the number of classes. + target_weight (torch.Tensor): [vocab_size, hidden_size] + where `vocab_size` is the number of classes. + reduction: + Specifies the reduction to apply to the output: 'batchmean'. Default: 'batchmean'. + Returns: + loss + """ + return FusedKLDivLossFunction.apply( + x, + target_x, + weight, + target_weight, + reduction + ) + + +class FusedKLDivLoss(nn.Module): + + def __init__( + self, + reduction: str = 'batchmean' + ): + """ + Args: + reduction: + Specifies the reduction to apply to the output: 'batchmean'. Default: 'batchmean'. + """ + super().__init__() + + assert reduction in ['batchmean'], f"reduction: {reduction} is not supported" + + self.reduction = reduction + + def forward( + self, + x: torch.Tensor, + target_x: torch.Tensor, + weight: torch.Tensor, + target_weight: torch.Tensor + ): + """ + Args: + x (torch.Tensor): [batch_size * seq_len, hidden_size] + target_x (torch.Tensor): [batch_size * seq_len, hidden_size] + weight (torch.Tensor): [vocab_size, hidden_size] + where `vocab_size` is the number of classes. + target_weight (torch.Tensor): [vocab_size, hidden_size] + where `vocab_size` is the number of classes. + Returns: + loss + """ + loss = fused_kl_div_loss( + x=x, + target_x=target_x, + weight=weight, + target_weight=target_weight, + reduction=self.reduction + ) + return loss diff --git a/fla/modules/fused_linear_cross_entropy.py b/fla/modules/fused_linear_cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..a18566fdbffccdd0d2b3bbe3586fdb38f18720e4 --- /dev/null +++ b/fla/modules/fused_linear_cross_entropy.py @@ -0,0 +1,570 @@ +# -*- coding: utf-8 -*- + +# Code adapted from +# https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/fused_linear_cross_entropy.py + +from functools import partial +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl +from torch.distributed import DeviceMesh +from torch.distributed.tensor import DTensor, Replicate, Shard, distribute_module +from torch.distributed.tensor.parallel import ParallelStyle + +from fla.ops.utils import logsumexp_fwd +from fla.ops.utils.op import exp +from fla.utils import input_guard + +# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 +# https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19 +# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling +# The optimal maximum block size depends on your hardware, your kernel, and your dtype +MAX_FUSED_SIZE = 65536 // 2 + + +@triton.jit +def cross_entropy_kernel( + logits, + lse, + target, + loss, + total, + ignore_index, + label_smoothing: tl.constexpr, + logit_scale: tl.constexpr, + reduction: tl.constexpr, + V: tl.constexpr, + BV: tl.constexpr +): + """ + This kernel computes both cross entropy loss and the gradient of the input. + We only consider hard label + mean reduction for now. + Please refer to https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for the math. + + Args: + logits: + Pointer to logits tensor. + lse: + Pointer to logsumexp tensor. + target: Pointer to target tensor. + loss: + Pointer to tensor to store the loss. + V (int): + The number of columns in the input tensor. + total (int): + The number of non-ignored classes. + ignore_index (int): + The index to ignore in the target. + label_smoothing (float): + The amount of smoothing when computing the loss, where 0.0 means no smoothing. + reduction (str): + The string for the reduction to apply + BV (int): + The block size for vocab. + """ + + # https://github.com/triton-lang/triton/issues/1058 + # If B*T*V is too large, i_n * stride will overflow out of int32, so we convert to int64 + i_n = tl.program_id(0).to(tl.int64) + NV = tl.cdiv(V, BV) + + # 1. Load target first because if the target is ignore_index, we can return right away + b_y = tl.load(target + i_n) + + # 2. locate the start index + logits += i_n * V + + if b_y == ignore_index: + # set all x as 0 + for i in range(0, V, BV): + o_v = i + tl.arange(0, BV) + tl.store(logits + o_v, 0.0, mask=o_v < V) + return + + # Online softmax: 2 loads + 1 store (compared with 3 loads + 1 store for the safe softmax) + # Refer to Algorithm 3 in the paper: https://arxiv.org/pdf/1805.02867 + + # 3. [Online softmax] first pass: compute logsumexp + # we did this in anouter kernel + b_l = tl.load(logits + b_y) * logit_scale + b_lse = tl.load(lse + i_n) + + # 4. Calculate the loss + # loss = lse - logits_l + b_loss = b_lse - b_l + + # Label smoothing is a general case of normal cross entropy + # See the full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issue-2503665310 + b_z = 0.0 + eps = label_smoothing / V + + # We need tl.debug_barrier() as mentioned in + # https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/ops/cross_entropy.py#L34 + tl.debug_barrier() + + # 5. [Online Softmax] Second pass: compute gradients + # For 'mean' reduction, gradients are normalized by number of non-ignored elements + # dx_y = (softmax(x_y) - 1) / N + # dx_i = softmax(x_i) / N, i != y + # For label smoothing: + # dx_i = (softmax(x_y) - label_smoothing / V) / N, i != y + # dx_y = (softmax(x_y) - label_smoothing / V - (1 - label_smoothing)) / N + # = dx_i - (1 - label_smoothing) / N + for iv in range(0, NV): + o_v = iv * BV + tl.arange(0, BV) + b_logits = tl.load(logits + o_v, mask=o_v < V, other=float('-inf')) * logit_scale + if label_smoothing > 0: + # scale X beforehand to avoid overflow + b_z += tl.sum(tl.where(o_v < V, -eps * b_logits, 0.0)) + b_p = (exp(b_logits - b_lse) - eps) * logit_scale + if reduction == "mean": + b_p = b_p / total + tl.store(logits + o_v, b_p, mask=o_v < V) + + tl.debug_barrier() + + # Orginal loss = H(q, p), with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps + # H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p) + # = (1 - label_smoothing) * H(q, p) + eps * sum(logsoftmax(x_i)) + # By using m (global max of xi) and d (sum of e^(xi-m)), we can simplify as: + # = (1 - label_smoothing) * H(q, p) + (-sum(x_i * eps) + label_smoothing * (m + logd)) + # Refer to H(q', p) in section 7 of the paper: + # https://arxiv.org/pdf/1512.00567 + # pytorch: + # https://github.com/pytorch/pytorch/blob/2981534f54d49fa3a9755c9b0855e7929c2527f0/aten/src/ATen/native/LossNLL.cpp#L516 + # See full derivation at https://github.com/linkedin/Liger-Kernel/pull/198#issuecomment-2333753087 + if label_smoothing > 0: + b_loss = b_loss * (1 - label_smoothing) + (b_z + label_smoothing * b_lse) + + # 6. Specially handle the i==y case where `dx_y = (softmax(x_y) - (1 - label_smoothing) / N` + b_l = tl.load(logits + b_y) + + # Normalize the loss by the number of non-ignored elements if reduction is "mean" + if reduction == 'mean': + b_loss = b_loss / total + b_l += (label_smoothing - 1) / total * logit_scale + else: + b_l += (label_smoothing - 1) * logit_scale + + tl.store(loss + i_n, b_loss) + tl.store(logits + b_y, b_l) + + +@triton.jit +def elementwise_mul_kernel( + x, + g, + N: tl.constexpr, + B: tl.constexpr +): + """ + This function multiplies each element of the tensor pointed by x with the value pointed by g. + The multiplication is performed in-place on the tensor pointed by x. + + Parameters: + x: + Pointer to the input tensor. + g: + Pointer to the gradient output value. + N (int): + The number of columns in the input tensor. + B (int): + The block size for Triton operations. + """ + + # Get the program ID and convert it to int64 to avoid overflow + i_x = tl.program_id(0).to(tl.int64) + o_x = i_x * B + tl.arange(0, B) + + # Load the gradient output value + b_g = tl.load(g) + b_x = tl.load(x + o_x, mask=o_x < N) + tl.store(x + o_x, b_x * b_g, mask=o_x < N) + + +def fused_linear_cross_entropy_forward( + x: torch.Tensor, + target: torch.LongTensor, + weight: torch.Tensor, + bias: torch.Tensor = None, + ignore_index: int = -100, + label_smoothing: float = 0.0, + logit_scale: float = 1.0, + num_chunks: int = 8, + reduction: str = "mean" +): + device = x.device + # inputs have shape: [N, H] + # materialized activations will have shape: [N, V] + # the increase in memory = [N, V] + # reduction can be achieved by partitioning the number of tokens N into smaller chunks. + + # ideally, we would like to achieve the same memory consumption as [N, H], + # so the expected chunk size should be: + # NC = ceil(V / H) + # C = ceil(N / NC) + # for ex: N = 4096*4, V = 32000, H = 4096 ==> NC = 8, C = ceil(N / NC) = 2048 + N, H, V = *x.shape, weight.shape[0] + BV = min(MAX_FUSED_SIZE, triton.next_power_of_2(V)) + # TODO: in real cases, we may need to limit the number of chunks NC to + # ensure the precisions of accumulated gradients + NC = min(num_chunks, triton.cdiv(V, H)) + C = triton.next_power_of_2(triton.cdiv(N, NC)) + NC = triton.cdiv(N, C) + + # [N, H] + dx = torch.zeros_like(x, device=device) + # [V, H] + dw = torch.zeros_like(weight, device=device, dtype=torch.float) if weight is not None else None + # [V] + db = torch.zeros_like(bias, device=device, dtype=torch.float) if bias is not None else None + # [N] + loss = torch.zeros(N, device=device, dtype=torch.float) + + total = target.ne(ignore_index).sum().item() + + for ic in range(NC): + start, end = ic * C, min((ic + 1) * C, N) + # [C, N] + c_x = x[start:end] + # when doing matmul, use the original precision + # [C, V] + c_logits = F.linear(c_x, weight, bias) + c_target = target[start:end] + # [C] + # keep lse in fp32 to maintain precision + c_lse = logsumexp_fwd(c_logits, scale=logit_scale, dtype=torch.float) + + # unreduced loss + c_loss = loss[start:end] + + # Here we calculate the gradient of c_logits in place so we can save memory. + cross_entropy_kernel[(c_logits.shape[0],)]( + logits=c_logits, + lse=c_lse, + target=c_target, + loss=c_loss, + total=total, + ignore_index=ignore_index, + label_smoothing=label_smoothing, + logit_scale=logit_scale, + reduction=reduction, + V=V, + BV=BV, + num_warps=32 + ) + + # gradient of logits is computed in-place by the above triton kernel and is of shape: C x V + # thus dx should be of shape: C x H + dx[start:end] = torch.mm(c_logits, weight) + + # keep dw in fp32 to maintain precision + if weight is not None: + dw += c_logits.t() @ c_x + + if bias is not None: + torch.add(input=db, other=c_logits.sum(0), out=db) + + loss = loss.sum() + if dw is not None: + dw = dw.to(weight) + if db is not None: + db = db.to(bias) + return loss, dx, dw, db + + +def fused_linear_cross_entropy_backward( + do: torch.Tensor, + dx: torch.Tensor, + dw: torch.Tensor, + db: torch.Tensor +): + # If cross entropy is the last layer, do is 1.0. Skip the mul to save time + if torch.ne(do, torch.tensor(1.0, device=do.device)): + # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place + # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton. + N, H = dx.shape + B = min(MAX_FUSED_SIZE, triton.next_power_of_2(H)) + + elementwise_mul_kernel[(triton.cdiv(N * H, B),)]( + x=dx, + g=do, + N=N*H, + B=B, + num_warps=32, + ) + + # handle dw + if dw is not None: + V, H = dw.shape + elementwise_mul_kernel[(triton.cdiv(V * H, B),)]( + x=dw, + g=do, + N=V*H, + B=B, + num_warps=32, + ) + + if db is not None: + V = db.shape[0] + elementwise_mul_kernel[(triton.cdiv(V, B),)]( + x=db, + g=do, + N=V, + B=B, + num_warps=32, + ) + return dx, dw, db + + +class FusedLinearCrossEntropyFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward( + ctx, + x: torch.Tensor, + target: torch.LongTensor, + weight: torch.Tensor, + bias: torch.Tensor = None, + ignore_index: int = -100, + label_smoothing: float = 0.0, + logit_scale: float = 1.0, + num_chunks: int = 8, + reduction: str = "mean" + ): + """ + Fusing the last linear layer with cross-entropy loss + Reference: https://github.com/mgmalek/efficient_cross_entropy + + Handle the forward and backward pass of the final linear layer via cross-entropy loss by avoiding + the materialization of the large logits tensor. Since Cross Entropy Loss is the last layer, we can + compute the gradient at the forward pass. By doing so, we don't have to store the x and target + for the backward pass. + + x (torch.Tensor): [batch_size * seq_len, hidden_size] + target (torch.LongTensor): [batch_size * seq_len] + where each value is in [0, vocab_size). + weight (torch.Tensor): [vocab_size, hidden_size] + where `vocab_size` is the number of classes. + bias (Optional[torch.Tensor]): [vocab_size] + where `vocab_size` is the number of classes. + ignore_index: + the index to ignore in the target. + label_smoothing: + the amount of smoothing when computing the loss, where 0.0 means no smoothing. + logit_scale: float = 1.0, + A scaling factor applied to the logits. Default: 1.0 + num_chunks: int + The number of chunks to split the input tensor into for processing. + This can help optimize memory usage and computation speed. + Default: 8 + reduction: + Specifies the reduction to apply to the output: 'mean' | 'sum'. + 'mean': the weighted mean of the output is taken, + 'sum': the output will be summed. + Default: 'mean'. + """ + loss, dx, dw, db = fused_linear_cross_entropy_forward( + x, + target, + weight, + bias, + ignore_index, + label_smoothing, + logit_scale, + num_chunks, + reduction + ) + # downcast to dtype and store for backward + ctx.save_for_backward( + dx.detach(), + dw.detach() if weight is not None else None, + db.detach() if bias is not None else None, + ) + return loss + + @staticmethod + @input_guard + def backward(ctx, do): + dx, dw, db = ctx.saved_tensors + dx, dw, db = fused_linear_cross_entropy_backward(do, dx, dw, db) + return dx, None, dw, db, None, None, None, None, None + + +def fused_linear_cross_entropy_loss( + x: torch.Tensor, + target: torch.LongTensor, + weight: torch.Tensor, + bias: torch.Tensor = None, + ignore_index: int = -100, + label_smoothing: float = 0.0, + logit_scale: float = 1.0, + num_chunks: int = 8, + reduction: str = "mean" +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x (torch.Tensor): [batch_size * seq_len, hidden_size] + target (torch.LongTensor): [batch_size * seq_len] + where each value is in [0, vocab_size). + weight (torch.Tensor): [vocab_size, hidden_size] + where `vocab_size` is the number of classes. + bias (Optional[torch.Tensor]): [vocab_size] + where `vocab_size` is the number of classes. + ignore_index: int. + If target == ignore_index, the loss is set to 0.0. + label_smoothing: float + logit_scale: float + A scaling factor applied to the logits. Default: 1.0 + num_chunks: int + The number of chunks to split the input tensor into for processing. + This can help optimize memory usage and computation speed. + Default: 8 + reduction: + Specifies the reduction to apply to the output: 'mean' | 'sum'. + 'mean': the weighted mean of the output is taken, + 'sum': the output will be summed. + Default: 'mean'. + Returns: + losses: [batch,], float + """ + return FusedLinearCrossEntropyFunction.apply( + x, + target, + weight, + bias, + ignore_index, + label_smoothing, + logit_scale, + num_chunks, + reduction + ) + + +class FusedLinearCrossEntropyLoss(nn.Module): + + def __init__( + self, + ignore_index: int = -100, + label_smoothing: float = 0.0, + logit_scale: float = 1.0, + num_chunks: int = 8, + reduction: str = "mean" + ): + """ + Args: + ignore_index: int. + If target == ignore_index, the loss is set to 0.0. + label_smoothing: float + logit_scale: float + A scaling factor applied to the logits. Default: 1.0 + num_chunks: int + The number of chunks to split the input tensor into for processing. + This can help optimize memory usage and computation speed. + Default: 8 + reduction: + Specifies the reduction to apply to the output: 'mean' | 'sum'. + 'mean': the weighted mean of the output is taken, + 'sum': the output will be summed. + Default: 'mean'. + """ + super().__init__() + + assert reduction in ["mean", "sum"], f"reduction: {reduction} is not supported" + + self.ignore_index = ignore_index + self.label_smoothing = label_smoothing + self.logit_scale = logit_scale + self.num_chunks = num_chunks + self.reduction = reduction + + @torch.compiler.disable + def forward( + self, + x: torch.Tensor, + target: torch.LongTensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor] = None + ): + """ + Args: + x (torch.Tensor): [batch_size, seq_len, hidden_size] + target (torch.LongTensor): [batch_size, seq_len] + where each value is in [0, V). + weight (torch.Tensor): [vocab_size, hidden_size] + where `vocab_size` is the number of classes. + bias (Optional[torch.Tensor]): [vocab_size] + where `vocab_size` is the number of classes. + Returns: + loss + """ + loss = fused_linear_cross_entropy_loss( + x.view(-1, x.shape[-1]), + target.view(-1), + weight=weight, + bias=bias, + ignore_index=self.ignore_index, + label_smoothing=self.label_smoothing, + logit_scale=self.logit_scale, + num_chunks=self.num_chunks, + reduction=self.reduction + ) + return loss + + +class LinearLossParallel(ParallelStyle): + def __init__( + self, + *, + sequence_dim: int = 1, + use_local_output: bool = False, + ): + super().__init__() + + self.sequence_sharding = (Shard(sequence_dim),) + self.use_local_output = use_local_output + + @staticmethod + def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): + x, target, weight, bias = inputs + + if not isinstance(x, DTensor): + # assume the input passed in already sharded on the sequence dim and create the DTensor + x = DTensor.from_local(x, device_mesh, sequence_sharding) + if x.placements != sequence_sharding: + x = x.redistribute(placements=sequence_sharding, async_op=True) + if not isinstance(target, DTensor): + target = DTensor.from_local(target, device_mesh, [Replicate()]) + if target.placements != sequence_sharding: + target = target.redistribute(placements=sequence_sharding, async_op=True) + + if not isinstance(weight, DTensor): + weight = DTensor.from_local(weight, device_mesh, [Replicate()]) + if weight.placements != [Replicate()]: + # we replicate the weight/bias in FLCE + weight = weight.redistribute(placements=[Replicate()], async_op=True) + + if bias is not None and not isinstance(bias, DTensor): + bias = DTensor.from_local(bias, device_mesh, [Replicate()]) + if bias is not None and bias.placements != [Replicate()]: + bias = bias.redistribute(placements=[Replicate()], async_op=True) + + return x.to_local(), target.to_local(), weight.to_local(), bias.to_local() if bias is not None else bias + + @staticmethod + def _prepare_output_fn(use_local_output, mod, outputs, device_mesh): + return outputs.to_local() if use_local_output else outputs + + def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module: + return distribute_module( + module, + device_mesh, + partition_fn=None, + input_fn=partial(self._prepare_input_fn, self.sequence_sharding), + output_fn=partial(self._prepare_output_fn, self.use_local_output) + ) diff --git a/fla/modules/fused_linear_listnet_loss.py b/fla/modules/fused_linear_listnet_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..3b1dcc820f06ab925e6f596904dd0b147209eca6 --- /dev/null +++ b/fla/modules/fused_linear_listnet_loss.py @@ -0,0 +1,427 @@ +# -*- coding: utf-8 -*- + +# Code adapted from +# https://github.com/linkedin/Liger-Kernel/blob/main/src/liger_kernel/ops/fused_linear_cross_entropy.py + +from functools import partial +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl +from torch.distributed import DeviceMesh +from torch.distributed.tensor import DTensor, Replicate, Shard, distribute_module +from torch.distributed.tensor.parallel import ParallelStyle + +from fla.ops.utils import logsumexp_fwd +from fla.ops.utils.op import exp +from fla.utils import input_guard + +# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 +# https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19 +# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling +# The optimal maximum block size depends on your hardware, your kernel, and your dtype +MAX_FUSED_SIZE = 65536 // 2 + +@triton.jit +def listnet_kernel( + logits, + targets, # Now full target distributions + lse_logits, + lse_targets, + loss, + total, + ignore_index, + logit_scale: tl.constexpr, + reduction: tl.constexpr, + V: tl.constexpr, + BV: tl.constexpr +): + i_n = tl.program_id(0).to(tl.int64) + NV = tl.cdiv(V, BV) + + # Pointers to current token's data + logits_ptr = logits + i_n * V + targets_ptr = targets + i_n * V + loss_ptr = loss + i_n + + # Compute prediction softmax + b_lse_logits = tl.load(lse_logits + i_n) + b_lse_targets = tl.load(lse_targets + i_n) + b_loss = 0.0 + + # Compute gradient: softmax(pred) - softmax(target) + for iv in range(0, NV): + o_v = iv * BV + tl.arange(0, BV) + mask = o_v < V + + # Load target and compute softmax + t_val = tl.load(targets_ptr + o_v, mask=mask, other=0.0) + p_target = tl.exp(t_val - b_lse_targets) + + # Load logits and compute softmax + l_val = tl.load(logits_ptr + o_v, mask=mask, other=0.0) * logit_scale + l_val_minus_lse = l_val - b_lse_logits + p_pred = tl.exp(l_val_minus_lse) + + # Gradient calculation + grad_val = p_pred - p_target + if reduction == "mean": + grad_val = grad_val / total + grad_val = tl.where(b_lse_targets == float('inf'), 0.0, grad_val) + tl.store(logits_ptr + o_v, grad_val, mask=mask) + + # Cross-entropy loss + # instead of: b_loss -= tl.sum(p_target * tl.log(p_pred), axis=0) + b_loss -= tl.sum(p_target * l_val_minus_lse, axis=0) + + tl.store(loss_ptr, b_loss) + +@triton.jit +def elementwise_mul_kernel( + x, + g, + N: tl.constexpr, + B: tl.constexpr +): + """ + This function multiplies each element of the tensor pointed by x with the value pointed by g. + The multiplication is performed in-place on the tensor pointed by x. + + Parameters: + x: + Pointer to the input tensor. + g: + Pointer to the gradient output value. + N (int): + The number of columns in the input tensor. + B (int): + The block size for Triton operations. + """ + + # Get the program ID and convert it to int64 to avoid overflow + i_x = tl.program_id(0).to(tl.int64) + o_x = i_x * B + tl.arange(0, B) + + # Load the gradient output value + b_g = tl.load(g) + b_x = tl.load(x + o_x, mask=o_x < N) + tl.store(x + o_x, b_x * b_g, mask=o_x < N) + + +def fused_linear_listnet_forward( + x: torch.Tensor, + targets: torch.Tensor, # Float tensor [N, V] + weight: torch.Tensor, + bias: torch.Tensor = None, + ignore_index: int = -100, + logit_scale: float = 1.0, + num_chunks: int = 8, + reduction: str = "mean" +): + N, H, V = *x.shape, weight.shape[0] + BV = min(MAX_FUSED_SIZE, triton.next_power_of_2(V)) + NC = min(num_chunks, triton.cdiv(V, H)) + C = triton.next_power_of_2(triton.cdiv(N, NC)) + NC = triton.cdiv(N, C) + + # Initialize outputs + dx = torch.zeros_like(x) + dw = torch.zeros_like(weight, dtype=torch.float) if weight is not None else None + db = torch.zeros_like(bias, dtype=torch.float) if bias is not None else None + loss = torch.zeros(N, device=x.device, dtype=torch.float) + total = N # All tokens considered + + for ic in range(NC): + start, end = ic * C, min((ic + 1) * C, N) + c_x = x[start:end] + c_logits = F.linear(c_x, weight, bias) + c_targets = targets[start:end] + c_lse_logits = logsumexp_fwd(c_logits, scale=logit_scale, dtype=torch.float) + c_lse_targets = logsumexp_fwd(c_targets, dtype=torch.float).nan_to_num(nan=float("inf")) + c_loss = loss[start:end] + + # Call ListNet kernel + listnet_kernel[(c_logits.shape[0],)]( + logits=c_logits, + targets=c_targets, # Full target distributions + lse_logits=c_lse_logits, + lse_targets=c_lse_targets, + loss=c_loss, + total=total, + ignore_index=ignore_index, + logit_scale=logit_scale, + reduction=reduction, + V=V, + BV=BV, + num_warps=32 + ) + + # Backward through linear layer + dx[start:end] = torch.mm(c_logits, weight) + if weight is not None: + dw += c_logits.t() @ c_x + if bias is not None: + db += c_logits.sum(0) + + loss = loss.sum() + if reduction == "mean": + loss = loss / total + + return loss, dx, dw, db + + +def fused_linear_listnet_backward( + do: torch.Tensor, + dx: torch.Tensor, + dw: torch.Tensor, + db: torch.Tensor +): + # If cross entropy is the last layer, do is 1.0. Skip the mul to save time + if torch.ne(do, torch.tensor(1.0, device=do.device)): + # We use a Triton kernel instead of a PyTorch operation because modifying inputs in-place + # for gradient storage and backward multiple times causes anomalies with PyTorch but not with Triton. + N, H = dx.shape + B = min(MAX_FUSED_SIZE, triton.next_power_of_2(H)) + + elementwise_mul_kernel[(triton.cdiv(N * H, B),)]( + x=dx, + g=do, + N=N*H, + B=B, + num_warps=32, + ) + + # handle dw + if dw is not None: + V, H = dw.shape + elementwise_mul_kernel[(triton.cdiv(V * H, B),)]( + x=dw, + g=do, + N=V*H, + B=B, + num_warps=32, + ) + + if db is not None: + V = db.shape[0] + elementwise_mul_kernel[(triton.cdiv(V, B),)]( + x=db, + g=do, + N=V, + B=B, + num_warps=32, + ) + return dx, dw, db + + +class FusedLinearListNetFunction(torch.autograd.Function): + @staticmethod + def forward( + ctx, + x: torch.Tensor, + targets: torch.Tensor, # Float targets + weight: torch.Tensor, + bias: torch.Tensor = None, + ignore_index: int = -100, + logit_scale: float = 1.0, + num_chunks: int = 8, + reduction: str = "mean" + ): + loss, dx, dw, db = fused_linear_listnet_forward( + x, targets, weight, bias, ignore_index, + logit_scale, num_chunks, reduction + ) + ctx.save_for_backward(dx, dw, db) + return loss + + @staticmethod + def backward(ctx, do): + dx, dw, db = ctx.saved_tensors + dx, dw, db = fused_linear_listnet_backward(do, dx, dw, db) + return dx, None, dw, db, None, None, None, None + + +def fused_linear_listnet_loss( + x: torch.Tensor, + target: torch.LongTensor, + weight: torch.Tensor, + bias: torch.Tensor = None, + ignore_index: int = -100, + label_smoothing: float = 0.0, + logit_scale: float = 1.0, + num_chunks: int = 8, + reduction: str = "mean" +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + x (torch.Tensor): [batch_size * seq_len, hidden_size] + target (torch.LongTensor): [batch_size * seq_len] + where each value is in [0, vocab_size). + weight (torch.Tensor): [vocab_size, hidden_size] + where `vocab_size` is the number of classes. + bias (Optional[torch.Tensor]): [vocab_size] + where `vocab_size` is the number of classes. + ignore_index: int. + If target == ignore_index, the loss is set to 0.0. + label_smoothing: float + logit_scale: float + A scaling factor applied to the logits. Default: 1.0 + num_chunks: int + The number of chunks to split the input tensor into for processing. + This can help optimize memory usage and computation speed. + Default: 8 + reduction: + Specifies the reduction to apply to the output: 'mean' | 'sum'. + 'mean': the weighted mean of the output is taken, + 'sum': the output will be summed. + Default: 'mean'. + Returns: + losses: [batch,], float + """ + return FusedLinearListNetFunction.apply( + x, + target, + weight, + bias, + ignore_index, + logit_scale, + num_chunks, + reduction + ) + + +class FusedLinearListNetLoss(nn.Module): + + def __init__( + self, + ignore_index: int = -100, + label_smoothing: float = 0.0, + logit_scale: float = 1.0, + num_chunks: int = 8, + reduction: str = "mean" + ): + """ + Args: + ignore_index: int. + If target == ignore_index, the loss is set to 0.0. + label_smoothing: float + logit_scale: float + A scaling factor applied to the logits. Default: 1.0 + num_chunks: int + The number of chunks to split the input tensor into for processing. + This can help optimize memory usage and computation speed. + Default: 8 + reduction: + Specifies the reduction to apply to the output: 'mean' | 'sum'. + 'mean': the weighted mean of the output is taken, + 'sum': the output will be summed. + Default: 'mean'. + """ + super().__init__() + + assert reduction in ["mean", "sum"], f"reduction: {reduction} is not supported" + + self.ignore_index = ignore_index + self.label_smoothing = label_smoothing + self.logit_scale = logit_scale + self.num_chunks = num_chunks + self.reduction = reduction + + @torch.compiler.disable + def forward( + self, + x: torch.Tensor, + target: torch.LongTensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor] = None + ): + """ + Args: + x (torch.Tensor): [batch_size, seq_len, hidden_size] + target (torch.LongTensor): [batch_size, seq_len] + where each value is in [0, V). + weight (torch.Tensor): [vocab_size, hidden_size] + where `vocab_size` is the number of classes. + bias (Optional[torch.Tensor]): [vocab_size] + where `vocab_size` is the number of classes. + Returns: + loss + """ + loss = fused_linear_listnet_loss( + x.view(-1, x.shape[-1]), + target.view(-1, target.shape[-1]), + weight=weight, + bias=bias, + ignore_index=self.ignore_index, + label_smoothing=self.label_smoothing, + logit_scale=self.logit_scale, + num_chunks=self.num_chunks, + reduction=self.reduction + ) + return loss + + +class LinearLossParallel(ParallelStyle): + def __init__( + self, + *, + sequence_dim: int = 1, + use_local_output: bool = False, + ): + super().__init__() + + self.sequence_sharding = (Shard(sequence_dim),) + self.use_local_output = use_local_output + + @staticmethod + def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): + x, target, weight, bias = inputs + + if not isinstance(x, DTensor): + # assume the input passed in already sharded on the sequence dim and create the DTensor + x = DTensor.from_local(x, device_mesh, sequence_sharding) + if x.placements != sequence_sharding: + x = x.redistribute(placements=sequence_sharding, async_op=True) + if not isinstance(target, DTensor): + target = DTensor.from_local(target, device_mesh, [Replicate()]) + if target.placements != sequence_sharding: + target = target.redistribute(placements=sequence_sharding, async_op=True) + + if not isinstance(weight, DTensor): + weight = DTensor.from_local(weight, device_mesh, [Replicate()]) + if weight.placements != [Replicate()]: + # we replicate the weight/bias in FLCE + weight = weight.redistribute(placements=[Replicate()], async_op=True) + + if bias is not None and not isinstance(bias, DTensor): + bias = DTensor.from_local(bias, device_mesh, [Replicate()]) + if bias is not None and bias.placements != [Replicate()]: + bias = bias.redistribute(placements=[Replicate()], async_op=True) + + return x.to_local(), target.to_local(), weight.to_local(), bias.to_local() if bias is not None else bias + + @staticmethod + def _prepare_output_fn(use_local_output, mod, outputs, device_mesh): + return outputs.to_local() if use_local_output else outputs + + def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module: + return distribute_module( + module, + device_mesh, + partition_fn=None, + input_fn=partial(self._prepare_input_fn, self.sequence_sharding), + output_fn=partial(self._prepare_output_fn, self.use_local_output) + ) + +# Naive ListNet loss function implementation +def list_net_loss(y_pred, y_true): + """ + ListNet loss introduced in "Learning to Rank: From Pairwise Approach to Listwise Approach". + :param y_pred: predictions from the model, shape [*, slate_length] + :param y_true: ground truth labels, shape [*, slate_length] + :return: loss value, a torch.Tensor + """ + return torch.mean(-torch.sum(F.softmax(y_true, dim=-1).nan_to_num(nan=0) * F.log_softmax(y_pred, dim=-1), dim=-1)) \ No newline at end of file diff --git a/fla/modules/fused_norm_gate.py b/fla/modules/fused_norm_gate.py new file mode 100644 index 0000000000000000000000000000000000000000..13807390f8138c998ba6b5d0eca226e74f7efde3 --- /dev/null +++ b/fla/modules/fused_norm_gate.py @@ -0,0 +1,995 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from __future__ import annotations + +import math +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl + +from fla.utils import get_multiprocessor_count, input_guard + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4, 8, 16, 32] + for num_stages in [2, 3, 4] + ], + key=['N', 'HAS_RESIDUAL', 'STORE_RESIDUAL_OUT', 'IS_RMS_NORM', 'HAS_BIAS'], +) +@triton.jit +def layer_norm_gated_fwd_kernel( + X, # pointer to the input + G, # pointer to the gate + Y, # pointer to the output + W, # pointer to the weights + B, # pointer to the biases + RESIDUAL, # pointer to the residual + RESIDUAL_OUT, # pointer to the residual + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + N, # number of columns in X + eps, # epsilon to avoid division by zero + ACTIVATION: tl.constexpr, + IS_RMS_NORM: tl.constexpr, + BLOCK_N: tl.constexpr, + HAS_RESIDUAL: tl.constexpr, + STORE_RESIDUAL_OUT: tl.constexpr, + HAS_WEIGHT: tl.constexpr, + HAS_BIAS: tl.constexpr +): + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + X += row * N + Y += row * N + G += row * N + if HAS_RESIDUAL: + RESIDUAL += row * N + if STORE_RESIDUAL_OUT: + RESIDUAL_OUT += row * N + # Compute mean and variance + cols = tl.arange(0, BLOCK_N) + x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32) + if HAS_RESIDUAL: + residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32) + x += residual + if STORE_RESIDUAL_OUT: + tl.store(RESIDUAL_OUT + cols, x, mask=cols < N) + if not IS_RMS_NORM: + mean = tl.sum(x, axis=0) / N + tl.store(Mean + row, mean) + xbar = tl.where(cols < N, x - mean, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + else: + xbar = tl.where(cols < N, x, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + rstd = 1 / tl.sqrt(var + eps) + tl.store(Rstd + row, rstd) + # Normalize and apply linear transformation + mask = cols < N + if HAS_WEIGHT: + w = tl.load(W + cols, mask=mask).to(tl.float32) + if HAS_BIAS: + b = tl.load(B + cols, mask=mask).to(tl.float32) + x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + y = x_hat * w if HAS_WEIGHT else x_hat + if HAS_BIAS: + y = y + b + + # Swish output gate + g = tl.load(G + cols, mask=cols < N, other=0.0).to(tl.float32) + if ACTIVATION == 'swish': + y = y * g * tl.sigmoid(g) + elif ACTIVATION == 'silu': + y = y * g * tl.sigmoid(g) + elif ACTIVATION == 'sigmoid': + y = y * tl.sigmoid(g) + + # Write output + tl.store(Y + cols, y, mask=mask) + + +def layer_norm_gated_fwd( + x: torch.Tensor, + g: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + activation: str = 'swish', + eps: float = 1e-5, + residual: torch.Tensor = None, + out_dtype: torch.dtype = None, + residual_dtype: torch.dtype = None, + is_rms_norm: bool = False +): + if residual is not None: + residual_dtype = residual.dtype + M, N = x.shape + if residual is not None: + assert residual.shape == (M, N) + if weight is not None: + assert weight.shape == (N,) + if bias is not None: + assert bias.shape == (N,) + # allocate output + y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype) + if residual is not None or (residual_dtype is not None and residual_dtype != x.dtype): + residual_out = torch.empty(M, N, device=x.device, dtype=residual_dtype) + else: + residual_out = None + mean = torch.empty((M,), dtype=torch.float, device=x.device) if not is_rms_norm else None + rstd = torch.empty((M,), dtype=torch.float, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + if N > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + + layer_norm_gated_fwd_kernel[(M,)]( + x, + g, + y, + weight, + bias, + residual, + residual_out, + mean, + rstd, + N, + eps, + ACTIVATION=activation, + IS_RMS_NORM=is_rms_norm, + BLOCK_N=BLOCK_N, + HAS_RESIDUAL=residual is not None, + STORE_RESIDUAL_OUT=residual_out is not None, + HAS_WEIGHT=weight is not None, + HAS_BIAS=bias is not None, + ) + # residual_out is None if residual is None and residual_dtype == input_dtype + return y, mean, rstd, residual_out if residual_out is not None else x + + +@triton.heuristics({ + 'RECOMPUTE_OUTPUT': lambda args: args["Y"] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4, 8, 16, 32] + for num_stages in [2, 3, 4] + ], + key=['N', 'HAS_DRESIDUAL', 'STORE_DRESIDUAL', 'IS_RMS_NORM', 'HAS_BIAS'], +) +@triton.jit +def layer_norm_gated_bwd_kernel( + X, # pointer to the input + G, # pointer to the gate + W, # pointer to the weights + B, # pointer to the biases + Y, # pointer to the output to be recomputed + DY, # pointer to the output gradient + DX, # pointer to the input gradient + DG, # pointer to the gate gradient + DW, # pointer to the partial sum of weights gradient + DB, # pointer to the partial sum of biases gradient + DRESIDUAL, + DRESIDUAL_IN, + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + M, # number of rows in X + N, # number of columns in X + eps, # epsilon to avoid division by zero + rows_per_program, + ACTIVATION: tl.constexpr, + IS_RMS_NORM: tl.constexpr, + BLOCK_N: tl.constexpr, + HAS_DRESIDUAL: tl.constexpr, + STORE_DRESIDUAL: tl.constexpr, + HAS_WEIGHT: tl.constexpr, + HAS_BIAS: tl.constexpr, + RECOMPUTE_OUTPUT: tl.constexpr, +): + # Map the program id to the elements of X, DX, and DY it should compute. + row_block_id = tl.program_id(0) + row_start = row_block_id * rows_per_program + cols = tl.arange(0, BLOCK_N) + mask = cols < N + X += row_start * N + G += row_start * N + if HAS_DRESIDUAL: + DRESIDUAL += row_start * N + if STORE_DRESIDUAL: + DRESIDUAL_IN += row_start * N + DY += row_start * N + DX += row_start * N + DG += row_start * N + if RECOMPUTE_OUTPUT: + Y += row_start * N + if HAS_WEIGHT: + w = tl.load(W + cols, mask=mask).to(tl.float32) + dw = tl.zeros((BLOCK_N,), dtype=tl.float32) + if HAS_BIAS: + b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32) + if HAS_BIAS: + db = tl.zeros((BLOCK_N,), dtype=tl.float32) + + row_end = min((row_block_id + 1) * rows_per_program, M) + for row in range(row_start, row_end): + # Load data to SRAM + x = tl.load(X + cols, mask=mask, other=0).to(tl.float32) + g = tl.load(G + cols, mask=mask, other=0).to(tl.float32) + dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32) + + if not IS_RMS_NORM: + mean = tl.load(Mean + row) + rstd = tl.load(Rstd + row) + # Compute dx + xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + xhat = tl.where(mask, xhat, 0.0) + + y = xhat * w if HAS_WEIGHT else xhat + if HAS_BIAS: + y = y + b + if RECOMPUTE_OUTPUT: + tl.store(Y + cols, y, mask=mask) + + sigmoid_g = tl.sigmoid(g) + if ACTIVATION == 'swish': + dg = dy * y * (sigmoid_g + g * sigmoid_g * (1 - sigmoid_g)) + dy = dy * g * sigmoid_g + elif ACTIVATION == 'silu': + dg = dy * y * (sigmoid_g + g * sigmoid_g * (1 - sigmoid_g)) + dy = dy * g * sigmoid_g + elif ACTIVATION == 'sigmoid': + dg = dy * y * sigmoid_g * (1 - sigmoid_g) + dy = dy * sigmoid_g + wdy = dy + if HAS_WEIGHT: + wdy = dy * w + dw += dy * xhat + if HAS_BIAS: + db += dy + if not IS_RMS_NORM: + c1 = tl.sum(xhat * wdy, axis=0) / N + c2 = tl.sum(wdy, axis=0) / N + dx = (wdy - (xhat * c1 + c2)) * rstd + else: + c1 = tl.sum(xhat * wdy, axis=0) / N + dx = (wdy - xhat * c1) * rstd + if HAS_DRESIDUAL: + dres = tl.load(DRESIDUAL + cols, mask=mask, other=0).to(tl.float32) + dx += dres + # Write dx + if STORE_DRESIDUAL: + tl.store(DRESIDUAL_IN + cols, dx, mask=mask) + tl.store(DX + cols, dx, mask=mask) + tl.store(DG + cols, dg, mask=mask) + + X += N + G += N + if HAS_DRESIDUAL: + DRESIDUAL += N + if STORE_DRESIDUAL: + DRESIDUAL_IN += N + if RECOMPUTE_OUTPUT: + Y += N + DY += N + DX += N + DG += N + if HAS_WEIGHT: + tl.store(DW + row_block_id * N + cols, dw, mask=mask) + if HAS_BIAS: + tl.store(DB + row_block_id * N + cols, db, mask=mask) + + +def layer_norm_gated_bwd( + dy: torch.Tensor, + x: torch.Tensor, + g: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + activation: str = 'swish', + eps: float = 1e-5, + mean: torch.Tensor = None, + rstd: torch.Tensor = None, + dresidual: torch.Tensor = None, + has_residual: bool = False, + is_rms_norm: bool = False, + x_dtype: torch.dtype = None, + recompute_output: bool = False, +): + M, N = x.shape + assert dy.shape == (M, N) + if dresidual is not None: + assert dresidual.shape == (M, N) + if weight is not None: + assert weight.shape == (N,) + if bias is not None: + assert bias.shape == (N,) + # allocate output + dx = torch.empty_like(x) if x_dtype is None else torch.empty(M, N, dtype=x_dtype, device=x.device) + dg = torch.empty_like(g) if x_dtype is None else torch.empty(M, N, dtype=x_dtype, device=x.device) + dresidual_in = torch.empty_like(x) if has_residual and dx.dtype != x.dtype else None + y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + if N > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + sm_count = get_multiprocessor_count(x.device.index) + dw = torch.empty((sm_count, N), dtype=torch.float, device=weight.device) if weight is not None else None + db = torch.empty((sm_count, N), dtype=torch.float, device=bias.device) if bias is not None else None + rows_per_program = math.ceil(M / sm_count) + grid = (sm_count,) + layer_norm_gated_bwd_kernel[grid]( + x, + g, + weight, + bias, + y, + dy, + dx, + dg, + dw, + db, + dresidual, + dresidual_in, + mean, + rstd, + M, + N, + eps, + rows_per_program, + ACTIVATION=activation, + IS_RMS_NORM=is_rms_norm, + BLOCK_N=BLOCK_N, + HAS_DRESIDUAL=dresidual is not None, + STORE_DRESIDUAL=dresidual_in is not None, + HAS_WEIGHT=weight is not None, + HAS_BIAS=bias is not None, + ) + dw = dw.sum(0).to(weight.dtype) if weight is not None else None + db = db.sum(0).to(bias.dtype) if bias is not None else None + # Don't need to compute dresidual_in separately in this case + if has_residual and dx.dtype == x.dtype: + dresidual_in = dx + return (dx, dg, dw, db, dresidual_in) if not recompute_output else (dx, dg, dw, db, dresidual_in, y) + + +class LayerNormGatedFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward( + ctx, + x: torch.Tensor, + g: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + activation: str, + residual: Optional[torch.Tensor] = None, + eps: float = 1e-6, + prenorm: bool = False, + residual_in_fp32: bool = False, + is_rms_norm: bool = False, + ): + x_shape_og = x.shape + g_shape_og = g.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + g = g.reshape(-1, g.shape[-1]) + if residual is not None: + assert residual.shape == x_shape_og + residual = residual.reshape(-1, residual.shape[-1]) + residual_dtype = ( + residual.dtype + if residual is not None + else (torch.float if residual_in_fp32 else None) + ) + y, mean, rstd, residual_out = layer_norm_gated_fwd( + x=x, + g=g, + weight=weight, + bias=bias, + activation=activation, + eps=eps, + residual=residual, + residual_dtype=residual_dtype, + is_rms_norm=is_rms_norm + ) + ctx.save_for_backward(residual_out, g, weight, bias, mean, rstd) + ctx.x_shape_og = x_shape_og + ctx.g_shape_og = g_shape_og + ctx.activation = activation + ctx.eps = eps + ctx.is_rms_norm = is_rms_norm + ctx.has_residual = residual is not None + ctx.prenorm = prenorm + ctx.x_dtype = x.dtype + y = y.reshape(x_shape_og) + return y if not prenorm else (y, residual_out.reshape(x_shape_og)) + + @staticmethod + @input_guard + def backward(ctx, dy, *args): + x, g, weight, bias, mean, rstd = ctx.saved_tensors + dy = dy.reshape(-1, dy.shape[-1]) + assert dy.shape == x.shape + if ctx.prenorm: + dresidual = args[0] + dresidual = dresidual.reshape(-1, dresidual.shape[-1]) + assert dresidual.shape == x.shape + else: + dresidual = None + dx, dg, dw, db, dresidual_in = layer_norm_gated_bwd( + dy=dy, + x=x, + g=g, + weight=weight, + bias=bias, + activation=ctx.activation, + eps=ctx.eps, + mean=mean, + rstd=rstd, + dresidual=dresidual, + has_residual=ctx.has_residual, + is_rms_norm=ctx.is_rms_norm, + x_dtype=ctx.x_dtype, + ) + return ( + dx.reshape(ctx.x_shape_og), + dg.reshape(ctx.g_shape_og), + dw, + db, + None, + dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None, + None, + None, + None, + None, + ) + + +class LayerNormGatedLinearFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward( + ctx, + x: torch.Tensor, + g: torch.Tensor, + norm_weight: torch.Tensor, + norm_bias: torch.Tensor, + linear_weight: torch.Tensor, + linear_bias: torch.Tensor, + residual: Optional[torch.Tensor] = None, + eps: float = 1e-6, + prenorm: bool = False, + residual_in_fp32: bool = False, + is_rms_norm: bool = False, + ): + x_shape_og = x.shape + g_shape_og = g.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + g = g.reshape(-1, g.shape[-1]) + if residual is not None: + assert residual.shape == x_shape_og + residual = residual.reshape(-1, residual.shape[-1]) + residual_dtype = ( + residual.dtype + if residual is not None + else (torch.float if residual_in_fp32 else None) + ) + y, mean, rstd, residual_out = layer_norm_gated_fwd( + x=x, + g=g, + weight=norm_weight, + bias=norm_bias, + eps=eps, + residual=residual, + residual_dtype=residual_dtype, + is_rms_norm=is_rms_norm + ) + y = y.reshape(x_shape_og) + dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype + linear_weight = linear_weight.to(dtype) + linear_bias = linear_bias.to(dtype) if linear_bias is not None else None + out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias) + # We don't store y, will be recomputed in the backward pass to save memory + ctx.save_for_backward(residual_out, g, norm_weight, norm_bias, linear_weight, mean, rstd) + ctx.x_shape_og = x_shape_og + ctx.g_shape_og = g_shape_og + ctx.eps = eps + ctx.is_rms_norm = is_rms_norm + ctx.has_residual = residual is not None + ctx.prenorm = prenorm + ctx.x_dtype = x.dtype + ctx.linear_bias_is_none = linear_bias is None + return out if not prenorm else (out, residual_out.reshape(x_shape_og)) + + @staticmethod + @input_guard + def backward(ctx, dout, *args): + x, g, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors + dout = dout.reshape(-1, dout.shape[-1]) + dy = F.linear(dout, linear_weight.t()) + dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0) + assert dy.shape == x.shape + if ctx.prenorm: + dresidual = args[0] + dresidual = dresidual.reshape(-1, dresidual.shape[-1]) + assert dresidual.shape == x.shape + else: + dresidual = None + dx, dg, dnorm_weight, dnorm_bias, dresidual_in, y = layer_norm_gated_bwd( + dy=dy, + x=x, + g=g, + norm_weight=norm_weight, + norm_bias=norm_bias, + eps=ctx.eps, + mean=mean, + rstd=rstd, + dresidual=dresidual, + has_residual=ctx.has_residual, + is_rms_norm=ctx.is_rms_norm, + x_dtype=ctx.x_dtype, + recompute_output=True, + ) + dlinear_weight = torch.einsum("bo,bi->oi", dout, y) + return ( + dx.reshape(ctx.x_shape_og), + dg.reshape(ctx.g_shape_og), + dnorm_weight, + dnorm_bias, + dlinear_weight, + dlinear_bias, + dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None, + None, + None, + None, + None, + ) + + +def layer_norm_gated( + x: torch.Tensor, + g: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + activation: str = 'swish', + residual: Optional[torch.Tensor] = None, + prenorm: bool = False, + residual_in_fp32: bool = False, + eps: float = 1e-6 +): + return LayerNormGatedFunction.apply( + x, + g, + weight, + bias, + activation, + residual, + eps, + prenorm, + residual_in_fp32, + False + ) + + +def rms_norm_gated( + x: torch.Tensor, + g: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + activation: str = 'swish', + residual: Optional[torch.Tensor] = None, + prenorm: bool = False, + residual_in_fp32: bool = False, + eps: float = 1e-6 +): + return LayerNormGatedFunction.apply( + x, + g, + weight, + bias, + activation, + residual, + eps, + prenorm, + residual_in_fp32, + True + ) + + +def layer_norm_swish_gate_linear( + x: torch.Tensor, + g: torch.Tensor, + norm_weight: torch.Tensor, + norm_bias: torch.Tensor, + linear_weight: torch.Tensor, + linear_bias: torch.Tensor, + residual: Optional[torch.Tensor] = None, + prenorm: bool = False, + residual_in_fp32: bool = False, + eps: float = 1e-6 +): + return LayerNormGatedLinearFunction.apply( + x, + g, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual, + eps, + prenorm, + residual_in_fp32, + False + ) + + +def rms_norm_swish_gate_linear( + x, + g: torch.Tensor, + norm_weight: torch.Tensor, + norm_bias: torch.Tensor, + linear_weight: torch.Tensor, + linear_bias: torch.Tensor, + residual: Optional[torch.Tensor] = None, + prenorm: bool = False, + residual_in_fp32: bool = False, + eps: float = 1e-6 +): + return LayerNormGatedLinearFunction.apply( + x, + g, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual, + eps, + prenorm, + residual_in_fp32, + True + ) + + +class FusedLayerNormGated(nn.Module): + + def __init__( + self, + hidden_size: int, + elementwise_affine: bool = True, + bias: bool = False, + activation: str = 'swish', + eps: float = 1e-5, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> FusedLayerNormGated: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + self.activation = activation + + if self.activation not in ['swish', 'silu', 'sigmoid']: + raise ValueError(f"Unsupported activation: {self.activation}") + + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + if bias: + self.bias = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + s += f", eps={self.eps}" + s += f", activation={self.activation}" + s += ")" + return s + + def forward( + self, + x: torch.Tensor, + g: torch.Tensor, + residual: Optional[torch.Tensor] = None, + prenorm: bool = False, + residual_in_fp32: bool = False + ) -> torch.Tensor: + return layer_norm_gated( + x, + g, + self.weight, + self.bias, + self.activation, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32 + ) + + +class FusedRMSNormGated(nn.Module): + + def __init__( + self, + hidden_size: int, + elementwise_affine: bool = True, + eps: float = 1e-5, + activation: str = 'swish', + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> FusedRMSNormGated: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + self.activation = activation + + if self.activation not in ['swish', 'silu', 'sigmoid']: + raise ValueError(f"Unsupported activation: {self.activation}") + + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + s += f", eps={self.eps}" + s += f", activation={self.activation}" + s += ")" + return s + + def forward( + self, + x: torch.Tensor, + g: torch.Tensor, + residual: Optional[torch.Tensor] = None, + prenorm: bool = False, + residual_in_fp32: bool = False + ) -> torch.Tensor: + return rms_norm_gated( + x, + g, + self.weight, + self.bias, + self.activation, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32 + ) + + +class FusedLayerNormSwishGate(FusedLayerNormGated): + + def __init__( + self, + hidden_size: int, + elementwise_affine: bool = True, + bias: bool = False, + eps: float = 1e-5, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> FusedLayerNormSwishGate: + super().__init__( + hidden_size=hidden_size, + elementwise_affine=elementwise_affine, + bias=bias, + eps=eps, + device=device, + dtype=dtype + ) + + +class FusedRMSNormSwishGate(FusedRMSNormGated): + + def __init__( + self, + hidden_size: int, + elementwise_affine: bool = True, + eps: float = 1e-5, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> FusedRMSNormSwishGate: + super().__init__( + hidden_size=hidden_size, + elementwise_affine=elementwise_affine, + eps=eps, + device=device, + dtype=dtype + ) + + +class FusedLayerNormGatedLinear(nn.Module): + + def __init__( + self, + hidden_size: int, + elementwise_affine: bool = True, + eps: float = 1e-5, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> FusedLayerNormGatedLinear: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + else: + self.register_parameter("weight", None) + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + s += f", eps={self.eps}" + s += ")" + return s + + def forward( + self, + x: torch.Tensor, + g: torch.Tensor, + weight: Optional[torch.Tensor] = None, + bias: Optional[torch.Tensor] = None, + residual: Optional[torch.Tensor] = None, + prenorm: bool = False, + residual_in_fp32: bool = False + ) -> torch.Tensor: + return layer_norm_swish_gate_linear( + x, + g, + self.weight, + self.bias, + weight, + bias, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32 + ) + + +class FusedLayerNormSwishGateLinear(FusedLayerNormGatedLinear): + + def __init__( + self, + hidden_size: int, + elementwise_affine: bool = True, + eps: float = 1e-5, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> FusedLayerNormSwishGateLinear: + super().__init__( + hidden_size=hidden_size, + elementwise_affine=elementwise_affine, + eps=eps, + device=device, + dtype=dtype + ) + + +class FusedRMSNormGatedLinear(nn.Module): + + def __init__( + self, + hidden_size, + elementwise_affine: bool = True, + eps: float = 1e-5, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> FusedRMSNormGatedLinear: + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + s += f", eps={self.eps}" + s += ")" + return s + + def forward( + self, + x: torch.Tensor, + g: torch.Tensor, + weight: Optional[torch.Tensor] = None, + bias: Optional[torch.Tensor] = None, + residual: Optional[torch.Tensor] = None, + prenorm: bool = False, + residual_in_fp32: bool = False + ) -> torch.Tensor: + return rms_norm_swish_gate_linear( + x, + g, + self.weight, + self.bias, + weight, + bias, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32 + ) + + +class FusedRMSNormSwishGateLinear(FusedRMSNormGatedLinear): + + def __init__( + self, + hidden_size: int, + elementwise_affine: bool = True, + eps: float = 1e-5, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ) -> FusedRMSNormSwishGateLinear: + super().__init__( + hidden_size=hidden_size, + elementwise_affine=elementwise_affine, + eps=eps, + device=device, + dtype=dtype + ) diff --git a/fla/modules/grpo.py b/fla/modules/grpo.py new file mode 100644 index 0000000000000000000000000000000000000000..dd1155557fd45321311258f75b1cbf8810f79c99 --- /dev/null +++ b/fla/modules/grpo.py @@ -0,0 +1,396 @@ +# -*- coding: utf-8 -*- + +# https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py +""" +# Get the per-token log probabilities for the completions for the model and the reference model + def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep): + # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded + logits = model(input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1).logits + logits = logits[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred + + input_ids = input_ids[:, -logits_to_keep:] + # For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves. + # See https://github.com/huggingface/trl/issues/2770 + logits = logits[:, -logits_to_keep:] + return selective_log_softmax(logits, input_ids) # compute logprobs for the input tokens + + def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): + if return_outputs: + raise ValueError("The GRPOTrainer does not support returning outputs") + # Compute the per-token log probabilities for the model + + prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"] + completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"] + input_ids = torch.cat([prompt_ids, completion_ids], dim=1) + attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) + logits_to_keep = completion_ids.size(1) # we only need to compute the logits for the completion tokens + + per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep) + + # Compute the KL divergence between the model and the reference model + ref_per_token_logps = inputs["ref_per_token_logps"] + per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 + + # x - x.detach() allows for preserving gradients from x + advantages = inputs["advantages"] + per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1) + per_token_loss = -(per_token_loss - self.beta * per_token_kl) + loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() + + # Log the metrics + completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item() + self._metrics["completion_length"].append(completion_length) + + mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean() + self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item()) + + return loss +""" + + +import torch +import triton +import triton.language as tl + +from fla.ops.utils.op import exp, log +from fla.utils import input_guard + + +@triton.autotune( + [triton.Config({'BLOCK_SIZE': BLOCK_SIZE}, num_warps=NUM_WARPS, num_stages=NUM_STAGES) + for BLOCK_SIZE in [1024, 2048, 4096, 8192] + for NUM_WARPS in [8, 16, 32] + for NUM_STAGES in [1, 2, 4] + ], key=['B', 'N'] +) +@triton.jit +def grpo_fwd_kernel( + logits_ptr, + ref_logp_ptr, + input_ids_ptr, + advantages_ptr, + completion_mask_ptr, + loss_ptr, + lse_ptr, + beta, + save_kl: tl.constexpr, + B, + M, + N, + L, + start_idx, + BLOCK_SIZE: tl.constexpr +): + row_idx = tl.program_id(0) + + off_b = row_idx // L + N = tl.cast(N, tl.int64) + + loss_ptr += row_idx + + completion_mask_ptr += row_idx + not_skip = tl.load(completion_mask_ptr).to(tl.int1) + if not_skip == 1: + ref_logp_ptr += row_idx + lse_ptr += row_idx + advantages_ptr += off_b + logits_ptr += N * (row_idx + off_b) + input_ids_ptr += row_idx + (off_b+1) * start_idx + base_cols = tl.arange(0, BLOCK_SIZE) + + m_i = -float("inf") + l_i = 0.0 + for start_n in tl.range(0, N, BLOCK_SIZE): + cols = start_n + base_cols + mask = cols < N + logits = tl.load(logits_ptr+cols, mask=mask, other=-float('inf')).to(tl.float32) + m_ij = tl.max(logits) + new_m_i = tl.maximum(m_i, m_ij) + l_i = l_i * exp(m_i - new_m_i) + tl.sum(exp(logits - new_m_i)) + m_i = new_m_i + lse = log(l_i) + m_i + + idx = tl.load(input_ids_ptr) + x = tl.load(logits_ptr+idx).to(tl.float32) + advantage = tl.load(advantages_ptr).to(tl.float32) + ref_logp = tl.load(ref_logp_ptr) + logp = x - lse + diff = ref_logp - logp + kl = exp(diff) - diff - 1 + loss = kl * beta - advantage + + tl.store(loss_ptr, loss.to(loss_ptr.dtype.element_ty)) + tl.store(lse_ptr, lse.to(lse_ptr.dtype.element_ty)) + if save_kl: + tl.store(loss_ptr+M, kl.to(loss_ptr.dtype.element_ty)) + else: + # store 0 + tl.store(loss_ptr, 0.0) + if save_kl: + tl.store(loss_ptr+M, 0.0) + + +@triton.autotune( + [triton.Config({'BLOCK_SIZE': BLOCK_SIZE}, num_warps=NUM_WARPS, num_stages=NUM_STAGES) + for BLOCK_SIZE in [1024, 2048, 4096, 8192] + for NUM_WARPS in [8, 16, 32] + for NUM_STAGES in [1, 2, 4] + ], key=['B', 'N'] +) +@triton.jit +def grpo_bwd_kernel( + dloss_ptr, + dlogits_ptr, + logits_ptr, + ref_logp_ptr, + input_ids_ptr, + advantages_ptr, + completion_mask_ptr, + lse_ptr, + beta, + B, + N, + L, + start_idx, + BLOCK_SIZE: tl.constexpr +): + + row_idx = tl.program_id(0) # B*L + off_b = row_idx // L + + N = tl.cast(N, tl.int64) + + dlogits_ptr += N * (row_idx + off_b) + base_cols = tl.arange(0, BLOCK_SIZE) + completion_mask_ptr += row_idx + not_skip = tl.load(completion_mask_ptr).to(tl.int1) + + if not_skip == 1: + lse_ptr += row_idx + dloss_ptr += row_idx + advantages_ptr += off_b + ref_logp_ptr += row_idx + logits_ptr += N * (row_idx + off_b) + input_ids_ptr += row_idx + (off_b+1) * start_idx + dloss = tl.load(dloss_ptr).to(tl.float32) + lse = tl.load(lse_ptr).to(tl.float32) + idx = tl.load(input_ids_ptr) + x = tl.load(logits_ptr+idx).to(tl.float32) + advantage = tl.load(advantages_ptr).to(tl.float32) + ref_logp = tl.load(ref_logp_ptr) + logp = x - lse + + dlogp = (beta * (-1.0 * exp(ref_logp - logp) + 1) + - advantage) * dloss + + for start_n in tl.range(0, N, BLOCK_SIZE): + cols = start_n + base_cols + mask = cols < N + logits = tl.load(logits_ptr+cols, mask=mask, other=-float('inf')).to(tl.float32) + probs = exp(logits - lse) + dlogits = tl.where(cols == idx, 1-probs, -probs) * dlogp + + tl.store(dlogits_ptr+cols, dlogits.to(dlogits_ptr.dtype.element_ty), mask=mask) + else: + dlogits = tl.zeros((BLOCK_SIZE,), dtype=tl.float32) + for start_n in tl.range(0, N, BLOCK_SIZE): + cols = start_n + base_cols + mask = cols < N + + tl.store(dlogits_ptr+cols, dlogits.to(dlogits_ptr.dtype.element_ty), mask=mask) + + +class GrpoLoss(torch.autograd.Function): + + @input_guard + @staticmethod + def forward(ctx, logits, ref_logp, input_ids, advantages, beta, completion_mask, save_kl): + ctx.input_shape = logits.shape + B, L_ADD_1, N = ctx.input_shape + L = L_ADD_1 - 1 + M = B * L + input_ids_start_index = input_ids.size(1) - L + + if not save_kl: + loss = torch.empty(B, L, device=logits.device, dtype=torch.float32) + else: + loss = torch.empty(B*2, L, device=logits.device, dtype=torch.float32) + + lse = torch.empty(B, L, device=logits.device, dtype=torch.float32) + + if completion_mask is None: + completion_mask = torch.ones(B, L, device=logits.device, dtype=torch.int32) + else: + loss[:B].masked_fill_(completion_mask.logical_not(), 0.0) + + grpo_fwd_kernel[(M,)]( + logits_ptr=logits, + ref_logp_ptr=ref_logp, + input_ids_ptr=input_ids, + advantages_ptr=advantages, + completion_mask_ptr=completion_mask, + loss_ptr=loss, + lse_ptr=lse, + beta=beta, + save_kl=save_kl, + B=B, M=M, N=N, L=L, + start_idx=input_ids_start_index, + ) + ctx.beta = beta + ctx.save_for_backward(lse, logits, input_ids, advantages, completion_mask) + ctx.ref_logp = ref_logp + return loss + + @input_guard + @staticmethod + def backward(ctx, dloss): + # The grad of logits comes from two parts, the reward part and the kl part + lse, logits, input_ids, advantages, completion_mask = ctx.saved_tensors + B, L_ADD_1, N = ctx.input_shape + L = L_ADD_1 - 1 + M = B * L + + input_ids_start_index = input_ids.size(1) - L + + dlogits = torch.empty_like(logits) # B, L_ADD_1, N + + grpo_bwd_kernel[(M,)]( + dloss_ptr=dloss, + dlogits_ptr=dlogits, + logits_ptr=logits, + ref_logp_ptr=ctx.ref_logp, + input_ids_ptr=input_ids, + advantages_ptr=advantages, + completion_mask_ptr=completion_mask, + lse_ptr=lse, + beta=ctx.beta, + B=B, N=N, L=L, + start_idx=input_ids_start_index, + ) + # The last token in the completion is not used in the loss computation + # and therefore its gradient should be set to 0 + dlogits[:, -1, :].fill_(0.0) + return dlogits.view(*ctx.input_shape), None, None, None, None, None, None + + +def fused_grpo_loss(logits, ref_logp, input_ids, advantages, beta=0.1, completion_mask=None, save_kl=False) -> torch.Tensor: + ''' + compute grpo loss, save memory(no addition usage) and fast speed(6X for A800) + + Args: + logtits: Tensor, [B, L+1, vocab_size], the origin output of model, it's not logits[:, :-1] + ref_logp: Tensor, [B, L], the origin output of model, it's not ref_logits[:, :-1] + input_ids: Tensor, [B, K+L], it's prompt_completion_id, it contains the prompt ids and output ids + advantages: Tensor, [B], the advantages of each prompt + beta: float, the weight of kl loss + completion_mask: Tensor, loss mask + save_kl: bool, if true will save kl + + Retutn: + loss: Tensor, [B, L], the loss of grpo, it contains the advantage part and kl part + + NOTE: logits(ref_logits) is computed by these steps + logits_to_keep = completion_ids.size(1) + + def get_per_token_logits(model, input_ids, attention_mask, logits_to_keep): + # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded + logits = model( + input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1 + ).logits + return logits + + logits = get_per_token_logits(model, prompt_completion_ids, attention_mask, logits_to_keep) + ''' + out = GrpoLoss.apply(logits, ref_logp, input_ids, advantages, beta, completion_mask, save_kl) + if not save_kl: + return out + else: + return out.chunk(2, axis=0) + + +def grpo_loss_torch(logits, ref_logp, input_ids, advantages, beta=0.1, completion_mask=None, save_kl=False): + def get_log_probs(logits, input_ids): + per_token_logps = [] + for logits_row, input_ids_row in zip(logits, input_ids[:, -logits.size(1):]): + log_probs = logits_row.log_softmax(dim=-1) + token_log_prob = torch.gather(log_probs, dim=1, index=input_ids_row.unsqueeze(1)).squeeze(1) + per_token_logps.append(token_log_prob) + return torch.stack(per_token_logps) + + logits = logits[:, :-1] + per_token_logps = get_log_probs(logits, input_ids) + ref_per_token_logps = ref_logp + per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1 + + per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1) + per_token_loss = -(per_token_loss - beta * per_token_kl) + if completion_mask is not None: + per_token_loss *= completion_mask + if save_kl: + per_token_kl *= completion_mask + return per_token_loss if not save_kl else (per_token_loss, per_token_kl) + + +@torch.compile(fullgraph=True) +def grpo_loss_with_old_logps( + logps: torch.Tensor, + ref_logps: torch.Tensor, + old_logps: torch.Tensor, + pad_mask: torch.Tensor, + logits_to_keep: int, + rewards: torch.Tensor, + beta: float = 0.2, + epsilon: float = 0.2 +): + """ + Compute the GRPO (Group Relative Policy Optimization) loss. + + Args: + logps (torch.Tensor): [Batch, Token_length] Log probabilities of the current policy. + ref_logps (torch.Tensor):[Batch, Token_length] Log probabilities of the reference policy. + old_logps (torch.Tensor): [Batch, Token_length] Log probabilities of the old policy. + completion_ids (torch.Tensor): [Batch, Token_length] Completion token IDs (bool). + pad_token_id: Pad token ID. + logits_to_keep (int): Number of logits to keep for masking. + rewards (torch.Tensor): [Batch] Rewards for each generation. + beta (float) = 0.2: A hyperparameter for weighting the KL divergence term. + epsilon (float) = 0.2: An float hyperparameter for clipping the importance weights. + + Returns: + torch.Tensor: The computed GRPO loss. + """ + B = logps.shape[0] + assert B > 1, "Batch * Num generations should be greater than 1" + + rewards_shaped = rewards.view(-1, B) # B,num_generations + advantages = (rewards_shaped - rewards_shaped.mean(dim=1, keepdim=True)) / \ + (rewards_shaped.std(dim=1, keepdim=True) + 1e-8) + advantages = advantages.view(-1) # B*num_generations + # Calculate the per - token KL divergence + per_token_kl = torch.exp(ref_logps - logps) - (ref_logps - logps) - 1 + + # Calculate the ratio of probabilities (importance weights) + # Importance weights are calculated as exp(log_pi_theta - log_pi_theta_old) + importance_weights = torch.exp(logps - old_logps) + + # Clip the importance weights to the range [1 - epsilon, 1 + epsilon] + importance_weights_clipped = torch.clamp(importance_weights, 1 - epsilon, 1 + epsilon) + + # Create a completion mask. It checks which positions are valid based on logits_to_keep + completion_mask = torch.arange(logits_to_keep, device=logps.device)[None, :] >= 0 + + # Combine the completion mask and padding mask + completion_mask = completion_mask & pad_mask # Ensure matching shape + + # Add an extra dimension to advantages to match the shape for element - wise multiplication + advantages = advantages.unsqueeze(1) + + # Calculate the per - token loss. It takes the minimum of the unclipped and clipped importance weights + # and subtracts the KL divergence term weighted by beta, then multiplies by the completion mask + token_loss = -(torch.min(advantages * importance_weights, advantages * + importance_weights_clipped) - beta * per_token_kl) * completion_mask + + # Calculate the final loss by summing the token losses and normalizing by the number of valid tokens + loss = -token_loss.sum() / completion_mask.sum() + + return loss diff --git a/fla/modules/l2norm.py b/fla/modules/l2norm.py new file mode 100644 index 0000000000000000000000000000000000000000..bb400f1ba5b18f8c20091be76325fea0b042cc4f --- /dev/null +++ b/fla/modules/l2norm.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional + +import torch +import triton +import triton.language as tl + +from fla.utils import input_guard + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4, 8, 16, 32] + ], + key=['N'] +) +@triton.jit +def l2norm_fwd_kernel( + X, + Y, + N, + eps, + BLOCK_N: tl.constexpr, +): + i_m = tl.program_id(0) + X += i_m * N + Y += i_m * N + # Compute mean and variance + cols = tl.arange(0, BLOCK_N) + mask = cols < N + x = tl.load(X + cols, mask=mask, other=0.0).to(tl.float32) + xbar = tl.where(mask, x, 0.0) + var = tl.sum(xbar * xbar, axis=0) + rstd = 1 / tl.sqrt(var + eps) + # tl.store(Rstd + i_m, rstd) + # Normalize and apply linear transformation + y = x * rstd + # Write output + tl.store(Y + cols, y, mask=mask) + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4, 8, 16, 32] + ], + key=['N'] +) +@triton.jit +def l2norm_bwd_kernel( + X, + DY, + DX, + N, + eps, + BLOCK_N: tl.constexpr, +): + i_m = tl.program_id(0) + X += i_m * N + DX += i_m * N + DY += i_m * N + + # Y += i_m * stride_y_row + cols = tl.arange(0, BLOCK_N) + mask = cols < N + x = tl.load(X + cols, mask=mask, other=0.0).to(tl.float32) + x = tl.where(mask, x, 0.0) + var = tl.sum(x * x) + rstd = 1 / tl.sqrt(var + eps) + # tl.store(Rstd + i_m, rstd) + # Normalize and apply linear transformation + # y = x * rstd + dy = tl.load(DY + cols, mask=mask, other=0.0).to(tl.float32) + dy = tl.where(mask, dy, 0.0) + dx = dy * rstd - tl.sum(dy * x) * (1 / (var+eps)) * rstd * x + tl.store(DX + cols, dx, mask=mask) + + +def l2norm_fwd( + x: torch.Tensor, + eps: float = 1e-6, + output_dtype: Optional[torch.dtype] = None +): + x_shape_og = x.shape + x = x.reshape(-1, x.shape[-1]) + # allocate output + if output_dtype is None: + y = torch.empty_like(x) + else: + y = torch.empty_like(x, dtype=output_dtype) + assert y.stride(-1) == 1 + N = x.shape[-1] + M = x.shape[0] + # rstd = torch.empty((M,), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + if N > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + l2norm_fwd_kernel[(M,)]( + x, + y, + N, + eps, + BLOCK_N, + ) + return y.reshape(x_shape_og) + + +def l2norm_bwd( + x: torch.Tensor, + dy: torch.Tensor, + eps: float = 1e-5 +): + x_shape_og = x.shape + x = x.reshape(-1, dy.shape[-1]) + dy = dy.reshape(-1, dy.shape[-1]) + if dy.stride(-1) != 1: + dy = dy.contiguous() + assert dy.shape == x.shape + # allocate output + dx = torch.empty_like(x) + M = x.shape[0] + N = x.shape[-1] + # rstd = torch.empty((M,), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + if N > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + l2norm_bwd_kernel[(M,)]( + x, + dy, + dx, + N, + eps, + BLOCK_N, + ) + return dx.reshape(x_shape_og) + + +class L2NormFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward( + ctx, + x, + eps=1e-6, + output_dtype=None + ): + y = l2norm_fwd(x, eps, output_dtype) + ctx.eps = eps + ctx.x_dtype = x.dtype + ctx.save_for_backward(x) + return y + + @staticmethod + @input_guard + def backward(ctx, dy): + x, = ctx.saved_tensors + dx = l2norm_bwd(x, dy, ctx.eps) + return dx, None, None + + +def l2_norm( + x: torch.Tensor, + eps: float = 1e-6, + output_dtype: Optional[torch.dtype] = None +) -> torch.Tensor: + return L2NormFunction.apply(x, eps, output_dtype) diff --git a/fla/modules/layernorm.py b/fla/modules/layernorm.py new file mode 100644 index 0000000000000000000000000000000000000000..5a1f7fff62502f24e5ed1ee888ceebeefba5e504 --- /dev/null +++ b/fla/modules/layernorm.py @@ -0,0 +1,1196 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2023, Tri Dao. +# https://github.com/state-spaces/mamba/blob/fb7b5310fa865dbd62aa059b1e26f2b431363e2a/mamba_ssm/ops/triton/layernorm.py +# Implement residual + layer_norm / rms_norm. + +# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html +# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate. +# This is faster for dimensions up to 8k, but after that it's much slower due to register spilling. +# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine. + +from __future__ import annotations + +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl +from einops import rearrange +from torch.distributed import DeviceMesh +from torch.distributed.tensor import DTensor, Replicate, Shard, distribute_module +from torch.distributed.tensor.parallel import ParallelStyle + +from fla.utils import get_multiprocessor_count, input_guard + + +def layer_norm_ref( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + residual: torch.Tensor = None, + eps: float = 1e-5, + prenorm: bool = False, + upcast: bool = False +): + dtype = x.dtype + if upcast: + weight = weight.float() + bias = bias.float() if bias is not None else None + if upcast: + x = x.float() + residual = residual.float() if residual is not None else residual + if residual is not None: + x = (x + residual).to(x.dtype) + out = F.layer_norm(x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps).to( + dtype + ) + return out if not prenorm else (out, x) + + +def rms_norm_ref( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + residual: torch.Tensor = None, + eps: float = 1e-5, + prenorm: bool = False, + upcast: bool = False +): + dtype = x.dtype + if upcast: + weight = weight.float() + bias = bias.float() if bias is not None else None + if upcast: + x = x.float() + residual = residual.float() if residual is not None else residual + if residual is not None: + x = (x + residual).to(x.dtype) + rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) + out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight) + out = out.to(dtype) + return out if not prenorm else (out, x) + + +def group_norm_ref( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + num_groups: int, + residual: torch.Tensor = None, + eps: float = 1e-5, + is_rms_norm: bool = False, + prenorm: bool = False, + upcast: bool = False +): + dtype = x.dtype + if upcast: + weight = weight.float() + bias = bias.float() if bias is not None else None + if upcast: + x = x.float() + residual = residual.float() if residual is not None else residual + if residual is not None: + x = (x + residual).to(x.dtype) + residual = x + x, weight = [ + rearrange(data, "... (g d) -> ... g d", g=num_groups) for data in (x, weight) + ] + if bias is not None: + bias = rearrange(bias, '... (g d) -> ... g d', g=num_groups) + if not is_rms_norm: + mean = x.mean(dim=-1, keepdim=True) + x = x - mean + rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) + out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight) + out = rearrange(out, "... g d -> ... (g d)") + out = out.to(dtype) + return out if not prenorm else (out, residual) + + +class GroupNormRef(nn.Module): + + def __init__( + self, + num_groups: int, + hidden_size: int, + elementwise_affine: bool = True, + bias: bool = False, + eps: float = 1e-5, + is_rms_norm: bool = False + ) -> GroupNormRef: + super().__init__() + + if hidden_size % num_groups != 0: + raise ValueError('num_channels must be divisible by num_groups') + + self.num_groups = num_groups + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + self.is_rms_norm = is_rms_norm + + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size)) + if bias: + self.bias = nn.Parameter(torch.empty(hidden_size)) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.num_groups}, {self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + if self.is_rms_norm: + s += f", is_rms_norm={self.is_rms_norm}" + s += f", eps={self.eps}" + s += ")" + return s + + def forward(self, x, residual=None, prenorm=False): + return group_norm_ref( + x, + self.weight, + self.bias, + num_groups=self.num_groups, + residual=residual, + eps=self.eps, + is_rms_norm=self.is_rms_norm, + prenorm=prenorm, + upcast=True + ) + + +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4, 8, 16, 32] + for num_stages in [2, 3, 4] + ], + key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"], +) +@triton.jit +def layer_norm_fwd_kernel( + X, # pointer to the input + Y, # pointer to the output + W, # pointer to the weights + B, # pointer to the biases + RESIDUAL, # pointer to the residual + RESIDUAL_OUT, # pointer to the residual + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + N, # number of columns in X + G, # number of groups + eps, # epsilon to avoid division by zero + IS_RMS_NORM: tl.constexpr, + BLOCK_N: tl.constexpr, + HAS_RESIDUAL: tl.constexpr, + STORE_RESIDUAL_OUT: tl.constexpr, + HAS_WEIGHT: tl.constexpr, + HAS_BIAS: tl.constexpr +): + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + group = row % G + X += row * N + Y += row * N + if HAS_RESIDUAL: + RESIDUAL += row * N + if STORE_RESIDUAL_OUT: + RESIDUAL_OUT += row * N + # Compute mean and variance + cols = tl.arange(0, BLOCK_N) + x = tl.load(X + cols, mask=cols < N, other=0.0).to(tl.float32) + if HAS_RESIDUAL: + residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32) + x += residual + if STORE_RESIDUAL_OUT: + tl.store(RESIDUAL_OUT + cols, x, mask=cols < N) + if not IS_RMS_NORM: + mean = tl.sum(x, axis=0) / N + tl.store(Mean + row, mean) + xbar = tl.where(cols < N, x - mean, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + else: + xbar = tl.where(cols < N, x, 0.0) + var = tl.sum(xbar * xbar, axis=0) / N + rstd = 1 / tl.sqrt(var + eps) + tl.store(Rstd + row, rstd) + # Normalize and apply linear transformation + mask = cols < N + if HAS_WEIGHT: + w = tl.load(W + group * N + cols, mask=mask).to(tl.float32) + if HAS_BIAS: + b = tl.load(B + group * N + cols, mask=mask).to(tl.float32) + x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + + y = tl.fma(x_hat, w, b) if HAS_WEIGHT and HAS_BIAS else \ + x_hat * w if HAS_WEIGHT else \ + x_hat + b if HAS_BIAS else x_hat + # Write output + y = tl.cast(y, dtype=Y.dtype.element_ty, fp_downcast_rounding="rtne") + tl.store(Y + cols, y, mask=mask) + + +def layer_norm_fwd( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + residual: torch.Tensor = None, + out_dtype: torch.dtype = None, + residual_dtype: torch.dtype = None, + is_rms_norm: bool = False, + num_groups: int = 1 +): + if residual is not None: + residual_dtype = residual.dtype + M, N, G = *x.shape, num_groups + if residual is not None: + assert residual.shape == (M, N) + if weight is not None: + assert weight.shape == (G * N,) + if bias is not None: + assert bias.shape == (G * N,) + # allocate output + y = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype) + if residual is not None or (residual_dtype is not None and residual_dtype != x.dtype): + residual_out = torch.empty(M, N, device=x.device, dtype=residual_dtype) + else: + residual_out = None + mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None + rstd = torch.empty((M,), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + if N > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + layer_norm_fwd_kernel[(M,)]( + x, + y, + weight, + bias, + residual, + residual_out, + mean, + rstd, + N, + G, + eps, + is_rms_norm, + BLOCK_N, + residual is not None, + residual_out is not None, + weight is not None, + bias is not None, + ) + # residual_out is None if residual is None and residual_dtype == input_dtype + return y, mean, rstd, residual_out if residual_out is not None else x + + +@triton.heuristics({ + "RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4, 8, 16, 32] + for num_stages in [2, 3, 4] + ], + key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS"], +) +@triton.jit +def layer_norm_bwd_kernel( + X, # pointer to the input + W, # pointer to the weights + B, # pointer to the biases + Y, # pointer to the output to be recomputed + DY, # pointer to the output gradient + DX, # pointer to the input gradient + DW, # pointer to the partial sum of weights gradient + DB, # pointer to the partial sum of biases gradient + DRESIDUAL, + DRESIDUAL_IN, + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + M, # number of rows in X + N, # number of columns in X + G, # number of groups + rows_per_program, + programs_per_group, + IS_RMS_NORM: tl.constexpr, + BLOCK_N: tl.constexpr, + HAS_DRESIDUAL: tl.constexpr, + STORE_DRESIDUAL: tl.constexpr, + HAS_WEIGHT: tl.constexpr, + HAS_BIAS: tl.constexpr, + RECOMPUTE_OUTPUT: tl.constexpr, +): + row_block_id = tl.program_id(0) + group_id, program_id_in_group = row_block_id // programs_per_group, row_block_id % programs_per_group + + row_start = group_id + program_id_in_group * G * rows_per_program + row_end = min(row_start + G * rows_per_program, M) + + cols = tl.arange(0, BLOCK_N) + mask = cols < N + + if HAS_WEIGHT: + w = tl.load(W + group_id * N + cols, mask=mask).to(tl.float32) + dw = tl.zeros((BLOCK_N,), dtype=tl.float32) + if RECOMPUTE_OUTPUT and HAS_BIAS: + b = tl.load(B + group_id * N + cols, mask=mask, other=0.0).to(tl.float32) + if HAS_BIAS: + db = tl.zeros((BLOCK_N,), dtype=tl.float32) + + for row in range(row_start, row_end, G): + # Load data to SRAM + x = tl.load(X + row * N + cols, mask=mask, other=0).to(tl.float32) + dy = tl.load(DY + row * N + cols, mask=mask, other=0).to(tl.float32) + if not IS_RMS_NORM: + mean = tl.load(Mean + row) + rstd = tl.load(Rstd + row) + # Compute dx + xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + xhat = tl.where(mask, xhat, 0.0) + if RECOMPUTE_OUTPUT: + y = xhat * w if HAS_WEIGHT else xhat + if HAS_BIAS: + y = y + b + tl.store(Y + row * N + cols, y, mask=mask) + wdy = dy + if HAS_WEIGHT: + wdy = dy * w + dw += dy * xhat + if HAS_BIAS: + db += dy + if not IS_RMS_NORM: + c1 = tl.sum(xhat * wdy, axis=0) / N + c2 = tl.sum(wdy, axis=0) / N + dx = (wdy - (xhat * c1 + c2)) * rstd + else: + c1 = tl.sum(xhat * wdy, axis=0) / N + dx = (wdy - xhat * c1) * rstd + if HAS_DRESIDUAL: + dres = tl.load(DRESIDUAL + row * N + cols, mask=mask, other=0).to(tl.float32) + dx += dres + # Write dx + dx = tl.cast(dx, dtype=DX.dtype.element_ty, fp_downcast_rounding="rtne") + if STORE_DRESIDUAL: + tl.store(DRESIDUAL_IN + row * N + cols, dx, mask=mask) + tl.store(DX + row * N + cols, dx, mask=mask) + + if HAS_WEIGHT: + tl.store(DW + row_block_id * N + cols, dw, mask=mask) + if HAS_BIAS: + tl.store(DB + row_block_id * N + cols, db, mask=mask) + + +def layer_norm_bwd( + dy: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + mean: torch.Tensor, + rstd: torch.Tensor, + dresidual: torch.Tensor = None, + has_residual: bool = False, + is_rms_norm: bool = False, + x_dtype: torch.dtype = None, + recompute_output: bool = False, + num_groups: int = 1 +): + M, N, G = *x.shape, num_groups + assert dy.shape == (M, N) + if dresidual is not None: + assert dresidual.shape == (M, N) + if weight is not None: + assert weight.shape == (G * N,) + if bias is not None: + assert bias.shape == (G * N,) + # allocate output + dx = torch.empty_like(x) if x_dtype is None else torch.empty(M, N, dtype=x_dtype, device=x.device) + dresidual_in = torch.empty_like(x) if has_residual and dx.dtype != x.dtype else None + y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) + if N > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # each program handles one group only + S = triton.cdiv(get_multiprocessor_count(x.device.index), G) * G + dw = torch.empty((S, N), dtype=torch.float32, device=weight.device) if weight is not None else None + db = torch.empty((S, N), dtype=torch.float32, device=bias.device) if bias is not None else None + rows_per_program = triton.cdiv(M, S) + programs_per_group = S // G + grid = (S,) + layer_norm_bwd_kernel[grid]( + x, + weight, + bias, + y, + dy, + dx, + dw, + db, + dresidual, + dresidual_in, + mean, + rstd, + M, + N, + G, + rows_per_program, + programs_per_group, + is_rms_norm, + BLOCK_N, + dresidual is not None, + dresidual_in is not None, + weight is not None, + bias is not None, + ) + dw = dw.view(G, -1, N).sum(1).to(weight).view_as(weight) if weight is not None else None + db = db.view(G, -1, N).sum(1).to(bias).view_as(bias) if bias is not None else None + # Don't need to compute dresidual_in separately in this case + if has_residual and dx.dtype == x.dtype: + dresidual_in = dx + return (dx, dw, db, dresidual_in) if not recompute_output else (dx, dw, db, dresidual_in, y) + + +class LayerNormFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward( + ctx, + x, + weight, + bias, + residual=None, + eps=1e-5, + prenorm=False, + residual_in_fp32=False, + is_rms_norm=False, + num_groups=1 + ): + x_shape_og = x.shape + + if x.shape[-1] % num_groups != 0: + raise ValueError('num_channels must be divisible by num_groups') + # reshape input data into 2D tensor + x = x.reshape(-1, (x.shape[-1] // num_groups)) + if residual is not None: + assert residual.shape == x_shape_og + residual = residual.reshape_as(x) + residual_dtype = ( + residual.dtype + if residual is not None + else (torch.float32 if residual_in_fp32 else None) + ) + y, mean, rstd, residual_out = layer_norm_fwd( + x, + weight, + bias, + eps, + residual, + residual_dtype=residual_dtype, + is_rms_norm=is_rms_norm, + num_groups=num_groups + ) + ctx.save_for_backward(residual_out, weight, bias, mean, rstd) + ctx.x_shape_og = x_shape_og + ctx.eps = eps + ctx.is_rms_norm = is_rms_norm + ctx.num_groups = num_groups + ctx.has_residual = residual is not None + ctx.prenorm = prenorm + ctx.x_dtype = x.dtype + y = y.reshape(x_shape_og) + return y if not prenorm else (y, residual_out.reshape(x_shape_og)) + + @staticmethod + @input_guard + def backward(ctx, dy, *args): + x, weight, bias, mean, rstd = ctx.saved_tensors + dy = dy.reshape(-1, (dy.shape[-1] // ctx.num_groups)) + assert dy.shape == x.shape + if ctx.prenorm: + dresidual = args[0] + dresidual = dresidual.reshape(-1, x.shape[-1]) + assert dresidual.shape == x.shape + else: + dresidual = None + dx, dw, db, dresidual_in = layer_norm_bwd( + dy, + x, + weight, + bias, + ctx.eps, + mean, + rstd, + dresidual, + ctx.has_residual, + ctx.is_rms_norm, + x_dtype=ctx.x_dtype, + num_groups=ctx.num_groups + ) + return ( + dx.reshape(ctx.x_shape_og), + dw, + db, + dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None, + None, + None, + None, + None, + None + ) + + +def layer_norm( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + residual: torch.Tensor = None, + eps: float = 1e-5, + prenorm: bool = False, + residual_in_fp32: bool = False, + is_rms_norm: bool = False +): + return LayerNormFunction.apply( + x, + weight, + bias, + residual, + eps, + prenorm, + residual_in_fp32, + is_rms_norm + ) + + +def group_norm( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + residual: torch.Tensor = None, + eps: float = 1e-5, + prenorm: bool = False, + residual_in_fp32: bool = False, + is_rms_norm: bool = False, + num_groups: int = 1 +): + return LayerNormFunction.apply( + x, + weight, + bias, + residual, + eps, + prenorm, + residual_in_fp32, + is_rms_norm, + num_groups + ) + + +def rms_norm( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + residual: torch.Tensor = None, + eps: float = 1e-5, + prenorm: bool = False, + residual_in_fp32: bool = False +): + return LayerNormFunction.apply( + x, + weight, + bias, + residual, + eps, + prenorm, + residual_in_fp32, + True + ) + + +def layer_norm_linear( + x: torch.Tensor, + norm_weight: torch.Tensor, + norm_bias: torch.Tensor, + linear_weight: torch.Tensor, + linear_bias: torch.Tensor, + residual: torch.Tensor = None, + eps: float = 1e-5, + prenorm: bool = False, + residual_in_fp32: bool = False, + is_rms_norm: bool = False, + num_groups: int = 1 +): + return LayerNormLinearFunction.apply( + x, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual, + eps, + prenorm, + residual_in_fp32, + is_rms_norm, + num_groups + ) + + +def rms_norm_linear( + x: torch.Tensor, + norm_weight: torch.Tensor, + norm_bias: torch.Tensor, + linear_weight: torch.Tensor, + linear_bias: torch.Tensor, + residual: torch.Tensor = None, + eps: float = 1e-5, + prenorm: bool = False, + residual_in_fp32: bool = False +): + return layer_norm_linear( + x=x, + norm_weight=norm_weight, + norm_bias=norm_bias, + linear_weight=linear_weight, + linear_bias=linear_bias, + residual=residual, + eps=eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + is_rms_norm=True + ) + + +def group_norm_linear( + x: torch.Tensor, + norm_weight: torch.Tensor, + norm_bias: torch.Tensor, + linear_weight: torch.Tensor, + linear_bias: torch.Tensor, + residual: torch.Tensor = None, + eps: float = 1e-5, + prenorm: bool = False, + residual_in_fp32: bool = False, + is_rms_norm: bool = False, + num_groups: int = 1 +): + return layer_norm_linear( + x=x, + norm_weight=norm_weight, + norm_bias=norm_bias, + linear_weight=linear_weight, + linear_bias=linear_bias, + residual=residual, + eps=eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + is_rms_norm=is_rms_norm, + num_groups=num_groups + ) + + +class LayerNorm(nn.Module): + + def __init__( + self, + hidden_size: int, + elementwise_affine: bool = True, + bias: bool = False, + eps: float = 1e-5 + ) -> LayerNorm: + super().__init__() + + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size)) + if bias: + self.bias = nn.Parameter(torch.empty(hidden_size)) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + s += f", eps={self.eps}" + s += ")" + return s + + def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False): + return layer_norm( + x, + self.weight, + self.bias, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32 + ) + + +class GroupNorm(nn.Module): + + def __init__( + self, + num_groups: int, + hidden_size: int, + elementwise_affine: bool = True, + bias: bool = False, + eps: float = 1e-5, + is_rms_norm: bool = False + ) -> GroupNorm: + super().__init__() + + if hidden_size % num_groups != 0: + raise ValueError('num_channels must be divisible by num_groups') + + self.num_groups = num_groups + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + self.is_rms_norm = is_rms_norm + + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size)) + if bias: + self.bias = nn.Parameter(torch.empty(hidden_size)) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.num_groups}, {self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + if self.is_rms_norm: + s += f", is_rms_norm={self.is_rms_norm}" + s += f", eps={self.eps}" + s += ")" + return s + + def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False): + return group_norm( + x, + self.weight, + self.bias, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + is_rms_norm=self.is_rms_norm, + num_groups=self.num_groups + ) + + +class RMSNorm(nn.Module): + + def __init__( + self, + hidden_size: int, + elementwise_affine: bool = True, + bias: bool = False, + eps: float = 1e-5 + ) -> RMSNorm: + super().__init__() + + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size)) + if bias: + self.bias = nn.Parameter(torch.empty(hidden_size)) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + s += f", eps={self.eps}" + s += ")" + return s + + def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False): + return rms_norm( + x, + self.weight, + self.bias, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + ) + + +class LayerNormLinearFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward( + ctx, + x, + norm_weight, + norm_bias, + linear_weight, + linear_bias, + residual=None, + eps=1e-5, + prenorm=False, + residual_in_fp32=False, + is_rms_norm=False, + num_groups=1 + ): + x_shape_og = x.shape + + if x.shape[-1] % num_groups != 0: + raise ValueError('num_channels must be divisible by num_groups') + # reshape input data into 2D tensor + x = x.reshape(-1, (x.shape[-1] // num_groups)) + if residual is not None: + assert residual.shape == x_shape_og + residual = residual.reshape_as(x) + residual_dtype = ( + residual.dtype + if residual is not None + else (torch.float32 if residual_in_fp32 else None) + ) + y, mean, rstd, residual_out = layer_norm_fwd( + x, + norm_weight, + norm_bias, + eps, + residual, + out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_gpu_dtype(), + residual_dtype=residual_dtype, + is_rms_norm=is_rms_norm, + num_groups=num_groups + ) + y = y.reshape(x_shape_og) + dtype = torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype + linear_weight = linear_weight.to(dtype) + linear_bias = linear_bias.to(dtype) if linear_bias is not None else None + out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias) + # We don't store y, will be recomputed in the backward pass to save memory + ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd) + ctx.x_shape_og = x_shape_og + ctx.eps = eps + ctx.is_rms_norm = is_rms_norm + ctx.num_groups = num_groups + ctx.has_residual = residual is not None + ctx.prenorm = prenorm + ctx.x_dtype = x.dtype + ctx.linear_bias_is_none = linear_bias is None + return out if not prenorm else (out, residual_out.reshape(x_shape_og)) + + @staticmethod + @input_guard + def backward(ctx, dout, *args): + x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors + dout = dout.reshape(-1, dout.shape[-1]) + dy = F.linear(dout, linear_weight.t()) + dy = dy.reshape(-1, (dy.shape[-1] // ctx.num_groups)) + dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0) + assert dy.shape == x.shape + if ctx.prenorm: + dresidual = args[0] + dresidual = dresidual.reshape(-1, x.shape[-1]) + assert dresidual.shape == x.shape + else: + dresidual = None + dx, dnorm_weight, dnorm_bias, dresidual_in, y = layer_norm_bwd( + dy, + x, + norm_weight, + norm_bias, + ctx.eps, + mean, + rstd, + dresidual, + ctx.has_residual, + ctx.is_rms_norm, + x_dtype=ctx.x_dtype, + recompute_output=True, + num_groups=ctx.num_groups + ) + dlinear_weight = torch.einsum("bo,bi->oi", dout, y.view(-1, linear_weight.shape[-1])) + return ( + dx.reshape(ctx.x_shape_og), + dnorm_weight, + dnorm_bias, + dlinear_weight, + dlinear_bias, + dresidual_in.reshape(ctx.x_shape_og) if ctx.has_residual else None, + None, + None, + None, + None, + None + ) + + +class LayerNormLinear(nn.Module): + + def __init__( + self, + hidden_size, + elementwise_affine: bool = True, + bias: bool = False, + eps: float = 1e-5 + ) -> LayerNormLinear: + super().__init__() + + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size)) + if bias: + self.bias = nn.Parameter(torch.empty(hidden_size)) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + s += f", eps={self.eps}" + s += ")" + return s + + def forward(self, x, weight, bias, residual=None, prenorm=False, residual_in_fp32=False): + return layer_norm_linear( + x=x, + norm_weight=self.weight, + norm_bias=self.bias, + linear_weight=weight, + linear_bias=bias, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + is_rms_norm=False + ) + + +class GroupNormLinear(nn.Module): + + def __init__( + self, + num_groups: int, + hidden_size: int, + elementwise_affine: bool = True, + bias: bool = False, + eps: float = 1e-5, + is_rms_norm: bool = False + ) -> GroupNormLinear: + super().__init__() + + if hidden_size % num_groups != 0: + raise ValueError('num_channels must be divisible by num_groups') + + self.num_groups = num_groups + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + self.is_rms_norm = is_rms_norm + + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size)) + if bias: + self.bias = nn.Parameter(torch.empty(hidden_size)) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.num_groups}, {self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + if self.is_rms_norm: + s += f", is_rms_norm={self.is_rms_norm}" + s += f", eps={self.eps}" + s += ")" + return s + + def forward(self, x, weight, bias, residual=None, prenorm=False, residual_in_fp32=False): + return layer_norm_linear( + x=x, + norm_weight=self.weight, + norm_bias=self.bias, + linear_weight=weight, + linear_bias=bias, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + is_rms_norm=self.is_rms_norm, + num_groups=self.num_groups + ) + + +class RMSNormLinear(nn.Module): + + def __init__( + self, + hidden_size, + elementwise_affine: bool = True, + bias: bool = False, + eps: float = 1e-5 + ) -> RMSNormLinear: + super().__init__() + + self.hidden_size = hidden_size + self.elementwise_affine = elementwise_affine + self.eps = eps + + self.register_parameter("weight", None) + self.register_parameter("bias", None) + if elementwise_affine: + self.weight = nn.Parameter(torch.empty(hidden_size)) + if bias: + self.bias = nn.Parameter(torch.empty(hidden_size)) + + self.reset_parameters() + + def reset_parameters(self): + if self.elementwise_affine: + nn.init.ones_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + def __repr__(self) -> str: + s = f"{self.__class__.__name__}({self.hidden_size}" + if not self.elementwise_affine: + s += f", elementwise_affine={self.elementwise_affine}" + s += f", eps={self.eps}" + s += ")" + return s + + def forward(self, x, weight, bias, residual=None, prenorm=False, residual_in_fp32=False): + return layer_norm_linear( + x=x, + norm_weight=self.weight, + norm_bias=self.bias, + linear_weight=weight, + linear_bias=bias, + residual=residual, + eps=self.eps, + prenorm=prenorm, + residual_in_fp32=residual_in_fp32, + is_rms_norm=True + ) + + +class NormParallel(ParallelStyle): + + def __init__(self, *, sequence_dim: int = 1, use_local_output: bool = False): + super().__init__() + self.sequence_sharding = (Shard(sequence_dim),) + self.use_local_output = use_local_output + + def _replicate_module_fn( + self, name: str, module: nn.Module, device_mesh: DeviceMesh + ): + for p_name, param in module.named_parameters(): + # simple replication with fixed ones_ init from LayerNorm/RMSNorm, which allow + # us to simply just use from_local + replicated_param = torch.nn.Parameter( + DTensor.from_local(param, device_mesh, [Replicate()], run_check=False) + ) + module.register_parameter(p_name, replicated_param) + + @staticmethod + def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): + input_tensor = inputs[0] + if isinstance(input_tensor, DTensor): + # if the passed in input DTensor is not sharded on the sequence dim, we need to redistribute it + if input_tensor.placements != sequence_sharding: + input_tensor = input_tensor.redistribute( + placements=sequence_sharding, async_op=True + ) + return input_tensor + elif isinstance(input_tensor, torch.Tensor): + # assume the input passed in already sharded on the sequence dim and create the DTensor + return DTensor.from_local( + input_tensor, device_mesh, sequence_sharding, run_check=False + ) + else: + raise ValueError( + f"expecting input of {mod} to be a torch.Tensor or DTensor, but got {input_tensor}" + ) + + @staticmethod + def _prepare_output_fn(use_local_output, mod, outputs, device_mesh): + return outputs.to_local() if use_local_output else outputs + + def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module: + return distribute_module( + module, + device_mesh, + self._replicate_module_fn, + partial(self._prepare_input_fn, self.sequence_sharding), + partial(self._prepare_output_fn, self.use_local_output), + ) diff --git a/fla/modules/layernorm_gated.py b/fla/modules/layernorm_gated.py new file mode 100644 index 0000000000000000000000000000000000000000..1a72ff839dc021e484d55486cc11a9c9f85863fe --- /dev/null +++ b/fla/modules/layernorm_gated.py @@ -0,0 +1,528 @@ +# Copyright (c) 2024, Tri Dao. +# Based on the Triton LayerNorm tutorial: https://triton-lang.org/main/getting-started/tutorials/05-layer-norm.html +# For the backward pass, we keep weight_grad and bias_grad in registers and accumulate. +# This backward pass is faster for dimensions up to 8k, but after that it's much slower due to register spilling. +# The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine. + +import math +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl +from einops import rearrange + +from fla.utils import get_multiprocessor_count, input_guard + + +def rms_norm_ref(x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, upcast=True): + dtype = x.dtype + weight = weight.float() + bias = bias.float() if bias is not None else None + if upcast: + x = x.float() + z = z.float() if z is not None else z + if z is not None and not norm_before_gate: + x = x * F.silu(z) + if group_size is None: + rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps) + out = (x * rstd * weight) + bias if bias is not None else (x * rstd * weight) + else: + x_group = rearrange(x, "... (g d) -> ... g d", d=group_size) + rstd = 1 / torch.sqrt((x_group.square()).mean(dim=-1, keepdim=True) + eps) + out = rearrange(x_group * rstd, "... g d -> ... (g d)") * weight + if bias is not None: + out = out + bias + if z is not None and norm_before_gate: + out *= F.silu(z) + return out.to(dtype) + + +@triton.heuristics({ + "HAS_BIAS": lambda args: args["B"] is not None, + "HAS_Z": lambda args: args["Z"] is not None, +}) +@triton.jit +def layer_norm_fwd_kernel( + X, # pointer to the input + Y, # pointer to the output + W, # pointer to the weights + B, # pointer to the biases + Z, # pointer to the other branch + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row, # how much to increase the pointer when moving by 1 row + stride_y_row, + stride_z_row, + M, # number of rows in X + N, # number of columns in X + eps, # epsilon to avoid division by zero + BLOCK_N: tl.constexpr, + HAS_BIAS: tl.constexpr, + HAS_Z: tl.constexpr, + NORM_BEFORE_GATE: tl.constexpr, + IS_RMS_NORM: tl.constexpr, +): + # Map the program id to the row of X and Y it should compute. + row = tl.program_id(0) + group = tl.program_id(1) + X += row * stride_x_row + group * N + Y += row * stride_y_row + group * N + if HAS_Z: + Z += row * stride_z_row + group * N + if not IS_RMS_NORM: + Mean += group * M + Rstd += group * M + W += group * N + if HAS_BIAS: + B += group * N + # Compute mean and variance + cols = tl.arange(0, BLOCK_N) + x = tl.load(X + cols, mask=cols < N, other=0.).to(tl.float32) + if HAS_Z and not NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=cols < N).to(tl.float32) + x *= z * tl.sigmoid(z) + if not IS_RMS_NORM: + mean = tl.sum(x, axis=0) / N + tl.store(Mean + row, mean) + xbar = tl.where(cols < N, x - mean, 0.) + var = tl.sum(xbar * xbar, axis=0) / N + else: + xbar = tl.where(cols < N, x, 0.) + var = tl.sum(xbar * xbar, axis=0) / N + rstd = 1 / tl.sqrt(var + eps) + tl.store(Rstd + row, rstd) + # Normalize and apply linear transformation + mask = cols < N + w = tl.load(W + cols, mask=mask).to(tl.float32) + if HAS_BIAS: + b = tl.load(B + cols, mask=mask).to(tl.float32) + x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + y = x_hat * w + b if HAS_BIAS else x_hat * w + if HAS_Z and NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=mask).to(tl.float32) + y *= z * tl.sigmoid(z) + # Write output + tl.store(Y + cols, y, mask=mask) + + +def layer_norm_fwd( + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + z: torch.Tensor = None, + out: torch.Tensor = None, + group_size: int = None, + norm_before_gate: bool = True, + is_rms_norm: bool = False, +): + M, N = x.shape + if group_size is None: + group_size = N + assert N % group_size == 0 + ngroups = N // group_size + assert x.stride(-1) == 1 + if z is not None: + assert z.stride(-1) == 1 + assert z.shape == (M, N) + assert weight.shape == (N,) + assert weight.stride(-1) == 1 + if bias is not None: + assert bias.stride(-1) == 1 + assert bias.shape == (N,) + # allocate output + if out is not None: + assert out.shape == x.shape + else: + out = torch.empty_like(x) + assert out.stride(-1) == 1 + mean = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device) if not is_rms_norm else None + rstd = torch.empty((ngroups * M, ), dtype=torch.float32, device=x.device) + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size)) + if group_size > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + num_warps = min(max(BLOCK_N // 256, 1), 8) + grid = (M, ngroups) + layer_norm_fwd_kernel[grid]( + x, + out, + weight, + bias, + z, + mean, + rstd, + x.stride(0), + out.stride(0), + z.stride(0) if z is not None else 0, + M, + group_size, + eps, + BLOCK_N=BLOCK_N, + NORM_BEFORE_GATE=norm_before_gate, + IS_RMS_NORM=is_rms_norm, + num_warps=num_warps + ) + return out, mean, rstd + + +@triton.heuristics({ + "HAS_BIAS": lambda args: args["B"] is not None, + "HAS_Z": lambda args: args["Z"] is not None, + "RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None, +}) +@triton.jit +def layer_norm_bwd_kernel( + X, # pointer to the input + W, # pointer to the weights + B, # pointer to the biases + Z, # pointer to the other branch + Y, # pointer to the output to be recomputed + DY, # pointer to the output gradient + DX, # pointer to the input gradient + DW, # pointer to the partial sum of weights gradient + DB, # pointer to the partial sum of biases gradient + DZ, # pointer to the other branch + Mean, # pointer to the mean + Rstd, # pointer to the 1/std + stride_x_row, # how much to increase the pointer when moving by 1 row + stride_z_row, + stride_y_row, + stride_dy_row, + stride_dx_row, + stride_dz_row, + stride_dw_row, + stride_db_row, + M, # number of rows in X + N, # number of columns in X + eps, # epsilon to avoid division by zero + rows_per_program, + NORM_BEFORE_GATE: tl.constexpr, + IS_RMS_NORM: tl.constexpr, + HAS_BIAS: tl.constexpr, + HAS_Z: tl.constexpr, + RECOMPUTE_OUTPUT: tl.constexpr, + BLOCK_N: tl.constexpr, +): + # Map the program id to the elements of X, DX, and DY it should compute. + row_block_id = tl.program_id(0) + group = tl.program_id(1) + row_start = row_block_id * rows_per_program + cols = tl.arange(0, BLOCK_N) + mask = cols < N + X += row_start * stride_x_row + group * N + if HAS_Z: + Z += row_start * stride_z_row + group * N + DZ += row_start * stride_dz_row + group * N + DY += row_start * stride_dy_row + group * N + DX += row_start * stride_dx_row + group * N + if RECOMPUTE_OUTPUT: + Y += row_start * stride_y_row + group * N + if not IS_RMS_NORM: + Mean += group * M + Rstd += group * M + W += group * N + w = tl.load(W + cols, mask=mask).to(tl.float32) + if (RECOMPUTE_OUTPUT or HAS_Z) and HAS_BIAS: + B += group * N + b = tl.load(B + cols, mask=mask, other=0.).to(tl.float32) + dw = tl.zeros((BLOCK_N,), dtype=tl.float32) + if HAS_BIAS: + db = tl.zeros((BLOCK_N,), dtype=tl.float32) + row_end = min((row_block_id + 1) * rows_per_program, M) + for row in range(row_start, row_end): + # Load data to SRAM + x = tl.load(X + cols, mask=mask, other=0).to(tl.float32) + dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32) + if not IS_RMS_NORM: + mean = tl.load(Mean + row) + if HAS_Z and not NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=mask, other=0.).to(tl.float32) + x_og = x + x = x_og * z * tl.sigmoid(z) + rstd = tl.load(Rstd + row) + # Compute dx + xhat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd + xhat = tl.where(mask, xhat, 0.) + if HAS_Z and NORM_BEFORE_GATE: + z = tl.load(Z + cols, mask=mask, other=0.).to(tl.float32) + z_sigmoid = tl.sigmoid(z) + y = xhat * w + b if HAS_BIAS else xhat * w + if RECOMPUTE_OUTPUT: + tl.store(Y + cols, y * z * z_sigmoid, mask=mask) + dz = dy * y * z_sigmoid * (1 + z * (1 - z_sigmoid)) + tl.store(DZ + cols, dz, mask=mask) + dy *= z * z_sigmoid + else: + if RECOMPUTE_OUTPUT: + y = xhat * w + b if HAS_BIAS else xhat * w + tl.store(Y + cols, y, mask=mask) + wdy = w * dy + c1 = tl.sum(xhat * wdy, axis=0) / N + if not IS_RMS_NORM: + c2 = tl.sum(wdy, axis=0) / N + dx = (wdy - (xhat * c1 + c2)) * rstd + else: + dx = (wdy - xhat * c1) * rstd + dw += dy * xhat + if HAS_BIAS: + db += dy + if HAS_Z and not NORM_BEFORE_GATE: + z_sigmoid = tl.sigmoid(z) + dz = dx * x_og * z_sigmoid * (1 + z * (1 - z_sigmoid)) + tl.store(DZ + cols, dz, mask=mask) + dx *= z * z_sigmoid + # Write dx + tl.store(DX + cols, dx, mask=mask) + + X += stride_x_row + if HAS_Z: + Z += stride_z_row + DZ += stride_dz_row + if RECOMPUTE_OUTPUT: + Y += stride_y_row + DY += stride_dy_row + DX += stride_dx_row + tl.store(DW + row_block_id * stride_dw_row + group * N + cols, dw, mask=mask) + if HAS_BIAS: + tl.store(DB + row_block_id * stride_db_row + group * N + cols, db, mask=mask) + + +def layer_norm_bwd( + dy: torch.Tensor, + x: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + eps: float, + mean: torch.Tensor, + rstd: torch.Tensor, + z: torch.Tensor = None, + group_size: int = None, + norm_before_gate: bool = True, + is_rms_norm: bool = False, + recompute_output: bool = False, + dz: torch.Tensor = None, + out: torch.Tensor = None, +): + M, N = x.shape + if group_size is None: + group_size = N + assert N % group_size == 0 + ngroups = N // group_size + assert x.stride(-1) == 1 + assert dy.stride(-1) == 1 + assert dy.shape == (M, N) + if z is not None: + assert z.stride(-1) == 1 + assert z.shape == (M, N) + assert weight.shape == (N,) + assert weight.stride(-1) == 1 + if bias is not None: + assert bias.stride(-1) == 1 + assert bias.shape == (N,) + # allocate output + dx = torch.empty_like(x) + if dz is not None: + assert z is not None + assert dz.shape == z.shape + assert dz.stride(-1) == 1 + else: + dz = torch.empty_like(z) if z is not None else None + if recompute_output: + if out is None: + out = torch.empty_like(x) + assert out.shape == x.shape + + # Less than 64KB per feature: enqueue fused kernel + MAX_FUSED_SIZE = 65536 // x.element_size() + BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(group_size)) + if group_size > BLOCK_N: + raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.") + # heuristics for number of warps + num_warps = min(max(BLOCK_N // 256, 1), 8) + sm_count = get_multiprocessor_count(x.device.index) + # If group size is small (e.g., 64), we're only using 1 warp. So having just 108 programs + # would limit the occupancy. + nrow_groups = math.ceil(sm_count * math.ceil(4 / num_warps) / ngroups) + _dw = torch.empty((nrow_groups, N), dtype=torch.float32, device=weight.device) + _db = torch.empty((nrow_groups, N), dtype=torch.float32, device=bias.device) if bias is not None else None + rows_per_program = math.ceil(M / nrow_groups) + grid = (nrow_groups, ngroups) + layer_norm_bwd_kernel[grid]( + x, + weight, + bias, + z, + out if recompute_output else None, + dy, + dx, + _dw, + _db, + dz, + mean, + rstd, + x.stride(0), + z.stride(0) if z is not None else 0, + 0 if not recompute_output else out.stride(0), + dy.stride(0), + dx.stride(0), + dz.stride(0) if dz is not None else 0, + _dw.stride(0), + _db.stride(0) if _db is not None else 0, + M, group_size, eps, + rows_per_program, + BLOCK_N=BLOCK_N, + NORM_BEFORE_GATE=norm_before_gate, + IS_RMS_NORM=is_rms_norm, + num_warps=num_warps + ) + dw = _dw.sum(0).to(weight.dtype) + db = _db.sum(0).to(bias.dtype) if bias is not None else None + return (dx, dw, db, dz) if not recompute_output else (dx, dw, db, dz, out) + + +class LayerNormFn(torch.autograd.Function): + + @input_guard + @staticmethod + def forward(ctx, x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, + is_rms_norm=False): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z)) + """ + + x_shape_og = x.shape + # reshape input data into 2D tensor + x = x.reshape(-1, x.shape[-1]) + if x.stride(-1) != 1: + x = x.contiguous() + if z is not None: + assert z.shape == x_shape_og + z = z.reshape(-1, z.shape[-1]) + if z.stride(-1) != 1: + z = z.contiguous() + weight = weight.contiguous() + if bias is not None: + bias = bias.contiguous() + y, mean, rstd = layer_norm_fwd( + x, + weight, + bias, + eps, + z=z, + group_size=group_size, + norm_before_gate=norm_before_gate, + is_rms_norm=is_rms_norm, + ) + ctx.save_for_backward(x, weight, bias, mean, rstd, z) + ctx.x_shape_og = x_shape_og + ctx.eps = eps + ctx.group_size = group_size + ctx.norm_before_gate = norm_before_gate + ctx.is_rms_norm = is_rms_norm + return y.reshape(x_shape_og) + + @input_guard + @staticmethod + def backward(ctx, dy): + x, weight, bias, mean, rstd, z = ctx.saved_tensors + dy = dy.reshape(-1, dy.shape[-1]) + if dy.stride(-1) != 1: + dy = dy.contiguous() + assert dy.shape == x.shape + dx, dw, db, dz = layer_norm_bwd( + dy, + x, + weight, + bias, + ctx.eps, + mean, + rstd, + z, + ctx.group_size, + ctx.norm_before_gate, + ctx.is_rms_norm + ) + dx = dx.reshape(ctx.x_shape_og) + dz = dz.reshape(ctx.x_shape_og) if dz is not None else None + return dx, dw, db, dz, None, None, None, None + + +def layernorm_fn(x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True, is_rms_norm=False): + return LayerNormFn.apply(x, weight, bias, z, eps, group_size, norm_before_gate, is_rms_norm) + + +def rmsnorm_fn(x, weight, bias, z=None, eps=1e-6, group_size=None, norm_before_gate=True): + return LayerNormFn.apply(x, weight, bias, z, eps, group_size, norm_before_gate, True) + + +class LayerNormGated(nn.Module): + + def __init__( + self, + hidden_size, + eps: float = 1e-5, + group_size: Optional[int] = None, + norm_before_gate: bool = True, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + """If group_size is not None, we do GroupNorm with each group having group_size elements. + group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). + """ + + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.bias = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.group_size = group_size + self.norm_before_gate = norm_before_gate + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) + torch.nn.init.zeros_(self.bias) + + def forward(self, x, z=None): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z)) + """ + return layernorm_fn(x, self.weight, self.bias, z=z, group_size=self.group_size, eps=self.eps, + norm_before_gate=self.norm_before_gate) + + +class RMSNormGated(nn.Module): + + def __init__( + self, + hidden_size, + eps: float = 1e-5, + group_size: Optional[int] = None, + norm_before_gate: bool = False, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + """If group_size is not None, we do GroupNorm with each group having group_size elements. + group_size=None is equivalent to group_size=hidden_size (i.e. there's only 1 group). + """ + factory_kwargs = {"device": device, "dtype": dtype} + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.empty(hidden_size, **factory_kwargs)) + self.register_parameter("bias", None) + self.group_size = group_size + self.norm_before_gate = norm_before_gate + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.ones_(self.weight) + + def forward(self, x, z=None): + """If z is not None, we do norm(x) * silu(z) if norm_before_gate, else norm(x * silu(z)) + """ + return rmsnorm_fn(x, self.weight, self.bias, z=z, eps=self.eps, group_size=self.group_size, + norm_before_gate=self.norm_before_gate) diff --git a/fla/modules/mlp.py b/fla/modules/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..b5f35aa6910ad143eb632fc28684043704e8741a --- /dev/null +++ b/fla/modules/mlp.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from __future__ import annotations + +from functools import partial +from typing import TYPE_CHECKING, Any, Optional + +import torch +import torch.nn as nn +from torch.distributed import DeviceMesh +from torch.distributed.tensor import DTensor, Placement, Replicate, Shard, distribute_module +from torch.distributed.tensor.parallel import ParallelStyle + +from fla.modules.activations import swiglu, swiglu_linear + +if TYPE_CHECKING: + from transformers.processing_utils import Unpack + + +class GatedMLP(nn.Module): + + def __init__( + self, + hidden_size: int, + hidden_ratio: Optional[int] = None, + intermediate_size: Optional[int] = None, + hidden_act: str = 'swish', + fuse_swiglu: bool = True + ) -> GatedMLP: + super().__init__() + + self.hidden_size = hidden_size + # the final number of params is `hidden_ratio * hidden_size^2` + # `intermediate_size` is chosen to be a multiple of 256 closest to `2/3 * hidden_size * hidden_ratio` + if hidden_ratio is None: + hidden_ratio = 4 + if intermediate_size is None: + intermediate_size = int(hidden_size * hidden_ratio * 2 / 3) + intermediate_size = 256 * ((intermediate_size + 256 - 1) // 256) + self.hidden_ratio = hidden_ratio + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.fuse_swiglu = fuse_swiglu + + if hidden_act != 'swish': + raise ValueError(f'Unsupported hidden_act: {hidden_act}') + + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + if self.fuse_swiglu: + self.swiglu_linear = SwiGLULinear() + + def forward( + self, + x: torch.Tensor, + **kwargs: Unpack[Any] + ) -> torch.Tensor: + gate, y = self.gate_proj(x), self.up_proj(x) + if self.fuse_swiglu: + return self.swiglu_linear(gate, y, self.down_proj.weight, self.down_proj.bias) + else: + return self.down_proj(swiglu(gate, y)) + + +class SwiGLULinear(nn.Module): + + def forward(self, x, y, weight, bias): + return swiglu_linear(x, y, weight, bias) + + +class SwiGLULinearParallel(ParallelStyle): + def __init__( + self, + *, + input_layouts: Optional[Placement] = None, + output_layouts: Optional[Placement] = None, + use_local_output: bool = True, + ): + super().__init__() + self.input_layouts = (input_layouts or Shard(-1),) + self.output_layouts = (output_layouts or Replicate(),) + self.desired_input_layouts = (Shard(-1),) + self.use_local_output = use_local_output + + @staticmethod + def _prepare_input_fn( + input_layouts, desired_input_layouts, mod, inputs, device_mesh + ): + x, y, weight, bias = inputs + if not isinstance(x, DTensor): + x = DTensor.from_local(x, device_mesh, input_layouts, run_check=False) + if x.placements != desired_input_layouts: + x = x.redistribute(placements=desired_input_layouts, async_op=True) + + if not isinstance(y, DTensor): + y = DTensor.from_local(y, device_mesh, input_layouts, run_check=False) + if y.placements != desired_input_layouts: + y = y.redistribute(placements=desired_input_layouts, async_op=True) + + if not isinstance(weight, DTensor): + weight = DTensor.from_local(weight, device_mesh, (Shard(1),)) + + if bias is not None and not isinstance(bias, DTensor): + bias = DTensor.from_local(bias, device_mesh, (Replicate(),)) + + return x, y, weight, bias + + @staticmethod + def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh): + # Rowwise sharding produces partial output, depending on output layouts: + # 1. to replicate -> allreduce + # 2. to shard -> reduce_scatter + if outputs.placements != output_layouts: + outputs = outputs.redistribute(placements=output_layouts, async_op=True) + # back to local tensor if use_local_output is True + return outputs.to_local() if use_local_output else outputs + + def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module: + return distribute_module( + module, + device_mesh, + partition_fn=None, + input_fn=partial(self._prepare_input_fn, self.input_layouts, self.desired_input_layouts), + output_fn=partial(self._prepare_output_fn, self.output_layouts, self.use_local_output) + ) diff --git a/fla/modules/parallel.py b/fla/modules/parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..3561476adeed63be51edbee270106291d6a646ac --- /dev/null +++ b/fla/modules/parallel.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional + +import torch.nn as nn +from torch.distributed import DeviceMesh +from torch.distributed.tensor import DTensor, distribute_module +from torch.distributed.tensor.parallel import ParallelStyle +from torch.distributed.tensor.placement_types import Placement + + +class PrepareModuleWeight(ParallelStyle): + def __init__(self, *, layouts: Optional[Placement] = None): + super().__init__() + self.layouts = layouts + + def _replicate_module_fn( + self, + name: str, + module: nn.Module, + device_mesh: DeviceMesh + ): + for p_name, param in module.named_parameters(): + replicated_param = nn.Parameter( + DTensor.from_local(param, device_mesh, [self.layouts], run_check=False) + ) + module.register_parameter(p_name, replicated_param) + + def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module: + return distribute_module( + module, + device_mesh, + partition_fn=self._replicate_module_fn, + input_fn=None, + output_fn=None + ) diff --git a/fla/modules/seq_to_myopic.py b/fla/modules/seq_to_myopic.py new file mode 100644 index 0000000000000000000000000000000000000000..aa6dee800a6d27bdf4810ef9b7ae4e84ba68ff86 --- /dev/null +++ b/fla/modules/seq_to_myopic.py @@ -0,0 +1,111 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton +import triton.language as tl + +@triton.autotune( + configs=[ + triton.Config({'BLOCK_SIZE_V': block_size_v}, num_warps=num_warp) + for block_size_v in [256, 512, 1024, 2048] + for num_warp in [1, 2, 4, 8] + ], + key=['V'], +) +@triton.jit +def _seq_to_myopic_kernel( + seq_ptr, + output_ptr, + B, + T_total, + T, + V, + pad_token_id, + window_size, + T_val, + stride_seq_b, + stride_seq_t, + stride_out_b, + stride_out_t, + stride_out_v, + BLOCK_SIZE_V: tl.constexpr, +): + b = tl.program_id(0) + v_block = tl.program_id(1) + + v_start = v_block * BLOCK_SIZE_V + v_end = tl.minimum(v_start + BLOCK_SIZE_V, V) + v_idx = tl.arange(0, BLOCK_SIZE_V) + v = v_start + v_idx + mask = v < V + + next_occurrence = tl.full((BLOCK_SIZE_V,), T_val, dtype=tl.int64) + + for t in range(T_total - 1, -1, -1): + token = tl.load(seq_ptr + b * stride_seq_b + t * stride_seq_t) + + token_valid = (token != pad_token_id) & (token >= 0) & (token < V) + in_block = (token >= v_start) & (token < v_end) + + if token_valid: + if in_block: + local_v = token - v_start + next_occurrence = tl.where(v_idx == local_v, t, next_occurrence) + + if t < T: + distance = next_occurrence - t + valid = (distance < window_size) + value = tl.where(valid, window_size - distance, float('-inf')) + + output_offset = ( + b * stride_out_b + + t * stride_out_t + + v * stride_out_v + ) + tl.store(output_ptr + output_offset, value, mask=mask) + +def seq_to_myopic( + seq: torch.Tensor, + vocab_size: int, + window_size: int, + pad_token_id: int = -100 +) -> torch.Tensor: + """ + Triton-optimized myopic sequence processing with autotuned block size. + + :param seq: Input sequence of shape (B, T + window_size) + :param vocab_size: Size of the vocabulary + :param window_size: How far ahead to look for next occurrences + :param pad_token_id: Padding token ID + :return: Tensor of shape (B, T, V) with window_size - distance for tokens in window, else -inf + """ + B, T_total = seq.shape + T = T_total - window_size + + assert T > 0, "T_total must be greater than window_size to produce valid output." + + output = torch.empty((B, T, vocab_size), device=seq.device, dtype=torch.float16) + if not output.is_contiguous(): + output = output.contiguous() + + # Let autotune select the best BLOCK_SIZE_V based on vocab_size + grid = (B, triton.cdiv(vocab_size, 128)) # Start with minimum block size + + _seq_to_myopic_kernel[grid]( + seq, + output, + B, + T_total, + T, + vocab_size, + pad_token_id, + window_size, + T_total, + seq.stride(0), + seq.stride(1), + output.stride(0), + output.stride(1), + output.stride(2), + ) + + return output \ No newline at end of file diff --git a/fla/ops/abc/__init__.py b/fla/ops/abc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fdac8d900fc51485a55716443ee1f00424b522b9 --- /dev/null +++ b/fla/ops/abc/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +from .chunk import chunk_abc + +__all__ = [ + 'chunk_abc' +] diff --git a/fla/ops/abc/chunk.py b/fla/ops/abc/chunk.py new file mode 100644 index 0000000000000000000000000000000000000000..8538e04800cd71414782ff72668df1fbd97984b1 --- /dev/null +++ b/fla/ops/abc/chunk.py @@ -0,0 +1,1116 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional, Tuple + +import torch +import triton +import triton.language as tl + +from fla.ops.utils import logcumsumexp_fwd_kernel, softmax_bwd, softmax_fwd +from fla.ops.utils.op import exp +from fla.utils import input_guard + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_fwd_kernel_h( + k, + v, + z, + h, + h0, + ht, + T, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + NT: tl.constexpr, + NORMK: tl.constexpr, + USE_INITIAL_STATE: tl.constexpr, + STORE_FINAL_STATE: tl.constexpr +): + i_v, i_k, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + + b_h = tl.zeros([BK, BV], dtype=tl.float32) + if USE_INITIAL_STATE: + p_h = tl.make_block_ptr(h0 + i_bh * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + b_h += tl.load(p_h, boundary_check=(0, 1)).to(tl.float32) + if NORMK: + p_z0 = tl.make_block_ptr(z + i_bh * T*K, (T * K,), (1,), (i_k * BK,), (BK,), (0,)) + else: + p_z0 = tl.make_block_ptr(z + i_bh * T*V, (T * V,), (1,), (i_v * BV,), (BV,), (0,)) + b_zp = tl.load(p_z0).to(tl.float32) + for i_t in range(NT): + p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_h = tl.make_block_ptr(h + i_bh * NT*K*V + i_t * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + + tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1)) + # [BK, BT] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BT, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + if NORMK: + p_zc = tl.make_block_ptr(z + i_bh * T*K, (T * K,), (1,), ((i_t * BT + BT - 1) * K + i_k * BK,), (BK,), (0,)) + # [BK,] + b_zc = tl.load(p_zc, boundary_check=(0,)) + b_r, b_zp = exp(b_zp - b_zc), b_zc + # [BK, BV] + b_h = b_h * b_r[:, None] + b_k = exp(b_k - b_zc[:, None]).to(b_k.dtype) + else: + p_zc = tl.make_block_ptr(z + i_bh * T*V, (T * V,), (1,), ((i_t * BT + BT - 1) * V + i_v * BV,), (BV,), (0,)) + # [BV,] + b_zc = tl.load(p_zc, boundary_check=(0,)) + b_r, b_zp = exp(b_zp - b_zc), b_zc + # [BK, BV] + b_h = b_h * b_r[None, :] + b_v = exp(b_v - b_zc[None, :]).to(b_v.dtype) + # [BK, BV] + b_h += tl.dot(b_k, b_v, allow_tf32=False) + + if STORE_FINAL_STATE: + p_h = tl.make_block_ptr(ht + i_bh * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_fwd_kernel_intra_K( + v, + z, + o, + A, + T, + V: tl.constexpr, + BT: tl.constexpr, + BC: tl.constexpr, + BV: tl.constexpr, + NC: tl.constexpr +): + i_v, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_t, i_i = i_c // NC, i_c % NC + + p_z = tl.make_block_ptr(z + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0)) + p_zn = tl.make_block_ptr(z + i_bh * T*V, (T * V,), (1,), ((i_t * BT + i_i * BC) * V + i_v * BV,), (BV,), (0,)) + # [BV,] + b_zn = tl.load(p_zn, boundary_check=(0,)) + # [BC, BV] + b_o = tl.zeros([BC, BV], dtype=tl.float32) + for i_j in range(0, i_i): + p_A = tl.make_block_ptr(A + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0)) + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_j * BC, i_v * BV), (BC, BV), (1, 0)) + # [BC, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BC, BC] + b_A = tl.load(p_A, boundary_check=(0, 1)) + b_o += tl.dot(b_A, exp(b_v - b_zn[None, :]).to(b_v.dtype), allow_tf32=False) + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_o *= exp(b_zn[None, :] - b_z) + + o_i = tl.arange(0, BC) + o_A = i_bh * T * BT + (i_t * BT + i_i * BC + tl.arange(0, BC)) * BT + i_i * BC + m_A = (i_t * BT + i_i * BC + tl.arange(0, BC)) < T + for j in range(0, BC): + p_v = tl.make_block_ptr(v + i_bh * T*V, (T * V,), (1,), ((i_t * BT + i_i * BC + j) * V + i_v * BV,), (BV,), (0,)) + # [BC,] + b_A = tl.load(A + o_A + j, mask=m_A, other=0) + # [BV,] + b_v = tl.load(p_v, boundary_check=(0,)).to(tl.float32) + # [BC, BV] + # avoid 0 * inf = inf + m_i = o_i[:, None] >= j + b_o += tl.where(m_i, b_A[:, None] * exp(b_v[None, :] - b_z), 0) + p_o = tl.make_block_ptr(o + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0)) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_fwd_kernel_K( + q, + k, + z, + h, + o, + A, + scale, + T, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + NT: tl.constexpr +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_p = tl.maximum(i_t * BT - 1, 0) + + o_i = tl.arange(0, BT) + m_s = o_i[:, None] >= o_i[None, :] + + b_o = tl.zeros([BT, BV], dtype=tl.float32) + b_A = tl.zeros([BT, BT], dtype=tl.float32) + for i_k in range(tl.cdiv(K, BK)): + p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_h = tl.make_block_ptr(h + i_bh * NT*K*V + i_t * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + # [BK, BT] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BK, BV] + b_h = tl.load(p_h, boundary_check=(0, 1)) + # [BT, BV] + b_o += tl.dot(b_q, b_h, allow_tf32=False) + # [BT, BT] + b_A += tl.dot(b_q, b_k, allow_tf32=False) + p_z = tl.make_block_ptr(z + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_o = tl.make_block_ptr(o + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + # [BT, BV] + b_z = tl.load(p_z, boundary_check=(0, 1)) + # [BT, BV] + p_zp = tl.make_block_ptr(z + i_bh * T*V, (T * V,), (1,), (i_p * V + i_v * BV,), (BV,), (0,)) + b_zp = tl.load(p_zp, boundary_check=(0,)) + b_o = b_o * exp(b_zp[None, :] - b_z) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + p_A = tl.make_block_ptr(A + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)) + # [BT, BT] + b_A = tl.where(m_s, b_A, 0.) + if i_v == 0: + tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_fwd_kernel_intra_V( + q, + k, + z, + A, + scale, + T, + K: tl.constexpr, + BT: tl.constexpr, + BC: tl.constexpr, + BK: tl.constexpr, + NC: tl.constexpr +): + i_k, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_t, i_i, i_j = i_c // (NC * NC), (i_c % (NC * NC)) // NC, (i_c % (NC * NC)) % NC + n_bh = tl.num_programs(2) + + if i_i > i_j: + p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)) + p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT + i_j * BC), (BK, BC), (0, 1)) + p_z = tl.make_block_ptr(z + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)) + p_A = tl.make_block_ptr(A + (i_k*n_bh+i_bh)*T*BT, (T, BT), (BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0)) + p_zn = tl.make_block_ptr(z + i_bh * T*K, (T * K,), (1,), ((i_t * BT + i_i * BC) * K + i_k * BK,), (BK,), (0,)) + # [BK,] + b_zn = tl.load(p_zn, boundary_check=(0,)) + # [BC, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_q = (b_q * exp(b_zn[None, :] - b_z) * scale).to(b_q.dtype) + # [BK, BC] + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_k = exp(b_k - b_zn[:, None]).to(b_k.dtype) + # [BC, BC] + b_A = tl.dot(b_q, b_k, allow_tf32=False) + tl.store(p_A, b_A.to(A.dtype.element_ty), boundary_check=(0, 1)) + elif i_i == i_j: + p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)) + p_k = tl.make_block_ptr(k + i_bh * T*K, (T * K,), (1,), ((i_t * BT + i_j * BC) * K + i_k * BK,), (BK,), (0,)) + p_z = tl.make_block_ptr(z + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)) + # [BC, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_z = tl.load(p_z, boundary_check=(0, 1)) + + o_i = tl.arange(0, BC) + o_A = (i_bh + i_k * n_bh) * T * BT + (i_t * BT + i_i * BC + tl.arange(0, BC)) * BT + i_j * BC + m_A = (i_t * BT + i_i * BC + tl.arange(0, BC)) < T + for j in range(0, BC): + # [BK,] + b_k = tl.load(p_k, boundary_check=(0,)).to(tl.float32) + # [BC,] + b_A = tl.sum(b_q * exp(b_k[None, :] - b_z) * scale, 1) + b_A = tl.where(o_i >= j, b_A, 0.) + tl.store(A + o_A + j, b_A.to(b_q.dtype), mask=m_A) + + p_k = tl.advance(p_k, (K,)) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_fwd_kernel_V( + q, + v, + z, + h, + o, + A, + scale, + T, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + NT: tl.constexpr +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_p = tl.maximum(i_t * BT - 1, 0) + + b_o = tl.zeros([BT, BV], dtype=tl.float32) + for i_k in range(tl.cdiv(K, BK)): + p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_z = tl.make_block_ptr(z + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_h = tl.make_block_ptr(h + i_bh * NT*K*V + i_t * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + p_zp = tl.make_block_ptr(z + i_bh * T*K, (T * K,), (1,), (i_p * K + i_k * BK,), (BK,), (0,)) + + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + # [BT, BK] + b_z = tl.load(p_z, boundary_check=(0, 1)) + # [BT, BK] + b_zp = tl.load(p_zp, boundary_check=(0,)) + b_q = (b_q * exp(b_zp[None, :] - b_z)).to(b_q.dtype) + # [BK, BV] + b_h = tl.load(p_h, boundary_check=(0, 1)) + # works but dkw, owing to divine benevolence + # [BT, BV] + if i_k >= 0: + b_o += tl.dot(b_q, b_h, allow_tf32=False) + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_o = tl.make_block_ptr(o + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_A = tl.make_block_ptr(A + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)) + # [BT, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BT, BT] + b_A = tl.load(p_A, boundary_check=(0, 1)) + b_o += tl.dot(b_A.to(b_v.dtype), b_v, allow_tf32=False) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_bwd_kernel_dh( + q, + z, + do, + dh, + scale, + T, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + NT: tl.constexpr, + NORMK: tl.constexpr +): + i_k, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + + b_dh = tl.zeros([BK, BV], dtype=tl.float32) + b_zp = tl.full([BK if NORMK else BV], float('inf'), dtype=tl.float32) + for i_t in range(NT - 1, -1, -1): + i_p = tl.maximum(i_t * BT - 1, 0) + p_q = tl.make_block_ptr(q + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_dh = tl.make_block_ptr(dh + i_bh * NT*K*V + i_t * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + + # [BK, BT] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + # [BT, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + + tl.store(p_dh, b_dh.to(p_dh.dtype.element_ty), boundary_check=(0, 1)) + if NORMK: + p_z = tl.make_block_ptr(z + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_zc = tl.make_block_ptr(z + i_bh * T*K, (T * K,), (1,), (i_p * K + i_k * BK,), (BK,), (0,)) + # [BK,] + b_zc = tl.load(p_zc, boundary_check=(0,)) + b_r, b_zp = exp(b_zc - b_zp), b_zc + # [BK, BT] + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_q = (b_q * exp(b_zc[:, None] - b_z)).to(b_q.dtype) + # [BK, BV] + b_dh = b_dh * b_r[:, None] + else: + p_z = tl.make_block_ptr(z + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_zc = tl.make_block_ptr(z + i_bh * T*V, (T * V,), (1,), (i_p * V + i_v * BV,), (BV,), (0,)) + # [BV,] + b_zc = tl.load(p_zc, boundary_check=(0,)) + b_r, b_zp = exp(b_zc - b_zp), b_zc + # [BT, BV] + b_z = tl.load(p_z, boundary_check=(0,)) + b_do = (b_do * exp(b_zc[None, :] - b_z)).to(b_do.dtype) + # [BK, BV] + b_dh = b_dh * b_r[None, :] + # [BK, BV] + b_dh += tl.dot(b_q, b_do, allow_tf32=False) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_bwd_kernel_V( + k, + v, + z, + h, + A, + do, + dh, + dq, + dk, + dv, + dA, + scale, + T, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + NT: tl.constexpr +): + i_k, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_p = tl.maximum(i_t * BT - 1, 0) + n_bh = tl.num_programs(2) + + p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_zc = tl.make_block_ptr(z + i_bh * T*K, (T * K,), (1,), ((i_t * BT + BT - 1) * K + i_k * BK,), (BK,), (0,)) + p_A = tl.make_block_ptr(A + i_bh * T * BT, (BT, T), (1, BT), (0, i_t * BT), (BT, BT), (0, 1)) + + # [BK,] + b_zc = tl.load(p_zc, boundary_check=(0,)) + # [BT, BK] + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_k = exp(b_k - b_zc[None, :]).to(b_k.dtype) + # [BT, BT] + b_A = tl.load(p_A, boundary_check=(0, 1)) + + b_dq = tl.zeros([BT, BK], dtype=tl.float32) + b_dk = tl.zeros([BT, BK], dtype=tl.float32) + b_dA = tl.zeros([BT, BT], dtype=tl.float32) + for i_v in range(tl.cdiv(V, BV)): + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_h = tl.make_block_ptr(h + i_bh * NT*K*V + i_t * V * K, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1)) + p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_dh = tl.make_block_ptr(dh + i_bh * NT*K*V + i_t * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + p_dv = tl.make_block_ptr(dv + (i_k*n_bh+i_bh) * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + + # [BT, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BV, BK] + b_h = tl.load(p_h, boundary_check=(0, 1)) + # [BT, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + # [BK, BV] + b_dh = tl.load(p_dh, boundary_check=(0, 1)) + + # [BT, BV] + b_dv = tl.dot(b_k, b_dh, allow_tf32=False) + if i_k == 0: + b_dv += tl.dot(b_A.to(b_do.dtype), b_do, allow_tf32=False) + b_do = (b_do * scale).to(b_do.dtype) + tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1)) + # [BT, BT] + b_dA += tl.dot(b_do, tl.trans(b_v), allow_tf32=False) + # [BT, BK] + b_dq += tl.dot(b_do, b_h, allow_tf32=False) + # [BT, BK] + b_dk += tl.dot(b_v, tl.trans(b_dh), allow_tf32=False) + p_z = tl.make_block_ptr(z + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_zp = tl.make_block_ptr(z + i_bh * T*K, (T * K,), (1,), (i_p * K + i_k * BK,), (BK,), (0,)) + # [BK,] + b_zp = tl.load(p_zp, boundary_check=(0,)) + # [BT, BK] + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_z = exp(b_zp[None, :] - b_z) + # [BT, BK] + b_dq = b_dq * b_z + b_dk = b_dk * b_k + + p_dq = tl.make_block_ptr(dq + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_dk = tl.make_block_ptr(dk + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_dA = tl.make_block_ptr(dA + i_bh * T * BT, (T, BT,), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)) + tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1)) + + o_i = tl.arange(0, BT) + m_s = o_i[:, None] >= o_i[None, :] + # [BT, BT] + b_dA = tl.where(m_s, b_dA, 0.).to(b_k.dtype) + if i_k == 0: + tl.store(p_dA, b_dA.to(p_dA.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_bwd_kernel_intra_V( + q, + k, + z, + dA, + dq, + dk, + T, + K: tl.constexpr, + BT: tl.constexpr, + BC: tl.constexpr, + BK: tl.constexpr, + NC: tl.constexpr +): + i_k, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_t, i_i = i_c // NC, i_c % NC + + p_z = tl.make_block_ptr(z + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)) + p_zn = tl.make_block_ptr(z + i_bh * T*K, (T * K,), (1,), ((i_t * BT + i_i * BC) * K + i_k * BK,), (BK,), (0,)) + # [BK,] + b_zn = tl.load(p_zn, boundary_check=(0,)) + # [BC, BK] + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_zq = exp(b_zn[None, :] - b_z) + b_dq = tl.zeros([BC, BK], dtype=tl.float32) + for i_j in range(0, i_i): + p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0)) + p_dA = tl.make_block_ptr(dA + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0)) + # [BC, BK] + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_kz = exp(b_k - b_zn[None, :]).to(b_k.dtype) + # [BC, BC] + b_dA = tl.load(p_dA, boundary_check=(0, 1)) + # [BC, BK] + b_dq += tl.dot(b_dA, b_kz, allow_tf32=False) + b_dq *= b_zq + + o_i = tl.arange(0, BC) + o_dA = i_bh * T * BT + (i_t * BT + i_i * BC + tl.arange(0, BC)) * BT + i_i * BC + m_dA = (i_t * BT + i_i * BC + tl.arange(0, BC)) < T + for j in range(0, BC): + p_kj = tl.make_block_ptr(k + i_bh * T*K, (T * K,), (1,), ((i_t * BT + i_i*BC+j) * K + i_k * BK,), (BK,), (0,)) + # [BC,] + b_dA = tl.load(dA + o_dA + j, mask=m_dA, other=0) + # [BK,] + b_kj = tl.load(p_kj, boundary_check=(0,)).to(tl.float32) + # [BC, BK] + m_i = o_i[:, None] >= j + # [BC, BK] + b_dq += tl.where(m_i, b_dA[:, None] * exp(b_kj[None, :] - b_z), 0.) + p_dq = tl.make_block_ptr(dq + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)) + tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1)) + + tl.debug_barrier() + p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)) + p_zn = tl.make_block_ptr(z + i_bh * T*K, (T*K,), (1,), ((i_t * BT + i_i * BC + BC - 1) * K + i_k * BK,), (BK,), (0,)) + # [BK,] + b_zn = tl.load(p_zn, boundary_check=(0,)) + # [BC, BK] + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_kz = exp(b_k - b_zn[None, :]) + b_dk = tl.zeros([BC, BK], dtype=tl.float32) + for i_j in range(i_i + 1, NC): + p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0)) + p_z = tl.make_block_ptr(z + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0)) + p_dA = tl.make_block_ptr(dA + i_bh * T * BT, (T, BT), (BT, 1), (i_t * BT + i_j * BC, i_i * BC), (BC, BC), (1, 0)) + # [BC, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_qz = (b_q * exp(b_zn[None, :] - b_z)).to(b_q.dtype) + # [BC, BC] + b_dA = tl.load(p_dA, boundary_check=(0, 1)) + # [BC, BK] + b_dk += tl.dot(tl.trans(b_dA), b_qz, allow_tf32=False) + b_dk *= b_kz + + o_dA = i_bh * T * BT + (i_t * BT + i_i * BC) * BT + i_i * BC + tl.arange(0, BC) + for j in range(0, BC): + p_qj = tl.make_block_ptr(q + i_bh * T*K, (T * K,), (1,), ((i_t * BT + i_i * BC + j) * K + i_k * BK,), (BK,), (0,)) + p_zj = tl.make_block_ptr(z + i_bh * T*K, (T * K,), (1,), ((i_t * BT + i_i * BC + j) * K + i_k * BK,), (BK,), (0,)) + # [BC,] + b_dA = tl.load(dA + o_dA + j * BT, mask=(i_t * BT + i_i * BC + j < T), other=0) + # [BK,] + b_qj = tl.load(p_qj, boundary_check=(0,)).to(tl.float32) + b_zj = tl.load(p_zj, boundary_check=(0,)).to(tl.float32) + # [BC, BK] + m_i = o_i[:, None] <= j + b_dk += tl.where(m_i, b_dA[:, None] * b_qj[None, :] * exp(b_k - b_zj[None, :]), 0.) + p_dk = tl.make_block_ptr(dk + i_bh * T*K, (T, K), (K, 1), (i_t * BT + i_i * BC, i_k * BK), (BC, BK), (1, 0)) + tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_bwd_kernel_intra_K( + v, + z, + do, + dA, + scale, + T, + V: tl.constexpr, + BT: tl.constexpr, + BC: tl.constexpr, + BV: tl.constexpr, + NC: tl.constexpr +): + i_v, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_t, i_i, i_j = i_c // (NC * NC), (i_c % (NC * NC)) // NC, (i_c % (NC * NC)) % NC + n_bh = tl.num_programs(2) + + if i_i > i_j: + p_v = tl.make_block_ptr(v + i_bh * T*V, (V, T), (1, V), (i_v * BV, i_t * BT + i_j * BC), (BV, BC), (0, 1)) + p_z = tl.make_block_ptr(z + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0)) + p_zn = tl.make_block_ptr(z + i_bh * T*V, (T * V,), (1,), ((i_t * BT + i_i * BC) * V + i_v * BV,), (BV,), (0,)) + p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0)) + p_dA = tl.make_block_ptr(dA+(i_bh+i_v*n_bh)*T*BT, (T, BT), (BT, 1), (i_t * BT + i_i * BC, i_j * BC), (BC, BC), (1, 0)) + # [BV,] + b_zn = tl.load(p_zn, boundary_check=(0,)) + # [BC, BV] + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_do = tl.load(p_do, boundary_check=(0, 1)) + b_do = (b_do * exp(b_zn[None, :] - b_z) * scale).to(b_do.dtype) + # [BV, BC] + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_v = exp(b_v - b_zn[:, None]).to(b_v.dtype) + # [BC, BC] + b_dA = tl.dot(b_do, b_v, allow_tf32=False) + tl.store(p_dA, b_dA.to(dA.dtype.element_ty), boundary_check=(0, 1)) + elif i_i == i_j: + p_v = tl.make_block_ptr(v + i_bh * T*V, (T * V,), (1,), ((i_t * BT + i_j * BC) * V + i_v * BV,), (BV,), (0,)) + p_z = tl.make_block_ptr(z + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0)) + p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0)) + # [BC, BV] + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_do = tl.load(p_do, boundary_check=(0, 1)) * scale + + o_i = tl.arange(0, BC) + o_A = (i_bh + i_v * n_bh) * T * BT + (i_t * BT + i_i * BC + tl.arange(0, BC)) * BT + i_j * BC + m_A = (i_t * BT + i_i * BC + tl.arange(0, BC)) < T + for j in range(0, BC): + # [BV,] + b_v = tl.load(p_v, boundary_check=(0,)).to(tl.float32) + # [BC,] + b_dA = tl.sum(b_do * exp(b_v[None, :] - b_z), 1) + b_dA = tl.where(o_i >= j, b_dA, 0) + tl.store(dA + o_A + j, b_dA.to(b_do.dtype), mask=m_A) + + p_v = tl.advance(p_v, (V,)) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_bwd_kernel_K( + q, + k, + v, + z, + h, + A, + do, + dh, + dq, + dk, + dv, + dA, + scale, + T, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + NT: tl.constexpr +): + i_k, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_p = tl.maximum(i_t * BT - 1, 0) + n_bh = tl.num_programs(2) + + o_i = tl.arange(0, BT) + m_s = o_i[:, None] >= o_i[None, :] + + p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_A = tl.make_block_ptr(A + (i_k*n_bh+i_bh) * T * BT, (T, BT, ), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)) + + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BT, BT] + b_A = tl.dot((b_q * scale).to(b_q.dtype), tl.trans(b_k), allow_tf32=False) + b_A = tl.where(m_s, b_A, 0.) + tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1)) + + b_dq = tl.zeros([BT, BK], dtype=tl.float32) + b_dk = tl.zeros([BT, BK], dtype=tl.float32) + for i_v in range(tl.cdiv(V, BV)): + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_z = tl.make_block_ptr(z + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_zp = tl.make_block_ptr(z + i_bh * T*V, (T * V,), (1,), (i_p * V + i_v * BV,), (BV,), (0,)) + p_zc = tl.make_block_ptr(z + i_bh * T*V, (T * V,), (1,), ((i_t * BT + BT - 1) * V + i_v * BV,), (BV,), (0,)) + p_h = tl.make_block_ptr(h + i_bh * NT*K*V + i_t * K*V, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1)) + + p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_dh = tl.make_block_ptr(dh + i_bh * NT*K*V + i_t * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + p_dv = tl.make_block_ptr(dv + (i_k*n_bh+i_bh) * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + + # [BV,] + b_zp = tl.load(p_zp, boundary_check=(0,)) + b_zc = tl.load(p_zc, boundary_check=(0,)) + # [BT, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_v = exp(b_v - b_zc[None, :]).to(b_v.dtype) + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_z = exp(b_zp[None, :] - b_z) + # [BV, BK] + b_h = tl.load(p_h, boundary_check=(0, 1)) + # [BT, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + b_do = (b_do * b_z * scale).to(b_do.dtype) + # [BK, BV] + b_dh = tl.load(p_dh, boundary_check=(0, 1)) + + # [BT, BK] + b_dq += tl.dot(b_do, b_h, allow_tf32=False) + b_dk += tl.dot(b_v, tl.trans(b_dh), allow_tf32=False) + # [BT, BV] + b_dv = b_v * tl.dot(b_k, b_dh, allow_tf32=False) + tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1)) + p_dA = tl.make_block_ptr(dA + i_bh * T * BT, (T, BT, ), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)) + # [BT, BT] + b_dA = tl.load(p_dA, boundary_check=(0, 1)) + # [BT, BK] + b_dq += tl.dot(b_dA, b_k, allow_tf32=False) + b_dk += tl.dot(tl.trans(b_dA).to(b_k.dtype), b_q, allow_tf32=False) + + p_dq = tl.make_block_ptr(dq + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_dk = tl.make_block_ptr(dk + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_bwd_kernel_intra_KV( + v, + z, + A, + do, + dv, + T, + V: tl.constexpr, + BT: tl.constexpr, + BC: tl.constexpr, + BV: tl.constexpr, + NC: tl.constexpr +): + i_v, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_t, i_i = i_c // NC, i_c % NC + + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0)) + p_zn = tl.make_block_ptr(z + i_bh * T*V, (T*V,), (1,), ((i_t * BT + i_i * BC + BC - 1) * V + i_v * BV,), (BV,), (0,)) + # [BV,] + b_zn = tl.load(p_zn, boundary_check=(0,)) + # [BC, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_dv = tl.zeros([BC, BV], dtype=tl.float32) + for i_j in range(i_i + 1, NC): + p_z = tl.make_block_ptr(z + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_j * BC, i_v * BV), (BC, BV), (1, 0)) + p_A = tl.make_block_ptr(A + i_bh * T * BT, (BT, T), (1, BT), (i_i * BC, i_t * BT + i_j * BC), (BC, BC), (0, 1)) + p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_j * BC, i_v * BV), (BC, BV), (1, 0)) + # [BC, BV] + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_do = tl.load(p_do, boundary_check=(0, 1)) + b_do = (b_do * exp(b_zn[None, :] - b_z)).to(b_do.dtype) + # [BC, BC] + b_A = tl.load(p_A, boundary_check=(0, 1)) + b_dv += tl.dot(b_A, b_do, allow_tf32=False) + b_dv *= exp(b_v - b_zn[None, :]) + + o_i = tl.arange(0, BC) + for j in range(0, BC): + p_z = tl.make_block_ptr(z + i_bh * T*V, (T * V,), (1,), ((i_t * BT + i_i * BC + j) * V + i_v * BV,), (BV,), (0,)) + p_A = tl.make_block_ptr(A + i_bh * T * BT, (T * BT,), (1,), ((i_t * BT + i_i * BC + j) * BT + i_i * BC,), (BC,), (0,)) + p_do = tl.make_block_ptr(do + i_bh * T*V, (T * V,), (1,), ((i_t * BT + i_i * BC + j) * V + i_v * BV,), (BV,), (0,)) + # [BC,] + b_A = tl.load(p_A, boundary_check=(0,)) + # [BV,] + b_z = tl.load(p_z, boundary_check=(0,)) + b_do = tl.load(p_do, boundary_check=(0,)) + # [BC, BV] + m_i = o_i[:, None] <= j + b_dv += tl.where(m_i, exp(b_v - b_z[None, :]) * b_A[:, None] * b_do[None, :], 0.) + p_dv = tl.make_block_ptr(dv + i_bh * T*V, (T, V), (V, 1), (i_t * BT + i_i * BC, i_v * BV), (BC, BV), (1, 0)) + tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_bwd_kernel_rcum_inter( + s, + z, + ss, + doo, + T, + S: tl.constexpr, + BT: tl.constexpr, + BS: tl.constexpr, + NT: tl.constexpr +): + i_m, i_bh = tl.program_id(0), tl.program_id(1) + + b_sp = tl.zeros([BS,], dtype=tl.float32) + b_zp = tl.full([BS,], float('inf'), dtype=tl.float32) + for i_t in range(NT - 1, -1, -1): + p_s = tl.make_block_ptr(s + i_bh * T*S, (T, S), (S, 1), (i_t * BT, i_m * BS), (BT, BS), (1, 0)) + p_z = tl.make_block_ptr(z + i_bh * T*S, (T, S), (S, 1), (i_t * BT, i_m * BS), (BT, BS), (1, 0)) + p_zc = tl.make_block_ptr(z + i_bh * T*S, (T*S,), (1,), ((i_t * BT) * S + i_m * BS,), (BS,), (0,)) + p_ss = tl.make_block_ptr(ss + i_bh * T*S, (T, S), (S, 1), (i_t * BT, i_m * BS), (BT, BS), (1, 0)) + p_doo = tl.make_block_ptr(doo + i_bh * T*S, (T, S), (S, 1), (i_t * BT, i_m * BS), (BT, BS), (1, 0)) + # [BS,] + b_zc = tl.load(p_zc, boundary_check=(0,)) + # [BT, BS] + b_s = tl.load(p_s, boundary_check=(0, 1)) + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_ss = tl.load(p_ss, boundary_check=(0, 1)) + + b_doo = exp(b_s - b_zp[None, :]) * b_sp[None, :] + tl.store(p_doo, b_doo.to(p_doo.dtype.element_ty), boundary_check=(0, 1)) + # [BS,] + b_sp = b_sp * exp(b_zc - b_zp) + tl.sum(b_ss * exp(b_zc[None, :] - b_z), 0) + b_zp = b_zc + + +@triton.jit(do_not_specialize=['T']) +def chunk_abc_bwd_kernel_rcum_intra( + s, + z, + ss, + doo, + T, + S: tl.constexpr, + BT: tl.constexpr, + BC: tl.constexpr, + BS: tl.constexpr, + NC: tl.constexpr +): + i_s, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_t, i_i = i_c // NC, i_c % NC + + o_i = tl.arange(0, BC) + m_o = tl.full([BC, BC], 1., dtype=tl.float32) + + p_s = tl.make_block_ptr(s + i_bh * T*S, (T, S), (S, 1), (i_t * BT + i_i * BC, i_s * BS), (BC, BS), (1, 0)) + p_zn = tl.make_block_ptr(z + i_bh * T*S, (T*S,), (1,), ((i_t * BT + i_i * BC + BC - 1) * S + i_s * BS,), (BS,), (0,)) + p_doo = tl.make_block_ptr(doo + i_bh * T*S, (T, S), (S, 1), (i_t * BT + i_i * BC, i_s * BS), (BC, BS), (1, 0)) + # [BC, BS] + b_s = tl.load(p_s, boundary_check=(0, 1)) + # [BS,] + b_zn = tl.load(p_zn, boundary_check=(0,)) + + b_doo = tl.zeros([BC, BS], dtype=tl.float32) + for i_j in range(i_i + 1, NC): + p_z = tl.make_block_ptr(z + i_bh * T*S, (T, S), (S, 1), (i_t * BT + i_j * BC, i_s * BS), (BC, BS), (1, 0)) + p_ss = tl.make_block_ptr(ss + i_bh * T*S, (T, S), (S, 1), (i_t * BT + i_j * BC, i_s * BS), (BC, BS), (1, 0)) + # [BC, BS] + b_z = tl.load(p_z, boundary_check=(0, 1)) + b_ss = tl.load(p_ss, boundary_check=(0, 1)) + # [BC, BS] + b_doo += b_ss * exp(b_zn[None, :] - b_z) + b_doo = exp(b_s - b_zn[None, :]) * tl.dot(m_o.to(b_s.dtype), b_doo.to(b_s.dtype), allow_tf32=False) + + for j in range(0, BC): + p_z = tl.make_block_ptr(z + i_bh * T*S, (T*S,), (1,), ((i_t * BT + i_i * BC + j) * S + i_s * BS,), (BS,), (0,)) + p_ss = tl.make_block_ptr(ss + i_bh * T*S, (T*S,), (1,), ((i_t * BT + i_i * BC + j) * S + i_s * BS,), (BS,), (0,)) + # [BS,] + b_z = tl.load(p_z, boundary_check=(0,)) + b_ss = tl.load(p_ss, boundary_check=(0,)) + # [BC, BS] + m_i = o_i[:, None] <= j + b_doo += tl.where(m_i, exp(b_s - b_z[None, :]) * b_ss[None, :], 0.) + b_doo += tl.load(p_doo, boundary_check=(0, 1)) + tl.store(p_doo, b_doo.to(p_doo.dtype.element_ty), boundary_check=(0, 1)) + + +class ChunkABCFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward(ctx, q, k, v, s, initial_state, output_final_state): + B, H, T, K, V, M = *q.shape, v.shape[-1], s.shape[-1] + BT, BC = 64, 16 + BK = min(64, triton.next_power_of_2(K)) + BV = min(64, triton.next_power_of_2(V)) + BM = min(64, triton.next_power_of_2(M)) + NT, NC = triton.cdiv(T, BT), triton.cdiv(BT, BC) + NV, NM = triton.cdiv(V, BV), triton.cdiv(M, BM) + num_warps = 4 if BK == 64 else 2 + num_stages = 1 + + def fwd_pre(s, B, H, T, S): + # keep cummulative normalizer in fp32 + z = torch.empty_like(s, dtype=torch.float) + grid = (B * H,) + logcumsumexp_fwd_kernel[grid]( + s, z, + T=T, S=S + ) + return z + + def fwd_inner(q, k, v, z, B, H, T, K, V, BT, BK, BV, NT, normk=False, h0=None, ht=None): + NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV) + h = q.new_empty(B, H, NT * K, V) + grid = (NV, NK, B * H) + chunk_abc_fwd_kernel_h[grid]( + k, v, z, h, h0, ht, + T=T, K=K, V=V, BT=BT, BK=BK, BV=BV, NT=NT, + NORMK=normk, + USE_INITIAL_STATE=h0 is not None, + STORE_FINAL_STATE=ht is not None, + num_warps=num_warps, + num_stages=num_stages + ) + return h + + final_state = None + if output_final_state: + final_state = (q.new_empty(B, H, K, M, dtype=torch.float), + q.new_empty(B, H, M, V, dtype=torch.float)) + + z = fwd_pre(s, B, H, T, M) + scale = K ** -0.5 + hk = fwd_inner( + q=q, k=k, v=s, z=z, + B=B, H=H, T=T, K=K, V=M, BT=BT, BK=BK, BV=BM, NT=NT, + normk=False, + h0=initial_state[0] if initial_state is not None else None, + ht=final_state[0] if final_state is not None else None + ) + ok1 = torch.empty_like(s) + Ak = q.new_empty(B, H, T, BT) + grid = (NM, NT, B * H) + chunk_abc_fwd_kernel_K[grid]( + q, k, z, hk, ok1, Ak, + scale=scale, + T=T, K=K, V=M, BT=BT, BK=BK, BV=BM, NT=NT, + num_warps=num_warps, + num_stages=num_stages + ) + ok0 = torch.empty_like(s) + grid = (NM, NT * NC, B * H) + chunk_abc_fwd_kernel_intra_K[grid]( + s, z, ok0, Ak, + T=T, V=M, BT=BT, BC=BC, BV=BM, NC=NC, + num_warps=2, + num_stages=num_stages + ) + ok = ok0.add_(ok1) + + scale = 1. + # p is kept in fp32 for safe softmax backward + p = softmax_fwd(ok, dtype=torch.float) + qv = p.to(q.dtype) + + scale = 1. + hv = fwd_inner( + q=qv, k=s, v=v, z=z, + B=B, H=H, T=T, K=M, V=V, BT=BT, BK=BM, BV=BV, NT=NT, + normk=True, + h0=initial_state[1] if initial_state is not None else None, + ht=final_state[1] if final_state is not None else None + ) + Av = q.new_zeros(NM, B, H, T, BT) + grid = (NM, NT * NC * NC, B * H) + chunk_abc_fwd_kernel_intra_V[grid]( + qv, s, z, Av, + scale=scale, + T=T, K=M, BT=BT, BC=BC, BK=BM, NC=NC, + num_warps=2, + num_stages=num_stages + ) + Av = Av.sum(0) + ov = torch.empty_like(v) + grid = (NV, NT, B * H) + chunk_abc_fwd_kernel_V[grid]( + qv, v, z, hv, ov, Av, + scale=scale, + T=T, + K=M, + V=V, + BT=BT, + BK=BM, + BV=BV, + NT=NT, + num_warps=num_warps, + num_stages=num_stages + ) + ctx.save_for_backward(q, k, v, s, z, ok, p, hk, hv, Av) + ctx.BT = BT + return ov, final_state + + @staticmethod + @input_guard + def backward(ctx, dov, dht=None): + q, k, v, s, z, ok, p, hk, hv, Av = ctx.saved_tensors + B, H, T, K, V, M = *q.shape, v.shape[-1], s.shape[-1] + BT, BC = ctx.BT, 16 + BK = min(64, triton.next_power_of_2(K)) + BV = min(64, triton.next_power_of_2(V)) + BM = min(64, triton.next_power_of_2(M)) + NT, NC = triton.cdiv(T, BT), triton.cdiv(BT, BC) + NK, NM = triton.cdiv(K, BK), triton.cdiv(M, BM) + num_warps = 4 if BK == 64 else 2 + num_stages = 1 + + def bwd_inner(q, z, do, B, H, T, K, V, BT, BK, BV, NT, scale, normk=False): + NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV) + dh = q.new_empty(B, H, NT * K, V) + grid = (NK, NV, B * H) + chunk_abc_bwd_kernel_dh[grid]( + q, z, do, dh, + scale=scale, + T=T, K=K, V=V, BT=BT, BK=BK, BV=BV, NT=NT, + NORMK=normk, + num_warps=num_warps, + num_stages=num_stages + ) + return dh + + def bwd_post(s, z, ss, B, H, T, S, BT, BC, BS, NT, NC, NS): + doo = torch.empty_like(s) + grid = (NS, B * H) + chunk_abc_bwd_kernel_rcum_inter[grid]( + s, z, ss, doo, + T=T, S=S, BT=BT, BS=BS, NT=NT, + num_warps=num_warps, + num_stages=num_stages + ) + grid = (NS, NT * NC, B * H) + chunk_abc_bwd_kernel_rcum_intra[grid]( + s, z, ss, doo, + T=T, S=S, BT=BT, BC=BC, BS=BS, NC=NC, + num_warps=num_warps, + num_stages=num_stages + ) + return doo + + scale = 1. + qv = p.to(q.dtype) + dhv = bwd_inner( + qv, z, dov, + B=B, H=H, T=T, K=M, V=V, BT=BT, BK=BM, BV=BV, NT=NT, + scale=scale, + normk=True + ) + dp1 = torch.empty_like(p) + dsv1 = torch.empty_like(s, dtype=torch.float) + dv = v.new_empty(NM, *v.shape) + dAv = q.new_zeros(B, H, T, BT) + grid = (NM, NT, B * H) + chunk_abc_bwd_kernel_V[grid]( + s, v, z, hv, Av, dov, dhv, dp1, dsv1, dv, dAv, + scale=scale, + T=T, K=M, V=V, BT=BT, BK=BM, BV=BV, NT=NT, + num_warps=num_warps, + num_stages=num_stages + ) + dv = dv.sum(0) + dp0 = torch.empty_like(p) + dsv0 = s.new_zeros(s.shape, dtype=torch.float) + grid = (NM, NT * NC, B * H) + chunk_abc_bwd_kernel_intra_V[grid]( + qv, s, z, dAv, dp0, dsv0, + T=T, K=M, BT=BT, BC=BC, BK=BM, NC=NC, + num_warps=2, + num_stages=num_stages + ) + dp = dp1.add_(dp0) + dsv = dsv1.add_(dsv0) + + # softmax gradient, equivalent to: + # dok = p * (dp - (p * dp).sum(-1, True)) + dok = softmax_bwd(p, dp, dtype=ok.dtype) + + scale = K ** -0.5 + dhk = bwd_inner( + q, z, dok, + B=B, H=H, T=T, K=K, V=M, BT=BT, BK=BK, BV=BM, NT=NT, + scale=scale, + normk=False + ) + dAk = q.new_zeros(NM, B, H, T, BT) + grid = (NM, NT * NC * NC, B * H) + chunk_abc_bwd_kernel_intra_K[grid]( + s, z, dok, dAk, + scale=scale, + T=T, V=M, BT=BT, BC=BC, BV=BM, NC=NC, + num_warps=2, + num_stages=num_stages + ) + dAk = dAk.sum(0) + + Ak = q.new_zeros(NK, B, H, T, BT) + dq = torch.empty_like(q) + dk = torch.empty_like(k) + dsk1 = s.new_empty(NK, *s.shape, dtype=torch.float) + grid = (NK, NT, B * H) + chunk_abc_bwd_kernel_K[grid]( + q, k, s, z, hk, Ak, dok, dhk, dq, dk, dsk1, dAk, + scale=scale, + T=T, K=K, V=M, BT=BT, BK=BK, BV=BM, NT=NT, + num_warps=num_warps, + num_stages=num_stages + ) + Ak = Ak.sum(0) + dsk1 = dsk1.sum(0) + dsk0 = torch.empty_like(s, dtype=torch.float) + grid = (NM, NT * NC, B * H) + chunk_abc_bwd_kernel_intra_KV[grid]( + s, z, Ak, dok, dsk0, + T=T, V=M, BT=BT, BC=BC, BV=BM, NC=NC, + num_warps=2, + num_stages=num_stages + ) + ds = dsv.add_(dsk1.add_(dsk0)) + ds -= bwd_post(s, z, ok * dok + p * dp, B, H, T, M, BT, BC, BM, NT, NC, NM) + ds = ds.to(s.dtype) + return dq, dk, dv, ds, None, None + + +@torch.compiler.disable +def chunk_abc( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + s: torch.Tensor, + initial_state: Optional[Tuple[torch.Tensor]] = None, + output_final_state: bool = False, + head_first: bool = True +) -> Tuple[torch.Tensor, torch.Tensor]: + r""" + Args: + q (torch.Tensor): + queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]` + k (torch.Tensor): + keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]` + v (torch.Tensor): + values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]` + s (torch.Tensor): + slot representations of shape `[B, H, T, M]` if `head_first=True` else `[B, T, H, M]` + initial_state (Optional[Tuple[torch.Tensor, torch.Tensor]]): + Initial states of shape `[B, H, K, M]` and `[B, H, M, V]`. Default: `None`. + output_final_state (Optional[bool]): + Whether to output the final state of shape `[B, H, K, M]` and `[B, H, M, V]`. Default: `False`. + head_first (Optional[bool]): + Whether the inputs are in the head-first format. + Default: `True`. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`. + final_state (torch.Tensor): + Final state of shape `[B, H, K, M]` and `[B, H, M, V]` if `output_final_state=True` else `None`. + """ + if not head_first: + q, k, v, s = map(lambda x: x.transpose(1, 2), (q, k, v, s)) + o, final_state = ChunkABCFunction.apply(q, k, v, s, initial_state, output_final_state) + if not head_first: + o = o.transpose(1, 2) + return o, final_state diff --git a/fla/ops/abc/naive.py b/fla/ops/abc/naive.py new file mode 100644 index 0000000000000000000000000000000000000000..a7f25c40db73bcf33d1599761be0008cc5be7c59 --- /dev/null +++ b/fla/ops/abc/naive.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +from typing import Optional + +import torch +from einops import repeat + + +def naive_recurrent_abc( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + s: torch.Tensor, + g: Optional[torch.Tensor] = None, + scale: Optional[int] = None, + initial_state: Optional[torch.Tensor] = None, + output_final_state: Optional[bool] = False +) -> torch.Tensor: + dtype = q.dtype + + NG = q.shape[1]//k.shape[1] + # [batch_size, n_heads, seq_len, n_slots] + if g is None: + z = s.float().logcumsumexp(2) + g = torch.cat((z[:, :, :1], z[:, :, :-1]), 2) - z + s = torch.exp(s - z) + q, k, v, s, g = map(lambda x: x.float(), (q, k, v, s, g)) + k, v, s, g = map(lambda x: repeat(x, 'b h t d -> b (h g) t d', g=NG), (k, v, s, g)) + if initial_state is not None: + initial_state = tuple(map(lambda x: repeat(x, 'b h k v -> b (h g) k v', g=NG), initial_state)) + + B, H, T, K, V, M = *q.shape, v.shape[-1], s.shape[-1] + + hk = torch.zeros(B, H, K, M, dtype=torch.float, device=q.device) + ok = torch.zeros_like(s) + + if scale is None: + scale = q.shape[-1] ** -0.5 + + final_state = None + if initial_state is not None: + hk += initial_state[0] + + for i in range(T): + q_i = q[:, :, i] * scale + k_i = k[:, :, i] + v_i = s[:, :, i] + g_i = g[:, :, i].exp() + hk = hk * g_i[..., None, :] + k_i[..., None] * v_i[..., None, :] + ok[:, :, i] = (q_i[..., None] * hk).sum(-2) + + qv = ok.softmax(-1) + hv = torch.zeros(B, H, M, V, dtype=torch.float, device=q.device) + ov = torch.zeros_like(v) + if initial_state is not None: + hv += initial_state[1] + + for i in range(T): + q_i = qv[:, :, i] + k_i = s[:, :, i] + v_i = v[:, :, i] + g_i = g[:, :, i].exp() + hv = hv * g_i[..., :, None] + k_i[..., None] * v_i[..., None, :] + ov[:, :, i] = (q_i[..., None] * hv).sum(-2) + + if output_final_state: + final_state = (hk.view(B, -1, NG, K, M)[:, :, 0], hv.view(B, -1, NG, M, V)[:, :, 0]) + return ov.to(dtype), final_state + + +def naive_cumsum_abc( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + s: torch.Tensor +) -> torch.Tensor: + """ + A simple implementation of vanilla ABC that is more aligned with the descriptions in the paper. + This is just for demonstration purposes, with no numerical stabilities guaranteed. + """ + + dtype = q.dtype + q, k, v, s = map(lambda x: x.float(), (q, k, v, s)) + + scale = q.shape[-1] ** -0.5 + # [batch_size, n_heads, seq_len, n_slots] + s = (s - s.max(2, True)[0]).exp() + z = s.cumsum(2) + # [batch_size, n_heads, seq_len, n_slots, d_head] + K = (s.unsqueeze(-1) * k.unsqueeze(-2)).cumsum(2) / z.unsqueeze(-1) + V = (s.unsqueeze(-1) * v.unsqueeze(-2)).cumsum(2) / z.unsqueeze(-1) + # [batch_size, n_heads, seq_len, n_slots] + p = torch.einsum('...d,...md->...m', q * scale, K).softmax(-1) + # [batch_size, n_heads, seq_len, d_head] + o = torch.einsum('...m,...md->...d', p, V) + return o.to(dtype), None diff --git a/fla/ops/attn/__pycache__/parallel.cpython-311.pyc b/fla/ops/attn/__pycache__/parallel.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0507a93ec6b469e4b020dc8fe7bbf519db16e7c1 Binary files /dev/null and b/fla/ops/attn/__pycache__/parallel.cpython-311.pyc differ diff --git a/fla/ops/attn/parallel.py b/fla/ops/attn/parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..d19a2e1b13398bf81cb503564c8652ff5735eee3 --- /dev/null +++ b/fla/ops/attn/parallel.py @@ -0,0 +1,629 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional + +import torch +import triton +import triton.language as tl +from einops import rearrange, reduce + +from fla.ops.common.utils import prepare_chunk_indices +from fla.ops.utils.op import exp, log +from fla.utils import autocast_custom_bwd, autocast_custom_fwd, check_shared_mem, contiguous + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4] + ([8] if check_shared_mem('hopper') else []) + for num_stages in [2, 3, 4, 5] + ], + key=['B', 'H', 'G', 'K', 'V', 'BK', 'BV'], +) +@triton.jit +def parallel_attn_fwd_kernel( + q, + k, + v, + o, + lse, + scale, + offsets, + indices, + T, + B: tl.constexpr, + H: tl.constexpr, + HQ: tl.constexpr, + G: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_OFFSETS: tl.constexpr +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_hq = i_bh // HQ, i_bh % HQ + i_h = i_hq // G + + if USE_OFFSETS: + i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + else: + i_n = i_b + bos, eos = i_n * T, i_n * T + T + + p_q = tl.make_block_ptr(q + (bos * HQ + i_hq) * K, (T, K), (HQ*K, 1), (i_t * BT, 0), (BT, BK), (1, 0)) + p_o = tl.make_block_ptr(o + (bos * HQ + i_hq) * V, (T, V), (HQ*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_lse = tl.make_block_ptr(lse + bos * HQ + i_hq, (T,), (HQ,), (i_t * BT,), (BT,), (0,)) + + # the Q block is kept in the shared memory throughout the whole kernel + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + # [BT, BV] + b_o = tl.zeros([BT, BV], dtype=tl.float32) + + b_m = tl.full([BT], float('-inf'), dtype=tl.float32) + b_acc = tl.zeros([BT], dtype=tl.float32) + for i_s in range(0, i_t * BT, BS): + p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (K, T), (1, H*K), (0, i_s), (BK, BS), (0, 1)) + p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_s, i_v * BV), (BS, BV), (1, 0)) + # [BK, BS] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BS, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BT, BS] + b_s = tl.dot(b_q, b_k) + + # [BT, BS] + b_m, b_mp = tl.maximum(b_m, tl.max(b_s, 1)), b_m + b_r = exp(b_mp - b_m) + # [BT, BS] + b_p = exp(b_s - b_m[:, None]) + # [BT] + b_acc = b_acc * b_r + tl.sum(b_p, 1) + # [BT, BV] + b_o = b_o * b_r[:, None] + tl.dot(b_p.to(b_q.dtype), b_v) + + b_mp = b_m + + # [BT] + o_q = i_t * BT + tl.arange(0, BT) + for i_s in range(i_t * BT, min((i_t + 1) * BT, T), BS): + p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (K, T), (1, H*K), (0, i_s), (BK, BS), (0, 1)) + p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_s, i_v * BV), (BS, BV), (1, 0)) + + # [BS] + o_k = i_s + tl.arange(0, BS) + # [BK, BS] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BS, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BT, BS] + b_s = tl.dot(b_q, b_k) + b_s = tl.where(o_q[:, None] >= o_k[None, :], b_s, float('-inf')) + + # [BT] + b_m, b_mp = tl.maximum(b_m, tl.max(b_s, 1)), b_m + b_r = exp(b_mp - b_m) + # [BT, BS] + b_p = exp(b_s - b_m[:, None]) + # [BT] + b_acc = b_acc * b_r + tl.sum(b_p, 1) + # [BT, BV] + b_o = b_o * b_r[:, None] + tl.dot(b_p.to(b_q.dtype), b_v) + + b_mp = b_m + b_o = b_o / b_acc[:, None] + b_m += log(b_acc) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_lse, b_m.to(p_lse.dtype.element_ty), boundary_check=(0,)) + + +@triton.jit +def parallel_attn_bwd_kernel_preprocess( + o, + do, + delta, + B: tl.constexpr, + V: tl.constexpr +): + i_n = tl.program_id(0) + o_d = tl.arange(0, B) + m_d = o_d < V + + b_o = tl.load(o + i_n * V + o_d, mask=m_d, other=0) + b_do = tl.load(do + i_n * V + o_d, mask=m_d, other=0).to(tl.float32) + b_delta = tl.sum(b_o * b_do) + + tl.store(delta + i_n, b_delta.to(delta.dtype.element_ty)) + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4] + ([8] if check_shared_mem('hopper') else []) + for num_stages in [2, 3, 4, 5] + ], + key=['B', 'H', 'G', 'K', 'V', 'BK', 'BV'], +) +@triton.jit(do_not_specialize=['T']) +def parallel_attn_bwd_kernel_dq( + q, + k, + v, + lse, + delta, + do, + dq, + scale, + offsets, + indices, + T, + B: tl.constexpr, + H: tl.constexpr, + HQ: tl.constexpr, + G: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_OFFSETS: tl.constexpr +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_hq = i_bh // HQ, i_bh % HQ + i_h = i_hq // G + + if USE_OFFSETS: + i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + else: + i_n = i_b + bos, eos = i_n * T, i_n * T + T + + p_q = tl.make_block_ptr(q + (bos * HQ + i_hq) * K, (T, K), (HQ*K, 1), (i_t * BT, 0), (BT, BK), (1, 0)) + p_dq = tl.make_block_ptr(dq + (bos * HQ + i_hq) * K, (T, K), (HQ*K, 1), (i_t * BT, 0), (BT, BK), (1, 0)) + p_do = tl.make_block_ptr(do + (bos * HQ + i_hq) * V, (T, V), (HQ*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_lse = tl.make_block_ptr(lse + bos * HQ + i_hq, (T,), (HQ,), (i_t * BT,), (BT,), (0,)) + p_delta = tl.make_block_ptr(delta + bos * HQ + i_hq, (T,), (HQ,), (i_t * BT,), (BT,), (0,)) + + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + # [BT, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + # [BT] + b_lse = tl.load(p_lse, boundary_check=(0,)) + b_delta = tl.load(p_delta, boundary_check=(0,)) + + # [BT, BK] + b_dq = tl.zeros([BT, BK], dtype=tl.float32) + for i_s in range(0, i_t * BT, BS): + p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (K, T), (1, H*K), (0, i_s), (BK, BS), (0, 1)) + p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (V, T), (1, H*V), (i_v * BV, i_s), (BV, BS), (0, 1)) + # [BK, BS] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BV, BS] + b_v = tl.load(p_v, boundary_check=(0, 1)) + + # [BT, BS] + b_s = tl.dot(b_q, b_k) + b_p = exp(b_s - b_lse[:, None]) + + # [BT, BV] @ [BV, BS] -> [BT, BS] + b_dp = tl.dot(b_do, b_v) + b_ds = b_p * (b_dp.to(tl.float32) - b_delta[:, None]) + # [BT, BS] @ [BS, BK] -> [BT, BK] + b_dq += tl.dot(b_ds.to(b_k.dtype), tl.trans(b_k)) + + # [BT] + o_q = i_t * BT + tl.arange(0, BT) + for i_s in range(i_t * BT, min((i_t + 1) * BT, T), BS): + p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (K, T), (1, H*K), (0, i_s), (BK, BS), (0, 1)) + p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (V, T), (1, H*V), (i_v * BV, i_s), (BV, BS), (0, 1)) + # [BS] + o_k = i_s + tl.arange(0, BS) + # [BK, BS] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BV, BS] + b_v = tl.load(p_v, boundary_check=(0, 1)) + + # [BT, BS] + b_s = tl.dot(b_q, b_k) + b_p = exp(b_s - b_lse[:, None]) + b_p = tl.where(o_q[:, None] >= o_k[None, :], b_p, 0) + + # [BT, BV] @ [BV, BS] -> [BT, BS] + b_dp = tl.dot(b_do, b_v) + b_ds = b_p * (b_dp.to(tl.float32) - b_delta[:, None]) + # [BT, BS] @ [BS, BK] -> [BT, BK] + b_dq += tl.dot(b_ds.to(b_k.dtype), tl.trans(b_k)) + + b_dq *= scale + + tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [1, 2, 4] + ([8] if check_shared_mem('hopper') else []) + for num_stages in [2, 3, 4, 5] + ], + key=['B', 'H', 'G', 'K', 'V', 'BK', 'BV'], +) +@triton.jit(do_not_specialize=['T']) +def parallel_attn_bwd_kernel_dkv( + q, + k, + v, + lse, + delta, + do, + dk, + dv, + offsets, + indices, + scale, + T, + B: tl.constexpr, + H: tl.constexpr, + HQ: tl.constexpr, + G: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_OFFSETS: tl.constexpr +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_hq = i_bh // HQ, i_bh % HQ + i_h = i_hq // G + + if USE_OFFSETS: + i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + else: + i_n = i_b + bos, eos = i_n * T, i_n * T + T + + p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, 0), (BT, BK), (1, 0)) + p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_dk = tl.make_block_ptr(dk + (bos * HQ + i_hq) * K, (T, K), (HQ*K, 1), (i_t * BT, 0), (BT, BK), (1, 0)) + p_dv = tl.make_block_ptr(dv + (bos * HQ + i_hq) * V, (T, V), (HQ*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + + # [BT, BK] + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_dk = tl.zeros([BT, BK], dtype=tl.float32) + # [BT, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_dv = tl.zeros([BT, BV], dtype=tl.float32) + + o_k = i_t * BT + tl.arange(0, BT) + for i_s in range(i_t * BT, min((i_t + 1) * BT, T), BS): + p_q = tl.make_block_ptr(q + (bos * HQ + i_hq) * K, (T, K), (HQ*K, 1), (i_s, 0), (BS, BK), (1, 0)) + p_do = tl.make_block_ptr(do + (bos * HQ + i_hq) * V, (T, V), (HQ*V, 1), (i_s, i_v * BV), (BS, BV), (1, 0)) + p_lse = tl.make_block_ptr(lse + bos * HQ + i_hq, (T,), (HQ,), (i_s,), (BS,), (0,)) + p_delta = tl.make_block_ptr(delta + bos * HQ + i_hq, (T,), (HQ,), (i_s,), (BS,), (0,)) + + # [BS] + o_q = i_s + tl.arange(0, BS) + # [BS, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + # [BS, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + # [BS] + b_lse = tl.load(p_lse, boundary_check=(0,)) + b_delta = tl.load(p_delta, boundary_check=(0,)) + # [BT, BS] + b_s = tl.dot(b_k, tl.trans(b_q)) + b_p = exp(b_s - b_lse[None, :]) + b_p = tl.where(o_k[:, None] <= o_q[None, :], b_p, 0) + # [BT, BS] @ [BS, BV] -> [BT, BV] + b_dv += tl.dot(b_p.to(b_do.dtype), b_do) + # [BT, BV] @ [BV, BS] -> [BT, BS] + b_dp = tl.dot(b_v, tl.trans(b_do)) + # [BT, BS] + b_ds = b_p * (b_dp - b_delta[None, :]) + # [BT, BS] @ [BS, BK] -> [BT, BK] + b_dk += tl.dot(b_ds.to(b_q.dtype), b_q) + + for i_s in range((i_t + 1) * BT, tl.cdiv(T, BS) * BS, BS): + p_q = tl.make_block_ptr(q + (bos * HQ + i_hq) * K, (T, K), (HQ*K, 1), (i_s, 0), (BS, BK), (1, 0)) + p_do = tl.make_block_ptr(do + (bos * HQ + i_hq) * V, (T, V), (HQ*V, 1), (i_s, i_v * BV), (BS, BV), (1, 0)) + p_lse = tl.make_block_ptr(lse + bos * HQ + i_hq, (T,), (HQ,), (i_s,), (BS,), (0,)) + p_delta = tl.make_block_ptr(delta + bos * HQ + i_hq, (T,), (HQ,), (i_s,), (BS,), (0,)) + + # [BS] + o_q = i_s + tl.arange(0, BS) + # [BS, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + # [BS, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + # [BS] + b_lse = tl.load(p_lse, boundary_check=(0,)) + b_delta = tl.load(p_delta, boundary_check=(0,)) + # [BT, BS] + b_s = tl.dot(b_k, tl.trans(b_q)) + b_p = exp(b_s - b_lse[None, :]) + # [BT, BS] @ [BS, BV] -> [BT, BV] + b_dv += tl.dot(b_p.to(b_do.dtype), b_do) + # [BT, BV] @ [BV, BS] -> [BT, BS] + b_dp = tl.dot(b_v, tl.trans(b_do)) + # [BT, BS] + b_ds = b_p * (b_dp - b_delta[None, :]) + # [BT, BS] @ [BS, BK] -> [BT, BK] + b_dk += tl.dot(b_ds.to(b_q.dtype), b_q) + + tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1)) + + +def parallel_attn_fwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: float, + chunk_size: int = 128, + offsets: Optional[torch.LongTensor] = None, + indices: Optional[torch.LongTensor] = None, +): + B, T, H, K, V = *k.shape, v.shape[-1] + HQ = q.shape[2] + G = HQ // H + BT = chunk_size + if check_shared_mem('hopper', q.device.index): + BS = min(64, max(16, triton.next_power_of_2(T))) + BK = min(256, max(16, triton.next_power_of_2(K))) + BV = min(256, max(16, triton.next_power_of_2(V))) + elif check_shared_mem('ampere', q.device.index): + BS = min(32, max(16, triton.next_power_of_2(T))) + BK = min(256, max(16, triton.next_power_of_2(K))) + BV = min(128, max(16, triton.next_power_of_2(V))) + else: + BS = min(32, max(16, triton.next_power_of_2(T))) + BK = min(256, max(16, triton.next_power_of_2(K))) + BV = min(64, max(16, triton.next_power_of_2(V))) + NK = triton.cdiv(K, BK) + NV = triton.cdiv(V, BV) + NT = triton.cdiv(T, BT) if offsets is None else len(indices) + assert NK == 1, "The key dimension can not be larger than 256" + + o = torch.empty(B, T, HQ, V, dtype=v.dtype, device=q.device) + lse = torch.empty(B, T, HQ, dtype=torch.float, device=q.device) + + grid = (NV, NT, B * HQ) + parallel_attn_fwd_kernel[grid]( + q=q, + k=k, + v=v, + o=o, + lse=lse, + scale=scale, + offsets=offsets, + indices=indices, + B=B, + T=T, + H=H, + HQ=HQ, + G=G, + K=K, + V=V, + BT=BT, + BS=BS, + BK=BK, + BV=BV, + ) + return o, lse + + +def parallel_attn_bwd_preprocess( + o: torch.Tensor, + do: torch.Tensor +): + V = o.shape[-1] + delta = torch.empty_like(o[..., 0], dtype=torch.float32) + parallel_attn_bwd_kernel_preprocess[(delta.numel(),)]( + o=o, + do=do, + delta=delta, + B=triton.next_power_of_2(V), + V=V, + ) + return delta + + +def parallel_attn_bwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + o: torch.Tensor, + lse: torch.Tensor, + do: torch.Tensor, + scale: float = None, + chunk_size: int = 128, + offsets: Optional[torch.LongTensor] = None, + indices: Optional[torch.LongTensor] = None, +): + B, T, H, K, V = *k.shape, v.shape[-1] + HQ = q.shape[2] + G = HQ // H + BT = chunk_size + BS = max(16, triton.next_power_of_2(T)) + BS = min(32, BS) if check_shared_mem('ampere') else min(16, BS) + BK = max(16, triton.next_power_of_2(K)) + BV = max(16, triton.next_power_of_2(V)) + NV = triton.cdiv(V, BV) + NT = triton.cdiv(T, BT) if offsets is None else len(indices) + + delta = parallel_attn_bwd_preprocess(o, do) + + dq = torch.empty(B, T, HQ, K, dtype=k.dtype if H == HQ else torch.float, device=q.device) + dk = torch.empty(B, T, HQ, K, dtype=k.dtype if H == HQ else torch.float, device=q.device) + dv = torch.empty(B, T, HQ, V, dtype=v.dtype if H == HQ else torch.float, device=q.device) + grid = (NV, NT, B * HQ) + parallel_attn_bwd_kernel_dq[grid]( + q=q, + k=k, + v=v, + lse=lse, + delta=delta, + do=do, + dq=dq, + offsets=offsets, + indices=indices, + scale=scale, + T=T, + B=B, + H=H, + HQ=HQ, + G=G, + K=K, + V=V, + BT=BT, + BS=BS, + BK=BK, + BV=BV + ) + parallel_attn_bwd_kernel_dkv[grid]( + q=q, + k=k, + v=v, + lse=lse, + delta=delta, + do=do, + dk=dk, + dv=dv, + offsets=offsets, + indices=indices, + scale=scale, + T=T, + B=B, + H=H, + HQ=HQ, + G=G, + K=K, + V=V, + BT=BT, + BS=BS, + BK=BK, + BV=BV + ) + dk = reduce(dk, 'b t (h g) k -> b t h k', g=G, reduction='sum') + dv = reduce(dv, 'b t (h g) v -> b t h v', g=G, reduction='sum') + return dq, dk, dv + + +@torch.compile +class ParallelAttentionFunction(torch.autograd.Function): + + @staticmethod + @contiguous + @autocast_custom_fwd + def forward(ctx, q, k, v, scale, offsets): + ctx.dtype = q.dtype + + chunk_size = min(128, max(16, triton.next_power_of_2(q.shape[1]))) + # 2-d indices denoting the offsets of chunks in each sequence + # for example, if the passed `offsets` is [0, 100, 356] and `chunk_size` is 64, + # then there are 2 and 4 chunks in the 1st and 2nd sequences respectively, and `indices` will be + # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]] + indices = prepare_chunk_indices(offsets, chunk_size) if offsets is not None else None + + o, lse = parallel_attn_fwd( + q=q, + k=k, + v=v, + scale=scale, + chunk_size=chunk_size, + offsets=offsets, + indices=indices + ) + ctx.save_for_backward(q, k, v, o, lse) + ctx.chunk_size = chunk_size + ctx.offsets = offsets + ctx.indices = indices + ctx.scale = scale + return o.to(q.dtype) + + @staticmethod + @contiguous + @autocast_custom_bwd + def backward(ctx, do): + q, k, v, o, lse = ctx.saved_tensors + dq, dk, dv = parallel_attn_bwd( + q=q, + k=k, + v=v, + o=o, + lse=lse, + do=do, + scale=ctx.scale, + chunk_size=ctx.chunk_size, + offsets=ctx.offsets, + indices=ctx.indices + ) + return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None + + +def parallel_attn( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: Optional[float] = None, + cu_seqlens: Optional[torch.LongTensor] = None, + head_first: bool = False +) -> torch.Tensor: + r""" + Args: + q (torch.Tensor): + queries of shape `[B, T, HQ, K]` if `head_first=False` else `[B, HQ, T, K]`. + k (torch.Tensor): + keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`. + GQA will be applied if HQ is divisible by H. + v (torch.Tensor): + values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`. + scale (Optional[int]): + Scale factor for attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + cu_seqlens (torch.LongTensor): + Cumulative sequence lengths of shape `[N+1]` used for variable-length training, + consistent with the FlashAttention API. + head_first (Optional[bool]): + Whether the inputs are in the head-first format. Default: `False`. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`. + """ + if scale is None: + scale = k.shape[-1] ** -0.5 + if cu_seqlens is not None: + assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided" + if head_first: + q, k, v = map(lambda x: rearrange(x, 'b h t d -> b t h d'), (q, k, v)) + o = ParallelAttentionFunction.apply(q, k, v, scale, cu_seqlens) + if head_first: + o = rearrange(o, 'b t h d -> b h t d') + return o diff --git a/fla/ops/based/__pycache__/parallel.cpython-311.pyc b/fla/ops/based/__pycache__/parallel.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3b02db9bd5227b7a05bdb042441fdeed54c910d Binary files /dev/null and b/fla/ops/based/__pycache__/parallel.cpython-311.pyc differ diff --git a/fla/ops/based/parallel.py b/fla/ops/based/parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..d4621ea5838bc410a33b1b0f0af40b3c322f02b5 --- /dev/null +++ b/fla/ops/based/parallel.py @@ -0,0 +1,410 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional + +import torch +import triton +import triton.language as tl + +from fla.utils import autocast_custom_bwd, autocast_custom_fwd, input_guard + +# Based: An Educational and Effective Sequence Mixer +# https://hazyresearch.stanford.edu/blog/2023-12-11-zoology2-based + + +@triton.jit(do_not_specialize=['T']) +def parallel_based_fwd_kernel( + q, + k, + v, + o, + z, + scale, + T, + B: tl.constexpr, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BTL: tl.constexpr, + BTS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, +): + # i_c: chunk index. used for sequence parallelism + i_kv, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + NV = tl.cdiv(V, BV) + i_k = i_kv // (NV) + i_v = i_kv % (NV) + + p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (i_c * BTL, i_k * BK), (BTL, BK), (1, 0)) + p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k * BK, 0), (BK, BTS), (0, 1)) + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (0, i_v * BV), (BTS, BV), (1, 0)) + + # [BQ, BD] block Q, in the shared memory throughout the whole kernel + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + b_o = tl.zeros([BTL, BV], dtype=tl.float32) + b_z = tl.zeros([BTL], dtype=tl.float32) + + # Q block and K block have no overlap + # no need for mask, thereby saving flops + for _ in range(0, i_c * BTL, BTS): + # [BK, BTS] + b_k = tl.load(p_k, boundary_check=(0, 1)) + + # [BTS, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BTL, BTS] + b_s = tl.dot(b_q, (b_k), allow_tf32=False) + b_s = 1 + b_s + 0.5 * b_s * b_s + b_z += tl.sum(b_s, axis=1) + + # [BQ, BD] + b_o = b_o + tl.dot(b_s.to(b_v.dtype), b_v, allow_tf32=False) + p_k = tl.advance(p_k, (0, BTS)) + p_v = tl.advance(p_v, (BTS, 0)) + + # # rescale interchunk output + tl.debug_barrier() + o_q = tl.arange(0, BTL) + # # sync threads, easy for compiler to optimize + # tl.debug_barrier() + + o_k = tl.arange(0, BTS) + p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k * BK, i_c * BTL), (BK, BTS), (0, 1)) + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_c * BTL, i_v * BV), (BTS, BV), (1, 0)) + # Q block and K block have overlap. masks required + for _ in range(i_c * BTL, (i_c + 1) * BTL, BTS): + # [BK, BTS] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BTS, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BTL, BTS] + m_s = o_q[:, None] >= o_k[None, :] + b_s = tl.dot(b_q, b_k, allow_tf32=False) + b_s = 1 + b_s + 0.5 * b_s * b_s + b_s = tl.where(m_s, b_s, 0) + b_z += tl.sum(b_s, axis=1) + # [BTL, BV] + b_o += tl.dot(b_s.to(b_q.dtype), b_v, allow_tf32=False) + + p_k = tl.advance(p_k, (0, BTS)) + p_v = tl.advance(p_v, (BTS, 0)) + o_k += BTS + + p_o = tl.make_block_ptr(o + (i_bh + B * H * i_k) * T*V, (T, V), (V, 1), (i_c*BTL, i_v*BV), (BTL, BV), (1, 0)) + p_z = z + (i_bh + B * H * i_k) * T + i_c * BTL + tl.arange(0, BTL) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_z, b_z.to(p_z.dtype.element_ty), mask=((i_c * BTL + tl.arange(0, BTL)) < T)) + + +@triton.jit +def _parallel_based_bwd_dq( + i_bh, + i_c, + i_k, + i_v, + q, + k, + v, + do, + dz, + dq, + scale, + T, + B: tl.constexpr, + H: tl.constexpr, + BTL: tl.constexpr, + BTS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, +): + p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i_c * BTL, i_v * BV), (BTL, BV), (1, 0)) + p_q = tl.make_block_ptr(q + (i_bh) * T*K, (T, K), (K, 1), (i_c*BTL, i_k*BK), (BTL, BK), (1, 0)) + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + + b_do = tl.load(p_do, boundary_check=(0, 1)).to(b_q.dtype) + b_dq = tl.zeros([BTL, BK], dtype=tl.float32) + p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (0, i_k * BK), (BTS, BK), (1, 0)) + p_v = tl.make_block_ptr(v + i_bh * T*V, (V, T), (1, V), (i_v * BV, 0), (BV, BTS), (0, 1)) + p_dz = dz + i_bh * T + i_c * BTL + tl.arange(0, BTL) + b_dz = tl.load(p_dz, mask=(i_c * BTL + tl.arange(0, BTL)) < T) + + for _ in range(0, i_c * BTL, BTS): + # [BTS, BK] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BV, BTS] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BTL, BTS] + b_ds = tl.dot(b_do, b_v, allow_tf32=False) + if i_v == 0: + b_ds += b_dz[:, None] + else: + b_ds = b_ds + b_s = tl.dot(b_q, tl.trans(b_k), allow_tf32=False) + # [BQ, BD] + b_dq += tl.dot((b_ds * (1 + b_s)).to(b_v.dtype), b_k, allow_tf32=False) + p_k = tl.advance(p_k, (BTS, 0)) + p_v = tl.advance(p_v, (0, BTS)) + + b_dq *= scale + o_q = tl.arange(0, BTL) + o_k = tl.arange(0, BTS) + p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_c * BTL, i_k * BK), (BTS, BK), (1, 0)) + p_v = tl.make_block_ptr(v + i_bh * T*V, (V, T), (1, V), (i_v * BV, i_c * BTL), (BV, BTS), (0, 1)) + # Q block and K block have overlap. masks required + for _ in range(i_c * BTL, (i_c + 1) * BTL, BTS): + # [BTS, BK] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BV, BTS] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BTL, BTS] + m_s = o_q[:, None] >= o_k[None, :] + b_ds = tl.dot(b_do, b_v, allow_tf32=False) + if i_v == 0: + b_ds += b_dz[:, None] + else: + b_ds = b_ds + b_ds = tl.where(m_s, b_ds, 0) * scale + b_s = tl.dot(b_q, tl.trans(b_k), allow_tf32=False) + b_s = tl.where(m_s, b_s, 0) + # [BTL, BK] + b_dq += tl.dot((b_ds + b_ds * b_s).to(b_k.dtype), b_k, allow_tf32=False) + p_k = tl.advance(p_k, (BTS, 0)) + p_v = tl.advance(p_v, (0, BTS)) + o_k += BTS + p_dq = tl.make_block_ptr(dq + (i_bh + B * H * i_v) * T*K, (T, K), (K, 1), (i_c*BTL, i_k*BK), (BTL, BK), (1, 0)) + tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1)) + return + + +@triton.jit +def _parallel_based_bwd_dkv( + i_bh, + i_c, + i_k, + i_v, + q, + k, + v, + do, + dz, + dk, + dv, + scale, + T, + B: tl.constexpr, + H: tl.constexpr, + BTL: tl.constexpr, + BTS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, +): + # compute dk dv + p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_c * BTL, i_k * BK), (BTL, BK), (1, 0)) + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (i_c * BTL, i_v * BV), (BTL, BV), (1, 0)) + b_k, b_v = tl.load(p_k, boundary_check=(0, 1)), tl.load(p_v, boundary_check=(0, 1)) + b_dk, b_dv = tl.zeros([BTL, BK], dtype=tl.float32), tl.zeros([BTL, BV], dtype=tl.float32) + + for i in range((tl.cdiv(T, BTS) * BTS)-BTS, (i_c + 1) * BTL - BTS, -BTS): + p_q = tl.make_block_ptr(q + i_bh * T*K, (K, T), (1, K), (i_k * BK, i), (BK, BTS), (0, 1)) + p_do = tl.make_block_ptr(do + i_bh * T*V, (V, T), (1, V), (i_v * BV, i), (BV, BTS), (0, 1)) + p_dz = dz + i_bh * T + i + tl.arange(0, BTS) + b_q = tl.load(p_q, boundary_check=(0, 1)) # [BK, BTS] + b_do = tl.load(p_do, boundary_check=(0, 1)).to(b_q.dtype) # [BV, BTS] + b_dz = tl.load(p_dz, mask=(i + tl.arange(0, BTS)) < T) + b_s = tl.dot(b_k.to(b_q.dtype), b_q, allow_tf32=False) * scale # [BTL, BTS] + b_s2 = 1 + b_s + 0.5 * b_s * b_s + b_dv += tl.dot(b_s2.to(b_q.dtype), tl.trans(b_do), allow_tf32=False) + b_ds = tl.dot(b_v, b_do, allow_tf32=False) * scale + if i_v == 0: + b_ds += b_dz[None, :] * scale + else: + b_ds = b_ds + b_dk += tl.dot((b_ds + b_ds * b_s).to(b_q.dtype), tl.trans(b_q), allow_tf32=False) + + tl.debug_barrier() + o_q, o_k = tl.arange(0, BTS), tl.arange(0, BTL) + for i in range(i_c*BTL, (i_c+1)*BTL, BTS): + p_q = tl.make_block_ptr(q + i_bh * T*K, (K, T), (1, K), (i_k * BK, i), (BK, BTS), (0, 1)) + p_do = tl.make_block_ptr(do + i_bh * T*V, (V, T), (1, V), (i_v * BV, i), (BV, BTS), (0, 1)) + p_dz = dz + i_bh * T + i + tl.arange(0, BTS) + b_q = tl.load(p_q, boundary_check=(0, 1)) # [BD, BQ] + b_do = tl.load(p_do, boundary_check=(0, 1)).to(b_q.dtype) + b_dz = tl.load(p_dz, mask=(i + tl.arange(0, BTS)) < T) + # [BK, BQ] + m_s = o_k[:, None] <= o_q[None, :] + b_s = tl.dot(b_k, b_q, allow_tf32=False) * scale + b_s2 = 1 + b_s + 0.5 * b_s * b_s + b_s = tl.where(m_s, b_s, 0) + b_s2 = tl.where(m_s, b_s2, 0) + + b_ds = tl.dot(b_v, b_do, allow_tf32=False) + if i_v == 0: + b_ds += b_dz[None, :] + else: + b_ds = b_ds + b_ds = tl.where(m_s, b_ds, 0) * scale + # [BK, BD] + b_dv += tl.dot(b_s2.to(b_q.dtype), tl.trans(b_do), allow_tf32=False) + b_dk += tl.dot((b_ds + b_ds * b_s).to(b_q.dtype), tl.trans(b_q), allow_tf32=False) + o_q += BTS + + p_dk = tl.make_block_ptr(dk + (i_bh + B * H * i_v) * T*K, (T, K), (K, 1), (i_c*BTL, i_k*BK), (BTL, BK), (1, 0)) + p_dv = tl.make_block_ptr(dv + (i_bh + B * H * i_k) * T*V, (T, V), (V, 1), (i_c*BTL, i_v*BV), (BTL, BV), (1, 0)) + tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1)) + return + + +@triton.jit(do_not_specialize=['T']) +def parallel_based_bwd_kernel( + q, + k, + v, + do, + dz, + dq, + dk, + dv, + scale, + T, + B: tl.constexpr, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BTL: tl.constexpr, + BTS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, +): + i_kv, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + NV = tl.cdiv(V, BV) + i_k = i_kv // (NV) + i_v = i_kv % NV + _parallel_based_bwd_dq( + i_bh, i_c, i_k, i_v, + q, k, v, do, dz, dq, + scale, T, B, H, BTL, BTS, BK, BV, K, V + ) + tl.debug_barrier() + _parallel_based_bwd_dkv( + i_bh, i_c, i_k, i_v, + q, k, v, do, dz, dk, dv, + scale, T, B, H, BTL, BTS, BK, BV, K, V + ) + + +class ParallelBasedFunction(torch.autograd.Function): + + @staticmethod + @input_guard + @autocast_custom_fwd + def forward(ctx, q, k, v, scale): + BTL, BTS = 128, 32 + assert BTL % BTS == 0 + # assert q.shape[-1] % 16 == 0 + BK = min(128, triton.next_power_of_2(k.shape[-1])) + BV = min(128, triton.next_power_of_2(v.shape[-1])) + BK, BV = max(BK, 16), max(BV, 16) + B, H, T, K, V = *k.shape, v.shape[-1] + num_stages = 2 + num_warps = 4 + NK = triton.cdiv(K, BK) + NV = triton.cdiv(V, BV) + grid = (NK * NV, triton.cdiv(T, BTL), B * H) + + assert NK == 1, "will encounter some synchronization issue if not." + + o = torch.empty(NK, B, H, T, V, device=q.device) + z = torch.empty(NK, B, H, T, device=q.device) + parallel_based_fwd_kernel[grid]( + q, k, v, o, z, + scale, + B=B, + H=H, + T=T, + K=K, + V=V, + BTL=BTL, + BTS=BTS, + BK=BK, + BV=BV, + num_warps=num_warps, + num_stages=num_stages + ) + ctx.save_for_backward(q, k, v) + ctx.scale = scale + return o.sum(0).to(q.dtype), z.sum(0).to(q.dtype) + + @staticmethod + @input_guard + @autocast_custom_bwd + def backward(ctx, do, dz): + q, k, v = ctx.saved_tensors + scale = ctx.scale + BTL, BTS = 64, 32 + assert BTL % BTS == 0 + BK = min(128, triton.next_power_of_2(k.shape[-1])) + BV = min(128, triton.next_power_of_2(v.shape[-1])) + BK, BV = max(BK, 16), max(BV, 16) + B, H, T, K, V = *k.shape, v.shape[-1] + num_stages = 2 + num_warps = 4 + NK = triton.cdiv(K, BK) + NV = triton.cdiv(V, BV) + grid = (NK * NV, triton.cdiv(T, BTL), B * H) + + assert NK == 1, "will encounter some synchronization issue if not" + + dq = torch.empty(NV, B, H, T, K, dtype=q.dtype, device=q.device) + dk = torch.empty(NV, B, H, T, K, dtype=q.dtype, device=q.device) + dv = torch.empty(NK, B, H, T, V, dtype=q.dtype, device=q.device) + + parallel_based_bwd_kernel[grid]( + q, k, v, do, dz, dq, dk, dv, + scale, + B=B, + H=H, + T=T, + K=K, + V=V, + BTL=BTL, + BTS=BTS, + BK=BK, + BV=BV, + num_warps=num_warps, + num_stages=num_stages + ) + + return dq.sum(0).to(q.dtype), dk.sum(0).to(k.dtype), dv.sum(0).to(v.dtype), None + + +triton_parallel_based = ParallelBasedFunction.apply + + +def parallel_based( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: Optional[float] = None, + use_norm: bool = True, + head_first: bool = True +): + assert q.shape[-1] <= 128, "only support feature dim up to 128" + if scale is None: + scale = q.shape[-1] ** -0.5 + if not head_first: + q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v)) + o, z = triton_parallel_based(q, k, v, scale) + if use_norm: + o = o / (z[..., None] + 1e-6) + if not head_first: + o = o.transpose(1, 2) + return o.to(q.dtype) diff --git a/fla/ops/common/__pycache__/__init__.cpython-311.pyc b/fla/ops/common/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..db6acc806c9281a59d44eeaa0058ffcc526032d5 Binary files /dev/null and b/fla/ops/common/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/ops/common/__pycache__/chunk_scaled_dot_kkt.cpython-311.pyc b/fla/ops/common/__pycache__/chunk_scaled_dot_kkt.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df206c9053bd60fa80252e90ff32121792866d9d Binary files /dev/null and b/fla/ops/common/__pycache__/chunk_scaled_dot_kkt.cpython-311.pyc differ diff --git a/fla/ops/common/__pycache__/fused_recurrent.cpython-311.pyc b/fla/ops/common/__pycache__/fused_recurrent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a7d9a1bb29d93a4e8ad2effdb3bcf03a2d45bf1 Binary files /dev/null and b/fla/ops/common/__pycache__/fused_recurrent.cpython-311.pyc differ diff --git a/fla/ops/common/chunk_h.py b/fla/ops/common/chunk_h.py new file mode 100644 index 0000000000000000000000000000000000000000..0aa5a7a93b9741968fa03ab630eb8aba062ccc5f --- /dev/null +++ b/fla/ops/common/chunk_h.py @@ -0,0 +1,422 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional, Tuple + +import torch +import triton +import triton.language as tl + +from fla.ops.common.utils import prepare_chunk_offsets +from fla.ops.utils.op import exp +from fla.utils import check_shared_mem + +BKV_LIST = [32, 64] if check_shared_mem() else [16, 32] + + +@triton.heuristics({ + 'USE_INITIAL_STATE': lambda args: args['h0'] is not None, + 'STORE_FINAL_STATE': lambda args: args['ht'] is not None, + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages) + for BK in BKV_LIST + for BV in BKV_LIST + for num_warps in [1, 2, 4, 8] + for num_stages in [2, 3, 4] + ], + key=['BT', 'USE_G', 'USE_GK', 'USE_GV'] +) +@triton.jit(do_not_specialize=['T']) +def chunk_fwd_kernel_h( + k, + v, + h, + g, + gk, + gv, + h0, + ht, + offsets, + split_offsets, + T, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + USE_GK: tl.constexpr, + USE_GV: tl.constexpr, + USE_INITIAL_STATE: tl.constexpr, + STORE_FINAL_STATE: tl.constexpr, + USE_OFFSETS: tl.constexpr, + HEAD_FIRST: tl.constexpr +): + i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_n, i_h = i_nh // H, i_nh % H + if USE_OFFSETS: + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + NT = tl.cdiv(T, BT) + NS = tl.cdiv(T, BS) + boh = tl.load(split_offsets + i_n).to(tl.int32) + else: + bos, eos = i_n * T, i_n * T + T + NT = tl.cdiv(T, BT) + NS = tl.cdiv(T, BS) + boh = i_n * NS + + # [BK, BV] + b_h = tl.zeros([BK, BV], dtype=tl.float32) + if USE_INITIAL_STATE: + p_h0 = tl.make_block_ptr(h0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + b_h = tl.load(p_h0, boundary_check=(0, 1)).to(tl.float32) + + for i_t in range(NT): + i_s = i_t // (BS // BT) + if HEAD_FIRST: + p_k = tl.make_block_ptr(k + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_v = tl.make_block_ptr(v + i_nh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + + o_h = (i_nh * NS + i_s).to(tl.int64) * K*V + p_h = tl.make_block_ptr(h + o_h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + else: + p_k = tl.make_block_ptr(k + (bos*H + i_h) * K, (K, T), (1, H*K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_v = tl.make_block_ptr(v + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + + o_h = ((boh + i_s) * H + i_h).to(tl.int64) * K*V + p_h = tl.make_block_ptr(h + o_h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + + if i_t % (BS // BT) == 0: + tl.store(p_h, b_h.to(p_h.dtype.element_ty), boundary_check=(0, 1)) + # [BK, BT] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BT, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + last_idx = min((i_t + 1) * BT, T) - 1 + + # scalar decay + if USE_G: + if HEAD_FIRST: + b_g_last = tl.load(g + i_nh * T + last_idx) + p_g = g + i_nh * T + i_t * BT + tl.arange(0, BT) + p_g = tl.max_contiguous(tl.multiple_of(p_g, BT), BT) + else: + b_g_last = tl.load(g + bos * H + last_idx * H + i_h) + p_g = g + bos*H + (i_t * BT + tl.arange(0, BT)) * H + i_h + b_h *= exp(b_g_last) + b_g = tl.load(p_g, mask=(i_t * BT + tl.arange(0, BT) < T), other=0.) + b_v = (b_v * exp(b_g_last - b_g)[:, None]).to(b_v.dtype) + + # vector decay, h = Diag(gk) @ h + if USE_GK: + if HEAD_FIRST: + p_gk = tl.make_block_ptr(gk + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_gk_last = gk + i_nh * T*K + last_idx * K + i_k * BK + tl.arange(0, BK) + p_gk_last = tl.max_contiguous(tl.multiple_of(p_gk_last, BK), BK) + else: + p_gk = tl.make_block_ptr(gk + (bos*H + i_h) * K, (K, T), (1, H*K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_gk_last = gk + (bos + last_idx) * H*K + i_h * K + i_k * BK + tl.arange(0, BK) + + b_gk_last = tl.load(p_gk_last, mask=(i_k * BK + tl.arange(0, BK) < K), other=0.) + b_h *= exp(b_gk_last)[:, None] + + b_gk = tl.load(p_gk, boundary_check=(0, 1)) + b_k = (b_k * exp(b_gk_last[:, None] - b_gk)).to(b_k.dtype) + + # vector decay, h = h @ Diag(gv) + if USE_GV: + if HEAD_FIRST: + p_gv = tl.make_block_ptr(gv + i_nh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_gv_last = gv + i_nh * T*V + last_idx * V + i_v * BV + tl.arange(0, BV) + p_gv_last = tl.max_contiguous(tl.multiple_of(p_gv_last, BV), BV) + else: + p_gv = tl.make_block_ptr(gv + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_gv_last = gv + (bos + last_idx) * H*V + i_h * V + i_v * BV + tl.arange(0, BV) + + b_gv_last = tl.load(p_gv_last, mask=(i_v * BV + tl.arange(0, BV) < V), other=0.) + b_h *= exp(b_gv_last)[None, :] + + b_gv = tl.load(p_gv, boundary_check=(0, 1)) + b_v = (b_v * exp(b_gv_last[None, :] - b_gv)).to(b_v.dtype) + + b_h += tl.dot(b_k, b_v) + + if STORE_FINAL_STATE: + p_ht = tl.make_block_ptr(ht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.heuristics({ + 'STORE_INITIAL_STATE_GRADIENT': lambda args: args['dh0'] is not None, + 'USE_FINAL_STATE_GRADIENT': lambda args: args['dht'] is not None, + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages) + for BK in BKV_LIST + for BV in BKV_LIST + for num_warps in [1, 2, 4, 8] + for num_stages in [2, 3, 4] + ], + key=['BT', 'USE_G', 'USE_GK', 'USE_GV'] +) +@triton.jit(do_not_specialize=['T']) +def chunk_bwd_kernel_dh( + q, + g, + gk, + gv, + do, + dh, + dht, + dh0, + offsets, + split_offsets, + scale, + T, + HQ: tl.constexpr, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + NG: tl.constexpr, + USE_G: tl.constexpr, + USE_GK: tl.constexpr, + USE_GV: tl.constexpr, + STORE_INITIAL_STATE_GRADIENT: tl.constexpr, + USE_FINAL_STATE_GRADIENT: tl.constexpr, + USE_OFFSETS: tl.constexpr, + HEAD_FIRST: tl.constexpr +): + i_k, i_v, i_nh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_bg = i_nh // NG + i_n, i_hq = i_nh // HQ, i_nh % HQ + i_h = i_hq // NG + if USE_OFFSETS: + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + NT = tl.cdiv(T, BT) + NS = tl.cdiv(T, BS) + boh = tl.load(split_offsets + i_n).to(tl.int32) + else: + bos, eos = i_n * T, i_n * T + T + NT = tl.cdiv(T, BT) + NS = tl.cdiv(T, BS) + boh = i_n * NS + + # [BK, BV] + b_dh = tl.zeros([BK, BV], dtype=tl.float32) + if USE_FINAL_STATE_GRADIENT: + p_dht = tl.make_block_ptr(dht + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + b_dh += tl.load(p_dht, boundary_check=(0, 1)).to(tl.float32) + + for i_t in range(NT - 1, -1, -1): + i_s = i_t // (BS // BT) + if HEAD_FIRST: + o_dh = (i_nh * NS + i_s).to(tl.int64) * K*V + p_dh = tl.make_block_ptr(dh + o_dh, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + else: + o_dh = ((boh + i_s) * H + i_h).to(tl.int64) * K*V + p_dh = tl.make_block_ptr(dh + o_dh, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + + if i_t % (BS // BT) == 0: + tl.store(p_dh, b_dh.to(p_dh.dtype.element_ty), boundary_check=(0, 1)) + last_idx = min(i_t * BT + BT, T) - 1 + # [BK, BT] + if HEAD_FIRST: + p_q = tl.make_block_ptr(q + i_nh * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_do = tl.make_block_ptr(do + i_nh * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + else: + p_q = tl.make_block_ptr(q + (bos*HQ + i_hq) * K, (K, T), (1, HQ*K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_do = tl.make_block_ptr(do + (bos*HQ + i_hq) * V, (T, V), (HQ*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + # [BT, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + + if USE_G: + if HEAD_FIRST: + p_g = g + i_bg * T + i_t * BT + tl.arange(0, BT) + p_g = tl.max_contiguous(tl.multiple_of(p_g, BT), BT) + b_g_last = tl.load(g + i_bg * T + last_idx) + else: + p_g = g + (bos + i_t * BT + tl.arange(0, BT)) * H + i_h + b_g_last = tl.load(g + (bos + last_idx) * H + i_h) + b_g = tl.load(p_g, mask=(i_t * BT + tl.arange(0, BT) < T), other=0.) + b_q = (b_q * exp(b_g)[None, :]).to(b_q.dtype) + + b_dh *= exp(b_g_last) + + if USE_GK: + if HEAD_FIRST: + p_gk = tl.make_block_ptr(gk + i_bg * T*K, (K, T), (1, K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_gk_last = gk + (i_bg * T + last_idx) * K + i_k * BK + tl.arange(0, BK) + p_gk_last = tl.max_contiguous(tl.multiple_of(p_gk_last, BK), BK) + else: + p_gk = tl.make_block_ptr(gk + (bos*H + i_h) * K, (K, T), (1, H*K), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_gk_last = gk + (bos + last_idx) * H*K + i_h * K + i_k * BK + tl.arange(0, BK) + + b_gk = tl.load(p_gk, boundary_check=(0, 1)) + b_q = (b_q * exp(b_gk)).to(b_q.dtype) + b_gk_last = tl.load(p_gk_last, mask=(i_k * BK + tl.arange(0, BK) < K), other=0.) + b_dh *= exp(b_gk_last)[:, None] + + if USE_GV: + if HEAD_FIRST: + p_gv = tl.make_block_ptr(gv + i_bg * T*V, (T, V), (V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_gv_last = gv + (i_bg * T + last_idx) * V + i_v * BV + tl.arange(0, BV) + p_gv_last = tl.max_contiguous(tl.multiple_of(p_gv_last, BV), BV) + else: + p_gv = tl.make_block_ptr(gv + (bos*H + i_h) * V, (T, V), (H*V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_gv_last = gv + (bos + last_idx) * H*V + i_h * V + i_v * BV + tl.arange(0, BV) + + b_gv = tl.load(p_gv, boundary_check=(0, 1)) + b_do = (b_do * exp(b_gv)).to(b_do.dtype) + + b_gv_last = tl.load(p_gv_last, mask=(i_v * BV + tl.arange(0, BV) < V), other=0.) + b_dh *= exp(b_gv_last)[None, :] + + b_dh += tl.dot(b_q, b_do) + + if STORE_INITIAL_STATE_GRADIENT: + p_dh0 = tl.make_block_ptr(dh0 + i_nh * K*V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + tl.store(p_dh0, b_dh.to(p_dh0.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_fwd_h( + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + gk: torch.Tensor, + gv: torch.Tensor, + h0: torch.Tensor, + output_final_state: bool, + offsets: Optional[torch.Tensor] = None, + head_first: bool = True, + chunk_size: int = 64, + split_size: Optional[int] = None, + states_in_fp32: bool = False +) -> Tuple[torch.Tensor, torch.Tensor]: + if head_first: + B, H, T, K, V = *k.shape, v.shape[-1] + else: + B, T, H, K, V = *k.shape, v.shape[-1] + BT = min(chunk_size, max(16, triton.next_power_of_2(T))) + BS = BT if split_size is None else min(split_size, max(16, triton.next_power_of_2(T))) + assert BS % BT == 0, f"The `split_size` (got {BS}) must be a multiple of `chunk_size` {BT}" + # N: the actual number of sequences in the batch with either equal or variable lengths + if offsets is None: + split_offsets, N, NS = None, B, triton.cdiv(T, BS) + else: + split_offsets = prepare_chunk_offsets(offsets, BS) + N, NS = len(offsets) - 1, split_offsets[-1] + + if head_first: + h = k.new_empty(B, H, NS, K, V, dtype=k.dtype if not states_in_fp32 else torch.float) + else: + h = k.new_empty(B, NS, H, K, V, dtype=k.dtype if not states_in_fp32 else torch.float) + ht = k.new_empty(N, H, K, V, dtype=torch.float) if output_final_state else None + def grid(meta): return (triton.cdiv(K, meta['BK']), triton.cdiv(V, meta['BV']), N * H) + chunk_fwd_kernel_h[grid]( + k=k, + v=v, + h=h, + g=g, + gk=gk, + gv=gv, + h0=h0, + ht=ht, + offsets=offsets, + split_offsets=split_offsets, + T=T, + H=H, + K=K, + V=V, + BT=BT, + BS=BS, + USE_G=g is not None, + USE_GK=gk is not None, + USE_GV=gv is not None, + HEAD_FIRST=head_first + ) + return h, ht + + +def chunk_bwd_dh( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + gk: torch.Tensor, + gv: torch.Tensor, + do: torch.Tensor, + h0: torch.Tensor, + dht: torch.Tensor, + scale: float, + offsets: Optional[torch.Tensor] = None, + head_first: bool = True, + chunk_size: int = 64, + split_size: Optional[int] = None, + states_in_fp32: bool = False +) -> Tuple[torch.Tensor, torch.Tensor]: + if head_first: + B, H, T, K, V = *k.shape, v.shape[-1] + HQ = q.shape[1] + else: + B, T, H, K, V = *k.shape, v.shape[-1] + HQ = q.shape[2] + BT = min(chunk_size, max(16, triton.next_power_of_2(T))) + BS = BT if split_size is None else min(split_size, max(16, triton.next_power_of_2(T))) + assert BS % BT == 0, f"The `split_size` (got {BS}) must be a multiple of `chunk_size` {BT}" + # N: the actual number of sequences in the batch with either equal or variable lengths + # NG: number of groups in GQA + if offsets is None: + split_offsets, N, NS = None, B, triton.cdiv(T, BS) + else: + split_offsets = prepare_chunk_offsets(offsets, BS) + N, NS = len(offsets) - 1, split_offsets[-1] + NG = HQ // H + + if head_first: + dh = k.new_empty(B, HQ, NS, K, V, dtype=k.dtype if not states_in_fp32 else torch.float) + else: + dh = k.new_empty(B, NS, HQ, K, V, dtype=k.dtype if not states_in_fp32 else torch.float) + dh0 = torch.empty_like(h0, dtype=torch.float) if h0 is not None else None + + def grid(meta): return (triton.cdiv(K, meta['BK']), triton.cdiv(V, meta['BV']), N * H) + chunk_bwd_kernel_dh[grid]( + q=q, + g=g, + gk=gk, + gv=gv, + do=do, + dh=dh, + dht=dht, + dh0=dh0, + offsets=offsets, + split_offsets=split_offsets, + scale=scale, + T=T, + HQ=HQ, + H=H, + K=K, + V=V, + BT=BT, + BS=BS, + NG=NG, + USE_G=g is not None, + USE_GK=gk is not None, + USE_GV=gv is not None, + HEAD_FIRST=head_first + ) + return dh, dh0 diff --git a/fla/ops/common/chunk_o.py b/fla/ops/common/chunk_o.py new file mode 100644 index 0000000000000000000000000000000000000000..b1e99d1d28bebc49994deaef04c252be74b2d570 --- /dev/null +++ b/fla/ops/common/chunk_o.py @@ -0,0 +1,668 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional, Tuple + +import torch +import triton +import triton.language as tl + +from fla.ops.utils.op import exp, safe_exp +from fla.utils import check_shared_mem, is_nvidia_hopper + +BKV_LIST = [64, 128] if check_shared_mem() else [32, 64] +NUM_WARPS = [2, 4] if is_nvidia_hopper else [2, 4, 8] + + +@triton.heuristics({ + 'USE_G': lambda args: args['g'] is not None, + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({'BK': BK, 'BV': BV}, num_warps=num_warps, num_stages=num_stages) + for BK in BKV_LIST + for BV in BKV_LIST + for num_warps in NUM_WARPS + for num_stages in [2, 3, 4] + ], + key=['H', 'K', 'V', 'BT'], +) +@triton.jit(do_not_specialize=['T']) +def chunk_fwd_kernel_o( + q, + k, + v, + h, + g, + o, + offsets, + indices, + scale, + T, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + USE_OFFSETS: tl.constexpr, + HEAD_FIRST: tl.constexpr +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + + if USE_OFFSETS: + i_tg = i_t + i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + NT = tl.cdiv(T, BT) + else: + NT = tl.cdiv(T, BT) + i_tg = i_b * NT + i_t + bos, eos = i_b * T, i_b * T + T + + s_qk = K if HEAD_FIRST else H*K + s_vo = V if HEAD_FIRST else H*V + s_g = 1 if HEAD_FIRST else H + # offset calculation + q += (i_bh * T*K) if HEAD_FIRST else ((bos * H + i_h) * K) + k += (i_bh * T*K) if HEAD_FIRST else ((bos * H + i_h) * K) + v += (i_bh * T*V) if HEAD_FIRST else ((bos * H + i_h) * V) + o += (i_bh * T*V) if HEAD_FIRST else ((bos * H + i_h) * V) + h += ((i_bh * NT + i_t).to(tl.int64) * K*V) if HEAD_FIRST else ((i_tg * H + i_h).to(tl.int64) * K*V) + + b_o = tl.zeros([BT, BV], dtype=tl.float32) + b_A = tl.zeros([BT, BT], dtype=tl.float32) + + for i_k in range(tl.cdiv(K, BK)): + p_q = tl.make_block_ptr(q, (T, K), (s_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_k = tl.make_block_ptr(k, (K, T), (1, s_qk), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + p_h = tl.make_block_ptr(h, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + # [BK, BT] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BK, BV] + b_h = tl.load(p_h, boundary_check=(0, 1)) + + # [BT, BK] @ [BK, BV] -> [BT, BV] + b_o += tl.dot(b_q, b_h) + # [BT, BK] @ [BK, BT] -> [BT, BT] + b_A += tl.dot(b_q, b_k) + + if USE_G: + g += (i_bh * T) if HEAD_FIRST else (bos * H + i_h) + p_g = tl.make_block_ptr(g, (T,), (s_g,), (i_t * BT,), (BT,), (0,)) + b_g = tl.load(p_g, boundary_check=(0,)) + b_o = b_o * exp(b_g)[:, None] + b_A = b_A * safe_exp(b_g[:, None] - b_g[None, :]) + + o_i = tl.arange(0, BT) + m_A = o_i[:, None] >= o_i[None, :] + b_A = tl.where(m_A, b_A, 0) + + p_v = tl.make_block_ptr(v, (T, V), (s_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_o = tl.make_block_ptr(o, (T, V), (s_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + b_v = tl.load(p_v, boundary_check=(0, 1)) + + # to fix mma -> mma layout conversion + # already solved by triton v3.2 or higher + b_o = b_o * scale + tl.dot(b_A.to(b_v.dtype), b_v) * scale + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None, + 'USE_G': lambda args: args['g'] is not None, + 'USE_DW': lambda args: args['dw'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in NUM_WARPS + for num_stages in [2, 3, 4] + ], + key=['H', 'K', 'V', 'BT', 'BK', 'BV', 'USE_G', 'USE_DW'], +) +@triton.jit(do_not_specialize=['T']) +def chunk_bwd_kernel_dqkwg( + q, + k, + v, + h, + g, + do, + dh, + dq, + dk, + dg, + w, + dv, + dw, + offsets, + indices, + scale, + B: tl.constexpr, + T, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + USE_DW: tl.constexpr, + USE_OFFSETS: tl.constexpr, + HEAD_FIRST: tl.constexpr +): + i_k, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + if USE_G: + dg += i_k * B * H * T + if USE_OFFSETS: + i_tg = i_t + i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + NT = tl.cdiv(T, BT) + else: + NT = tl.cdiv(T, BT) + i_tg = i_b * NT + i_t + bos, eos = i_b * T, i_b * T + T + + # offset calculation + v += i_bh * T*V if HEAD_FIRST else (bos * H + i_h) * V + do += i_bh * T*V if HEAD_FIRST else (bos * H + i_h) * V + h += (i_bh * NT + i_t).to(tl.int64) * K*V if HEAD_FIRST else (i_tg * H + i_h).to(tl.int64) * K*V + dh += (i_bh * NT + i_t).to(tl.int64) * K*V if HEAD_FIRST else (i_tg * H + i_h).to(tl.int64) * K*V + q += i_bh * T*K if HEAD_FIRST else (bos * H + i_h) * K + k += i_bh * T*K if HEAD_FIRST else (bos * H + i_h) * K + dq += i_bh * T*K if HEAD_FIRST else (bos * H + i_h) * K + dk += i_bh * T*K if HEAD_FIRST else (bos * H + i_h) * K + s_qk = K if HEAD_FIRST else H*K + s_vo = V if HEAD_FIRST else H*V + s_g = 1 if HEAD_FIRST else H + + # for delta rule only + if USE_DW: + dw += i_bh * T*K if HEAD_FIRST else (bos * H + i_h) * K + dv += i_bh * T*V if HEAD_FIRST else (bos * H + i_h) * V + w += i_bh * T*K if HEAD_FIRST else (bos * H + i_h) * K + + b_dq = tl.zeros([BT, BK], dtype=tl.float32) + b_dk = tl.zeros([BT, BK], dtype=tl.float32) + b_ds = tl.zeros([BT, BT], dtype=tl.float32) + b_dg_last = tl.zeros([1,], dtype=tl.float32) if USE_G else None + b_dw = tl.zeros([BT, BK], dtype=tl.float32) if USE_DW else None + + for i_v in range(tl.cdiv(V, BV)): + p_v = tl.make_block_ptr(v, (T, V), (s_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_do = tl.make_block_ptr(do, (T, V), (s_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_h = tl.make_block_ptr(h, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1)) + p_dh = tl.make_block_ptr(dh, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1)) + # [BT, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_do = tl.load(p_do, boundary_check=(0, 1)) + # [BV, BK] + b_h = tl.load(p_h, boundary_check=(0, 1)) + b_dh = tl.load(p_dh, boundary_check=(0, 1)) + if USE_G: + b_dg_last += (tl.sum(b_h * b_dh)) + # [BT, BV] @ [BV, BT] -> [BT, BT] + b_ds += tl.dot(b_do, tl.trans(b_v)) + # [BT, BV] @ [BV, BK] -> [BT, BK] + b_dq += tl.dot(b_do, b_h.to(b_do.dtype)) + # [BT, BV] @ [BV, BK] -> [BT, BK] + b_dk += tl.dot(b_v, b_dh.to(b_v.dtype)) + if USE_DW: + p_dv = tl.make_block_ptr(dv, (T, V), (s_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + b_dv = tl.load(p_dv, boundary_check=(0, 1)) + b_dw += tl.dot(b_dv.to(b_v.dtype), b_h.to(b_v.dtype)) + + if USE_DW and not USE_G: + p_dw = tl.make_block_ptr(dw, (T, K), (s_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + tl.store(p_dw, -b_dw.to(p_dw.dtype.element_ty), boundary_check=(0, 1)) + + tl.debug_barrier() + o_i = tl.arange(0, BT) + p_q = tl.make_block_ptr(q, (T, K), (s_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_k = tl.make_block_ptr(k, (T, K), (s_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + + p_dq = tl.make_block_ptr(dq, (T, K), (s_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_dk = tl.make_block_ptr(dk, (T, K), (s_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + + if USE_G: + b_dg = tl.zeros([BT,], dtype=tl.float32) + g += i_bh * T if HEAD_FIRST else bos * H + i_h + dg += i_bh * T if HEAD_FIRST else bos * H + i_h + p_g = tl.make_block_ptr(g, (T,), (s_g,), (i_t * BT,), (BT,), (0,)) + b_g = tl.load(p_g, boundary_check=(0,)) + b_g_last = tl.load(g + (min(i_t * BT + BT, T) - 1) * s_g) + b_dg_last *= exp(b_g_last) + + if USE_DW: + p_w = tl.make_block_ptr(w, (T, K), (s_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_dw = tl.make_block_ptr(dw, (T, K), (s_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + b_w = tl.load(p_w, boundary_check=(0, 1)) + b_dw = b_dw * exp(b_g)[:, None] + tl.store(p_dw, -b_dw.to(p_dw.dtype.element_ty), boundary_check=(0, 1)) + b_dg -= tl.sum(b_w * b_dw, axis=1) + + b_dq = b_dq * exp(b_g)[:, None] * scale + b_dg += tl.sum(b_dq * b_q, axis=1) + + b_dk = b_dk * safe_exp(-b_g + b_g_last)[:, None] + b_dg -= tl.sum(b_k * b_dk, axis=1) + b_dg_last += tl.sum(b_dk * b_k) + + b_ds = tl.where(o_i[:, None] >= o_i[None, :], b_ds * safe_exp(b_g[:, None] - b_g[None, :]), 0) * scale + b_ds2 = b_ds * tl.dot(b_q, tl.trans(b_k)) + b_dg += tl.sum(b_ds2, axis=1) + b_dg -= tl.sum(b_ds2, axis=0) + + b_ds = b_ds.to(b_k.dtype) + # [BT, BK] + b_dq += tl.dot(b_ds, b_k) + b_dk += tl.dot(tl.trans(b_ds), b_q) + p_dg = tl.make_block_ptr(dg, (T,), (s_g,), (i_t * BT,), (BT,), (0,)) + # (SY 09/21) revcumsum in a separate kernel due to strange triton compiler issue + # b_dg = tl.dot(tl.where(o_i[:, None] <= o_i[None, :], 1., 0.), b_dg, allow_tf32=False) + b_dg_last) + b_dg = tl.where(o_i < min(BT, T-i_t*BT) - 1, b_dg, b_dg + b_dg_last) + tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_dg, b_dg.to(p_dg.dtype.element_ty), boundary_check=(0,)) + else: + b_ds = tl.where(o_i[:, None] >= o_i[None, :], b_ds, 0) + b_ds = b_ds.to(b_k.dtype) + b_dq += tl.dot(b_ds, b_k) + b_dk += tl.dot(tl.trans(b_ds), b_q) * scale + b_dq *= scale + tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None, + 'USE_G': lambda args: args['g'] is not None, +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in [2, 4, 8] + for num_stages in [2, 3, 4] + ], + key=['H', 'K', 'V', 'BT', 'BK', 'BV', 'USE_G'], +) +@triton.jit(do_not_specialize=['T']) +def chunk_bwd_kernel_dv( + q, + k, + g, + do, + dv, + dh, + offsets, + indices, + scale, + T, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + USE_OFFSETS: tl.constexpr, + HEAD_FIRST: tl.constexpr +): + i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + if USE_OFFSETS: + i_tg = i_t + i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + NT = tl.cdiv(T, BT) + else: + NT = tl.cdiv(T, BT) + i_tg = i_b * NT + i_t + bos, eos = i_b * T, i_b * T + T + + b_dv = tl.zeros([BT, BV], dtype=tl.float32) + + # offset calculation + q += i_bh * T*K if HEAD_FIRST else (bos * H + i_h) * K + k += i_bh * T*K if HEAD_FIRST else (bos * H + i_h) * K + do += i_bh * T*V if HEAD_FIRST else (bos * H + i_h) * V + dv += i_bh * T*V if HEAD_FIRST else (bos * H + i_h) * V + s_qk = K if HEAD_FIRST else H*K + s_vo = V if HEAD_FIRST else H*V + s_g = 1 if HEAD_FIRST else H + dh += (i_bh * NT + i_t).to(tl.int64) * K*V if HEAD_FIRST else (i_tg * H + i_h).to(tl.int64) * K*V + + b_A = tl.zeros([BT, BT], dtype=tl.float32) + for i_k in range(tl.cdiv(K, BK)): + p_k = tl.make_block_ptr(k, (T, K), (s_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_q = tl.make_block_ptr(q, (K, T), (1, s_qk), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_A += tl.dot(b_k, b_q) + p_dh = tl.make_block_ptr(dh, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + b_dh = tl.load(p_dh, boundary_check=(0, 1)) + b_dv += tl.dot(b_k, b_dh.to(b_k.dtype)) + + if USE_G: + g += (i_bh * T) if HEAD_FIRST else (bos * H + i_h) + p_g = tl.make_block_ptr(g, (T,), (s_g,), (i_t * BT,), (BT,), (0,)) + b_g = tl.load(p_g, boundary_check=(0,)) + b_g_last = tl.load(g + (min(i_t * BT + BT, T) - 1) * s_g) + b_dv *= safe_exp(-b_g + b_g_last)[:, None] + + mask = (tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :]) + if USE_G: + b_A = tl.where(mask, b_A * safe_exp(b_g[None, :] - b_g[:, None]) * scale, 0).to(do.dtype.element_ty) + else: + b_A = tl.where(mask, b_A * scale, 0).to(do.dtype.element_ty) + p_do = tl.make_block_ptr(do, (T, V), (s_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_dv = tl.make_block_ptr(dv, (T, V), (s_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + b_do = tl.load(p_do, boundary_check=(0, 1)) + b_dv += tl.dot(b_A.to(b_do.dtype), b_do) + tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.heuristics({ + 'USE_G': lambda args: args['g'] is not None, + 'USE_OFFSETS': lambda args: args['offsets'] is not None, +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps, num_stages=num_stages) + for num_warps in NUM_WARPS + for num_stages in [2, 3, 4] + ], + key=['H', 'K', 'V', 'BT', 'BK', 'BV', 'USE_G'], +) +@triton.jit(do_not_specialize=['T']) +def chunk_bwd_kernel_dv_local( + q, + k, + g, + do, + dv, + offsets, + indices, + scale, + T, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_G: tl.constexpr, + USE_OFFSETS: tl.constexpr, + HEAD_FIRST: tl.constexpr +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if USE_OFFSETS: + i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + # offset calculation + q += i_bh * T*K if HEAD_FIRST else (bos * H + i_h) * K + k += i_bh * T*K if HEAD_FIRST else (bos * H + i_h) * K + do += i_bh * T*V if HEAD_FIRST else (bos * H + i_h) * V + dv += i_bh * T*V if HEAD_FIRST else (bos * H + i_h) * V + s_qk = K if HEAD_FIRST else H*K + s_vo = V if HEAD_FIRST else H*V + s_g = 1 if HEAD_FIRST else H + + b_A = tl.zeros([BT, BT], dtype=tl.float32) + for i_k in range(tl.cdiv(K, BK)): + p_k = tl.make_block_ptr(k, (T, K), (s_qk, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + p_q = tl.make_block_ptr(q, (K, T), (1, s_qk), (i_k * BK, i_t * BT), (BK, BT), (0, 1)) + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_A += tl.dot(b_k, b_q) + + if USE_G: + g += (i_bh * T) if HEAD_FIRST else (bos * H + i_h) + p_g = tl.make_block_ptr(g, (T,), (s_g,), (i_t * BT,), (BT,), (0,)) + b_g = tl.load(p_g, boundary_check=(0,)) + + mask = (tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :]) + if USE_G: + b_A = tl.where(mask, b_A * safe_exp(b_g[None, :] - b_g[:, None]) * scale, 0).to(do.dtype.element_ty) + else: + b_A = tl.where(mask, b_A * scale, 0).to(do.dtype.element_ty) + + for i_v in range(tl.cdiv(V, BV)): + p_do = tl.make_block_ptr(do, (T, V), (s_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + p_dv = tl.make_block_ptr(dv, (T, V), (s_vo, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0)) + b_do = tl.load(p_do, boundary_check=(0, 1)) + b_dv = tl.dot(b_A.to(b_do.dtype), b_do) + tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_fwd_o( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + h: torch.Tensor, + g: Optional[torch.Tensor] = None, # cumsum of log decay + scale: Optional[float] = None, + offsets: Optional[torch.LongTensor] = None, + indices: Optional[torch.LongTensor] = None, + head_first: bool = True, + chunk_size: int = 64 +) -> torch.Tensor: + if head_first: + B, H, T, K, V = *q.shape, v.shape[-1] + else: + B, T, H, K, V = *q.shape, v.shape[-1] + if scale is None: + scale = k.shape[-1] ** -0.5 + BT = min(chunk_size, max(16, triton.next_power_of_2(T))) + NT = triton.cdiv(T, BT) if offsets is None else len(indices) + + o = torch.empty_like(v) + + def grid(meta): return (triton.cdiv(V, meta['BV']), NT, B * H) + chunk_fwd_kernel_o[grid]( + q, + k, + v, + h, + g, + o, + offsets, + indices, + scale, + T=T, + H=H, + K=K, + V=V, + BT=BT, + HEAD_FIRST=head_first + ) + return o + + +def chunk_bwd_dv( + q: torch.Tensor, + k: torch.Tensor, + g: torch.Tensor, + do: torch.Tensor, + dh: torch.Tensor, + scale: float, + offsets: Optional[torch.LongTensor] = None, + indices: Optional[torch.LongTensor] = None, + head_first: bool = True, + chunk_size: int = 64 +) -> torch.Tensor: + if head_first: + B, H, T, K, V = *k.shape, do.shape[-1] + else: + B, T, H, K, V = *k.shape, do.shape[-1] + BT = min(chunk_size, max(16, triton.next_power_of_2(T))) + # H100 can have larger block size + if check_shared_mem('hopper', k.device.index): + CONST_TILING = 128 + elif check_shared_mem: + CONST_TILING = 64 + else: + CONST_TILING = 32 + BK = min(triton.next_power_of_2(K), CONST_TILING) + BV = min(triton.next_power_of_2(V), CONST_TILING) + NT = triton.cdiv(T, BT) if offsets is None else len(indices) + NV = triton.cdiv(V, BV) + + dv = torch.empty_like(do) + grid = (NV, NT, B * H) + chunk_bwd_kernel_dv[grid]( + q, + k, + g, + do, + dv, + dh, + offsets, + indices, + scale, + T=T, + H=H, + K=K, + V=V, + BT=BT, + BK=BK, + BV=BV, + HEAD_FIRST=head_first + ) + return dv + + +def chunk_bwd_dv_local( + q: torch.Tensor, + k: torch.Tensor, + g: torch.Tensor, + do: torch.Tensor, + dh: torch.Tensor, + scale: float, + offsets: Optional[torch.LongTensor] = None, + indices: Optional[torch.LongTensor] = None, + head_first: bool = True, + chunk_size: int = 64 +) -> torch.Tensor: + if head_first: + B, H, T, K, V = *k.shape, do.shape[-1] + else: + B, T, H, K, V = *k.shape, do.shape[-1] + BT = min(chunk_size, max(16, triton.next_power_of_2(T))) + # H100 can have larger block size + if check_shared_mem('hopper', k.device.index): + CONST_TILING = 128 + elif check_shared_mem: + CONST_TILING = 64 + else: + CONST_TILING = 32 + BK = min(triton.next_power_of_2(K), CONST_TILING) + BV = min(triton.next_power_of_2(V), CONST_TILING) + NT = triton.cdiv(T, BT) if offsets is None else len(indices) + + dv = torch.empty_like(do) + grid = (NT, B * H) + chunk_bwd_kernel_dv_local[grid]( + q, + k, + g, + do, + dv, + offsets, + indices, + scale, + T=T, + H=H, + K=K, + V=V, + BT=BT, + BK=BK, + BV=BV, + HEAD_FIRST=head_first + ) + return dv + + +def chunk_bwd_dqkwg( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g: torch.Tensor, + do: torch.Tensor, + h: torch.Tensor, + dh: torch.Tensor, + dv: Optional[torch.Tensor] = None, + w: Optional[torch.Tensor] = None, + offsets: Optional[torch.LongTensor] = None, + indices: Optional[torch.LongTensor] = None, + chunk_size: int = 64, + scale: float = 1.0, + head_first: bool = True, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + + if head_first: + B, H, T, K, V = *k.shape, v.shape[-1] + else: + B, T, H, K, V = *k.shape, v.shape[-1] + BT = min(chunk_size, max(16, triton.next_power_of_2(T))) + NT = triton.cdiv(T, BT) if offsets is None else len(indices) + + CONST_TILING = 64 if check_shared_mem() else 32 + BK = min(triton.next_power_of_2(K), CONST_TILING) + BV = min(triton.next_power_of_2(V), CONST_TILING) + NK = triton.cdiv(K, BK) + dq = torch.empty_like(q) + dk = torch.empty_like(k) + dg = torch.empty(NK, *g.shape, dtype=torch.float32, device=g.device) if g is not None else None + dw = torch.empty_like(w) if w is not None else None + + grid = (NK, NT, B * H) + chunk_bwd_kernel_dqkwg[grid]( + q=q, + k=k, + v=v, + h=h, + g=g, + do=do, + dh=dh, + dv=dv, + w=w, + dw=dw, + dq=dq, + dk=dk, + dg=dg, + offsets=offsets, + indices=indices, + scale=scale, + B=B, + T=T, + H=H, + K=K, + V=V, + BT=BT, + BK=BK, + BV=BV, + HEAD_FIRST=head_first + ) + + if dg is not None: + dg = dg.sum(0) + return dq, dk, dw, dg diff --git a/fla/ops/common/chunk_scaled_dot_kkt.py b/fla/ops/common/chunk_scaled_dot_kkt.py new file mode 100644 index 0000000000000000000000000000000000000000..ff30664dce50a8869dd6198aaecea2ab6a171704 --- /dev/null +++ b/fla/ops/common/chunk_scaled_dot_kkt.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional + +import torch +import triton +import triton.language as tl + +from fla.ops.common.utils import prepare_chunk_indices + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({'BK': BK}, num_warps=num_warps, num_stages=num_stages) + for BK in [32, 64, 128] + for num_warps in [2, 4, 8] + for num_stages in [2, 3, 4] + ], + key=['H', 'K', 'BT', 'USE_OFFSETS'], +) +@triton.jit(do_not_specialize=['T']) +def chunk_scaled_dot_kkt_fwd_kernel( + k, + beta, + A, + offsets, + indices, + T, + H: tl.constexpr, + K: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + HEAD_FIRST: tl.constexpr, + USE_OFFSETS: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + if USE_OFFSETS: + i_n, i_t = tl.load(indices + i_t * 2).to(tl.int32), tl.load(indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + o_t = tl.arange(0, BT) + + if HEAD_FIRST: + p_beta = tl.make_block_ptr(beta + i_bh * T, (T,), (1,), (i_t * BT,), (BT,), (0,)) + else: + p_beta = tl.make_block_ptr(beta + bos*H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,)) + b_beta = tl.load(p_beta, boundary_check=(0,)) + + b_A = tl.zeros([BT, BT], dtype=tl.float32) + for i_k in range(tl.cdiv(K, BK)): + if HEAD_FIRST: + p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + else: + p_k = tl.make_block_ptr(k + (bos*H + i_h) * K, (T, K), (H*K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0)) + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_kb = b_k * b_beta[:, None] + b_A += tl.dot(b_kb.to(b_k.dtype), tl.trans(b_k)) + + b_A = tl.where(o_t[:, None] > o_t[None, :], b_A, 0) + if HEAD_FIRST: + p_A = tl.make_block_ptr(A + i_bh * T*BT, (T, BT), (BT, 1), (i_t * BT, 0), (BT, BT), (1, 0)) + else: + p_A = tl.make_block_ptr(A + (bos*H + i_h) * BT, (T, BT), (BT*H, 1), (i_t * BT, 0), (BT, BT), (1, 0)) + tl.store(p_A, b_A.to(p_A.dtype.element_ty), boundary_check=(0, 1)) + + +def chunk_scaled_dot_kkt_fwd( + k: torch.Tensor, + beta: torch.Tensor, + cu_seqlens: Optional[torch.LongTensor], + head_first: bool = False, + chunk_size: int = 64, + output_dtype: torch.dtype = torch.float32 +) -> torch.Tensor: + r""" + Compute beta * K * K^T. + + Args: + k (torch.Tensor): + The key tensor of shape `[B, T, H, K]` if not `head_first` else `[B, H, T, K]`. + beta (torch.Tensor): + The beta tensor of shape `[B, T, H]` if not `head_first` else `[B, H, T]`. + cu_seqlens (torch.LongTensor): + The cumulative sequence lengths of the input tensor. + Default: None + head_first (bool): + If False, the input/output tensor is in the shape of `[B, T, H, K]`. + If True, the input/output tensor is in the shape of `[B, H, T, K]`. + Default: False + chunk_size (int): + The chunk size. Default: 64. + output_dtype (torch.dtype): + The dtype of the output tensor. Default: `torch.float32` + + Returns: + beta * K * K^T of shape `[B, T, H, BT]` if not `head_first` else `[B, H, T, BT]`, + where `BT` is the chunk size. + """ + if head_first: + B, H, T, K = k.shape + else: + B, T, H, K = k.shape + BT = chunk_size + indices = prepare_chunk_indices(cu_seqlens, BT) if cu_seqlens is not None else None + NT = triton.cdiv(T, BT) if cu_seqlens is None else len(indices) + A = torch.empty(B, *((H, T) if head_first else (T, H)), BT, device=k.device, dtype=output_dtype) + chunk_scaled_dot_kkt_fwd_kernel[(NT, B * H)]( + k=k, + beta=beta, + A=A, + offsets=cu_seqlens, + indices=indices, + T=T, + H=H, + K=K, + BT=BT, + HEAD_FIRST=head_first + ) + return A diff --git a/fla/ops/delta_rule/README.md b/fla/ops/delta_rule/README.md new file mode 100644 index 0000000000000000000000000000000000000000..607b0d583c7ec2904c18c0f1d86fb0ec2dfdf583 --- /dev/null +++ b/fla/ops/delta_rule/README.md @@ -0,0 +1,90 @@ +# Chunkwise-form Parallelism of DeltaNet + +This section expands on the formulation presented in Appendix B of the DeltaNet paper.[^1] + +To reduce notational clutter, we focus on the first chunk, denoting $\mathbf{S}^r=\mathbf{S}_{[1]}^r$. By partially expanding the recurrence, we have: +```math +\begin{equation} +\begin{aligned} +\mathbf{S}^r &= \underbrace{\left(\prod_{i=1}^r \mathbf{I} - \beta^i \boldsymbol{k}^i \boldsymbol{k}^{i\top} \right)}_{:= \mathbf{P}^r} \cdot\mathbf{S}^{0} + \overbrace{\sum_{i=1}^{r} \underbrace{\left(\prod_{j=i+1}^r \mathbf{I} - \beta^j \boldsymbol{k}^j \boldsymbol{k}^{j\top} \right)}_{:= \mathbf{P}_{i+1}^r}\beta^i \boldsymbol{k}^i\boldsymbol{v}^{i\top}}^{:=\mathbf{H}^r} \\ +&=\mathbf{P}^r \cdot \mathbf{S}^{0} + \mathbf{H}^r +\end{aligned} +\end{equation} +``` + +where $\mathbf{P}_i^r$ involves cumulative products of generalized Householder matrices. +We abbreviate $\mathbf{P}_1^r$ as $\mathbf{P}^r$. +This can be optimized using the classical WY representation: +```math +\begin{equation} +\mathbf{P}^{r} = \mathbf{I} - \sum_{i=1}^{r}\boldsymbol{k}^i\boldsymbol{w}^{i\top} \in \mathbb{R}^{d_k \times d_k};\qquad +\boldsymbol{w}^r = \beta^r \left(\boldsymbol{k}^r - \sum_{i=1}^{r-1} \left(\boldsymbol{k}^{r\top}\boldsymbol{k}^i \right)\boldsymbol{w}^i \right) \in \mathbb{R}^{d_k} +\end{equation} +``` + +We prove this by induction: +```math +\begin{align*} +\mathbf{P}^{r} &= \prod_{i=1}^r \mathbf{I} - \beta^i \boldsymbol{k}^i \boldsymbol{k}^{i\top} \\ +&= \left(\mathbf{I} - \beta^r \boldsymbol{k}^r \boldsymbol{k}^{r\top}\right)\mathbf{P}^{r-1} \\ +&= \left(\mathbf{I} - \beta^r \boldsymbol{k}^r \boldsymbol{k}^{r\top}\right)\left(\mathbf{I} - \sum_{i=1}^{r-1}\boldsymbol{k}^i\boldsymbol{w}^{i\top}\right) \\ +&= \mathbf{I} - \sum_{i=1}^{r-1}\boldsymbol{k}^i\boldsymbol{w}^{i\top} - \beta^r \boldsymbol{k}^r \boldsymbol{k}^{r\top} + \beta^r\boldsymbol{k}^r \boldsymbol{k}^{r\top} \left(\sum_{i=1}^{r-1}\boldsymbol{k}^i\boldsymbol{w}^{i\top}\right) \\ +&= \mathbf{I} - \sum_{i=1}^{r-1}\boldsymbol{k}^i\boldsymbol{w}^{i\top} - \beta^r \boldsymbol{k}^r \left(\boldsymbol{k}^{r} - \left(\sum_{i=1}^{r-1}\left(\boldsymbol{k}^{r\top} \boldsymbol{k}^i\right)\boldsymbol{w}^{i}\right) \right)^\top \\ +&= \mathbf{I} - \sum_{i=1}^{r}\boldsymbol{k}^i\boldsymbol{w}^{i\top} +\end{align*} +``` + +Similarly, $\mathbf{H}^r$ can be represented as: +```math +\begin{equation} +\mathbf{H}^{r} = \sum_{i=1}^{r} \boldsymbol{k}^i \boldsymbol{u}^{i\top} \in \mathbb{R}^{d_k \times d_v};\qquad \boldsymbol{u}^r = \beta^r \left(\boldsymbol{v}^r - \sum_{i=1}^{r-1} \left(\boldsymbol{k}^{r\top}\boldsymbol{k}^i\right) \boldsymbol{u}^i \right)\in \mathbb{R}^{d_v} +\end{equation} +``` + +This can also be proven by induction: +```math +\begin{align*} +\mathbf{H}^{r} &= \sum_{i=1}^{r} \mathbf{P}_{i+1}^r \beta^i \boldsymbol{k}^i \boldsymbol{v}^{i\top}\\ +&= \left(\mathbf{I} - \beta^r \boldsymbol{k}^r \boldsymbol{k}^{r\top}\right) \mathbf{H}^{r-1} + \beta^r \boldsymbol{k}^r \boldsymbol{v}^{r\top}\\ +&= \sum_{i=1}^{r-1}\boldsymbol{k}^i \boldsymbol{u}^{i\top} - \beta^r \boldsymbol{k}^r \boldsymbol{k}^{r\top} \sum_{i=1}^{r-1}\boldsymbol{k}^i \boldsymbol{u}^{i\top} +\beta^r \boldsymbol{k}^r \boldsymbol{v}^{r\top}\\ +&= \sum_{i=1}^{r-1}\boldsymbol{k}^i \boldsymbol{u}^{i\top} + \boldsymbol{k}^r \left(\beta^r \boldsymbol{v}^{r\top}-\beta^r \boldsymbol{k}^{r\top} \sum_{i=1}^{r-1}\boldsymbol{k}^i \boldsymbol{u}^{i\top}\right) \\ +&= \sum_{i=1}^{r-1}\boldsymbol{k}^i \boldsymbol{u}^{i\top} + \boldsymbol{k}^r \beta^r\left(\boldsymbol{v}^{r}-\sum_{i=1}^{r-1}\left(\boldsymbol{k}^{r\top}\boldsymbol{k}^{i}\right)\boldsymbol{u}^{i} \right)^\top \\ +&=\sum_{i=1}^{r} \boldsymbol{k}^i \boldsymbol{u}^{i\top} +\end{align*} +``` + +In matrix form, $\mathbf{P}$ and $\mathbf{H}$ can be written as: +```math +\begin{equation} +\mathbf{P}=\mathbf{I}-\mathbf{K}^\top\mathbf{W} \in \mathbb{R}^{d_k \times d_k}, \qquad\mathbf{H}=\mathbf{K}^\top\mathbf{U} \in \mathbb{R}^{d_k\times d_v} +\end{equation} +``` + +Now we can derive the matrix form of $\mathbf{W}$ and $\mathbf{U}$: +```math +\begin{align*} +\mathbf{W} &= \mathrm{diag}(\beta) \mathbf{K} - \mathrm{tril}(\mathrm{diag}(\beta) \mathbf{K}\mathbf{K}^\top, -1)\mathbf{W}\\ +\left(\mathbf{I} + \mathrm{tril}(\mathrm{diag}(\beta) \mathbf{K}\mathbf{K}^\top, -1)\right) \mathbf{W} &= \mathrm{diag}(\beta) \mathbf{K} +\end{align*} +``` +A similar process holds for $\mathbf{U}$. We can further write $\mathbf{W}$ and $\mathbf{U}$ in matrix form: +```math +\begin{align*} +\mathbf{T} &= \left(\mathbf{I} + \mathrm{tril}\left(\mathrm{diag}(\beta)\mathbf{K} \mathbf{K}^\top,-1\right)\right)^{-1}\mathrm{diag}\left(\beta\right)\in \mathbb{R}^{C \times C}\\ +\mathbf{W} &= \mathbf{T} \mathbf{K}\in \mathbb{R}^{C \times d_k}\\ +\mathbf{U} &= \mathbf{T}\mathbf{V}\in \mathbb{R}^{C \times d_v} +\end{align*} +``` + +Substituting these back into the original equations yields a hardware-efficient chunkwise algorithm for DeltaNet that leverages matrix multiplications, enabling tensor core based GPU optimization: +```math +\begin{equation} +\begin{aligned} +\mathbf{S} &= \mathbf{P}\cdot\mathbf{S}^0 + \mathbf{H} \\ +&= \mathbf{S}^0 + \mathbf{K}^\top (\mathbf{U} -\mathbf{W} \mathbf{S}^0) \in \mathbb{R}^{d_k \times d_v}\\ +\mathbf{O} &= \mathbf{Q} \mathbf{S}^0 + (\mathbf{Q} \mathbf{K}^{\top} \odot \mathbf{M}) \left(\mathbf{U} - \mathbf{W} \mathbf{S}^0\right) \in \mathbb{R}^{C \times d_v} +\end{aligned} +\end{equation} +``` + +[^1]: https://arxiv.org/abs/2406.06484 diff --git a/fla/ops/delta_rule/__pycache__/__init__.cpython-311.pyc b/fla/ops/delta_rule/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5fdfdcdf5f84f4c0caeba15ca26fbfa3b6f75948 Binary files /dev/null and b/fla/ops/delta_rule/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/ops/gated_delta_rule/__pycache__/wy_fast.cpython-311.pyc b/fla/ops/gated_delta_rule/__pycache__/wy_fast.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dbed299ed89e30fbd0f69686df11baaff07f7c5b Binary files /dev/null and b/fla/ops/gated_delta_rule/__pycache__/wy_fast.cpython-311.pyc differ diff --git a/fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_A_fwd.cpython-311.pyc b/fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_A_fwd.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a890d67a1ae19a91d86fa980ed491403ea80d3c Binary files /dev/null and b/fla/ops/generalized_delta_rule/dplr/__pycache__/chunk_A_fwd.cpython-311.pyc differ diff --git a/fla/ops/lightning_attn/__init__.py b/fla/ops/lightning_attn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c28c3af59f61d32cbb68a63926ac67fa2bb73447 --- /dev/null +++ b/fla/ops/lightning_attn/__init__.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- + +from .chunk import chunk_lightning_attn +from .fused_recurrent import fused_recurrent_lightning_attn + +__all__ = [ + 'chunk_lightning_attn', + 'fused_recurrent_lightning_attn' +] diff --git a/fla/ops/linear_attn/__pycache__/fused_recurrent.cpython-311.pyc b/fla/ops/linear_attn/__pycache__/fused_recurrent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..44cbdc83b9df49259717effa2976cc067948b706 Binary files /dev/null and b/fla/ops/linear_attn/__pycache__/fused_recurrent.cpython-311.pyc differ diff --git a/fla/ops/linear_attn/chunk.py b/fla/ops/linear_attn/chunk.py new file mode 100644 index 0000000000000000000000000000000000000000..8283e707923389e5c0f4e8294f7c491277f7243d --- /dev/null +++ b/fla/ops/linear_attn/chunk.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Yu Zhang, Songlin Yang + +from typing import Optional, Tuple + +import torch + +from fla.ops.linear_attn.utils import normalize_output +from fla.ops.simple_gla import chunk_simple_gla + + +@torch.compiler.disable +def chunk_linear_attn( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: Optional[float] = None, + initial_state: Optional[torch.Tensor] = None, + output_final_state: bool = False, + normalize: bool = True, + head_first: bool = True +) -> Tuple[torch.Tensor, torch.Tensor]: + r""" + Args: + q (torch.Tensor): + queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]` + k (torch.Tensor): + keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]` + v (torch.Tensor): + values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]` + scale (Optional[int]): + Scale factor for the linear attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + initial_state (Optional[torch.Tensor]): + Initial state of shape `[B, H, K, V]`. Default: `None`. + output_final_state (Optional[bool]): + Whether to output the final state of shape `[B, H, K, V]`. Default: `False`. + normalize (bool): + Whether to normalize the output. Default: `True`. + head_first (Optional[bool]): + Whether the inputs are in the head-first format. Default: `True`. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]` + final_state (torch.Tensor): + Final state of shape `[B, H, K, V]` if `output_final_state=True` else `None` + """ + + if scale is None: + scale = k.shape[-1] ** -0.5 + + o, final_state = chunk_simple_gla( + q=q, + k=k, + v=v, + scale=scale, + g=None, + initial_state=initial_state, + output_final_state=output_final_state, + head_first=head_first + ) + if normalize: + o = normalize_output(q * scale, k, o) + return o, final_state diff --git a/fla/ops/linear_attn/fused_recurrent.py b/fla/ops/linear_attn/fused_recurrent.py new file mode 100644 index 0000000000000000000000000000000000000000..b50b8c7bfb470b69be5ba3327de24ed07ffa974d --- /dev/null +++ b/fla/ops/linear_attn/fused_recurrent.py @@ -0,0 +1,251 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2024, Songlin Yang, Yu Zhang + +from typing import Optional, Tuple + +import torch +import triton +import triton.language as tl + +from fla.ops.linear_attn.utils import normalize_output +from fla.utils import input_guard + + +@triton.jit +def fused_recurrent_linear_attn_fwd_kernel( + q, # query [B, H, L, K] + k, # key [B, H, L, V] + v, # value [B, H, L, V] + o, # output [B, H, L, V] + h0, + ht, # final hidden state [B, H, K, V] + + s_k_h, # stride size: L * K + s_v_h, # stride size: L * V + + scale, + B, # batch size + H, # H + T, # T + K: tl.constexpr, # K + V: tl.constexpr, # V + BK: tl.constexpr, # BLOCK SIZE along the K dimension + BV: tl.constexpr, # BLOCK SIZE along the V dimension + USE_INITIAL_STATE: tl.constexpr, # whether to use initial state + STORE_FINAL_STATE: tl.constexpr, # whether to store final state +): + # indices + i_v, i_k, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + + p_q = q + i_bh * s_k_h + i_k * BK + tl.arange(0, BK) + p_k = k + i_bh * s_k_h + i_k * BK + tl.arange(0, BK) + p_v = v + i_bh * s_v_h + i_v * BV + tl.arange(0, BV) + p_o = o + (i_bh + i_k * B * H) * s_v_h + i_v * BV + tl.arange(0, BV) + + mask_bk = (i_k * BK + tl.arange(0, BK)) < K + mask_bv = (i_v * BV + tl.arange(0, BV)) < V + mask_kv = mask_bk[None, :] & mask_bv[:, None] + + b_h = tl.zeros([BV, BK], dtype=tl.float32) + + if USE_INITIAL_STATE: + p_h0 = h0 + i_bh * K * V + (i_k * BK + tl.arange(0, BK)[None, :]) * V + (i_v * BV + tl.arange(0, BV)[:, None]) + b_h += tl.load(p_h0, mask=mask_kv, other=0).to(tl.float32) + + for _ in range(0, T): + b_k = tl.load(p_k, mask=mask_bk, other=0).to(tl.float32) + b_v = tl.load(p_v, mask=mask_bv, other=0).to(tl.float32) + b_q = tl.load(p_q, mask=mask_bk, other=0).to(tl.float32) * scale + + b_h += b_k[None, :] * b_v[:, None] + b_o = b_h * b_q[None, :] + b_o = tl.sum(b_o, axis=1) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), mask=mask_bv) + + p_q += K + p_k += K + p_o += V + p_v += V + + if STORE_FINAL_STATE: + p_ht = ht + i_bh * K * V + (i_k * BK + tl.arange(0, BK)[None, :]) * V + (i_v * BV + tl.arange(0, BV)[:, None]) + tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), mask=mask_kv) + + +# Similar to Algorithm1 of https://arxiv.org/abs/2006.16236 +@triton.jit +def fused_recurrent_linear_attn_bwd_kernel( + q, # query [B, H, L, K] + k, # key [B, H, L, V] + v, # value [B, H, L, V] + + do, # gradient of output [B, H, L, V] + dq, # gradient of query [NV, B, H, L, K] + dk, # gradient of key [NV, B, H, L, K] + dv, # gradient of value [NK, B, H, L, V] + h0, # initial hidden state initialization [B, H, K, V] + + s_k_h, # stride size: L * K + s_v_h, # stride size: L * V + scale, # K ** -0.5 + + B, # B + H, # H + T, # T + K: tl.constexpr, # K + V: tl.constexpr, # V + BK: tl.constexpr, # BLOCK SIZE along the K dimension + BV: tl.constexpr, # BLOCK SIZE along the V dimension + USE_INITIAL_STATE: tl.constexpr, # whether to use initial state +): + i_v, i_k, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + + p_q = q + i_bh * s_k_h + i_k * BK + tl.arange(0, BK) + p_k = k + i_bh * s_k_h + i_k * BK + tl.arange(0, BK) + p_v = v + i_bh * s_v_h + i_v * BV + tl.arange(0, BV) + p_do = do + i_bh * s_v_h + i_v * BV + tl.arange(0, BV) + + p_dq = dq + (i_bh + i_v * B * H) * s_k_h + i_k * BK + tl.arange(0, BK) + mask_bk = i_k * BK + tl.arange(0, BK) < K + mask_bv = i_v * BV + tl.arange(0, BV) < V + + b_h = tl.zeros([BK, BV], dtype=tl.float32) + + if USE_INITIAL_STATE: + mask_kv = mask_bk[:, None] & mask_bv[None, :] + p_h0 = h0 + i_bh * K * V + (i_k * BK + tl.arange(0, BK)[:, None]) * V + (i_v * BV + tl.arange(0, BV)[None, :]) + b_h += tl.load(p_h0, mask=mask_kv, other=0).to(tl.float32) + + for _ in range(0, T): + b_k = tl.load(p_k, mask=mask_bk, other=0).to(tl.float32) + b_v = tl.load(p_v, mask=mask_bv, other=0).to(tl.float32) + b_do = tl.load(p_do, mask=mask_bv, other=0).to(tl.float32) + + b_h += b_k[:, None] * b_v[None, :] + _d_q = b_h * b_do[None, :] + d_q = tl.sum(_d_q, axis=1) * scale + tl.store(p_dq, d_q.to(p_dq.dtype.element_ty), mask=mask_bk) + + p_k += K + p_do += V + p_v += V + p_dq += K + + # sync threads + tl.debug_barrier() + + p_q = q + i_bh * s_k_h + i_k * BK + tl.arange(0, BK) + (T - 1) * K + p_k = k + i_bh * s_k_h + i_k * BK + tl.arange(0, BK) + (T - 1) * K + p_do = do + i_bh * s_v_h + i_v * BV + tl.arange(0, BV) + (T - 1) * V + p_v = v + i_bh * s_v_h + i_v * BV + tl.arange(0, BV) + (T - 1) * V + p_dk = dk + (i_bh + i_v * B * H) * s_k_h + i_k * BK + tl.arange(0, BK) + (T - 1) * K + p_dv = dv + (i_bh + i_k * B * H) * s_v_h + i_v * BV + tl.arange(0, BV) + (T - 1) * V + d_h = tl.zeros([BK, BV], dtype=tl.float32) + + for _ in range(T): + b_do = tl.load(p_do, mask=mask_bv, other=0).to(tl.float32) + b_q = tl.load(p_q, mask=mask_bk, other=0).to(tl.float32) * scale + b_k = tl.load(p_k, mask=mask_bk, other=0).to(tl.float32) + b_v = tl.load(p_v, mask=mask_bv, other=0).to(tl.float32) + d_h += b_q[:, None] * b_do[None, :] + d_k = tl.sum(d_h * b_v[None, :], axis=1) + d_v = tl.sum(d_h * b_k[:, None], axis=0) + + tl.store(p_dk, d_k.to(p_dk.dtype.element_ty), mask=mask_bk) + tl.store(p_dv, d_v.to(p_dv.dtype.element_ty), mask=mask_bv) + + p_do -= V + p_q -= K + p_k -= K + p_v -= V + p_dk -= K + p_dv -= V + + +class FusedRecurrentLinearAttentionFunction(torch.autograd.Function): + + @staticmethod + @input_guard + def forward(ctx, q, k, v, scale, initial_state=None, output_final_state=False): + B, H, T, K = q.shape + V = v.shape[-1] + + BK, BV = min(K, 32), min(V, 32) + NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV) + num_warps = 1 + num_stages = 1 + + o = q.new_empty(NK, B, H, T, V) + final_state = q.new_empty(B, H, K, V) if output_final_state else None + + grid = (NV, NK, B * H) + fused_recurrent_linear_attn_fwd_kernel[grid]( + q, k, v, o, initial_state, final_state, + q.stride(1), + v.stride(1), scale, + B=B, H=H, T=T, K=K, V=V, BK=BK, BV=BV, + USE_INITIAL_STATE=initial_state is not None, + STORE_FINAL_STATE=final_state is not None, + num_warps=num_warps, + num_stages=num_stages + ) + + o = o.sum(0) + ctx.save_for_backward(q, k, v, initial_state) + ctx.scale = scale + return o, final_state + + @staticmethod + @input_guard + def backward(ctx, do, dht=None): + q, k, v, initial_state = ctx.saved_tensors + B, H, T, K = q.shape + V = v.shape[-1] + scale = ctx.scale + + BK, BV = min(K, 32), min(V, 32) + NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV) + num_warps = 1 + num_stages = 1 + + dq = q.new_empty(NV, B, H, T, K) + dk = q.new_empty(NV, B, H, T, K) + dv = q.new_empty(NK, B, H, T, V) + grid = (NV, NK, B * H) + + fused_recurrent_linear_attn_bwd_kernel[grid]( + q, k, v, do, dq, dk, dv, initial_state, + q.stride(1), + v.stride(1), + scale, + B=B, H=H, T=T, K=K, V=V, BK=BK, BV=BV, + USE_INITIAL_STATE=initial_state is not None, + num_warps=num_warps, + num_stages=num_stages + ) + dq = dq.sum(0) + dk = dk.sum(0) + dv = dv.sum(0) + return dq, dk, dv, None, None, None + + +def fused_recurrent_linear_attn( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: Optional[float] = None, + initial_state: torch.Tensor = None, + output_final_state: bool = False, + normalize: bool = False, + head_first: bool = True +) -> Tuple[torch.Tensor, torch.Tensor]: + if scale is None: + scale = q.shape[-1] ** -0.5 + if not head_first: + q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v)) + o, final_state = FusedRecurrentLinearAttentionFunction.apply(q, k, v, scale, initial_state, output_final_state) + if normalize: + o = normalize_output(q * scale, k, o) + if not head_first: + o = o.transpose(1, 2) + return o, final_state diff --git a/fla/ops/linear_attn/naive.py b/fla/ops/linear_attn/naive.py new file mode 100644 index 0000000000000000000000000000000000000000..b6ecf2718fcac8eef80f445ed02b95f36329f3c4 --- /dev/null +++ b/fla/ops/linear_attn/naive.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +from typing import Optional, Tuple + +import torch +from einops import rearrange + +from fla.ops.linear_attn.utils import normalize_output + + +def naive_chunk_linear_attn( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: Optional[float] = None, + normalize: bool = False +) -> Tuple[torch.Tensor, torch.Tensor]: + if scale is None: + scale = q.shape[-1] ** -0.5 + chunk_size = 64 + q = rearrange(q, 'b h (n c) d -> b h n c d', c=chunk_size) * scale + k = rearrange(k, 'b h (n c) d -> b h n c d', c=chunk_size) + v = rearrange(v, 'b h (n c) d -> b h n c d', c=chunk_size) + kv = k.transpose(-1, -2) @ v + kv = kv.cumsum(2) + kv = torch.cat([torch.zeros_like(kv[:, :, :1]), kv[:, :, :-1]], dim=2) + inter = q @ kv + intra = (( + q @ k.transpose(-1, -2)).masked_fill_( + torch.triu(torch.ones(chunk_size, chunk_size, dtype=bool, device=q.device), diagonal=1), + 0 + )) @ v + o = inter + intra + if normalize: + o = normalize_output(q * scale, k, o) + return rearrange(o, 'b h n c d -> b h (n c) d') diff --git a/fla/ops/linear_attn/utils.py b/fla/ops/linear_attn/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b444376833f5d512af6fc2db387db75a43a92e5d --- /dev/null +++ b/fla/ops/linear_attn/utils.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- + +import torch + + +@torch.jit.script +def normalize_output(q, k, o): + k = k.cumsum(-2) + z = (q * k).sum(-1, keepdim=True) + return o / (z + 1e-10) diff --git a/fla/ops/nsa/__pycache__/__init__.cpython-311.pyc b/fla/ops/nsa/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6328d041eb0359a51e0c9f0a1fad7f75b4188b4f Binary files /dev/null and b/fla/ops/nsa/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/ops/nsa/parallel.py b/fla/ops/nsa/parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..7e89d964c7357ceeabaaeb9500849ce6cbdecfad --- /dev/null +++ b/fla/ops/nsa/parallel.py @@ -0,0 +1,1435 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +import warnings +from typing import Optional, Union + +import torch +import triton +import triton.language as tl +from einops import rearrange + +from fla.ops.common.utils import prepare_chunk_indices, prepare_chunk_offsets, prepare_lens, prepare_token_indices +from fla.ops.nsa.utils import _bitonic_merge +from fla.ops.utils import mean_pooling +from fla.ops.utils.op import exp, log +from fla.utils import autocast_custom_bwd, autocast_custom_fwd, check_shared_mem, contiguous + +try: + from flash_attn import flash_attn_func, flash_attn_varlen_func +except ImportError: + warnings.warn( + "Flash Attention is not installed. Please install it via `pip install flash-attn --no-build-isolation`", + category=ImportWarning + ) + flash_attn_func = None + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4] + ], + key=['BS', 'BK', 'BV'], +) +@triton.jit +def parallel_nsa_compression_fwd_kernel( + q, + k, + v, + o, + lse, + scale, + offsets, + token_indices, + chunk_offsets, + T, + H: tl.constexpr, + HQ: tl.constexpr, + G: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BC: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_OFFSETS: tl.constexpr, +): + i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + + if USE_OFFSETS: + i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + boc = tl.load(chunk_offsets + i_n).to(tl.int32) + else: + bos, eos = i_b * T, i_b * T + T + boc = i_b * tl.cdiv(T, BS) + + p_q = tl.make_block_ptr(q + (bos + i_t) * HQ*K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0)) + + # the Q block is kept in the shared memory throughout the whole kernel + # [G, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + + # the number of compression representations in total + TC = tl.cdiv(T, BS) + # the number of compression representations required to iterate over + # incomplete compression blocks are not included + NC = (i_t + 1) // BS + + p_o = tl.make_block_ptr(o + (bos + i_t) * HQ*V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0)) + # [G, BV] + b_o = tl.zeros([G, BV], dtype=tl.float32) + # max scores for the current block + b_m = tl.full([G], float('-inf'), dtype=tl.float32) + # lse = log(acc) + m + b_acc = tl.zeros([G], dtype=tl.float32) + + for i_c in range(0, NC, BC): + o_c = i_c + tl.arange(0, BC) + + p_k = tl.make_block_ptr(k + (boc * H + i_h) * K, (K, TC), (1, H*K), (0, i_c), (BK, BC), (0, 1)) + p_v = tl.make_block_ptr(v + (boc * H + i_h) * V, (TC, V), (H*V, 1), (i_c, i_v * BV), (BC, BV), (1, 0)) + # [BK, BC] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BC, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [G, BC] + b_s = tl.dot(b_q, b_k) + b_s = tl.where((o_c < NC)[None, :], b_s, float('-inf')) + + # [G] + b_m, b_mp = tl.maximum(b_m, tl.max(b_s, 1)), b_m + b_r = exp(b_mp - b_m) + # [G, BC] + b_p = exp(b_s - b_m[:, None]) + # [G] + b_acc = b_acc * b_r + tl.sum(b_p, 1) + + # [G, BV] + b_o = b_o * b_r[:, None] + tl.dot(b_p.to(b_q.dtype), b_v) + + b_mp = b_m + if NC == 0: + b_lse = tl.zeros([G], dtype=tl.float32) + else: + b_o = b_o / b_acc[:, None] + b_lse = b_m + log(b_acc) + + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + if i_v == 0: + tl.store(lse + (bos + i_t) * HQ + i_h * G + tl.arange(0, G), b_lse.to(lse.dtype.element_ty)) + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None, +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4] + ], + key=['BS', 'BK', 'BV'], +) +@triton.jit(do_not_specialize=['T']) +def parallel_nsa_compression_bwd_kernel_dq( + q, + k, + v, + lse, + delta, + do, + dq, + scale, + offsets, + token_indices, + chunk_offsets, + T, + B: tl.constexpr, + H: tl.constexpr, + HQ: tl.constexpr, + G: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BC: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_OFFSETS: tl.constexpr +): + i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + + if USE_OFFSETS: + i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + boc = tl.load(chunk_offsets + i_n).to(tl.int32) + else: + bos, eos = i_b * T, i_b * T + T + boc = i_b * tl.cdiv(T, BS) + + q += (bos + i_t) * HQ*K + do += (bos + i_t) * HQ*V + lse += (bos + i_t) * HQ + delta += (bos + i_t) * HQ + dq += (i_v * B * T + bos + i_t) * HQ*K + + p_q = tl.make_block_ptr(q, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0)) + p_dq = tl.make_block_ptr(dq, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0)) + + # [G, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + + p_do = tl.make_block_ptr(do, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0)) + p_lse = lse + i_h * G + tl.arange(0, G) + p_delta = delta + i_h * G + tl.arange(0, G) + + # the number of compression representations in total + TC = tl.cdiv(T, BS) + # the number of compression representations required to iterate over + # incomplete compression blocks are not included + NC = (i_t + 1) // BS + + # [G, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + # [G] + b_lse = tl.load(p_lse) + b_delta = tl.load(p_delta) + + # [G, BK] + b_dq = tl.zeros([G, BK], dtype=tl.float32) + for i_c in range(0, NC, BC): + o_c = i_c + tl.arange(0, BC) + p_k = tl.make_block_ptr(k + (boc * H + i_h) * K, (K, TC), (1, H*K), (0, i_c), (BK, BC), (0, 1)) + p_v = tl.make_block_ptr(v + (boc * H + i_h) * V, (V, TC), (1, H*V), (i_v * BV, i_c), (BV, BC), (0, 1)) + # [BK, BC] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BV, BC] + b_v = tl.load(p_v, boundary_check=(0, 1)) + + # [G, BC] + b_s = tl.dot(b_q, b_k) + b_p = exp(b_s - b_lse[:, None]) + b_p = tl.where((o_c < NC)[None, :], b_p, 0) + + # [G, BV] @ [BV, BC] -> [G, BC] + b_dp = tl.dot(b_do, b_v) + b_ds = b_p * (b_dp.to(tl.float32) - b_delta[:, None]) + # [G, BC] @ [BC, BK] -> [G, BK] + b_dq += tl.dot(b_ds.to(b_k.dtype), tl.trans(b_k)) + b_dq *= scale + tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4] + ], + key=['BS', 'BK', 'BV'], +) +@triton.jit(do_not_specialize=['T']) +def parallel_nsa_compression_bwd_kernel_dkv( + q, + k, + v, + lse, + delta, + do, + dk, + dv, + offsets, + chunk_indices, + chunk_offsets, + scale, + T, + B: tl.constexpr, + H: tl.constexpr, + HQ: tl.constexpr, + G: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BC: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_OFFSETS: tl.constexpr +): + i_v, i_c, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + + if USE_OFFSETS: + i_n, i_c = tl.load(chunk_indices + i_c * 2).to(tl.int32), tl.load(chunk_indices + i_c * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + boc = tl.load(chunk_offsets + i_n).to(tl.int32) + else: + bos, eos = i_b * T, i_b * T + T + boc = i_b * tl.cdiv(T, BS) + + # the number of compression representations in total + TC = tl.cdiv(T, BS) + + p_k = tl.make_block_ptr(k + (boc * H + i_h) * K, (TC, K), (H*K, 1), (i_c * BC, 0), (BC, BK), (1, 0)) + p_v = tl.make_block_ptr(v + (boc * H + i_h) * V, (TC, V), (H*V, 1), (i_c * BC, i_v * BV), (BC, BV), (1, 0)) + p_dk = tl.make_block_ptr(dk + (i_v * B*T*H + boc * H + i_h) * K, (TC, K), (H*K, 1), (i_c * BC, 0), (BC, BK), (1, 0)) + p_dv = tl.make_block_ptr(dv + (i_v * B*T*H + boc * H + i_h) * V, (TC, V), (H*V, 1), (i_c * BC, i_v * BV), (BC, BV), (1, 0)) + + # [BC, BK] + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_dk = tl.zeros([BC, BK], dtype=tl.float32) + # [BC, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_dv = tl.zeros([BC, BV], dtype=tl.float32) + + for i in range(i_c * BC * BS, T): + o_c = i_c * BC + tl.arange(0, BC) + + p_q = tl.make_block_ptr(q + (bos + i) * HQ*K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0)) + # [G, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + + p_do = tl.make_block_ptr(do + (bos + i) * HQ*V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0)) + p_lse = lse + (bos + i) * HQ + i_h * G + tl.arange(0, G) + p_delta = delta + (bos + i) * HQ + i_h * G + tl.arange(0, G) + # [G, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + # [G] + b_lse = tl.load(p_lse) + b_delta = tl.load(p_delta) + # [BC, G] + b_s = tl.dot(b_k, tl.trans(b_q)) + b_p = exp(b_s - b_lse[None, :]) + b_p = tl.where((i >= max(0, (o_c + 1) * BS - 1))[:, None], b_p, 0) + # [BC, G] @ [G, BV] -> [BC, BV] + b_dv += tl.dot(b_p.to(b_do.dtype), b_do) + # [BC, BV] @ [BV, G] -> [BC, G] + b_dp = tl.dot(b_v, tl.trans(b_do)) + # [BC, G] + b_ds = b_p * (b_dp - b_delta[None, :]) + # [BC, G] @ [G, BK] -> [BC, BK] + b_dk += tl.dot(b_ds.to(b_q.dtype), b_q) + + tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4] + ], + key=['BS', 'BK'], +) +@triton.jit +def parallel_nsa_kernel_topk( + q, + k, + lse, + scale, + block_indices, + offsets, + token_indices, + chunk_offsets, + T, + H: tl.constexpr, + HQ: tl.constexpr, + G: tl.constexpr, + K: tl.constexpr, + S: tl.constexpr, + BC: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + USE_OFFSETS: tl.constexpr, +): + i_t, i_bh = tl.program_id(0), tl.program_id(1) + i_b, i_h = i_bh // H, i_bh % H + + if USE_OFFSETS: + i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + boc = tl.load(chunk_offsets + i_n).to(tl.int32) + else: + bos, eos = i_b * T, i_b * T + T + boc = i_b * tl.cdiv(T, BS) + + p_q = tl.make_block_ptr(q + (bos + i_t) * HQ*K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0)) + + # the Q block is kept in the shared memory throughout the whole kernel + # [G, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + + # the number of compression representations in total + TC = tl.cdiv(T, BS) + # the number of compression representations required to iterate over + # incomplete compression blocks are not included + NC = (i_t + 1) // BS + ################################ + # 1. lse computation + ################################ + if lse is not None: + b_lse = tl.load(lse + (bos + i_t) * HQ + i_h * G + tl.arange(0, G)) + else: + # max scores for the current block + b_m = tl.full([G], float('-inf'), dtype=tl.float32) + # lse = log(acc) + m + b_acc = tl.zeros([G], dtype=tl.float32) + for i_c in range(0, NC, BC): + o_c = i_c + tl.arange(0, BC) + + p_k = tl.make_block_ptr(k + (boc * H + i_h) * K, (K, TC), (1, H*K), (0, i_c), (BK, BC), (0, 1)) + # [BK, BC] + b_k = tl.load(p_k, boundary_check=(0, 1)) + + # [G, BC] + b_s = tl.dot(b_q, b_k) + b_s = tl.where((o_c < NC)[None, :], b_s, float('-inf')) + + # [G] + b_m, b_mp = tl.maximum(b_m, tl.max(b_s, 1)), b_m + b_r = exp(b_mp - b_m) + # [G, BC] + b_p = exp(b_s - b_m[:, None]) + # [G] + b_acc = b_acc * b_r + tl.sum(b_p, 1) + + b_mp = b_m + if NC == 0: + b_lse = tl.zeros([G], dtype=tl.float32) + else: + b_lse = b_m + log(b_acc) + + ################################ + # 2. topk selection + ################################ + # [BC] + b_i = tl.full([BC], -1, dtype=tl.float32) + o_i = tl.zeros([BC], dtype=tl.int32) + m_i = tl.arange(0, BC) < BC//2 + for i_c in range(0, i_t // BS + 1, BC): + o_c = i_c + tl.arange(0, BC) + + p_k = tl.make_block_ptr(k + (boc * H + i_h) * K, (K, TC), (1, H*K), (0, i_c), (BK, BC), (0, 1)) + # [BK, BC] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [G, BC] + b_s = tl.dot(b_q, b_k) + b_s = tl.where((i_t // BS > o_c)[None, :], b_s, float('-inf')) + # [G, BC] + b_p = tl.where((i_t // BS == o_c)[None, :], float(1.0), exp(b_s - b_lse[:, None])) + # the importance scores of the current block + # [BC] + b_i, b_ip = tl.sum(b_p, 0), b_i + o_i, o_ip = tl.where(o_c <= i_t // BS, o_c + 1, 0), o_i + + n_dims: tl.constexpr = tl.standard._log2(b_i.shape[0]) + for i in tl.static_range(1, n_dims): + b_i, o_i = _bitonic_merge(b_i, o_i.to(tl.int32), i, 2, n_dims) + + if i_c != 0: + b_i, o_i = _bitonic_merge(b_i, o_i.to(tl.int32), n_dims, False, n_dims) + b_i_new = b_ip * m_i + b_i * (1 - m_i) + o_i_new = o_ip * m_i + o_i * (1 - m_i) + b_i, o_i = _bitonic_merge(b_i_new, o_i_new.to(tl.int32), n_dims, True, n_dims) + else: + b_i, o_i = _bitonic_merge(b_i, o_i.to(tl.int32), n_dims, True, n_dims) + + m_top = tl.arange(0, BC//S) == 0 + b_top = tl.sum(m_top[:, None] * tl.reshape(o_i - 1, [BC//S, S]), 0) + + p_b = tl.make_block_ptr(block_indices + (bos + i_t) * H*S, (H*S,), (1,), (i_h * S,), (S,), (0,)) + tl.store(p_b, b_top.to(p_b.dtype.element_ty)) + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None, + 'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor), +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4] + ], + key=['BS', 'BK', 'BV'], +) +@triton.jit +def parallel_nsa_fwd_kernel( + q, + k, + v, + o, + lse, + scale, + block_indices, + block_counts, + offsets, + token_indices, + T, + H: tl.constexpr, + HQ: tl.constexpr, + G: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + S: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_OFFSETS: tl.constexpr, + USE_BLOCK_COUNTS: tl.constexpr +): + i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + + if USE_OFFSETS: + i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + k += (bos * H + i_h) * K + v += (bos * H + i_h) * V + block_indices += (bos + i_t) * H*S + i_h * S + + if USE_BLOCK_COUNTS: + NS = tl.load(block_counts + (bos + i_t) * H + i_h) + else: + NS = S + + p_q = tl.make_block_ptr(q + (bos + i_t) * HQ*K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0)) + # the Q block is kept in the shared memory throughout the whole kernel + # [G, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + + p_o = tl.make_block_ptr(o + (bos + i_t) * HQ*V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0)) + p_lse = lse + (bos + i_t) * HQ + i_h * G + tl.arange(0, G) + # [G, BV] + b_o = tl.zeros([G, BV], dtype=tl.float32) + + b_m = tl.full([G], float('-inf'), dtype=tl.float32) + b_acc = tl.zeros([G], dtype=tl.float32) + for i in range(NS): + i_s = tl.load(block_indices + i).to(tl.int32) * BS + if i_s <= i_t and i_s >= 0: + p_k = tl.make_block_ptr(k, (K, T), (1, H*K), (0, i_s), (BK, BS), (0, 1)) + p_v = tl.make_block_ptr(v, (T, V), (H*V, 1), (i_s, i_v * BV), (BS, BV), (1, 0)) + # [BK, BS] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BS, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [G, BS] + b_s = tl.dot(b_q, b_k) + b_s = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_s, float('-inf')) + + # [G] + b_m, b_mp = tl.maximum(b_m, tl.max(b_s, 1)), b_m + b_r = exp(b_mp - b_m) + # [G, BS] + b_p = exp(b_s - b_m[:, None]) + # [G] + b_acc = b_acc * b_r + tl.sum(b_p, 1) + # [G, BV] + b_o = b_o * b_r[:, None] + tl.dot(b_p.to(b_q.dtype), b_v) + + b_mp = b_m + b_o = b_o / b_acc[:, None] + b_m += log(b_acc) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_lse, b_m.to(p_lse.dtype.element_ty)) + + +@triton.heuristics({ + 'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor) +}) +@triton.jit +def parallel_nsa_kernel_mask( + block_indices, + block_counts, + block_mask, + T: tl.constexpr, + H: tl.constexpr, + S: tl.constexpr, + BS: tl.constexpr, + NS: tl.constexpr, + USE_BLOCK_COUNTS: tl.constexpr +): + i_t, i_b, i_hs = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_h, i_s = i_hs // S, i_hs % S + + b_i = tl.load(block_indices + i_b * T * H * S + i_t * H * S + i_h * S + i_s) + if USE_BLOCK_COUNTS: + b_m = b_i * BS <= i_t and i_s < tl.load(block_counts + i_b * T * H + i_t * H + i_h) + else: + b_m = b_i * BS <= i_t + + if b_i < NS and b_i >= 0: + tl.store(block_mask + i_b * T * H * NS + i_t * H * NS + i_h * NS + b_i, b_m.to(block_mask.dtype.element_ty)) + + +@triton.jit +def parallel_nsa_bwd_kernel_preprocess( + o, + do, + delta, + B: tl.constexpr, + V: tl.constexpr +): + i_n = tl.program_id(0) + o_d = tl.arange(0, B) + m_d = o_d < V + + b_o = tl.load(o + i_n * V + o_d, mask=m_d, other=0) + b_do = tl.load(do + i_n * V + o_d, mask=m_d, other=0).to(tl.float32) + b_delta = tl.sum(b_o * b_do) + + tl.store(delta + i_n, b_delta.to(delta.dtype.element_ty)) + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None, + 'USE_BLOCK_COUNTS': lambda args: isinstance(args['block_counts'], torch.Tensor) +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4] + ], + key=['BS', 'BK', 'BV'], +) +@triton.jit(do_not_specialize=['T']) +def parallel_nsa_bwd_kernel_dq( + q, + k, + v, + lse, + delta, + do, + dq, + scale, + block_indices, + block_counts, + offsets, + token_indices, + T, + B: tl.constexpr, + H: tl.constexpr, + HQ: tl.constexpr, + G: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + S: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_OFFSETS: tl.constexpr, + USE_BLOCK_COUNTS: tl.constexpr +): + i_t, i_v, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + + if USE_OFFSETS: + i_n, i_t = tl.load(token_indices + i_t * 2).to(tl.int32), tl.load(token_indices + i_t * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + q += (bos + i_t) * HQ*K + do += (bos + i_t) * HQ*V + lse += (bos + i_t) * HQ + delta += (bos + i_t) * HQ + dq += (i_v * B * T + bos + i_t) * HQ*K + block_indices += (bos + i_t) * H*S + i_h * S + + if USE_BLOCK_COUNTS: + NS = tl.load(block_counts + (bos + i_t) * H + i_h) + else: + NS = S + + k += (bos * H + i_h) * K + v += (bos * H + i_h) * V + + p_q = tl.make_block_ptr(q, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0)) + p_dq = tl.make_block_ptr(dq, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0)) + + # [G, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + + p_do = tl.make_block_ptr(do, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0)) + p_lse = lse + i_h * G + tl.arange(0, G) + p_delta = delta + i_h * G + tl.arange(0, G) + + # [G, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + # [G] + b_lse = tl.load(p_lse) + b_delta = tl.load(p_delta) + + # [G, BK] + b_dq = tl.zeros([G, BK], dtype=tl.float32) + for i in range(NS): + i_s = tl.load(block_indices + i).to(tl.int32) * BS + if i_s <= i_t and i_s >= 0: + p_k = tl.make_block_ptr(k, (K, T), (1, H*K), (0, i_s), (BK, BS), (0, 1)) + p_v = tl.make_block_ptr(v, (V, T), (1, H*V), (i_v * BV, i_s), (BV, BS), (0, 1)) + # [BK, BS] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BV, BS] + b_v = tl.load(p_v, boundary_check=(0, 1)) + + # [G, BS] + b_s = tl.dot(b_q, b_k) + b_p = exp(b_s - b_lse[:, None]) + b_p = tl.where((i_t >= (i_s + tl.arange(0, BS)))[None, :], b_p, 0) + + # [G, BV] @ [BV, BS] -> [G, BS] + b_dp = tl.dot(b_do, b_v) + b_ds = b_p * (b_dp.to(tl.float32) - b_delta[:, None]) + # [G, BS] @ [BS, BK] -> [G, BK] + b_dq += tl.dot(b_ds.to(b_k.dtype), tl.trans(b_k)) + b_dq *= scale + + tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.heuristics({ + 'USE_OFFSETS': lambda args: args['offsets'] is not None +}) +@triton.autotune( + configs=[ + triton.Config({}, num_warps=num_warps) + for num_warps in [1, 2, 4] + ], + key=['BS', 'BK', 'BV'], +) +@triton.jit(do_not_specialize=['T']) +def parallel_nsa_bwd_kernel_dkv( + q, + k, + v, + lse, + delta, + do, + dk, + dv, + block_mask, + offsets, + chunk_indices, + scale, + T, + B: tl.constexpr, + H: tl.constexpr, + HQ: tl.constexpr, + G: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + M: tl.constexpr, + BS: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_OFFSETS: tl.constexpr +): + i_v, i_s, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_b, i_h = i_bh // H, i_bh % H + + if USE_OFFSETS: + i_n, i_s = tl.load(chunk_indices + i_s * 2).to(tl.int32), tl.load(chunk_indices + i_s * 2 + 1).to(tl.int32) + bos, eos = tl.load(offsets + i_n).to(tl.int32), tl.load(offsets + i_n + 1).to(tl.int32) + T = eos - bos + else: + bos, eos = i_b * T, i_b * T + T + + p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H*K, 1), (i_s * BS, 0), (BS, BK), (1, 0)) + p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_s * BS, i_v * BV), (BS, BV), (1, 0)) + p_dk = tl.make_block_ptr(dk + (i_v * B*T*H + bos * H + i_h) * K, (T, K), (H*K, 1), (i_s * BS, 0), (BS, BK), (1, 0)) + p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H*V, 1), (i_s * BS, i_v * BV), (BS, BV), (1, 0)) + + # [BS, BK] + b_k = tl.load(p_k, boundary_check=(0, 1)) + b_dk = tl.zeros([BS, BK], dtype=tl.float32) + # [BS, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_dv = tl.zeros([BS, BV], dtype=tl.float32) + + for i in range(i_s * BS, T): + b_m = tl.load(block_mask + (bos + i) * H*M + i_h * M + i_s) + if b_m: + p_q = tl.make_block_ptr(q + (bos + i) * HQ*K, (HQ, K), (K, 1), (i_h * G, 0), (G, BK), (1, 0)) + # [G, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + + p_do = tl.make_block_ptr(do + (bos + i) * HQ*V, (HQ, V), (V, 1), (i_h * G, i_v * BV), (G, BV), (1, 0)) + p_lse = lse + (bos + i) * HQ + i_h * G + tl.arange(0, G) + p_delta = delta + (bos + i) * HQ + i_h * G + tl.arange(0, G) + # [G, BV] + b_do = tl.load(p_do, boundary_check=(0, 1)) + # [G] + b_lse = tl.load(p_lse) + b_delta = tl.load(p_delta) + # [BS, G] + b_s = tl.dot(b_k, tl.trans(b_q)) + b_p = exp(b_s - b_lse[None, :]) + b_p = tl.where((i >= (i_s * BS + tl.arange(0, BS)))[:, None], b_p, 0) + # [BS, G] @ [G, BV] -> [BS, BV] + b_dv += tl.dot(b_p.to(b_do.dtype), b_do) + # [BS, BV] @ [BV, G] -> [BS, G] + b_dp = tl.dot(b_v, tl.trans(b_do)) + # [BS, G] + b_ds = b_p * (b_dp - b_delta[None, :]) + # [BS, G] @ [G, BK] -> [BS, BK] + b_dk += tl.dot(b_ds.to(b_q.dtype), b_q) + + tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1)) + + +def parallel_nsa_compression_fwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + block_size: int, + scale: float, + offsets: Optional[torch.LongTensor] = None, + token_indices: Optional[torch.LongTensor] = None, +): + B, T, HQ, K, V = *q.shape, v.shape[-1] + H = k.shape[2] + G = HQ // H + BC = BS = block_size + if check_shared_mem('hopper', q.device.index): + BK = min(256, triton.next_power_of_2(K)) + BV = min(256, triton.next_power_of_2(V)) + else: + BK = min(128, triton.next_power_of_2(K)) + BV = min(128, triton.next_power_of_2(V)) + NK = triton.cdiv(K, BK) + NV = triton.cdiv(V, BV) + assert NK == 1, "The key dimension can not be larger than 256" + + chunk_offsets = prepare_chunk_offsets(offsets, BS) if offsets is not None else None + + grid = (T, NV, B * H) + o = torch.empty(B, T, HQ, V, dtype=v.dtype, device=q.device) + lse = torch.empty(B, T, HQ, dtype=torch.float, device=q.device) + + parallel_nsa_compression_fwd_kernel[grid]( + q=q, + k=k, + v=v, + o=o, + lse=lse, + scale=scale, + offsets=offsets, + token_indices=token_indices, + chunk_offsets=chunk_offsets, + T=T, + H=H, + HQ=HQ, + G=G, + K=K, + V=V, + BC=BC, + BS=BS, + BK=BK, + BV=BV, + ) + return o, lse + + +def parallel_nsa_compression_bwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + o: torch.Tensor, + lse: torch.Tensor, + do: torch.Tensor, + block_size: int = 64, + scale: float = None, + offsets: Optional[torch.LongTensor] = None, + token_indices: Optional[torch.LongTensor] = None, +): + B, T, HQ, K, V = *q.shape, v.shape[-1] + H = k.shape[2] + G = HQ // H + BC = BS = block_size + BK = triton.next_power_of_2(K) + BV = min(128, triton.next_power_of_2(v.shape[-1])) + NV = triton.cdiv(V, BV) + if offsets is not None: + lens = prepare_lens(offsets) + chunk_indices = torch.cat([torch.arange(n) for n in triton.cdiv(triton.cdiv(lens, BS), BC).tolist()]) + chunk_indices = torch.stack([chunk_indices.eq(0).cumsum(0) - 1, chunk_indices], 1).to(offsets) + chunk_offsets = prepare_chunk_offsets(offsets, BS) + NC = len(chunk_indices) + else: + chunk_indices, chunk_offsets = None, None + NC = triton.cdiv(triton.cdiv(T, BS), BC) + + delta = parallel_nsa_bwd_preprocess(o, do) + + dq = torch.empty(NV, *q.shape, dtype=q.dtype if NV == 1 else torch.float, device=q.device) + grid = (T, NV, B * H) + parallel_nsa_compression_bwd_kernel_dq[grid]( + q=q, + k=k, + v=v, + lse=lse, + delta=delta, + do=do, + dq=dq, + scale=scale, + offsets=offsets, + token_indices=token_indices, + chunk_offsets=chunk_offsets, + T=T, + B=B, + H=H, + HQ=HQ, + G=G, + K=K, + V=V, + BC=BC, + BS=BS, + BK=BK, + BV=BV + ) + dq = dq.sum(0) + + dk = torch.empty(NV, *k.shape, dtype=k.dtype if NV == 1 else torch.float, device=q.device) + dv = torch.empty(v.shape, dtype=v.dtype, device=q.device) + + grid = (NV, NC, B * H) + parallel_nsa_compression_bwd_kernel_dkv[grid]( + q=q, + k=k, + v=v, + lse=lse, + delta=delta, + do=do, + dk=dk, + dv=dv, + offsets=offsets, + chunk_indices=chunk_indices, + chunk_offsets=chunk_offsets, + scale=scale, + T=T, + B=B, + H=H, + HQ=HQ, + G=G, + K=K, + V=V, + BC=BC, + BS=BS, + BK=BK, + BV=BV + ) + dk = dk.sum(0) + return dq, dk, dv + + +class ParallelNSACompressionFunction(torch.autograd.Function): + + @staticmethod + @contiguous + @autocast_custom_fwd + def forward( + ctx, + q, + k, + v, + block_size, + scale, + offsets + ): + ctx.dtype = q.dtype + + # 2-d sequence indices denoting the offsets of tokens in each sequence + # for example, if the passed `offsets` is [0, 2, 6], + # then there are 2 and 4 tokens in the 1st and 2nd sequences respectively, and `token_indices` will be + # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]] + token_indices = prepare_token_indices(offsets) if offsets is not None else None + + o, lse = parallel_nsa_compression_fwd( + q=q, + k=k, + v=v, + block_size=block_size, + scale=scale, + offsets=offsets, + token_indices=token_indices + ) + ctx.save_for_backward(q, k, v, o, lse) + ctx.offsets = offsets + ctx.token_indices = token_indices + ctx.block_size = block_size + ctx.scale = scale + return o.to(q.dtype), lse + + @staticmethod + @contiguous + @autocast_custom_bwd + def backward(ctx, do, *args): + q, k, v, o, lse = ctx.saved_tensors + dq, dk, dv = parallel_nsa_compression_bwd( + q=q, + k=k, + v=v, + o=o, + lse=lse, + do=do, + block_size=ctx.block_size, + scale=ctx.scale, + offsets=ctx.offsets, + token_indices=ctx.token_indices + ) + return dq.to(q), dk.to(k), dv.to(v), None, None, None + + +def parallel_nsa_topk( + q: torch.Tensor, + k: torch.Tensor, + lse: torch.Tensor, + block_counts: Union[torch.LongTensor, int], + block_size: int = 64, + scale: float = None, + offsets: Optional[torch.LongTensor] = None, +) -> torch.LongTensor: + B, T, HQ, K = q.shape + H = k.shape[2] + G = HQ // H + S = block_counts if isinstance(block_counts, int) else block_counts.max().item() + S = triton.next_power_of_2(S) + # here we set BC = BS, but beware that they are actually decoupled + BC = BS = block_size + BK = triton.next_power_of_2(K) + + block_indices = torch.zeros(B, T, H, S, dtype=torch.int32, device=q.device) + token_indices = prepare_token_indices(offsets) if offsets is not None else None + chunk_offsets = prepare_chunk_offsets(offsets, BS) if offsets is not None else None + grid = (T, B * H) + parallel_nsa_kernel_topk[grid]( + q=q, + k=k, + lse=lse, + scale=scale, + block_indices=block_indices, + offsets=offsets, + token_indices=token_indices, + chunk_offsets=chunk_offsets, + T=T, + H=H, + HQ=HQ, + G=G, + K=K, + S=S, + BC=BC, + BS=BS, + BK=BK + ) + return block_indices + + +def parallel_nsa_fwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + block_indices: torch.LongTensor, + block_counts: Union[torch.LongTensor, int], + block_size: int, + scale: float, + offsets: Optional[torch.LongTensor] = None, + token_indices: Optional[torch.LongTensor] = None, +): + B, T, H, K, V, S = *k.shape, v.shape[-1], block_indices.shape[-1] + HQ = q.shape[2] + G = HQ // H + BS = block_size + if check_shared_mem('hopper', q.device.index): + BK = min(256, triton.next_power_of_2(K)) + BV = min(256, triton.next_power_of_2(V)) + else: + BK = min(128, triton.next_power_of_2(K)) + BV = min(128, triton.next_power_of_2(V)) + NK = triton.cdiv(K, BK) + NV = triton.cdiv(V, BV) + assert NK == 1, "The key dimension can not be larger than 256" + + grid = (T, NV, B * H) + o = torch.empty(B, T, HQ, V, dtype=v.dtype, device=q.device) + lse = torch.empty(B, T, HQ, dtype=torch.float, device=q.device) + + parallel_nsa_fwd_kernel[grid]( + q=q, + k=k, + v=v, + o=o, + lse=lse, + scale=scale, + block_indices=block_indices, + block_counts=block_counts, + offsets=offsets, + token_indices=token_indices, + T=T, + H=H, + HQ=HQ, + G=G, + K=K, + V=V, + S=S, + BS=BS, + BK=BK, + BV=BV, + ) + return o, lse + + +def parallel_nsa_block_mask( + block_indices: torch.LongTensor, + block_counts: Union[torch.LongTensor, int], + offsets: torch.LongTensor, + block_size: int, +): + B, T, H, S = block_indices.shape + BS = block_size + if offsets is not None: + NS = triton.cdiv(prepare_lens(offsets).max().item(), BS) + else: + NS = triton.cdiv(T, BS) + block_mask = torch.zeros(B, T, H, NS, dtype=torch.bool, device=block_indices.device) + + parallel_nsa_kernel_mask[(T, B, H*S)]( + block_indices=block_indices, + block_counts=block_counts, + block_mask=block_mask, + T=T, + H=H, + S=S, + BS=BS, + NS=NS + ) + return block_mask + + +def parallel_nsa_bwd_preprocess( + o: torch.Tensor, + do: torch.Tensor +): + V = o.shape[-1] + delta = torch.empty_like(o[..., 0], dtype=torch.float32) + parallel_nsa_bwd_kernel_preprocess[(delta.numel(),)]( + o=o, + do=do, + delta=delta, + B=triton.next_power_of_2(V), + V=V, + ) + return delta + + +def parallel_nsa_bwd( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + o: torch.Tensor, + lse: torch.Tensor, + do: torch.Tensor, + block_indices: torch.Tensor, + block_counts: Union[torch.LongTensor, int], + block_size: int = 64, + scale: float = None, + offsets: Optional[torch.LongTensor] = None, + token_indices: Optional[torch.LongTensor] = None, +): + B, T, H, K, V, S = *k.shape, v.shape[-1], block_indices.shape[-1] + HQ = q.shape[2] + G = HQ // H + BS = block_size + BK = triton.next_power_of_2(K) + BV = min(128, triton.next_power_of_2(v.shape[-1])) + NV = triton.cdiv(V, BV) + + delta = parallel_nsa_bwd_preprocess(o, do) + + dq = torch.empty(NV, *q.shape, dtype=q.dtype if NV == 1 else torch.float, device=q.device) + grid = (T, NV, B * H) + parallel_nsa_bwd_kernel_dq[grid]( + q=q, + k=k, + v=v, + lse=lse, + delta=delta, + do=do, + dq=dq, + block_indices=block_indices, + block_counts=block_counts, + offsets=offsets, + token_indices=token_indices, + scale=scale, + T=T, + B=B, + H=H, + HQ=HQ, + G=G, + K=K, + V=V, + S=S, + BS=BS, + BK=BK, + BV=BV + ) + dq = dq.sum(0) + + if offsets is not None: + chunk_indices = prepare_chunk_indices(offsets, BS) + NS = len(chunk_indices) + else: + chunk_indices = None + NS = triton.cdiv(T, BS) + + # [B, T, H, M] + block_mask = parallel_nsa_block_mask(block_indices, block_counts, offsets, block_size) + dk = torch.empty(NV, *k.shape, dtype=k.dtype if NV == 1 else torch.float, device=q.device) + dv = torch.empty(v.shape, dtype=v.dtype, device=q.device) + + grid = (NV, NS, B * H) + parallel_nsa_bwd_kernel_dkv[grid]( + q=q, + k=k, + v=v, + lse=lse, + delta=delta, + do=do, + dk=dk, + dv=dv, + block_mask=block_mask, + offsets=offsets, + chunk_indices=chunk_indices, + scale=scale, + T=T, + B=B, + H=H, + HQ=HQ, + G=G, + K=K, + V=V, + M=block_mask.shape[-1], + BS=BS, + BK=BK, + BV=BV + ) + dk = dk.sum(0) + return dq, dk, dv + + +@torch.compile +class ParallelNSAFunction(torch.autograd.Function): + + @staticmethod + @contiguous + @autocast_custom_fwd + def forward(ctx, q, k, v, block_indices, block_counts, block_size, scale, offsets): + ctx.dtype = q.dtype + + # 2-d sequence indices denoting the offsets of tokens in each sequence + # for example, if the passed `offsets` is [0, 2, 6], + # then there are 2 and 4 tokens in the 1st and 2nd sequences respectively, and `token_indices` will be + # [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]] + token_indices = prepare_token_indices(offsets) if offsets is not None else None + + o, lse = parallel_nsa_fwd( + q=q, + k=k, + v=v, + block_indices=block_indices, + block_counts=block_counts, + block_size=block_size, + scale=scale, + offsets=offsets, + token_indices=token_indices + ) + ctx.save_for_backward(q, k, v, o, lse) + ctx.block_indices = block_indices + ctx.block_counts = block_counts + ctx.offsets = offsets + ctx.token_indices = token_indices + ctx.block_size = block_size + ctx.scale = scale + return o.to(q.dtype) + + @staticmethod + @contiguous + @autocast_custom_bwd + def backward(ctx, do): + q, k, v, o, lse = ctx.saved_tensors + dq, dk, dv = parallel_nsa_bwd( + q=q, + k=k, + v=v, + o=o, + lse=lse, + do=do, + block_indices=ctx.block_indices, + block_counts=ctx.block_counts, + block_size=ctx.block_size, + scale=ctx.scale, + offsets=ctx.offsets, + token_indices=ctx.token_indices + ) + return dq.to(q), dk.to(k), dv.to(v), None, None, None, None, None, None, None, None + + +def parallel_nsa_compression( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + block_size: int = 64, + scale: float = None, + offsets: Optional[torch.LongTensor] = None +): + return ParallelNSACompressionFunction.apply( + q, + k, + v, + block_size, + scale, + offsets + ) + + +def parallel_nsa( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + g_cmp: torch.Tensor, + g_slc: torch.Tensor, + g_swa: torch.Tensor, + block_indices: Optional[torch.LongTensor] = None, + block_counts: Union[torch.LongTensor, int] = 16, + block_size: int = 64, + window_size: int = 0, + scale: Optional[float] = None, + cu_seqlens: Optional[torch.LongTensor] = None, + head_first: bool = False +) -> torch.Tensor: + r""" + Args: + q (torch.Tensor): + queries of shape `[B, T, HQ, K]` if `head_first=False` else `[B, HQ, T, K]`. + k (torch.Tensor): + keys of shape `[B, T, H, K]` if `head_first=False` else `[B, H, T, K]`. + GQA is enforced here. The ratio of query heads (HQ) to key/value heads (H) must be a power of 2 and >=16. + v (torch.Tensor): + values of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`. + g_cmp (torch.Tensor): + Gate score for compressed attention of shape `[B, T, HQ]` if `head_first=False` else `[B, HQ, T]`. + g_slc (torch.Tensor): + Gate score for selected attention of shape `[B, T, HQ]` if `head_first=False` else `[B, HQ, T]`. + g_swa (torch.Tensor): + Gate score for sliding attentionof shape `[B, T, HQ]` if `head_first=False` else `[B, HQ, T]`. + block_indices (torch.LongTensor): + Block indices of shape `[B, T, H, S]` if `head_first=False` else `[B, H, T, S]`. + `S` is the number of selected blocks for each query token, which is set to 16 in the paper. + If `g_cmp` is provided, the passed `block_indices` will be ignored. + block_counts (Optional[Union[torch.LongTensor, int]]): + Number of selected blocks for each query. + If a tensor is provided, with shape `[B, T, H]` if `head_first=False` else `[B, H, T]`, + each query can select the same number of blocks. + If not provided, it will default to 16. + block_size (int): + Selected block size. Default: 64. + window_size (int): + Sliding window size. Default: 0. + scale (Optional[int]): + Scale factor for attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + head_first (Optional[bool]): + Whether the inputs are in the head-first format. Default: `False`. + cu_seqlens (torch.LongTensor): + Cumulative sequence lengths of shape `[N+1]` used for variable-length training, + consistent with the FlashAttention API. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, T, HQ, V]` if `head_first=False` else `[B, HQ, T, V]`. + """ + assert block_counts is not None, "block counts must be provided for selection" + if scale is None: + scale = k.shape[-1] ** -0.5 + if cu_seqlens is not None: + assert q.shape[0] == 1, "batch size must be 1 when cu_seqlens are provided" + if head_first: + q, k, v = map(lambda x: rearrange(x, 'b h t d -> b t h d'), (q, k, v)) + g_cmp, g_slc, g_swa = map(lambda x: rearrange(x, 'b h t -> b t h') if x is not None else None, (g_cmp, g_slc, g_swa)) + if not isinstance(block_counts, int): + block_counts = rearrange(block_counts, 'b h t -> b t h') + assert q.shape[2] % (k.shape[2] * 16) == 0, "Group size must be a multiple of 16 in NSA" + + k_cmp, v_cmp = mean_pooling(k, block_size, cu_seqlens), mean_pooling(v, block_size, cu_seqlens) + o_cmp, lse_cmp = None, None + if g_cmp is not None: + o_cmp, lse_cmp = parallel_nsa_compression( + q=q, + k=k_cmp, + v=v_cmp, + block_size=block_size, + scale=scale, + offsets=cu_seqlens + ) + if block_indices is not None: + warnings.warn("`block_indices` will be ignored when `g_cmp` is provided") + block_indices = parallel_nsa_topk( + q=q, + k=k_cmp, + lse=lse_cmp, + block_counts=block_counts, + block_size=block_size, + scale=scale, + offsets=cu_seqlens + ) + o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens) + o = o_slc * g_slc.unsqueeze(-1) + if o_cmp is not None: + o = torch.addcmul(o, o_cmp, g_cmp.unsqueeze(-1)) + if window_size > 0: + if cu_seqlens is not None: + max_seqlen = q.shape[1] + o_swa = flash_attn_varlen_func( + q.squeeze(0), k.squeeze(0), v.squeeze(0), + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen, + max_seqlen_k=max_seqlen, + causal=True, + window_size=(window_size-1, 0) + ).unsqueeze(0) + else: + o_swa = flash_attn_func( + q, k, v, + causal=True, + window_size=(window_size-1, 0) + ) + o = torch.addcmul(o, o_swa, g_swa.unsqueeze(-1)) + if head_first: + o = rearrange(o, 'b t h d -> b h t d') + return o diff --git a/fla/ops/nsa/utils.py b/fla/ops/nsa/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..73e54138b750a280c4f8edd04ca36ffb3f58705f --- /dev/null +++ b/fla/ops/nsa/utils.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +# Implements argsort based on bitonic sort. +# [What is bitonic sort?](https://en.wikipedia.org/wiki/Bitonic_sorter) + +# Code adapted from https://github.com/triton-lang/triton/issues/3698#issuecomment-2067681396 + + +import triton +import triton.language as tl + +from fla.ops.utils.op import log2 + + +@triton.jit +def _compare_and_swap( + x, + ids, + flip, + i: tl.constexpr, + n_dims: tl.constexpr, +): + n_outer: tl.constexpr = x.numel >> n_dims + shape: tl.constexpr = [n_outer * 2**i, 2, 2**(n_dims - i - 1)] + y = tl.reshape(x, shape) + # slice left/right with 'stride' 2**(n_dims - i - 1) + mask = tl.arange(0, 2)[None, :, None] + left = tl.broadcast_to(tl.sum(y * (1 - mask), 1)[:, None, :], shape).to(y.dtype) + right = tl.broadcast_to(tl.sum(y * mask, 1)[:, None, :], shape).to(y.dtype) + left = tl.reshape(left, x.shape) + right = tl.reshape(right, x.shape) + # idx + y_idx = tl.reshape(ids, shape) + left_idx = tl.broadcast_to(tl.sum(y_idx * (1 - mask), 1)[:, None, :], shape) + right_idx = tl.broadcast_to(tl.sum(y_idx * mask, 1)[:, None, :], shape) + left_idx = tl.reshape(left_idx, x.shape).to(y_idx.dtype) + right_idx = tl.reshape(right_idx, x.shape).to(y_idx.dtype) + # actual compare-and-swap + idtype = tl.core.get_int_dtype(bitwidth=x.dtype.primitive_bitwidth, signed=True) + ileft = left.to(idtype, bitcast=True) + iright = right.to(idtype, bitcast=True) + ix = x.to(idtype, bitcast=True) + + cond = (left > right) != flip + ret = ix ^ tl.where(cond, ileft ^ iright, tl.zeros_like(ix)) + new_ids = ids ^ tl.where(cond, left_idx ^ right_idx, tl.zeros_like(ids)) + return ret.to(x.dtype, bitcast=True), new_ids + + +@triton.jit +def _bitonic_merge( + x, + ids, + stage: tl.constexpr, + order: tl.constexpr, + n_dims: tl.constexpr, +): + n_outer: tl.constexpr = x.numel >> n_dims + tl.static_assert(stage <= n_dims) + # flip denotes whether to re-arrange sub-sequences of elements in ascending or + # descending order. + # if flip = 00000000... then all elements will be re-arranged ascendingly at this stage + # if flip = 00110011... then all the elements will be re-arranged alternatingly (with + # a stride of 2) at this stage + if order == 2: + shape: tl.constexpr = [n_outer * 2**(n_dims - 1 - stage), 2, 2**stage] + flip = tl.reshape(tl.broadcast_to(tl.arange(0, 2)[None, :, None], shape), x.shape) + else: + flip = order + # perform `stage` rounds of `compare-and-swap` + for i in tl.static_range(stage): + x, ids = _compare_and_swap(x, ids, flip, i + (n_dims - stage), n_dims) + return x, ids + + +@triton.jit +def argsort( + x, + ids, + dim: tl.constexpr = None, + descending: tl.constexpr = tl.core.CONSTEXPR_0, +): + # handle default dimension or check that it is the most minor dim + _dim: tl.constexpr = len(x.shape) - 1 if dim is None else dim + tl.static_assert(_dim == len(x.shape) - 1, "only minor dimension is currently supported") + # iteratively run bitonic merge-sort steps + n_dims: tl.constexpr = log2(x.shape[_dim]) + + for i in tl.static_range(1, n_dims + 1): + x, ids = _bitonic_merge(x, ids, i, 2 if i < n_dims else descending, n_dims) + return x, ids diff --git a/fla/ops/rebased/__init__.py b/fla/ops/rebased/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6ec6a0cb31f7f635aa528cad753d5e19196a2028 --- /dev/null +++ b/fla/ops/rebased/__init__.py @@ -0,0 +1,7 @@ +# -*- coding: utf-8 -*- + +from .parallel import parallel_rebased + +__all__ = [ + 'parallel_rebased' +] diff --git a/fla/ops/rebased/__pycache__/parallel.cpython-311.pyc b/fla/ops/rebased/__pycache__/parallel.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31561203857dc7eba28605f215bb3b8b722352ab Binary files /dev/null and b/fla/ops/rebased/__pycache__/parallel.cpython-311.pyc differ diff --git a/fla/ops/rebased/naive.py b/fla/ops/rebased/naive.py new file mode 100644 index 0000000000000000000000000000000000000000..a70242eb3c2dcc5918503af8b03a15b5740e4c2a --- /dev/null +++ b/fla/ops/rebased/naive.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional + +import torch + + +def naive_parallel_rebased( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: Optional[float] = None, + use_norm: bool = True, +) -> torch.Tensor: + if scale is None: + scale = q.shape[-1] ** -0.5 + q = q * scale + attn = q @ k.transpose(-2, -1) + attn = attn ** 2 + attn.masked_fill_(~torch.tril(torch.ones(q.shape[-2], q.shape[-2], dtype=torch.bool, device=q.device)), 0) + o = attn @ v + if use_norm: + z = attn.sum(-1) + return o / (z[..., None] + 1e-6) + else: + return o diff --git a/fla/ops/retention/__pycache__/__init__.cpython-311.pyc b/fla/ops/retention/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e839c965671368aa1118d9e0794489d2f5f9a1c Binary files /dev/null and b/fla/ops/retention/__pycache__/__init__.cpython-311.pyc differ diff --git a/fla/ops/retention/__pycache__/fused_chunk.cpython-311.pyc b/fla/ops/retention/__pycache__/fused_chunk.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b89f987b1c8cc26a6a457b765ca388aae317d913 Binary files /dev/null and b/fla/ops/retention/__pycache__/fused_chunk.cpython-311.pyc differ diff --git a/fla/ops/retention/__pycache__/fused_recurrent.cpython-311.pyc b/fla/ops/retention/__pycache__/fused_recurrent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..90f64b36306174ef130594b0fcbdacf3ecabf74a Binary files /dev/null and b/fla/ops/retention/__pycache__/fused_recurrent.cpython-311.pyc differ diff --git a/fla/ops/retention/fused_chunk.py b/fla/ops/retention/fused_chunk.py new file mode 100644 index 0000000000000000000000000000000000000000..ff068f647d9829604ca8bd5dca05db46735919ff --- /dev/null +++ b/fla/ops/retention/fused_chunk.py @@ -0,0 +1,365 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional, Tuple + +import torch +import triton +import triton.language as tl +from packaging import version + +from fla.utils import autocast_custom_bwd, autocast_custom_fwd, input_guard + + +@triton.jit(do_not_specialize=['T']) +def fused_chunk_retention_fwd_kernel( + q, + k, + v, + o, + h0, + ht, + scale, + T, + B: tl.constexpr, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_INITIAL_STATE: tl.constexpr, + STORE_FINAL_STATE: tl.constexpr, + CHECK: tl.constexpr +): + # indices + i_v, i_k, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_h = i_bh % H + + o_i = tl.arange(0, BT) + # decay rate given the head index + b_b = tl.math.log2(1 - tl.math.exp2(-5 - i_h * 1.0)) + + # d_b: overall decay for the entire chunk + # d_o: cumulative decay from the start of the chunk + # d_h: cumulative decay from the end of the chunk + d_b, d_o, d_h = tl.math.exp2(BT * b_b), tl.math.exp2((o_i + 1) * b_b), tl.math.exp2((BT - o_i - 1) * b_b) + + # [BT, BT] + m_s = o_i[:, None] >= o_i[None, :] + d_s = tl.where(m_s, tl.math.exp2((o_i[:, None] - o_i[None, :]) * b_b), 0) + # [BK, BV] + b_h = tl.zeros([BK, BV], dtype=tl.float32) + + # make block pointers + p_q = tl.make_block_ptr(q + i_bh * T*K, (T, K), (K, 1), (0, i_k * BK), (BT, BK), (1, 0)) + p_k = tl.make_block_ptr(k + i_bh * T*K, (K, T), (1, K), (i_k * BK, 0), (BK, BT), (0, 1)) + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (0, i_v * BV), (BT, BV), (1, 0)) + p_o = tl.make_block_ptr(o + (i_k*B*H+i_bh).to(tl.int64) * T*V, (T, V), (V, 1), (0, i_v * BV), (BT, BV), (1, 0)) + + if USE_INITIAL_STATE: + p_h = tl.make_block_ptr(h0 + i_bh * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + b_h = tl.load(p_h, boundary_check=(0, 1)).to(tl.float32) + + NT = tl.cdiv(T, BT) + for i in range(0, NT): + # [BT, BK] + b_q = tl.load(p_q, boundary_check=(0, 1)) + b_q = (b_q * scale).to(b_q.dtype) + # [BK, BT] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BT, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + + # [BT, BT] + b_s = tl.dot(b_q, b_k, allow_tf32=False) * d_s + # [BT, BV] + b_o = tl.dot(b_s.to(b_q.dtype), b_v, allow_tf32=False) + if CHECK and i == 0: + b_o += tl.dot(b_q, b_h.to(b_q.dtype), allow_tf32=False) * d_o[:, None] + b_h = d_b * b_h + tl.dot(b_k, (b_v * d_h[:, None]).to(b_k.dtype), allow_tf32=False) + else: + b_o += tl.dot(b_q, b_h.to(b_q.dtype), allow_tf32=False) * d_o[:, None] + if i == NT - 1 and (T % BT) != 0: + d_b = tl.math.exp2((T % BT) * b_b) + d_h = tl.math.exp2(((T % BT) - o_i - 1) * b_b) + b_h = d_b * b_h + tl.dot(b_k, (b_v * d_h[:, None]).to(b_k.dtype), allow_tf32=False) + tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1)) + + p_q = tl.advance(p_q, (BT, 0)) + p_k = tl.advance(p_k, (0, BT)) + p_v = tl.advance(p_v, (BT, 0)) + p_o = tl.advance(p_o, (BT, 0)) + + if STORE_FINAL_STATE: + p_ht = tl.make_block_ptr(ht + i_bh * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0)) + tl.store(p_ht, b_h.to(p_ht.dtype.element_ty), boundary_check=(0, 1)) + + +@triton.jit(do_not_specialize=['T']) +def fused_chunk_retention_bwd_kernel( + q, + k, + v, + do, + dq, + dk, + dv, + h0, + scale, + T, + B: tl.constexpr, + H: tl.constexpr, + K: tl.constexpr, + V: tl.constexpr, + BT: tl.constexpr, + BK: tl.constexpr, + BV: tl.constexpr, + USE_INITIAL_STATE: tl.constexpr, + CHECK: tl.constexpr +): + i_v, i_k, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2) + i_h = i_bh % H + + o_i = tl.arange(0, BT) + b_b = tl.math.log2(1 - tl.math.exp2(-5 - i_h * 1.0)) + d_q, d_k = tl.math.exp2((o_i+1) * b_b) * scale, tl.math.exp2((BT - o_i - 1) * b_b) + d_b = tl.math.exp2(BT * b_b) + + m_s = o_i[:, None] >= o_i[None, :] + d_s = tl.where(m_s, tl.math.exp2((o_i[:, None] - o_i[None, :]) * b_b), 0) * scale + # [BV, BK] + b_h = tl.zeros([BV, BK], dtype=tl.float32) + if USE_INITIAL_STATE: + p_h = tl.make_block_ptr(h0 + i_bh * K * V, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1)) + b_h = tl.load(p_h, boundary_check=(0, 1)).to(tl.float32) + + for i in range(0, tl.cdiv(T, BT)): + p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (i * BT, i_k * BK), (BT, BK), (1, 0)) + p_v = tl.make_block_ptr(v + i_bh * T*V, (V, T), (1, V), (i_v * BV, i * BT), (BV, BT), (0, 1)) + p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (i * BT, i_v * BV), (BT, BV), (1, 0)) + p_dq = tl.make_block_ptr(dq + (i_bh + i_v*B*H).to(tl.int64) * T*K, (T, K), (K, 1), (i*BT, i_k*BK), (BT, BK), (1, 0)) + + # [BT, K] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [V, BT] + b_v = tl.load(p_v, boundary_check=(0, 1)) + # [BT, V] + b_do = tl.load(p_do, boundary_check=(0, 1)) + b_dd = (b_do * d_q[:, None]).to(b_do.dtype) + + # [BT, BT] + b_ds = tl.dot(b_do, b_v, allow_tf32=False) + b_ds = (b_ds * d_s).to(b_k.dtype) + # [BT, K] + b_dq = tl.dot(b_ds, b_k, allow_tf32=False) + # [V, K] + if CHECK and i == 0: + b_dq += tl.dot(b_dd, b_h.to(b_k.dtype), allow_tf32=False) + b_h = d_b * b_h + tl.dot((b_v * d_k[None, :]).to(b_k.dtype), b_k, allow_tf32=False) + else: + b_dq += tl.dot(b_dd, b_h.to(b_k.dtype), allow_tf32=False) + b_h = d_b * b_h + tl.dot((b_v * d_k[None, :]).to(b_k.dtype), b_k, allow_tf32=False) + + tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1)) + + # sync threads + b_h = None + tl.debug_barrier() + d_s = tl.trans(d_s) + # [BK, BV] + b_dh = tl.zeros([BK, BV], dtype=tl.float32) + for i in range(1, tl.cdiv(T, BT) + 1): + p_q = tl.make_block_ptr(q + i_bh * T*K, (K, T), (1, K), (i_k * BK, T - i * BT), (BK, BT), (0, 1)) + p_k = tl.make_block_ptr(k + i_bh * T*K, (T, K), (K, 1), (T - i * BT, i_k * BK), (BT, BK), (1, 0)) + p_v = tl.make_block_ptr(v + i_bh * T*V, (T, V), (V, 1), (T - i * BT, i_v * BV), (BT, BV), (1, 0)) + p_do = tl.make_block_ptr(do + i_bh * T*V, (T, V), (V, 1), (T - i * BT, i_v * BV), (BT, BV), (1, 0)) + p_dk = tl.make_block_ptr(dk + (i_bh+i_v*B*H).to(tl.int64) * T*K, (T, K), (K, 1), (T - i*BT, i_k*BK), (BT, BK), (1, 0)) + p_dv = tl.make_block_ptr(dv + (i_bh+i_k*B*H).to(tl.int64) * T*V, (T, V), (V, 1), (T - i*BT, i_v*BV), (BT, BV), (1, 0)) + # [K, BT] + b_q = tl.load(p_q, boundary_check=(0, 1)) + # [BT, BK] + b_k = tl.load(p_k, boundary_check=(0, 1)) + # [BT, BV] + b_v = tl.load(p_v, boundary_check=(0, 1)) + b_do = tl.load(p_do, boundary_check=(0, 1)) + b_dd = (b_do * d_q[:, None]).to(b_do.dtype) + + # [BT, BT] + b_ds = tl.dot(b_v, tl.trans(b_do), allow_tf32=False) + b_ds = (b_ds * d_s).to(b_k.dtype) + + # [BT, BT] + b_s = tl.dot(b_k, b_q, allow_tf32=False) * d_s + # [BT, BK] + b_dk = tl.dot(b_ds, tl.trans(b_q), allow_tf32=False) + # [BT, BV] + b_dv = tl.dot(b_s.to(b_q.dtype), b_do, allow_tf32=False) + if CHECK and i == 1: + b_dk += tl.dot(b_v, tl.trans(b_dh).to(b_v.dtype), allow_tf32=False) * d_k[:, None] + b_dv += tl.dot(b_k, b_dh.to(b_k.dtype), allow_tf32=False) * d_k[:, None] + b_dh = d_b * b_dh + tl.dot(b_q, b_dd, allow_tf32=False) + else: + b_dk += tl.dot(b_v, tl.trans(b_dh).to(b_v.dtype), allow_tf32=False) * d_k[:, None] + b_dv += tl.dot(b_k, b_dh.to(b_k.dtype), allow_tf32=False) * d_k[:, None] + b_dh = d_b * b_dh + tl.dot(b_q, b_dd, allow_tf32=False) + + tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1)) + tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1)) + + +class FusedChunkRetentionFunction(torch.autograd.Function): + + @staticmethod + @input_guard + @autocast_custom_fwd + def forward(ctx, q, k, v, scale, initial_state, output_final_state): + B, H, T, K, V = *k.shape, v.shape[-1] + + BT = 64 + BK, BV = min(triton.next_power_of_2(K), 64), min(triton.next_power_of_2(V), 64) + NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV) + num_stages = 1 + num_warps = 4 + + o = q.new_empty(NK, B, H, T, V) + + if output_final_state: + final_state = q.new_empty(B, H, K, V, dtype=torch.float, requires_grad=False) + else: + final_state = None + # the bug still exists even for Triton 2.2 on H100 GPUs + # so we always enable initial checks + CHECK = True + if version.parse(triton.__version__) < version.parse('2.2.0'): + import warnings + warnings.warn( + "Triton<2.2.0 detected for running this kernel, " + "which is known to have some weird compiler issues (refer to https://github.com/openai/triton/issues/2852) " + "that lead to significant precision loss. " + "We've add some initial condition checks to resolve this, sadly at the sacrifice of the speed. " + "For optimal performance, it is recommended to install Triton>=2.2.0 (if possible)." + ) + CHECK = True + + grid = (NV, NK, B * H) + fused_chunk_retention_fwd_kernel[grid]( + q, + k, + v, + o, + initial_state, + final_state, + scale, + T=T, + B=B, + H=H, + K=K, + V=V, + BT=BT, + BK=BK, + BV=BV, + USE_INITIAL_STATE=initial_state is not None, + STORE_FINAL_STATE=output_final_state, + CHECK=CHECK, + num_warps=num_warps, + num_stages=num_stages + ) + + o = o.sum(0) + ctx.save_for_backward(q, k, v, initial_state) + ctx.CHECK = CHECK + return o.to(q.dtype), final_state + + @staticmethod + @input_guard + @autocast_custom_bwd + def backward(ctx, do, dht=None): + q, k, v, initial_state = ctx.saved_tensors + B, H, T, K, V = *k.shape, v.shape[-1] + scale = K ** -0.5 + + BT = 64 + BK, BV = min(triton.next_power_of_2(K), 64), min(triton.next_power_of_2(V), 64) + NK, NV = triton.cdiv(K, BK), triton.cdiv(V, BV) + num_stages = 1 + num_warps = 4 + + dq = q.new_empty(NV, B, H, T, K) + dk = q.new_empty(NV, B, H, T, K) + dv = q.new_empty(NK, B, H, T, V) + grid = (NV, NK, B * H) + + fused_chunk_retention_bwd_kernel[grid]( + q, + k, + v, + do, + dq, + dk, + dv, + initial_state, + scale, + T=T, + B=B, + H=H, + K=K, + V=V, + BT=BT, + BK=BK, + BV=BV, + USE_INITIAL_STATE=initial_state is not None, + CHECK=ctx.CHECK, + num_warps=num_warps, + num_stages=num_stages + ) + dq = dq.sum(0) + dk = dk.sum(0) + dv = dv.sum(0) + return dq.to(q.dtype), dk.to(k.dtype), dv.to(v.dtype), None, None, None + + +def fused_chunk_retention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: Optional[float] = None, + initial_state: Optional[torch.Tensor] = None, + output_final_state: bool = False, + head_first: bool = True +) -> Tuple[torch.Tensor, torch.Tensor]: + r""" + Args: + q (torch.Tensor): + queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]` + k (torch.Tensor): + keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]` + v (torch.Tensor): + values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]` + scale (Optional[int]): + Scale factor for the attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + initial_state (Optional[torch.Tensor]): + Initial state of shape `[B, H, K, V]`. Default: `None`. + output_final_state (Optional[bool]): + Whether to output the final state of shape `[B, H, K, V]`. Default: `False`. + head_first (Optional[bool]): + Whether the inputs are in the head-first format. + Default: `True`. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`. + final_state (torch.Tensor): + Final state of shape `[B, H, K, V]` if `output_final_state=True` else `None`. + """ + if scale is None: + scale = k.shape[-1] ** -0.5 + if not head_first: + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + o, final_state = FusedChunkRetentionFunction.apply(q, k, v, scale, initial_state, output_final_state) + if not head_first: + o = o.transpose(1, 2) + return o, final_state diff --git a/fla/ops/retention/parallel.py b/fla/ops/retention/parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..8186fc78d43674d777bd9732980e31701004b2b3 --- /dev/null +++ b/fla/ops/retention/parallel.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang + +from typing import Optional, Tuple + +import torch + +from fla.ops.simple_gla.parallel import parallel_simple_gla + + +def parallel_retention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + scale: Optional[float] = None, + output_attentions: bool = False, + cu_seqlens: Optional[torch.LongTensor] = None, + head_first: bool = True +) -> Tuple[torch.Tensor, torch.Tensor]: + r""" + Args: + q (torch.Tensor): + queries of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]` + k (torch.Tensor): + keys of shape `[B, H, T, K]` if `head_first=True` else `[B, T, H, K]` + v (torch.Tensor): + values of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]` + scale (Optional[int]): + Scale factor for attention scores. + If not provided, it will default to `1 / sqrt(K)`. Default: `None`. + output_attentions (bool): + Whether to output the materialized attention scores of shape [B, H, T, T]. Default: `False`. + cu_seqlens (torch.LongTensor): + Cumulative sequence lengths of shape `[N+1]` used for variable-length training, + consistent with the FlashAttention API. + head_first (Optional[bool]): + Whether the inputs are in the head-first format. Default: `True`. + + Returns: + o (torch.Tensor): + Outputs of shape `[B, H, T, V]` if `head_first=True` else `[B, T, H, V]`. + attn (torch.Tensor): + Attention scores of shape `[B, H, T, T]` if `output_attentions=True` else `None` + """ + if head_first: + n_heads = q.shape[1] + else: + n_heads = q.shape[2] + s = (1 - q.new_tensor(2., dtype=torch.float).pow(-5. - q.new_tensor(range(n_heads), dtype=torch.float))).log() + if head_first: + g = s[None, :, None].expand(q.shape[0], q.shape[1], q.shape[2]).contiguous() + else: + g = s[None, None, :].expand(q.shape[0], q.shape[1], q.shape[2]).contiguous() + + return parallel_simple_gla( + q=q, + k=k, + v=v, + scale=scale, + g=g, + output_attentions=output_attentions, + head_first=head_first, + cu_seqlens=cu_seqlens + ) diff --git a/flame/utils/checkpoint.py b/flame/utils/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..839ac7df075c3bfca6747855781953c8a82a4c28 --- /dev/null +++ b/flame/utils/checkpoint.py @@ -0,0 +1,50 @@ +import os +import glob +import re +import shutil +from torchtitan.tools.logging import logger + + +def cleanup_local_checkpoints(checkpoint_dir: str, keep_latest_k: int): + """Removes older checkpoint directories locally, keeping only the latest k for both DCP and HF formats.""" + if keep_latest_k <= 0: + return # Keep all checkpoints + + logger.info(f"Cleaning up local checkpoints in {checkpoint_dir}, keeping latest {keep_latest_k}") + + # Cleanup DCP checkpoints (step-*) + dcp_checkpoints = sorted( + glob.glob(os.path.join(checkpoint_dir, "step-*")), + key=lambda x: int(re.search(r"step-(\d+)", os.path.basename(x)).group(1)) if re.search(r"step-(\d+)", os.path.basename(x)) and not x.endswith("-hf") else -1, + reverse=True + ) + # Filter out HF format directories + dcp_checkpoints = [d for d in dcp_checkpoints if not d.endswith("-hf")] + + if len(dcp_checkpoints) > keep_latest_k: + checkpoints_to_delete = dcp_checkpoints[keep_latest_k:] + logger.info(f"Deleting {len(checkpoints_to_delete)} old DCP checkpoints: {[os.path.basename(c) for c in checkpoints_to_delete]}") + for ckpt_path in checkpoints_to_delete: + if os.path.isdir(ckpt_path): # Ensure it's a directory + try: + shutil.rmtree(ckpt_path) + except OSError as e: + logger.error(f"Error removing directory {ckpt_path}: {e}") + + + # Cleanup HF checkpoints (step-*-hf) + hf_checkpoints = sorted( + glob.glob(os.path.join(checkpoint_dir, "step-*-hf")), + key=lambda x: int(re.search(r"step-(\d+)-hf", os.path.basename(x)).group(1)) if re.search(r"step-(\d+)-hf", os.path.basename(x)) else -1, + reverse=True + ) + + if len(hf_checkpoints) > keep_latest_k: + checkpoints_to_delete = hf_checkpoints[keep_latest_k:] + logger.info(f"Deleting {len(checkpoints_to_delete)} old HF checkpoints: {[os.path.basename(c) for c in checkpoints_to_delete]}") + for ckpt_path in checkpoints_to_delete: + if os.path.isdir(ckpt_path): # Ensure it's a directory + try: + shutil.rmtree(ckpt_path) + except OSError as e: + logger.error(f"Error removing directory {ckpt_path}: {e}") diff --git a/logs/none_75lcom2m/attempt_0/1/stderr.log b/logs/none_75lcom2m/attempt_0/1/stderr.log new file mode 100644 index 0000000000000000000000000000000000000000..02a97941d5116ccc5bf4d15323697d841fee8a96 --- /dev/null +++ b/logs/none_75lcom2m/attempt_0/1/stderr.log @@ -0,0 +1,17 @@ +Traceback (most recent call last): + File "", line 198, in _run_module_as_main + File "", line 88, in _run_code + File "/workspace/flame/flame/train.py", line 19, in + import fla # noqa + ^^^^^^^^^^ + File "/workspace/flame/fla/__init__.py", line 23, in + from fla.models import ( + File "/workspace/flame/fla/models/__init__.py", line 4, in + from fla.models.bitnet import BitNetConfig, BitNetForCausalLM, BitNetModel + File "/workspace/flame/fla/models/bitnet/__init__.py", line 8, in + AutoConfig.register(BitNetConfig.model_type, BitNetConfig) + File "/workspace/flame/.venv/lib/python3.11/site-packages/transformers/models/auto/configuration_auto.py", line 1211, in register + CONFIG_MAPPING.register(model_type, config, exist_ok=exist_ok) + File "/workspace/flame/.venv/lib/python3.11/site-packages/transformers/models/auto/configuration_auto.py", line 905, in register + raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.") +ValueError: 'bitnet' is already used by a Transformers config, pick another name. diff --git a/logs/none_vngrbiu1/attempt_0/2/stderr.log b/logs/none_vngrbiu1/attempt_0/2/stderr.log new file mode 100644 index 0000000000000000000000000000000000000000..643ae1cc2446868302e39419030f0e5812fd2544 --- /dev/null +++ b/logs/none_vngrbiu1/attempt_0/2/stderr.log @@ -0,0 +1,6638 @@ +wandb: Appending key for api.wandb.ai to your netrc file: /root/.netrc +wandb: Currently logged in as: zaydzuhri to https://api.wandb.ai. Use `wandb login --relogin` to force relogin +Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured. +[titan] 2025-06-13 12:41:23,509 - root - INFO - Starting job: default job +[titan] 2025-06-13 12:41:23,510 - root - INFO - { + "activation_checkpoint": { + "mode": "none", + "selective_ac_option": "2" + }, + "activation_offload": { + "mode": "none" + }, + "checkpoint": { + "async_mode": "disabled", + "convert_to_hf_on_save": false, + "create_seed_checkpoint": false, + "enable_checkpoint": true, + "exclude_from_loading": [], + "export_dtype": "float32", + "folder": "checkpoint", + "hf_repo_base_name": null, + "hf_upload_enabled": false, + "hf_upload_format": "dcp", + "interval": 5000, + "interval_type": "steps", + "keep_latest_k": 2, + "load_step": -1, + "model_weights_only": false + }, + "comm": { + "init_timeout_seconds": 300, + "trace_buf_size": 20000, + "train_timeout_seconds": 100 + }, + "experimental": { + "context_parallel_degree": 1, + "context_parallel_rotate_method": "allgather", + "custom_model_path": "", + "enable_async_tensor_parallel": false, + "enable_compiled_autograd": false, + "pipeline_parallel_degree": 1, + "pipeline_parallel_microbatches": null, + "pipeline_parallel_schedule": "1F1B", + "pipeline_parallel_schedule_csv": "", + "pipeline_parallel_split_points": [] + }, + "fault_tolerance": { + "enable": false, + "group_size": 0, + "min_replica_size": 1, + "replica_id": 0 + }, + "float8": { + "enable_fsdp_float8_all_gather": false, + "force_recompute_fp8_weight_in_bwd": false, + "precompute_float8_dynamic_scale_for_fsdp": false, + "recipe_name": null + }, + "job": { + "config_file": "flame/models/fla.toml", + "description": "default job", + "dump_folder": "exp/mtp.120M.batch8.seqlen2048.context2048.warmup1000.update1.steps15000.nft4.lr5e-4.cosine", + "print_args": true, + "use_for_integration_test": false + }, + "lr_scheduler": { + "decay_ratio": null, + "decay_type": "cosine", + "lr_min": 0.1, + "warmup_steps": 1000 + }, + "memory_estimation": { + "disable_fake_mode": false, + "enabled": false + }, + "metrics": { + "disable_color_printing": false, + "enable_tensorboard": false, + "enable_wandb": true, + "log_freq": 5, + "save_for_all_ranks": false, + "save_tb_folder": "tb" + }, + "model": { + "config": "configs/mtp_transformer_120M.json", + "converters": [], + "name": "fla", + "print_after_conversion": false, + "tokenizer_path": "fla-hub/transformer-1.3B-100B" + }, + "optimizer": { + "early_step_in_backward": false, + "eps": 1e-15, + "implementation": "fused", + "lr": 0.0005, + "name": "AdamW" + }, + "profiling": { + "enable_memory_snapshot": false, + "enable_profiling": true, + "profile_freq": 512, + "save_memory_snapshot_folder": "memory_snapshot", + "save_traces_folder": "profile_trace" + }, + "training": { + "batch_size": 8, + "compile": true, + "context_len": 2048, + "data_dir": null, + "data_files": null, + "data_parallel_replicate_degree": 1, + "data_parallel_shard_degree": -1, + "data_probs": null, + "dataset": "fla-hub/slimpajama-test", + "dataset_name": "default", + "dataset_split": "train", + "deterministic": false, + "disable_loss_parallel": false, + "enable_cpu_offload": false, + "fsdp_reshard_after_forward": "default", + "gc_freq": 50, + "gradient_accumulation_steps": 1, + "max_norm": 1.0, + "mixed_precision_param": "bfloat16", + "mixed_precision_reduce": "float32", + "num_workers": 1, + "persistent_workers": false, + "pin_memory": false, + "prefetch_factor": 2, + "seed": 69, + "seq_len": 2048, + "skip_nan_inf": true, + "steps": 15000, + "streaming": true, + "tensor_parallel_degree": 1, + "varlen": false + } +} +[titan] 2025-06-13 12:41:23,511 - root - INFO - [GC] Initial GC collection. 0.00 seconds. +[titan] 2025-06-13 12:41:23,819 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config +[titan] 2025-06-13 12:41:23,821 - root - INFO - CUDA capacity: NVIDIA GeForce RTX 4090 with 23.64GiB memory +[titan] 2025-06-13 12:41:23,823 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name +[titan] 2025-06-13 12:41:23,823 - root - WARNING - Peak flops undefined for: NVIDIA GeForce RTX 4090, fallback to A100 +[titan] 2025-06-13 12:41:23,823 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14 +[titan] 2025-06-13 12:41:23,823 - root - INFO - Building 1-D device mesh with ['dp_shard'], [4] +[titan] 2025-06-13 12:41:23,834 - root - INFO - Loading tokenizer... +[titan] 2025-06-13 12:41:24,794 - root - INFO - LlamaTokenizerFast(name_or_path='fla-hub/transformer-1.3B-100B', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '', 'eos_token': '', 'unk_token': ''}, clean_up_tokenization_spaces=False, added_tokens_decoder={ + 0: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 1: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), + 2: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), +} +) +[titan] 2025-06-13 12:41:24,794 - root - INFO - Loading dataset fla-hub/slimpajama-test:default +[titan] 2025-06-13 12:41:28,525 - root - INFO - IterableDataset({ + features: ['text', 'meta'], + num_shards: 5 +}) +[titan] 2025-06-13 12:41:28,525 - root - INFO - Shuffling the dataset with seed 69 +[titan] 2025-06-13 12:41:28,526 - root - INFO - Loading model config from configs/mtp_transformer_120M.json +[titan] 2025-06-13 12:41:28,529 - root - INFO - Building dataloader... +[titan] 2025-06-13 12:41:28,532 - root - INFO - Building model from the config +MTPTransformerConfig { + "attention_bias": false, + "bos_token_id": 1, + "elementwise_affine": true, + "eos_token_id": 2, + "fuse_cross_entropy": true, + "fuse_norm": false, + "fuse_swiglu": true, + "hidden_act": "swish", + "hidden_ratio": 4, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": null, + "max_position_embeddings": 4096, + "model_type": "mtp_transformer", + "n_future_tokens": 4, + "norm_eps": 1e-06, + "num_heads": 12, + "num_hidden_layers": 14, + "num_kv_heads": null, + "qk_norm": false, + "qkv_bias": false, + "rope_theta": 10000.0, + "tie_word_embeddings": true, + "transformers_version": "4.51.3", + "use_cache": true, + "use_custom_backward": false, + "vocab_size": 32000, + "window_size": null +} + +[titan] 2025-06-13 12:41:28,874 - root - INFO -  +MTPTransformerForCausalLM( + (model): MTPTransformerModel( + (embeddings): Embedding(32000, 768) + (layers): ModuleList( + (0-9): 10 x MTPTransformerBlock( + (attn_norm): RMSNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (q_proj): Linear(in_features=768, out_features=768, bias=False) + (k_proj): Linear(in_features=768, out_features=768, bias=False) + (v_proj): Linear(in_features=768, out_features=768, bias=False) + (o_proj): Linear(in_features=768, out_features=768, bias=False) + (rotary): RotaryEmbedding(dim=64, base=10000.0, interleaved=False, pos_idx_in_fp32=True) + ) + (mlp_norm): RMSNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): GatedMLP( + (gate_proj): Linear(in_features=768, out_features=2048, bias=False) + (up_proj): Linear(in_features=768, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=768, bias=False) + (swiglu_linear): SwiGLULinear() + ) + ) + ) + (extra_heads): ModuleList( + (0-3): 4 x MTPTransformerBlock( + (attn_norm): RMSNorm((768,), eps=1e-06, elementwise_affine=True) + (attn): Attention( + (q_proj): Linear(in_features=768, out_features=768, bias=False) + (k_proj): Linear(in_features=768, out_features=768, bias=False) + (v_proj): Linear(in_features=768, out_features=768, bias=False) + (o_proj): Linear(in_features=768, out_features=768, bias=False) + (rotary): RotaryEmbedding(dim=64, base=10000.0, interleaved=False, pos_idx_in_fp32=True) + ) + (mlp_norm): RMSNorm((768,), eps=1e-06, elementwise_affine=True) + (mlp): GatedMLP( + (gate_proj): Linear(in_features=768, out_features=2048, bias=False) + (up_proj): Linear(in_features=768, out_features=2048, bias=False) + (down_proj): Linear(in_features=2048, out_features=768, bias=False) + (swiglu_linear): SwiGLULinear() + ) + ) + ) + (norm): RMSNorm((768,), eps=1e-06, elementwise_affine=True) + ) + (lm_head): Linear(in_features=768, out_features=32000, bias=False) + (criterion): FusedLinearCrossEntropyLoss() +) + +[titan] 2025-06-13 12:41:28,903 - root - INFO - Compiling each block with torch.compile +[titan] 2025-06-13 12:41:28,903 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile +[titan] 2025-06-13 12:41:28,905 - root - INFO - Compiling the entire model with torch.compile +[titan] 2025-06-13 12:41:28,966 - root - INFO - Applied FSDP to the model +[titan] 2025-06-13 12:41:29,105 - root - INFO - CUDA memory usage for model: 0.15GiB(0.63%) +[titan] 2025-06-13 12:41:29,117 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to exp/mtp.120M.batch8.seqlen2048.context2048.warmup1000.update1.steps15000.nft4.lr5e-4.cosine/checkpoint +[titan] 2025-06-13 12:41:29,118 - root - INFO - CUDA capacity: NVIDIA GeForce RTX 4090 with 23.64GiB memory +[titan] 2025-06-13 12:41:29,119 - root - WARNING - Error running lspci: [Errno 2] No such file or directory: 'lspci', fallback to use device_name +[titan] 2025-06-13 12:41:29,120 - root - WARNING - Peak flops undefined for: NVIDIA GeForce RTX 4090, fallback to A100 +[titan] 2025-06-13 12:41:29,148 - root - INFO - ***** Running training ***** +[titan] 2025-06-13 12:41:29,148 - root - INFO -  Training starts at step 1 +[titan] 2025-06-13 12:41:29,148 - root - INFO -  Number of tokens per sequence = 2,048 +[titan] 2025-06-13 12:41:29,148 - root - INFO -  Gradient Accumulation steps = 1 +[titan] 2025-06-13 12:41:29,149 - root - INFO -  Instantaneous batch size (per device) = 8 +[titan] 2025-06-13 12:41:29,149 - root - INFO -  Global batch size (w. parallel, distributed & accumulation) = 32 (65,536 tokens) +[titan] 2025-06-13 12:41:29,149 - root - INFO -  Total optimization steps = 15,000 (983,040,000 tokens) +[titan] 2025-06-13 12:41:29,149 - root - INFO -  Warmup steps = 1,000 (65,536,000 tokens) +[titan] 2025-06-13 12:41:29,149 - root - INFO -  Number of parameters = 123,688,704  +[titan] 2025-06-13 12:41:29,149 - root - INFO - Profiling active. Traces will be saved at exp/mtp.120M.batch8.seqlen2048.context2048.warmup1000.update1.steps15000.nft4.lr5e-4.cosine/profile_trace +[titan] 2025-06-13 12:41:31,999 - fla.models.transformer_mtp.modeling_transformer - WARNING - `return_all_heads=False` is incompatible with training. Setting `return_all_heads=True`... +/workspace/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:679: UserWarning: Graph break due to unsupported builtin cuda_utils.get_device_properties. This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind). If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround. If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use torch.compiler.allow_in_graph. + torch._dynamo.utils.warn_once(msg) +[titan] 2025-06-13 12:42:07,190 - root - INFO - step: 1 loss: 42.1888 memory: 6.28GiB(26.55%) tps: 430 tflops: 0.43 mfu: 0.14% global_avg_ntp_loss: 10.5542 global_avg_mtp_loss: 31.6346 +[titan] 2025-06-13 12:42:07,191 - root - INFO - lr: 9.9900e-07 gnorm: 14.25 [ 0:00:38<6 days, 14:36:57] +[titan] 2025-06-13 12:42:07,191 - root - INFO - Saving the checkpoint (or staging if async is enabled). +[titan] 2025-06-13 12:42:09,174 - root - INFO - [GC] GC collection invoked by checkpointer. 0.09 seconds. +[titan] 2025-06-13 12:42:09,175 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 1.98 seconds. +[titan] 2025-06-13 12:42:09,175 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40 +[titan] 2025-06-13 12:42:11,775 - root - INFO - step: 5 loss: 42.0057 memory: 6.46GiB(27.34%) tps: 14,299 tflops: 14.39 mfu: 4.61% global_avg_ntp_loss: 10.5065 global_avg_mtp_loss: 31.4992 +[titan] 2025-06-13 12:42:11,775 - root - INFO - lr: 2.9970e-06 gnorm: 17.24 [ 0:00:42<1 day, 11:31:57] +[titan] 2025-06-13 12:42:14,978 - root - INFO - step: 10 loss: 41.5915 memory: 6.46GiB(27.34%) tps: 25,578 tflops: 25.74 mfu: 8.25% global_avg_ntp_loss: 10.4041 global_avg_mtp_loss: 31.1874 +[titan] 2025-06-13 12:42:14,978 - root - INFO - lr: 5.4945e-06 gnorm: 14.13 [ 0:00:45<19:05:38] +[titan] 2025-06-13 12:42:18,130 - root - INFO - step: 15 loss: 41.1751 memory: 6.46GiB(27.34%) tps: 25,998 tflops: 26.16 mfu: 8.39% global_avg_ntp_loss: 10.2974 global_avg_mtp_loss: 30.8777 +[titan] 2025-06-13 12:42:18,130 - root - INFO - lr: 7.9920e-06 gnorm: 13.78 [ 0:00:49<13:35:58] +[titan] 2025-06-13 12:42:21,628 - root - INFO - step: 20 loss: 39.6408 memory: 6.46GiB(27.34%) tps: 23,419 tflops: 23.57 mfu: 7.55% global_avg_ntp_loss: 9.9200 global_avg_mtp_loss: 29.7207 +[titan] 2025-06-13 12:42:21,628 - root - INFO - lr: 1.0490e-05 gnorm: 14.33 [ 0:00:52<10:55:26] +[titan] 2025-06-13 12:42:24,816 - root - INFO - step: 25 loss: 38.9274 memory: 6.46GiB(27.34%) tps: 25,699 tflops: 25.86 mfu: 8.29% global_avg_ntp_loss: 9.7311 global_avg_mtp_loss: 29.1963 +[titan] 2025-06-13 12:42:24,816 - root - INFO - lr: 1.2987e-05 gnorm: 10.49 [ 0:00:55< 9:16:00] +[titan] 2025-06-13 12:42:28,156 - root - INFO - step: 30 loss: 37.9038 memory: 6.46GiB(27.34%) tps: 24,529 tflops: 24.69 mfu: 7.91% global_avg_ntp_loss: 9.4779 global_avg_mtp_loss: 28.4260 +[titan] 2025-06-13 12:42:28,157 - root - INFO - lr: 1.5485e-05 gnorm: 8.97 [ 0:00:59< 8:10:57] +[titan] 2025-06-13 12:42:31,486 - root - INFO - step: 35 loss: 37.5654 memory: 6.46GiB(27.34%) tps: 24,602 tflops: 24.76 mfu: 7.94% global_avg_ntp_loss: 9.3920 global_avg_mtp_loss: 28.1734 +[titan] 2025-06-13 12:42:31,487 - root - INFO - lr: 1.7982e-05 gnorm: 7.72 [ 0:01:02< 7:24:24] +[titan] 2025-06-13 12:42:34,908 - root - INFO - step: 40 loss: 37.0058 memory: 6.46GiB(27.34%) tps: 23,946 tflops: 24.10 mfu: 7.72% global_avg_ntp_loss: 9.2515 global_avg_mtp_loss: 27.7543 +[titan] 2025-06-13 12:42:34,908 - root - INFO - lr: 2.0480e-05 gnorm: 7.38 [ 0:01:05< 6:50:02] +[titan] 2025-06-13 12:42:38,128 - root - INFO - step: 45 loss: 36.4447 memory: 6.46GiB(27.34%) tps: 25,449 tflops: 25.61 mfu: 8.21% global_avg_ntp_loss: 9.1124 global_avg_mtp_loss: 27.3324 +[titan] 2025-06-13 12:42:38,128 - root - INFO - lr: 2.2977e-05 gnorm: 8.22 [ 0:01:09< 6:22:11] +[titan] 2025-06-13 12:42:41,005 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:42:41,692 - root - INFO - step: 50 loss: 35.8707 memory: 6.46GiB(27.34%) tps: 22,986 tflops: 23.13 mfu: 7.41% global_avg_ntp_loss: 8.9665 global_avg_mtp_loss: 26.9042 +[titan] 2025-06-13 12:42:41,692 - root - INFO - lr: 2.5475e-05 gnorm: 7.45 [ 0:01:12< 6:01:37] +[titan] 2025-06-13 12:42:44,957 - root - INFO - step: 55 loss: 36.0886 memory: 6.46GiB(27.34%) tps: 25,098 tflops: 25.26 mfu: 8.10% global_avg_ntp_loss: 9.0209 global_avg_mtp_loss: 27.0678 +[titan] 2025-06-13 12:42:44,957 - root - INFO - lr: 2.7972e-05 gnorm: 6.35 [ 0:01:15< 5:43:25] +[titan] 2025-06-13 12:42:48,358 - root - INFO - step: 60 loss: 35.4873 memory: 6.46GiB(27.34%) tps: 24,089 tflops: 24.24 mfu: 7.77% global_avg_ntp_loss: 8.8714 global_avg_mtp_loss: 26.6159 +[titan] 2025-06-13 12:42:48,358 - root - INFO - lr: 3.0470e-05 gnorm: 6.58 [ 0:01:19< 5:28:48] +[titan] 2025-06-13 12:42:51,632 - root - INFO - step: 65 loss: 35.2990 memory: 6.46GiB(27.34%) tps: 25,024 tflops: 25.18 mfu: 8.07% global_avg_ntp_loss: 8.8239 global_avg_mtp_loss: 26.4752 +[titan] 2025-06-13 12:42:51,632 - root - INFO - lr: 3.2967e-05 gnorm: 6.05 [ 0:01:22< 5:15:57] +[titan] 2025-06-13 12:42:54,790 - root - INFO - step: 70 loss: 34.7382 memory: 6.46GiB(27.34%) tps: 25,939 tflops: 26.10 mfu: 8.37% global_avg_ntp_loss: 8.6833 global_avg_mtp_loss: 26.0549 +[titan] 2025-06-13 12:42:54,791 - root - INFO - lr: 3.5465e-05 gnorm: 5.81 [ 0:01:25< 5:04:30] +[titan] 2025-06-13 12:42:58,112 - root - INFO - step: 75 loss: 33.9558 memory: 6.46GiB(27.34%) tps: 24,664 tflops: 24.82 mfu: 7.96% global_avg_ntp_loss: 8.4874 global_avg_mtp_loss: 25.4684 +[titan] 2025-06-13 12:42:58,113 - root - INFO - lr: 3.7962e-05 gnorm: 6.32 [ 0:01:28< 4:55:07] +[titan] 2025-06-13 12:43:02,594 - root - INFO - step: 80 loss: 33.2195 memory: 6.46GiB(27.34%) tps: 18,281 tflops: 18.40 mfu: 5.90% global_avg_ntp_loss: 8.3018 global_avg_mtp_loss: 24.9177 +[titan] 2025-06-13 12:43:02,594 - root - INFO - lr: 4.0460e-05 gnorm: 5.90 [ 0:01:33< 4:50:31] +[titan] 2025-06-13 12:43:06,012 - root - INFO - step: 85 loss: 33.2370 memory: 6.46GiB(27.34%) tps: 23,972 tflops: 24.13 mfu: 7.73% global_avg_ntp_loss: 8.3067 global_avg_mtp_loss: 24.9303 +[titan] 2025-06-13 12:43:06,012 - root - INFO - lr: 4.2957e-05 gnorm: 5.11 [ 0:01:36< 4:43:20] +[titan] 2025-06-13 12:43:09,246 - root - INFO - step: 90 loss: 32.4404 memory: 6.46GiB(27.34%) tps: 25,333 tflops: 25.49 mfu: 8.17% global_avg_ntp_loss: 8.1068 global_avg_mtp_loss: 24.3337 +[titan] 2025-06-13 12:43:09,246 - root - INFO - lr: 4.5455e-05 gnorm: 5.16 [ 0:01:40< 4:36:25] +[titan] 2025-06-13 12:43:12,426 - root - INFO - step: 95 loss: 32.5616 memory: 6.46GiB(27.34%) tps: 25,759 tflops: 25.92 mfu: 8.31% global_avg_ntp_loss: 8.1361 global_avg_mtp_loss: 24.4255 +[titan] 2025-06-13 12:43:12,427 - root - INFO - lr: 4.7952e-05 gnorm: 4.44 [ 0:01:43< 4:30:06] +[titan] 2025-06-13 12:43:15,067 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:43:15,619 - root - INFO - step: 100 loss: 31.9831 memory: 6.46GiB(27.34%) tps: 25,662 tflops: 25.83 mfu: 8.28% global_avg_ntp_loss: 7.9910 global_avg_mtp_loss: 23.9921 +[titan] 2025-06-13 12:43:15,620 - root - INFO - lr: 5.0450e-05 gnorm: 4.29 [ 0:01:46< 4:24:26] +[titan] 2025-06-13 12:43:18,964 - root - INFO - step: 105 loss: 32.4180 memory: 6.46GiB(27.34%) tps: 24,496 tflops: 24.65 mfu: 7.90% global_avg_ntp_loss: 8.0989 global_avg_mtp_loss: 24.3191 +[titan] 2025-06-13 12:43:18,964 - root - INFO - lr: 5.2947e-05 gnorm: 4.58 [ 0:01:49< 4:19:40] +[titan] 2025-06-13 12:43:22,143 - root - INFO - step: 110 loss: 31.1803 memory: 6.46GiB(27.34%) tps: 25,773 tflops: 25.94 mfu: 8.31% global_avg_ntp_loss: 7.7861 global_avg_mtp_loss: 23.3941 +[titan] 2025-06-13 12:43:22,143 - root - INFO - lr: 5.5445e-05 gnorm: 3.34 [ 0:01:53< 4:14:57] +[titan] 2025-06-13 12:43:25,639 - root - INFO - step: 115 loss: 31.0521 memory: 6.46GiB(27.34%) tps: 23,432 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 7.7475 global_avg_mtp_loss: 23.3047 +[titan] 2025-06-13 12:43:25,640 - root - INFO - lr: 5.7942e-05 gnorm: 2.38 [ 0:01:56< 4:11:20] +[titan] 2025-06-13 12:43:29,186 - root - INFO - step: 120 loss: 30.5678 memory: 6.46GiB(27.34%) tps: 23,102 tflops: 23.25 mfu: 7.45% global_avg_ntp_loss: 7.6161 global_avg_mtp_loss: 22.9517 +[titan] 2025-06-13 12:43:29,186 - root - INFO - lr: 6.0440e-05 gnorm: 3.08 [ 0:02:00< 4:08:06] +[titan] 2025-06-13 12:43:32,377 - root - INFO - step: 125 loss: 30.2913 memory: 6.46GiB(27.34%) tps: 25,677 tflops: 25.84 mfu: 8.28% global_avg_ntp_loss: 7.5304 global_avg_mtp_loss: 22.7609 +[titan] 2025-06-13 12:43:32,377 - root - INFO - lr: 6.2937e-05 gnorm: 2.72 [ 0:02:03< 4:04:25] +[titan] 2025-06-13 12:43:35,890 - root - INFO - step: 130 loss: 30.4391 memory: 6.46GiB(27.34%) tps: 23,322 tflops: 23.47 mfu: 7.52% global_avg_ntp_loss: 7.5700 global_avg_mtp_loss: 22.8692 +[titan] 2025-06-13 12:43:35,890 - root - INFO - lr: 6.5435e-05 gnorm: 1.92 [ 0:02:06< 4:01:38] +[titan] 2025-06-13 12:43:39,119 - root - INFO - step: 135 loss: 30.0987 memory: 6.46GiB(27.34%) tps: 25,372 tflops: 25.53 mfu: 8.18% global_avg_ntp_loss: 7.4791 global_avg_mtp_loss: 22.6195 +[titan] 2025-06-13 12:43:39,119 - root - INFO - lr: 6.7932e-05 gnorm: 1.95 [ 0:02:09< 3:58:32] +[titan] 2025-06-13 12:43:42,333 - root - INFO - step: 140 loss: 29.6857 memory: 6.46GiB(27.34%) tps: 25,492 tflops: 25.65 mfu: 8.22% global_avg_ntp_loss: 7.3648 global_avg_mtp_loss: 22.3210 +[titan] 2025-06-13 12:43:42,333 - root - INFO - lr: 7.0430e-05 gnorm: 2.54 [ 0:02:13< 3:55:38] +[titan] 2025-06-13 12:43:45,802 - root - INFO - step: 145 loss: 29.5397 memory: 6.46GiB(27.34%) tps: 23,619 tflops: 23.77 mfu: 7.62% global_avg_ntp_loss: 7.3205 global_avg_mtp_loss: 22.2191 +[titan] 2025-06-13 12:43:45,802 - root - INFO - lr: 7.2927e-05 gnorm: 2.37 [ 0:02:16< 3:53:21] +[titan] 2025-06-13 12:43:48,312 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:43:48,961 - root - INFO - step: 150 loss: 29.5723 memory: 6.46GiB(27.34%) tps: 25,934 tflops: 26.10 mfu: 8.37% global_avg_ntp_loss: 7.3335 global_avg_mtp_loss: 22.2388 +[titan] 2025-06-13 12:43:48,961 - root - INFO - lr: 7.5425e-05 gnorm: 4.00 [ 0:02:19< 3:50:42] +[titan] 2025-06-13 12:43:52,286 - root - INFO - step: 155 loss: 29.7593 memory: 6.46GiB(27.34%) tps: 24,638 tflops: 24.80 mfu: 7.95% global_avg_ntp_loss: 7.3781 global_avg_mtp_loss: 22.3812 +[titan] 2025-06-13 12:43:52,287 - root - INFO - lr: 7.7922e-05 gnorm: 1.97 [ 0:02:23< 3:48:30] +[titan] 2025-06-13 12:43:56,891 - root - INFO - step: 160 loss: 30.0964 memory: 6.46GiB(27.34%) tps: 17,792 tflops: 17.91 mfu: 5.74% global_avg_ntp_loss: 7.4577 global_avg_mtp_loss: 22.6387 +[titan] 2025-06-13 12:43:56,891 - root - INFO - lr: 8.0420e-05 gnorm: 1.92 [ 0:02:27< 3:48:24] +[titan] 2025-06-13 12:44:00,306 - root - INFO - step: 165 loss: 29.6146 memory: 6.46GiB(27.34%) tps: 23,996 tflops: 24.15 mfu: 7.74% global_avg_ntp_loss: 7.3337 global_avg_mtp_loss: 22.2808 +[titan] 2025-06-13 12:44:00,306 - root - INFO - lr: 8.2917e-05 gnorm: 2.25 [ 0:02:31< 3:46:31] +[titan] 2025-06-13 12:44:03,499 - root - INFO - step: 170 loss: 29.4113 memory: 6.46GiB(27.34%) tps: 25,658 tflops: 25.82 mfu: 8.28% global_avg_ntp_loss: 7.2713 global_avg_mtp_loss: 22.1401 +[titan] 2025-06-13 12:44:03,499 - root - INFO - lr: 8.5415e-05 gnorm: 1.86 [ 0:02:34< 3:44:25] +[titan] 2025-06-13 12:44:06,683 - root - INFO - step: 175 loss: 29.1579 memory: 6.46GiB(27.34%) tps: 25,728 tflops: 25.89 mfu: 8.30% global_avg_ntp_loss: 7.1773 global_avg_mtp_loss: 21.9806 +[titan] 2025-06-13 12:44:06,684 - root - INFO - lr: 8.7912e-05 gnorm: 1.73 [ 0:02:37< 3:42:26] +[titan] 2025-06-13 12:44:10,259 - root - INFO - step: 180 loss: 29.5335 memory: 6.46GiB(27.34%) tps: 22,912 tflops: 23.06 mfu: 7.39% global_avg_ntp_loss: 7.2738 global_avg_mtp_loss: 22.2597 +[titan] 2025-06-13 12:44:10,260 - root - INFO - lr: 9.0410e-05 gnorm: 1.52 [ 0:02:41< 3:41:05] +[titan] 2025-06-13 12:44:13,715 - root - INFO - step: 185 loss: 28.9624 memory: 6.46GiB(27.34%) tps: 23,712 tflops: 23.86 mfu: 7.65% global_avg_ntp_loss: 7.1176 global_avg_mtp_loss: 21.8448 +[titan] 2025-06-13 12:44:13,715 - root - INFO - lr: 9.2907e-05 gnorm: 2.12 [ 0:02:44< 3:39:39] +[titan] 2025-06-13 12:44:16,984 - root - INFO - step: 190 loss: 29.0751 memory: 6.46GiB(27.34%) tps: 25,065 tflops: 25.22 mfu: 8.08% global_avg_ntp_loss: 7.1362 global_avg_mtp_loss: 21.9389 +[titan] 2025-06-13 12:44:16,984 - root - INFO - lr: 9.5405e-05 gnorm: 6.53 [ 0:02:47< 3:38:02] +[titan] 2025-06-13 12:44:20,467 - root - INFO - step: 195 loss: 30.6211 memory: 6.46GiB(27.34%) tps: 23,520 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 7.5277 global_avg_mtp_loss: 23.0933 +[titan] 2025-06-13 12:44:20,467 - root - INFO - lr: 9.7902e-05 gnorm: 4.94 [ 0:02:51< 3:36:47] +[titan] 2025-06-13 12:44:23,456 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:44:24,135 - root - INFO - step: 200 loss: 29.4882 memory: 6.46GiB(27.34%) tps: 22,336 tflops: 22.48 mfu: 7.20% global_avg_ntp_loss: 7.1992 global_avg_mtp_loss: 22.2891 +[titan] 2025-06-13 12:44:24,136 - root - INFO - lr: 1.0040e-04 gnorm: 2.83 [ 0:02:54< 3:35:49] +[titan] 2025-06-13 12:44:27,478 - root - INFO - step: 205 loss: 29.5622 memory: 6.46GiB(27.34%) tps: 24,510 tflops: 24.67 mfu: 7.91% global_avg_ntp_loss: 7.2264 global_avg_mtp_loss: 22.3359 +[titan] 2025-06-13 12:44:27,478 - root - INFO - lr: 1.0290e-04 gnorm: 3.05 [ 0:02:58< 3:34:30] +[titan] 2025-06-13 12:44:30,865 - root - INFO - step: 210 loss: 28.8507 memory: 6.46GiB(27.34%) tps: 24,192 tflops: 24.35 mfu: 7.80% global_avg_ntp_loss: 7.0371 global_avg_mtp_loss: 21.8136 +[titan] 2025-06-13 12:44:30,865 - root - INFO - lr: 1.0539e-04 gnorm: 3.27 [ 0:03:01< 3:33:18] +[titan] 2025-06-13 12:44:33,930 - root - INFO - step: 215 loss: 28.4966 memory: 6.46GiB(27.34%) tps: 26,732 tflops: 26.90 mfu: 8.62% global_avg_ntp_loss: 6.9337 global_avg_mtp_loss: 21.5629 +[titan] 2025-06-13 12:44:33,930 - root - INFO - lr: 1.0789e-04 gnorm: 1.74 [ 0:03:04< 3:31:47] +[titan] 2025-06-13 12:44:37,009 - root - INFO - step: 220 loss: 29.3886 memory: 6.46GiB(27.34%) tps: 26,608 tflops: 26.78 mfu: 8.58% global_avg_ntp_loss: 7.1560 global_avg_mtp_loss: 22.2326 +[titan] 2025-06-13 12:44:37,009 - root - INFO - lr: 1.1039e-04 gnorm: 3.84 [ 0:03:07< 3:30:21] +[titan] 2025-06-13 12:44:40,496 - root - INFO - step: 225 loss: 27.3423 memory: 6.46GiB(27.34%) tps: 23,494 tflops: 23.64 mfu: 7.58% global_avg_ntp_loss: 6.6378 global_avg_mtp_loss: 20.7045 +[titan] 2025-06-13 12:44:40,496 - root - INFO - lr: 1.1289e-04 gnorm: 4.18 [ 0:03:11< 3:29:25] +[titan] 2025-06-13 12:44:43,717 - root - INFO - step: 230 loss: 29.1740 memory: 6.46GiB(27.34%) tps: 25,435 tflops: 25.60 mfu: 8.20% global_avg_ntp_loss: 7.0846 global_avg_mtp_loss: 22.0894 +[titan] 2025-06-13 12:44:43,718 - root - INFO - lr: 1.1538e-04 gnorm: 2.57 [ 0:03:14< 3:28:15] +[titan] 2025-06-13 12:44:46,972 - root - INFO - step: 235 loss: 29.1274 memory: 6.46GiB(27.34%) tps: 25,175 tflops: 25.34 mfu: 8.12% global_avg_ntp_loss: 7.0639 global_avg_mtp_loss: 22.0636 +[titan] 2025-06-13 12:44:46,972 - root - INFO - lr: 1.1788e-04 gnorm: 2.41 [ 0:03:17< 3:27:09] +[titan] 2025-06-13 12:44:50,394 - root - INFO - step: 240 loss: 29.1559 memory: 6.46GiB(27.34%) tps: 23,941 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 7.0581 global_avg_mtp_loss: 22.0978 +[titan] 2025-06-13 12:44:50,394 - root - INFO - lr: 1.2038e-04 gnorm: 2.65 [ 0:03:21< 3:26:16] +[titan] 2025-06-13 12:44:53,648 - root - INFO - step: 245 loss: 27.9617 memory: 6.46GiB(27.34%) tps: 25,178 tflops: 25.34 mfu: 8.12% global_avg_ntp_loss: 6.7413 global_avg_mtp_loss: 21.2204 +[titan] 2025-06-13 12:44:53,649 - root - INFO - lr: 1.2288e-04 gnorm: 3.41 [ 0:03:24< 3:25:16] +[titan] 2025-06-13 12:44:56,261 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:44:56,921 - root - INFO - step: 250 loss: 28.8521 memory: 6.46GiB(27.34%) tps: 25,035 tflops: 25.19 mfu: 8.08% global_avg_ntp_loss: 6.9724 global_avg_mtp_loss: 21.8798 +[titan] 2025-06-13 12:44:56,921 - root - INFO - lr: 1.2537e-04 gnorm: 3.48 [ 0:03:27< 3:24:18] +[titan] 2025-06-13 12:45:00,431 - root - INFO - step: 255 loss: 28.5249 memory: 6.46GiB(27.34%) tps: 23,342 tflops: 23.49 mfu: 7.53% global_avg_ntp_loss: 6.8707 global_avg_mtp_loss: 21.6542 +[titan] 2025-06-13 12:45:00,431 - root - INFO - lr: 1.2787e-04 gnorm: 2.28 [ 0:03:31< 3:23:37] +[titan] 2025-06-13 12:45:03,710 - root - INFO - step: 260 loss: 26.9207 memory: 6.46GiB(27.34%) tps: 24,988 tflops: 25.15 mfu: 8.06% global_avg_ntp_loss: 6.4975 global_avg_mtp_loss: 20.4231 +[titan] 2025-06-13 12:45:03,710 - root - INFO - lr: 1.3037e-04 gnorm: 6.24 [ 0:03:34< 3:22:44] +[titan] 2025-06-13 12:45:07,271 - root - INFO - step: 265 loss: 28.7229 memory: 6.46GiB(27.34%) tps: 23,006 tflops: 23.15 mfu: 7.42% global_avg_ntp_loss: 6.8949 global_avg_mtp_loss: 21.8280 +[titan] 2025-06-13 12:45:07,271 - root - INFO - lr: 1.3287e-04 gnorm: 2.62 [ 0:03:38< 3:22:08] +[titan] 2025-06-13 12:45:11,112 - root - INFO - step: 270 loss: 28.6888 memory: 6.46GiB(27.34%) tps: 21,327 tflops: 21.46 mfu: 6.88% global_avg_ntp_loss: 6.8856 global_avg_mtp_loss: 21.8032 +[titan] 2025-06-13 12:45:11,113 - root - INFO - lr: 1.3536e-04 gnorm: 2.83 [ 0:03:41< 3:21:49] +[titan] 2025-06-13 12:45:14,483 - root - INFO - step: 275 loss: 28.7971 memory: 6.46GiB(27.34%) tps: 24,309 tflops: 24.46 mfu: 7.84% global_avg_ntp_loss: 6.8816 global_avg_mtp_loss: 21.9155 +[titan] 2025-06-13 12:45:14,483 - root - INFO - lr: 1.3786e-04 gnorm: 2.74 [ 0:03:45< 3:21:05] +[titan] 2025-06-13 12:45:17,644 - root - INFO - step: 280 loss: 28.7302 memory: 6.46GiB(27.34%) tps: 25,915 tflops: 26.08 mfu: 8.36% global_avg_ntp_loss: 6.8909 global_avg_mtp_loss: 21.8393 +[titan] 2025-06-13 12:45:17,645 - root - INFO - lr: 1.4036e-04 gnorm: 2.70 [ 0:03:48< 3:20:12] +[titan] 2025-06-13 12:45:21,161 - root - INFO - step: 285 loss: 28.2382 memory: 6.46GiB(27.34%) tps: 23,297 tflops: 23.45 mfu: 7.51% global_avg_ntp_loss: 6.7162 global_avg_mtp_loss: 21.5220 +[titan] 2025-06-13 12:45:21,162 - root - INFO - lr: 1.4286e-04 gnorm: 2.29 [ 0:03:52< 3:19:39] +[titan] 2025-06-13 12:45:24,304 - root - INFO - step: 290 loss: 27.8770 memory: 6.46GiB(27.34%) tps: 26,070 tflops: 26.24 mfu: 8.41% global_avg_ntp_loss: 6.6516 global_avg_mtp_loss: 21.2254 +[titan] 2025-06-13 12:45:24,305 - root - INFO - lr: 1.4535e-04 gnorm: 2.76 [ 0:03:55< 3:18:47] +[titan] 2025-06-13 12:45:27,431 - root - INFO - step: 295 loss: 28.7626 memory: 6.46GiB(27.34%) tps: 26,200 tflops: 26.37 mfu: 8.45% global_avg_ntp_loss: 6.8184 global_avg_mtp_loss: 21.9443 +[titan] 2025-06-13 12:45:27,432 - root - INFO - lr: 1.4785e-04 gnorm: 1.78 [ 0:03:58< 3:17:57] +[titan] 2025-06-13 12:45:30,432 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:45:31,003 - root - INFO - step: 300 loss: 28.3571 memory: 6.46GiB(27.34%) tps: 22,941 tflops: 23.09 mfu: 7.40% global_avg_ntp_loss: 6.7046 global_avg_mtp_loss: 21.6525 +[titan] 2025-06-13 12:45:31,003 - root - INFO - lr: 1.5035e-04 gnorm: 2.45 [ 0:04:01< 3:17:30] +[titan] 2025-06-13 12:45:34,437 - root - INFO - step: 305 loss: 28.6218 memory: 6.46GiB(27.34%) tps: 23,861 tflops: 24.01 mfu: 7.70% global_avg_ntp_loss: 6.8074 global_avg_mtp_loss: 21.8145 +[titan] 2025-06-13 12:45:34,437 - root - INFO - lr: 1.5285e-04 gnorm: 2.24 [ 0:04:05< 3:16:57] +[titan] 2025-06-13 12:45:37,814 - root - INFO - step: 310 loss: 28.5719 memory: 6.46GiB(27.34%) tps: 24,259 tflops: 24.41 mfu: 7.82% global_avg_ntp_loss: 6.7458 global_avg_mtp_loss: 21.8261 +[titan] 2025-06-13 12:45:37,814 - root - INFO - lr: 1.5534e-04 gnorm: 3.01 [ 0:04:08< 3:16:23] +[titan] 2025-06-13 12:45:41,765 - root - INFO - step: 315 loss: 27.6310 memory: 6.46GiB(27.34%) tps: 20,737 tflops: 20.87 mfu: 6.69% global_avg_ntp_loss: 6.5348 global_avg_mtp_loss: 21.0962 +[titan] 2025-06-13 12:45:41,765 - root - INFO - lr: 1.5784e-04 gnorm: 4.03 [ 0:04:12< 3:16:16] +[titan] 2025-06-13 12:45:45,332 - root - INFO - step: 320 loss: 28.5232 memory: 6.46GiB(27.34%) tps: 22,970 tflops: 23.12 mfu: 7.41% global_avg_ntp_loss: 6.7117 global_avg_mtp_loss: 21.8116 +[titan] 2025-06-13 12:45:45,332 - root - INFO - lr: 1.6034e-04 gnorm: 3.53 [ 0:04:16< 3:15:52] +[titan] 2025-06-13 12:45:48,488 - root - INFO - step: 325 loss: 28.0294 memory: 6.46GiB(27.34%) tps: 25,960 tflops: 26.13 mfu: 8.37% global_avg_ntp_loss: 6.6055 global_avg_mtp_loss: 21.4239 +[titan] 2025-06-13 12:45:48,488 - root - INFO - lr: 1.6284e-04 gnorm: 3.01 [ 0:04:19< 3:15:09] +[titan] 2025-06-13 12:45:51,800 - root - INFO - step: 330 loss: 27.8623 memory: 6.46GiB(27.34%) tps: 24,734 tflops: 24.89 mfu: 7.98% global_avg_ntp_loss: 6.5924 global_avg_mtp_loss: 21.2699 +[titan] 2025-06-13 12:45:51,800 - root - INFO - lr: 1.6533e-04 gnorm: 3.27 [ 0:04:22< 3:14:35] +[titan] 2025-06-13 12:45:55,081 - root - INFO - step: 335 loss: 28.6007 memory: 6.46GiB(27.34%) tps: 24,972 tflops: 25.13 mfu: 8.05% global_avg_ntp_loss: 6.7154 global_avg_mtp_loss: 21.8853 +[titan] 2025-06-13 12:45:55,081 - root - INFO - lr: 1.6783e-04 gnorm: 3.00 [ 0:04:25< 3:14:01] +[titan] 2025-06-13 12:45:58,428 - root - INFO - step: 340 loss: 28.3437 memory: 6.46GiB(27.34%) tps: 24,481 tflops: 24.64 mfu: 7.90% global_avg_ntp_loss: 6.6496 global_avg_mtp_loss: 21.6941 +[titan] 2025-06-13 12:45:58,428 - root - INFO - lr: 1.7033e-04 gnorm: 2.83 [ 0:04:29< 3:13:30] +[titan] 2025-06-13 12:46:01,725 - root - INFO - step: 345 loss: 29.0933 memory: 6.46GiB(27.34%) tps: 24,850 tflops: 25.01 mfu: 8.02% global_avg_ntp_loss: 6.9141 global_avg_mtp_loss: 22.1792 +[titan] 2025-06-13 12:46:01,725 - root - INFO - lr: 1.7283e-04 gnorm: 4.80 [ 0:04:32< 3:12:58] +[titan] 2025-06-13 12:46:04,530 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:46:05,218 - root - INFO - step: 350 loss: 27.8976 memory: 6.46GiB(27.34%) tps: 23,456 tflops: 23.61 mfu: 7.57% global_avg_ntp_loss: 6.5295 global_avg_mtp_loss: 21.3682 +[titan] 2025-06-13 12:46:05,218 - root - INFO - lr: 1.7532e-04 gnorm: 2.98 [ 0:04:36< 3:12:35] +[titan] 2025-06-13 12:46:08,576 - root - INFO - step: 355 loss: 27.8422 memory: 6.46GiB(27.34%) tps: 24,399 tflops: 24.55 mfu: 7.87% global_avg_ntp_loss: 6.4872 global_avg_mtp_loss: 21.3550 +[titan] 2025-06-13 12:46:08,576 - root - INFO - lr: 1.7782e-04 gnorm: 2.18 [ 0:04:39< 3:12:07] +[titan] 2025-06-13 12:46:11,936 - root - INFO - step: 360 loss: 27.7755 memory: 6.46GiB(27.34%) tps: 24,383 tflops: 24.54 mfu: 7.86% global_avg_ntp_loss: 6.4725 global_avg_mtp_loss: 21.3030 +[titan] 2025-06-13 12:46:11,936 - root - INFO - lr: 1.8032e-04 gnorm: 2.41 [ 0:04:42< 3:11:39] +[titan] 2025-06-13 12:46:15,207 - root - INFO - step: 365 loss: 27.9619 memory: 6.46GiB(27.34%) tps: 25,048 tflops: 25.21 mfu: 8.08% global_avg_ntp_loss: 6.5313 global_avg_mtp_loss: 21.4306 +[titan] 2025-06-13 12:46:15,207 - root - INFO - lr: 1.8282e-04 gnorm: 2.23 [ 0:04:46< 3:11:09] +[titan] 2025-06-13 12:46:18,435 - root - INFO - step: 370 loss: 28.0171 memory: 6.46GiB(27.34%) tps: 25,378 tflops: 25.54 mfu: 8.19% global_avg_ntp_loss: 6.6265 global_avg_mtp_loss: 21.3905 +[titan] 2025-06-13 12:46:18,436 - root - INFO - lr: 1.8531e-04 gnorm: 3.81 [ 0:04:49< 3:10:38] +[titan] 2025-06-13 12:46:21,893 - root - INFO - step: 375 loss: 27.6718 memory: 6.46GiB(27.34%) tps: 23,694 tflops: 23.84 mfu: 7.64% global_avg_ntp_loss: 6.3848 global_avg_mtp_loss: 21.2870 +[titan] 2025-06-13 12:46:21,894 - root - INFO - lr: 1.8781e-04 gnorm: 2.45 [ 0:04:52< 3:10:16] +[titan] 2025-06-13 12:46:25,134 - root - INFO - step: 380 loss: 28.1686 memory: 6.46GiB(27.34%) tps: 25,280 tflops: 25.44 mfu: 8.15% global_avg_ntp_loss: 6.5272 global_avg_mtp_loss: 21.6415 +[titan] 2025-06-13 12:46:25,135 - root - INFO - lr: 1.9031e-04 gnorm: 2.55 [ 0:04:55< 3:09:47] +[titan] 2025-06-13 12:46:28,475 - root - INFO - step: 385 loss: 28.0375 memory: 6.46GiB(27.34%) tps: 24,527 tflops: 24.68 mfu: 7.91% global_avg_ntp_loss: 6.4783 global_avg_mtp_loss: 21.5592 +[titan] 2025-06-13 12:46:28,475 - root - INFO - lr: 1.9281e-04 gnorm: 2.02 [ 0:04:59< 3:09:22] +[titan] 2025-06-13 12:46:31,707 - root - INFO - step: 390 loss: 27.4924 memory: 6.46GiB(27.34%) tps: 25,348 tflops: 25.51 mfu: 8.18% global_avg_ntp_loss: 6.3977 global_avg_mtp_loss: 21.0946 +[titan] 2025-06-13 12:46:31,707 - root - INFO - lr: 1.9530e-04 gnorm: 3.02 [ 0:05:02< 3:08:53] +[titan] 2025-06-13 12:46:35,050 - root - INFO - step: 395 loss: 27.8223 memory: 6.46GiB(27.34%) tps: 24,510 tflops: 24.67 mfu: 7.91% global_avg_ntp_loss: 6.4204 global_avg_mtp_loss: 21.4019 +[titan] 2025-06-13 12:46:35,050 - root - INFO - lr: 1.9780e-04 gnorm: 3.04 [ 0:05:05< 3:08:30] +[titan] 2025-06-13 12:46:37,513 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:46:38,197 - root - INFO - step: 400 loss: 27.7397 memory: 6.46GiB(27.34%) tps: 26,029 tflops: 26.20 mfu: 8.40% global_avg_ntp_loss: 6.3872 global_avg_mtp_loss: 21.3525 +[titan] 2025-06-13 12:46:38,198 - root - INFO - lr: 2.0030e-04 gnorm: 1.89 [ 0:05:09< 3:07:59] +[titan] 2025-06-13 12:46:41,244 - root - INFO - step: 405 loss: 27.7350 memory: 6.46GiB(27.34%) tps: 26,888 tflops: 27.06 mfu: 8.67% global_avg_ntp_loss: 6.3411 global_avg_mtp_loss: 21.3939 +[titan] 2025-06-13 12:46:41,245 - root - INFO - lr: 2.0280e-04 gnorm: 2.97 [ 0:05:12< 3:07:26] +[titan] 2025-06-13 12:46:44,596 - root - INFO - step: 410 loss: 27.4134 memory: 6.46GiB(27.34%) tps: 24,448 tflops: 24.60 mfu: 7.89% global_avg_ntp_loss: 6.3127 global_avg_mtp_loss: 21.1008 +[titan] 2025-06-13 12:46:44,596 - root - INFO - lr: 2.0529e-04 gnorm: 3.19 [ 0:05:15< 3:07:04] +[titan] 2025-06-13 12:46:48,011 - root - INFO - step: 415 loss: 27.2564 memory: 6.46GiB(27.34%) tps: 23,990 tflops: 24.14 mfu: 7.74% global_avg_ntp_loss: 6.2383 global_avg_mtp_loss: 21.0182 +[titan] 2025-06-13 12:46:48,011 - root - INFO - lr: 2.0779e-04 gnorm: 2.81 [ 0:05:18< 3:06:45] +[titan] 2025-06-13 12:46:51,229 - root - INFO - step: 420 loss: 27.4610 memory: 6.46GiB(27.34%) tps: 25,459 tflops: 25.62 mfu: 8.21% global_avg_ntp_loss: 6.2807 global_avg_mtp_loss: 21.1803 +[titan] 2025-06-13 12:46:51,230 - root - INFO - lr: 2.1029e-04 gnorm: 2.85 [ 0:05:22< 3:06:20] +[titan] 2025-06-13 12:46:54,524 - root - INFO - step: 425 loss: 26.8515 memory: 6.46GiB(27.34%) tps: 24,869 tflops: 25.03 mfu: 8.02% global_avg_ntp_loss: 6.1559 global_avg_mtp_loss: 20.6956 +[titan] 2025-06-13 12:46:54,524 - root - INFO - lr: 2.1279e-04 gnorm: 2.74 [ 0:05:25< 3:05:57] +[titan] 2025-06-13 12:46:58,320 - root - INFO - step: 430 loss: 27.2735 memory: 6.46GiB(27.34%) tps: 21,581 tflops: 21.72 mfu: 6.96% global_avg_ntp_loss: 6.1884 global_avg_mtp_loss: 21.0851 +[titan] 2025-06-13 12:46:58,320 - root - INFO - lr: 2.1528e-04 gnorm: 2.72 [ 0:05:29< 3:05:53] +[titan] 2025-06-13 12:47:01,406 - root - INFO - step: 435 loss: 27.4422 memory: 6.46GiB(27.34%) tps: 26,548 tflops: 26.72 mfu: 8.56% global_avg_ntp_loss: 6.2508 global_avg_mtp_loss: 21.1914 +[titan] 2025-06-13 12:47:01,407 - root - INFO - lr: 2.1778e-04 gnorm: 2.44 [ 0:05:32< 3:05:24] +[titan] 2025-06-13 12:47:04,933 - root - INFO - step: 440 loss: 27.3640 memory: 6.46GiB(27.34%) tps: 23,230 tflops: 23.38 mfu: 7.49% global_avg_ntp_loss: 6.2041 global_avg_mtp_loss: 21.1599 +[titan] 2025-06-13 12:47:04,933 - root - INFO - lr: 2.2028e-04 gnorm: 2.30 [ 0:05:35< 3:05:10] +[titan] 2025-06-13 12:47:08,234 - root - INFO - step: 445 loss: 27.6714 memory: 6.46GiB(27.34%) tps: 24,821 tflops: 24.98 mfu: 8.01% global_avg_ntp_loss: 6.3305 global_avg_mtp_loss: 21.3410 +[titan] 2025-06-13 12:47:08,234 - root - INFO - lr: 2.2278e-04 gnorm: 2.27 [ 0:05:39< 3:04:50] +[titan] 2025-06-13 12:47:10,905 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:47:11,454 - root - INFO - step: 450 loss: 27.2493 memory: 6.46GiB(27.34%) tps: 25,441 tflops: 25.60 mfu: 8.21% global_avg_ntp_loss: 6.1587 global_avg_mtp_loss: 21.0906 +[titan] 2025-06-13 12:47:11,455 - root - INFO - lr: 2.2527e-04 gnorm: 2.72 [ 0:05:42< 3:04:27] +[titan] 2025-06-13 12:47:14,987 - root - INFO - step: 455 loss: 27.4793 memory: 6.46GiB(27.34%) tps: 23,191 tflops: 23.34 mfu: 7.48% global_avg_ntp_loss: 6.2614 global_avg_mtp_loss: 21.2179 +[titan] 2025-06-13 12:47:14,987 - root - INFO - lr: 2.2777e-04 gnorm: 2.79 [ 0:05:45< 3:04:14] +[titan] 2025-06-13 12:47:18,499 - root - INFO - step: 460 loss: 27.1990 memory: 6.46GiB(27.34%) tps: 23,331 tflops: 23.48 mfu: 7.53% global_avg_ntp_loss: 6.1364 global_avg_mtp_loss: 21.0626 +[titan] 2025-06-13 12:47:18,499 - root - INFO - lr: 2.3027e-04 gnorm: 1.88 [ 0:05:49< 3:04:01] +[titan] 2025-06-13 12:47:21,885 - root - INFO - step: 465 loss: 26.4726 memory: 6.46GiB(27.34%) tps: 24,198 tflops: 24.35 mfu: 7.81% global_avg_ntp_loss: 6.0383 global_avg_mtp_loss: 20.4343 +[titan] 2025-06-13 12:47:21,885 - root - INFO - lr: 2.3277e-04 gnorm: 2.32 [ 0:05:52< 3:03:45] +[titan] 2025-06-13 12:47:25,101 - root - INFO - step: 470 loss: 27.6194 memory: 6.46GiB(27.34%) tps: 25,475 tflops: 25.64 mfu: 8.22% global_avg_ntp_loss: 6.2852 global_avg_mtp_loss: 21.3343 +[titan] 2025-06-13 12:47:25,101 - root - INFO - lr: 2.3526e-04 gnorm: 2.09 [ 0:05:55< 3:03:23] +[titan] 2025-06-13 12:47:28,386 - root - INFO - step: 475 loss: 26.7046 memory: 6.46GiB(27.34%) tps: 24,944 tflops: 25.10 mfu: 8.05% global_avg_ntp_loss: 6.0263 global_avg_mtp_loss: 20.6783 +[titan] 2025-06-13 12:47:28,386 - root - INFO - lr: 2.3776e-04 gnorm: 2.27 [ 0:05:59< 3:03:04] +[titan] 2025-06-13 12:47:31,758 - root - INFO - step: 480 loss: 26.4942 memory: 6.46GiB(27.34%) tps: 24,298 tflops: 24.45 mfu: 7.84% global_avg_ntp_loss: 6.0358 global_avg_mtp_loss: 20.4584 +[titan] 2025-06-13 12:47:31,758 - root - INFO - lr: 2.4026e-04 gnorm: 3.72 [ 0:06:02< 3:02:48] +[titan] 2025-06-13 12:47:34,823 - root - INFO - step: 485 loss: 26.7233 memory: 6.46GiB(27.34%) tps: 26,732 tflops: 26.90 mfu: 8.62% global_avg_ntp_loss: 6.0231 global_avg_mtp_loss: 20.7001 +[titan] 2025-06-13 12:47:34,823 - root - INFO - lr: 2.4276e-04 gnorm: 2.47 [ 0:06:05< 3:02:23] +[titan] 2025-06-13 12:47:38,901 - root - INFO - step: 490 loss: 27.0197 memory: 6.46GiB(27.34%) tps: 20,086 tflops: 20.21 mfu: 6.48% global_avg_ntp_loss: 6.0996 global_avg_mtp_loss: 20.9201 +[titan] 2025-06-13 12:47:38,902 - root - INFO - lr: 2.4525e-04 gnorm: 1.96 [ 0:06:09< 3:02:28] +[titan] 2025-06-13 12:47:41,700 - root - INFO - step: 495 loss: 26.7648 memory: 6.46GiB(27.34%) tps: 29,278 tflops: 29.46 mfu: 9.44% global_avg_ntp_loss: 6.1238 global_avg_mtp_loss: 20.6410 +[titan] 2025-06-13 12:47:41,700 - root - INFO - lr: 2.4775e-04 gnorm: 2.63 [ 0:06:12< 3:01:56] +[titan] 2025-06-13 12:47:44,444 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:47:45,056 - root - INFO - step: 500 loss: 27.3079 memory: 6.46GiB(27.34%) tps: 24,407 tflops: 24.56 mfu: 7.87% global_avg_ntp_loss: 6.1168 global_avg_mtp_loss: 21.1910 +[titan] 2025-06-13 12:47:45,057 - root - INFO - lr: 2.5025e-04 gnorm: 2.20 [ 0:06:15< 3:01:40] +[titan] 2025-06-13 12:47:48,533 - root - INFO - step: 505 loss: 27.2190 memory: 6.46GiB(27.34%) tps: 23,564 tflops: 23.71 mfu: 7.60% global_avg_ntp_loss: 6.1992 global_avg_mtp_loss: 21.0198 +[titan] 2025-06-13 12:47:48,534 - root - INFO - lr: 2.5275e-04 gnorm: 2.71 [ 0:06:19< 3:01:28] +[titan] 2025-06-13 12:47:51,614 - root - INFO - step: 510 loss: 26.0190 memory: 6.46GiB(27.34%) tps: 26,593 tflops: 26.76 mfu: 8.58% global_avg_ntp_loss: 5.8499 global_avg_mtp_loss: 20.1691 +[titan] 2025-06-13 12:47:51,615 - root - INFO - lr: 2.5524e-04 gnorm: 2.07 [ 0:06:22< 3:01:05] +[titan] 2025-06-13 12:47:53,177 - root - INFO - Dumping profiler traces at step 512 +[titan] 2025-06-13 12:47:53,258 - root - INFO - Finished dumping profiler traces in 0.08 seconds +[titan] 2025-06-13 12:47:55,243 - root - INFO - step: 515 loss: 26.6682 memory: 6.46GiB(27.34%) tps: 22,576 tflops: 22.72 mfu: 7.28% global_avg_ntp_loss: 5.9412 global_avg_mtp_loss: 20.7269 +[titan] 2025-06-13 12:47:55,244 - root - INFO - lr: 2.5774e-04 gnorm: 1.89 [ 0:06:26< 3:00:58] +[titan] 2025-06-13 12:47:58,616 - root - INFO - step: 520 loss: 25.8300 memory: 6.46GiB(27.34%) tps: 24,296 tflops: 24.45 mfu: 7.84% global_avg_ntp_loss: 5.8827 global_avg_mtp_loss: 19.9474 +[titan] 2025-06-13 12:47:58,616 - root - INFO - lr: 2.6024e-04 gnorm: 4.22 [ 0:06:29< 3:00:44] +[titan] 2025-06-13 12:48:02,053 - root - INFO - step: 525 loss: 25.0639 memory: 6.46GiB(27.34%) tps: 23,835 tflops: 23.99 mfu: 7.69% global_avg_ntp_loss: 5.6694 global_avg_mtp_loss: 19.3945 +[titan] 2025-06-13 12:48:02,053 - root - INFO - lr: 2.6274e-04 gnorm: 4.88 [ 0:06:32< 3:00:32] +[titan] 2025-06-13 12:48:05,758 - root - INFO - step: 530 loss: 26.7810 memory: 6.46GiB(27.34%) tps: 22,113 tflops: 22.25 mfu: 7.13% global_avg_ntp_loss: 6.0413 global_avg_mtp_loss: 20.7397 +[titan] 2025-06-13 12:48:05,758 - root - INFO - lr: 2.6523e-04 gnorm: 3.73 [ 0:06:36< 3:00:27] +[titan] 2025-06-13 12:48:08,735 - root - INFO - step: 535 loss: 26.8855 memory: 6.46GiB(27.34%) tps: 27,520 tflops: 27.70 mfu: 8.88% global_avg_ntp_loss: 6.0347 global_avg_mtp_loss: 20.8508 +[titan] 2025-06-13 12:48:08,735 - root - INFO - lr: 2.6773e-04 gnorm: 2.53 [ 0:06:39< 3:00:03] +[titan] 2025-06-13 12:48:12,522 - root - INFO - step: 540 loss: 26.9649 memory: 6.46GiB(27.34%) tps: 21,634 tflops: 21.77 mfu: 6.98% global_avg_ntp_loss: 5.9941 global_avg_mtp_loss: 20.9708 +[titan] 2025-06-13 12:48:12,522 - root - INFO - lr: 2.7023e-04 gnorm: 1.87 [ 0:06:43< 3:00:00] +[titan] 2025-06-13 12:48:15,764 - root - INFO - step: 545 loss: 26.6519 memory: 6.46GiB(27.34%) tps: 25,274 tflops: 25.43 mfu: 8.15% global_avg_ntp_loss: 5.9282 global_avg_mtp_loss: 20.7237 +[titan] 2025-06-13 12:48:15,764 - root - INFO - lr: 2.7273e-04 gnorm: 2.84 [ 0:06:46< 2:59:43] +[titan] 2025-06-13 12:48:18,527 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:48:19,166 - root - INFO - step: 550 loss: 26.4560 memory: 6.46GiB(27.34%) tps: 24,082 tflops: 24.24 mfu: 7.77% global_avg_ntp_loss: 5.8798 global_avg_mtp_loss: 20.5762 +[titan] 2025-06-13 12:48:19,166 - root - INFO - lr: 2.7522e-04 gnorm: 1.92 [ 0:06:49< 2:59:31] +[titan] 2025-06-13 12:48:22,448 - root - INFO - step: 555 loss: 26.6798 memory: 6.46GiB(27.34%) tps: 24,962 tflops: 25.12 mfu: 8.05% global_avg_ntp_loss: 6.0242 global_avg_mtp_loss: 20.6557 +[titan] 2025-06-13 12:48:22,448 - root - INFO - lr: 2.7772e-04 gnorm: 2.67 [ 0:06:53< 2:59:16] +[titan] 2025-06-13 12:48:26,023 - root - INFO - step: 560 loss: 26.6921 memory: 6.46GiB(27.34%) tps: 22,920 tflops: 23.07 mfu: 7.39% global_avg_ntp_loss: 5.9768 global_avg_mtp_loss: 20.7153 +[titan] 2025-06-13 12:48:26,023 - root - INFO - lr: 2.8022e-04 gnorm: 2.34 [ 0:06:56< 2:59:08] +[titan] 2025-06-13 12:48:29,236 - root - INFO - step: 565 loss: 26.6977 memory: 6.46GiB(27.34%) tps: 25,495 tflops: 25.66 mfu: 8.22% global_avg_ntp_loss: 5.9077 global_avg_mtp_loss: 20.7900 +[titan] 2025-06-13 12:48:29,237 - root - INFO - lr: 2.8272e-04 gnorm: 2.60 [ 0:07:00< 2:58:51] +[titan] 2025-06-13 12:48:32,909 - root - INFO - step: 570 loss: 26.3573 memory: 6.46GiB(27.34%) tps: 22,305 tflops: 22.45 mfu: 7.19% global_avg_ntp_loss: 5.8093 global_avg_mtp_loss: 20.5480 +[titan] 2025-06-13 12:48:32,910 - root - INFO - lr: 2.8521e-04 gnorm: 1.78 [ 0:07:03< 2:58:47] +[titan] 2025-06-13 12:48:35,985 - root - INFO - step: 575 loss: 26.2929 memory: 6.46GiB(27.34%) tps: 26,642 tflops: 26.81 mfu: 8.59% global_avg_ntp_loss: 5.7928 global_avg_mtp_loss: 20.5001 +[titan] 2025-06-13 12:48:35,985 - root - INFO - lr: 2.8771e-04 gnorm: 2.08 [ 0:07:06< 2:58:27] +[titan] 2025-06-13 12:48:39,515 - root - INFO - step: 580 loss: 26.4730 memory: 6.46GiB(27.34%) tps: 23,209 tflops: 23.36 mfu: 7.49% global_avg_ntp_loss: 5.8972 global_avg_mtp_loss: 20.5758 +[titan] 2025-06-13 12:48:39,516 - root - INFO - lr: 2.9021e-04 gnorm: 2.00 [ 0:07:10< 2:58:19] +[titan] 2025-06-13 12:48:42,904 - root - INFO - step: 585 loss: 26.7014 memory: 6.46GiB(27.34%) tps: 24,175 tflops: 24.33 mfu: 7.80% global_avg_ntp_loss: 5.8936 global_avg_mtp_loss: 20.8078 +[titan] 2025-06-13 12:48:42,905 - root - INFO - lr: 2.9271e-04 gnorm: 2.77 [ 0:07:13< 2:58:07] +[titan] 2025-06-13 12:48:46,283 - root - INFO - step: 590 loss: 27.0254 memory: 6.46GiB(27.34%) tps: 24,252 tflops: 24.41 mfu: 7.82% global_avg_ntp_loss: 6.0663 global_avg_mtp_loss: 20.9591 +[titan] 2025-06-13 12:48:46,283 - root - INFO - lr: 2.9520e-04 gnorm: 2.06 [ 0:07:17< 2:57:55] +[titan] 2025-06-13 12:48:49,942 - root - INFO - step: 595 loss: 26.2415 memory: 6.46GiB(27.34%) tps: 22,385 tflops: 22.53 mfu: 7.22% global_avg_ntp_loss: 5.7651 global_avg_mtp_loss: 20.4764 +[titan] 2025-06-13 12:48:49,943 - root - INFO - lr: 2.9770e-04 gnorm: 1.71 [ 0:07:20< 2:57:50] +[titan] 2025-06-13 12:48:52,292 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:48:53,028 - root - INFO - step: 600 loss: 26.2138 memory: 6.46GiB(27.34%) tps: 26,552 tflops: 26.72 mfu: 8.56% global_avg_ntp_loss: 5.8629 global_avg_mtp_loss: 20.3510 +[titan] 2025-06-13 12:48:53,028 - root - INFO - lr: 3.0020e-04 gnorm: 2.69 [ 0:07:23< 2:57:32] +[titan] 2025-06-13 12:48:56,161 - root - INFO - step: 605 loss: 26.9608 memory: 6.46GiB(27.34%) tps: 26,149 tflops: 26.32 mfu: 8.43% global_avg_ntp_loss: 6.1317 global_avg_mtp_loss: 20.8291 +[titan] 2025-06-13 12:48:56,162 - root - INFO - lr: 3.0270e-04 gnorm: 3.67 [ 0:07:26< 2:57:15] +[titan] 2025-06-13 12:48:59,440 - root - INFO - step: 610 loss: 26.6352 memory: 6.46GiB(27.34%) tps: 24,993 tflops: 25.15 mfu: 8.06% global_avg_ntp_loss: 5.8912 global_avg_mtp_loss: 20.7440 +[titan] 2025-06-13 12:48:59,440 - root - INFO - lr: 3.0519e-04 gnorm: 1.70 [ 0:07:30< 2:57:01] +[titan] 2025-06-13 12:49:03,452 - root - INFO - step: 615 loss: 26.4638 memory: 6.46GiB(27.34%) tps: 20,421 tflops: 20.55 mfu: 6.59% global_avg_ntp_loss: 5.8763 global_avg_mtp_loss: 20.5875 +[titan] 2025-06-13 12:49:03,453 - root - INFO - lr: 3.0769e-04 gnorm: 2.35 [ 0:07:34< 2:57:05] +[titan] 2025-06-13 12:49:07,137 - root - INFO - step: 620 loss: 26.3830 memory: 6.46GiB(27.34%) tps: 22,232 tflops: 22.37 mfu: 7.17% global_avg_ntp_loss: 5.8260 global_avg_mtp_loss: 20.5570 +[titan] 2025-06-13 12:49:07,138 - root - INFO - lr: 3.1019e-04 gnorm: 1.99 [ 0:07:37< 2:57:01] +[titan] 2025-06-13 12:49:11,160 - root - INFO - step: 625 loss: 25.7997 memory: 6.46GiB(27.34%) tps: 20,369 tflops: 20.50 mfu: 6.57% global_avg_ntp_loss: 5.8202 global_avg_mtp_loss: 19.9795 +[titan] 2025-06-13 12:49:11,160 - root - INFO - lr: 3.1269e-04 gnorm: 2.06 [ 0:07:41< 2:57:05] +[titan] 2025-06-13 12:49:15,068 - root - INFO - step: 630 loss: 26.6903 memory: 6.46GiB(27.34%) tps: 20,959 tflops: 21.09 mfu: 6.76% global_avg_ntp_loss: 5.9297 global_avg_mtp_loss: 20.7607 +[titan] 2025-06-13 12:49:15,069 - root - INFO - lr: 3.1518e-04 gnorm: 2.30 [ 0:07:45< 2:57:06] +[titan] 2025-06-13 12:49:18,670 - root - INFO - step: 635 loss: 26.6453 memory: 6.46GiB(27.34%) tps: 22,750 tflops: 22.90 mfu: 7.34% global_avg_ntp_loss: 5.8230 global_avg_mtp_loss: 20.8223 +[titan] 2025-06-13 12:49:18,670 - root - INFO - lr: 3.1768e-04 gnorm: 1.58 [ 0:07:49< 2:57:00] +[titan] 2025-06-13 12:49:22,161 - root - INFO - step: 640 loss: 26.2067 memory: 6.46GiB(27.34%) tps: 23,468 tflops: 23.62 mfu: 7.57% global_avg_ntp_loss: 5.7201 global_avg_mtp_loss: 20.4867 +[titan] 2025-06-13 12:49:22,161 - root - INFO - lr: 3.2018e-04 gnorm: 1.84 [ 0:07:52< 2:56:52] +[titan] 2025-06-13 12:49:25,756 - root - INFO - step: 645 loss: 25.8132 memory: 6.46GiB(27.34%) tps: 22,790 tflops: 22.93 mfu: 7.35% global_avg_ntp_loss: 5.6809 global_avg_mtp_loss: 20.1323 +[titan] 2025-06-13 12:49:25,756 - root - INFO - lr: 3.2268e-04 gnorm: 2.16 [ 0:07:56< 2:56:46] +[titan] 2025-06-13 12:49:28,862 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:49:29,505 - root - INFO - step: 650 loss: 25.5448 memory: 6.46GiB(27.34%) tps: 21,850 tflops: 21.99 mfu: 7.05% global_avg_ntp_loss: 5.6304 global_avg_mtp_loss: 19.9143 +[titan] 2025-06-13 12:49:29,506 - root - INFO - lr: 3.2517e-04 gnorm: 2.43 [ 0:08:00< 2:56:43] +[titan] 2025-06-13 12:49:32,843 - root - INFO - step: 655 loss: 27.8460 memory: 6.46GiB(27.34%) tps: 24,546 tflops: 24.70 mfu: 7.92% global_avg_ntp_loss: 6.2843 global_avg_mtp_loss: 21.5618 +[titan] 2025-06-13 12:49:32,844 - root - INFO - lr: 3.2767e-04 gnorm: 7.72 [ 0:08:03< 2:56:32] +[titan] 2025-06-13 12:49:36,265 - root - INFO - step: 660 loss: 24.6012 memory: 6.46GiB(27.34%) tps: 23,947 tflops: 24.10 mfu: 7.72% global_avg_ntp_loss: 5.4140 global_avg_mtp_loss: 19.1871 +[titan] 2025-06-13 12:49:36,265 - root - INFO - lr: 3.3017e-04 gnorm: 4.72 [ 0:08:07< 2:56:22] +[titan] 2025-06-13 12:49:39,536 - root - INFO - step: 665 loss: 25.6338 memory: 6.46GiB(27.34%) tps: 25,045 tflops: 25.20 mfu: 8.08% global_avg_ntp_loss: 5.6522 global_avg_mtp_loss: 19.9817 +[titan] 2025-06-13 12:49:39,536 - root - INFO - lr: 3.3267e-04 gnorm: 1.83 [ 0:08:10< 2:56:10] +[titan] 2025-06-13 12:49:43,014 - root - INFO - step: 670 loss: 25.9451 memory: 6.46GiB(27.34%) tps: 23,558 tflops: 23.71 mfu: 7.60% global_avg_ntp_loss: 5.7127 global_avg_mtp_loss: 20.2324 +[titan] 2025-06-13 12:49:43,014 - root - INFO - lr: 3.3516e-04 gnorm: 2.34 [ 0:08:13< 2:56:02] +[titan] 2025-06-13 12:49:46,586 - root - INFO - step: 675 loss: 25.9510 memory: 6.46GiB(27.34%) tps: 22,938 tflops: 23.08 mfu: 7.40% global_avg_ntp_loss: 5.7408 global_avg_mtp_loss: 20.2102 +[titan] 2025-06-13 12:49:46,586 - root - INFO - lr: 3.3766e-04 gnorm: 2.81 [ 0:08:17< 2:55:55] +[titan] 2025-06-13 12:49:49,837 - root - INFO - step: 680 loss: 26.4284 memory: 6.46GiB(27.34%) tps: 25,197 tflops: 25.36 mfu: 8.13% global_avg_ntp_loss: 5.7665 global_avg_mtp_loss: 20.6619 +[titan] 2025-06-13 12:49:49,838 - root - INFO - lr: 3.4016e-04 gnorm: 1.61 [ 0:08:20< 2:55:43] +[titan] 2025-06-13 12:49:53,053 - root - INFO - step: 685 loss: 25.9515 memory: 6.46GiB(27.34%) tps: 25,476 tflops: 25.64 mfu: 8.22% global_avg_ntp_loss: 5.6830 global_avg_mtp_loss: 20.2685 +[titan] 2025-06-13 12:49:53,054 - root - INFO - lr: 3.4266e-04 gnorm: 1.92 [ 0:08:23< 2:55:29] +[titan] 2025-06-13 12:49:56,348 - root - INFO - step: 690 loss: 26.2744 memory: 6.46GiB(27.34%) tps: 24,870 tflops: 25.03 mfu: 8.02% global_avg_ntp_loss: 5.7327 global_avg_mtp_loss: 20.5417 +[titan] 2025-06-13 12:49:56,348 - root - INFO - lr: 3.4515e-04 gnorm: 1.94 [ 0:08:27< 2:55:18] +[titan] 2025-06-13 12:49:59,959 - root - INFO - step: 695 loss: 26.2439 memory: 6.46GiB(27.34%) tps: 22,687 tflops: 22.83 mfu: 7.32% global_avg_ntp_loss: 5.7310 global_avg_mtp_loss: 20.5130 +[titan] 2025-06-13 12:49:59,960 - root - INFO - lr: 3.4765e-04 gnorm: 2.04 [ 0:08:30< 2:55:13] +[titan] 2025-06-13 12:50:02,351 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:50:02,911 - root - INFO - step: 700 loss: 25.5073 memory: 6.46GiB(27.34%) tps: 27,754 tflops: 27.93 mfu: 8.95% global_avg_ntp_loss: 5.5517 global_avg_mtp_loss: 19.9556 +[titan] 2025-06-13 12:50:02,912 - root - INFO - lr: 3.5015e-04 gnorm: 1.99 [ 0:08:33< 2:54:54] +[titan] 2025-06-13 12:50:06,490 - root - INFO - step: 705 loss: 26.1197 memory: 6.46GiB(27.34%) tps: 22,895 tflops: 23.04 mfu: 7.38% global_avg_ntp_loss: 5.7081 global_avg_mtp_loss: 20.4116 +[titan] 2025-06-13 12:50:06,490 - root - INFO - lr: 3.5265e-04 gnorm: 1.94 [ 0:08:37< 2:54:49] +[titan] 2025-06-13 12:50:09,522 - root - INFO - step: 710 loss: 25.0837 memory: 6.46GiB(27.34%) tps: 27,020 tflops: 27.19 mfu: 8.72% global_avg_ntp_loss: 5.4594 global_avg_mtp_loss: 19.6243 +[titan] 2025-06-13 12:50:09,522 - root - INFO - lr: 3.5514e-04 gnorm: 2.00 [ 0:08:40< 2:54:32] +[titan] 2025-06-13 12:50:12,778 - root - INFO - step: 715 loss: 26.2810 memory: 6.46GiB(27.34%) tps: 25,168 tflops: 25.33 mfu: 8.12% global_avg_ntp_loss: 5.7597 global_avg_mtp_loss: 20.5213 +[titan] 2025-06-13 12:50:12,778 - root - INFO - lr: 3.5764e-04 gnorm: 1.80 [ 0:08:43< 2:54:20] +[titan] 2025-06-13 12:50:16,281 - root - INFO - step: 720 loss: 25.5024 memory: 6.46GiB(27.34%) tps: 23,382 tflops: 23.53 mfu: 7.54% global_avg_ntp_loss: 5.5435 global_avg_mtp_loss: 19.9590 +[titan] 2025-06-13 12:50:16,282 - root - INFO - lr: 3.6014e-04 gnorm: 1.70 [ 0:08:47< 2:54:13] +[titan] 2025-06-13 12:50:19,805 - root - INFO - step: 725 loss: 25.3038 memory: 6.46GiB(27.34%) tps: 23,255 tflops: 23.40 mfu: 7.50% global_avg_ntp_loss: 5.5099 global_avg_mtp_loss: 19.7939 +[titan] 2025-06-13 12:50:19,805 - root - INFO - lr: 3.6264e-04 gnorm: 1.84 [ 0:08:50< 2:54:07] +[titan] 2025-06-13 12:50:23,655 - root - INFO - step: 730 loss: 25.6549 memory: 6.46GiB(27.34%) tps: 21,280 tflops: 21.42 mfu: 6.86% global_avg_ntp_loss: 5.5240 global_avg_mtp_loss: 20.1309 +[titan] 2025-06-13 12:50:23,655 - root - INFO - lr: 3.6513e-04 gnorm: 1.70 [ 0:08:54< 2:54:07] +[titan] 2025-06-13 12:50:27,185 - root - INFO - step: 735 loss: 25.5269 memory: 6.46GiB(27.34%) tps: 23,209 tflops: 23.36 mfu: 7.49% global_avg_ntp_loss: 5.5035 global_avg_mtp_loss: 20.0234 +[titan] 2025-06-13 12:50:27,185 - root - INFO - lr: 3.6763e-04 gnorm: 1.84 [ 0:08:57< 2:54:01] +[titan] 2025-06-13 12:50:30,477 - root - INFO - step: 740 loss: 25.6161 memory: 6.46GiB(27.34%) tps: 24,888 tflops: 25.05 mfu: 8.03% global_avg_ntp_loss: 5.5405 global_avg_mtp_loss: 20.0756 +[titan] 2025-06-13 12:50:30,477 - root - INFO - lr: 3.7013e-04 gnorm: 1.45 [ 0:09:01< 2:53:50] +[titan] 2025-06-13 12:50:33,879 - root - INFO - step: 745 loss: 25.3289 memory: 6.46GiB(27.34%) tps: 24,081 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 5.4707 global_avg_mtp_loss: 19.8582 +[titan] 2025-06-13 12:50:33,880 - root - INFO - lr: 3.7263e-04 gnorm: 1.62 [ 0:09:04< 2:53:42] +[titan] 2025-06-13 12:50:36,215 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:50:36,916 - root - INFO - step: 750 loss: 24.8997 memory: 6.46GiB(27.34%) tps: 26,986 tflops: 27.16 mfu: 8.70% global_avg_ntp_loss: 5.3739 global_avg_mtp_loss: 19.5258 +[titan] 2025-06-13 12:50:36,916 - root - INFO - lr: 3.7512e-04 gnorm: 1.85 [ 0:09:07< 2:53:26] +[titan] 2025-06-13 12:50:40,923 - root - INFO - step: 755 loss: 25.1343 memory: 6.46GiB(27.34%) tps: 20,446 tflops: 20.58 mfu: 6.59% global_avg_ntp_loss: 5.4576 global_avg_mtp_loss: 19.6767 +[titan] 2025-06-13 12:50:40,923 - root - INFO - lr: 3.7762e-04 gnorm: 3.07 [ 0:09:11< 2:53:29] +[titan] 2025-06-13 12:50:43,993 - root - INFO - step: 760 loss: 24.8638 memory: 6.46GiB(27.34%) tps: 26,681 tflops: 26.85 mfu: 8.61% global_avg_ntp_loss: 5.2956 global_avg_mtp_loss: 19.5682 +[titan] 2025-06-13 12:50:43,994 - root - INFO - lr: 3.8012e-04 gnorm: 1.79 [ 0:09:14< 2:53:15] +[titan] 2025-06-13 12:50:48,010 - root - INFO - step: 765 loss: 24.9258 memory: 6.46GiB(27.34%) tps: 20,395 tflops: 20.53 mfu: 6.58% global_avg_ntp_loss: 5.3106 global_avg_mtp_loss: 19.6152 +[titan] 2025-06-13 12:50:48,011 - root - INFO - lr: 3.8262e-04 gnorm: 2.05 [ 0:09:18< 2:53:18] +[titan] 2025-06-13 12:50:51,775 - root - INFO - step: 770 loss: 25.5008 memory: 6.46GiB(27.34%) tps: 21,764 tflops: 21.90 mfu: 7.02% global_avg_ntp_loss: 5.5008 global_avg_mtp_loss: 19.9999 +[titan] 2025-06-13 12:50:51,775 - root - INFO - lr: 3.8511e-04 gnorm: 1.76 [ 0:09:22< 2:53:16] +[titan] 2025-06-13 12:50:54,965 - root - INFO - step: 775 loss: 26.1613 memory: 6.46GiB(27.34%) tps: 25,682 tflops: 25.85 mfu: 8.28% global_avg_ntp_loss: 5.7067 global_avg_mtp_loss: 20.4547 +[titan] 2025-06-13 12:50:54,965 - root - INFO - lr: 3.8761e-04 gnorm: 1.79 [ 0:09:25< 2:53:04] +[titan] 2025-06-13 12:50:58,640 - root - INFO - step: 780 loss: 25.0707 memory: 6.46GiB(27.34%) tps: 22,297 tflops: 22.44 mfu: 7.19% global_avg_ntp_loss: 5.4892 global_avg_mtp_loss: 19.5814 +[titan] 2025-06-13 12:50:58,640 - root - INFO - lr: 3.9011e-04 gnorm: 3.21 [ 0:09:29< 2:53:01] +[titan] 2025-06-13 12:51:02,500 - root - INFO - step: 785 loss: 25.4569 memory: 6.46GiB(27.34%) tps: 21,224 tflops: 21.36 mfu: 6.85% global_avg_ntp_loss: 5.5469 global_avg_mtp_loss: 19.9099 +[titan] 2025-06-13 12:51:02,503 - root - INFO - lr: 3.9261e-04 gnorm: 2.90 [ 0:09:33< 2:53:01] +[titan] 2025-06-13 12:51:06,550 - root - INFO - step: 790 loss: 25.7539 memory: 6.46GiB(27.34%) tps: 20,243 tflops: 20.37 mfu: 6.53% global_avg_ntp_loss: 5.5720 global_avg_mtp_loss: 20.1819 +[titan] 2025-06-13 12:51:06,550 - root - INFO - lr: 3.9510e-04 gnorm: 1.87 [ 0:09:37< 2:53:04] +[titan] 2025-06-13 12:51:09,902 - root - INFO - step: 795 loss: 25.8946 memory: 6.46GiB(27.34%) tps: 24,443 tflops: 24.60 mfu: 7.88% global_avg_ntp_loss: 5.5732 global_avg_mtp_loss: 20.3214 +[titan] 2025-06-13 12:51:09,902 - root - INFO - lr: 3.9760e-04 gnorm: 1.87 [ 0:09:40< 2:52:55] +[titan] 2025-06-13 12:51:12,651 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:51:13,219 - root - INFO - step: 800 loss: 25.9748 memory: 6.46GiB(27.34%) tps: 24,696 tflops: 24.85 mfu: 7.97% global_avg_ntp_loss: 5.5567 global_avg_mtp_loss: 20.4181 +[titan] 2025-06-13 12:51:13,220 - root - INFO - lr: 4.0010e-04 gnorm: 1.84 [ 0:09:44< 2:52:46] +[titan] 2025-06-13 12:51:16,648 - root - INFO - step: 805 loss: 25.4073 memory: 6.46GiB(27.34%) tps: 23,896 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 5.5091 global_avg_mtp_loss: 19.8982 +[titan] 2025-06-13 12:51:16,648 - root - INFO - lr: 4.0260e-04 gnorm: 1.80 [ 0:09:47< 2:52:38] +[titan] 2025-06-13 12:51:20,446 - root - INFO - step: 810 loss: 25.5605 memory: 6.46GiB(27.34%) tps: 21,570 tflops: 21.71 mfu: 6.96% global_avg_ntp_loss: 5.5195 global_avg_mtp_loss: 20.0411 +[titan] 2025-06-13 12:51:20,447 - root - INFO - lr: 4.0509e-04 gnorm: 1.99 [ 0:09:51< 2:52:37] +[titan] 2025-06-13 12:51:24,566 - root - INFO - step: 815 loss: 25.2237 memory: 6.46GiB(27.34%) tps: 19,887 tflops: 20.01 mfu: 6.41% global_avg_ntp_loss: 5.4078 global_avg_mtp_loss: 19.8159 +[titan] 2025-06-13 12:51:24,566 - root - INFO - lr: 4.0759e-04 gnorm: 1.69 [ 0:09:55< 2:52:42] +[titan] 2025-06-13 12:51:27,853 - root - INFO - step: 820 loss: 25.9747 memory: 6.46GiB(27.34%) tps: 24,927 tflops: 25.09 mfu: 8.04% global_avg_ntp_loss: 5.6126 global_avg_mtp_loss: 20.3621 +[titan] 2025-06-13 12:51:27,853 - root - INFO - lr: 4.1009e-04 gnorm: 1.80 [ 0:09:58< 2:52:32] +[titan] 2025-06-13 12:51:31,227 - root - INFO - step: 825 loss: 24.9274 memory: 6.46GiB(27.34%) tps: 24,282 tflops: 24.44 mfu: 7.83% global_avg_ntp_loss: 5.3403 global_avg_mtp_loss: 19.5871 +[titan] 2025-06-13 12:51:31,227 - root - INFO - lr: 4.1259e-04 gnorm: 2.54 [ 0:10:02< 2:52:23] +[titan] 2025-06-13 12:51:34,813 - root - INFO - step: 830 loss: 25.4280 memory: 6.46GiB(27.34%) tps: 22,849 tflops: 22.99 mfu: 7.37% global_avg_ntp_loss: 5.4836 global_avg_mtp_loss: 19.9444 +[titan] 2025-06-13 12:51:34,814 - root - INFO - lr: 4.1508e-04 gnorm: 1.38 [ 0:10:05< 2:52:19] +[titan] 2025-06-13 12:51:38,568 - root - INFO - step: 835 loss: 25.8277 memory: 6.46GiB(27.34%) tps: 21,818 tflops: 21.96 mfu: 7.04% global_avg_ntp_loss: 5.5469 global_avg_mtp_loss: 20.2807 +[titan] 2025-06-13 12:51:38,569 - root - INFO - lr: 4.1758e-04 gnorm: 1.54 [ 0:10:09< 2:52:17] +[titan] 2025-06-13 12:51:42,644 - root - INFO - step: 840 loss: 25.5608 memory: 6.46GiB(27.34%) tps: 20,105 tflops: 20.23 mfu: 6.48% global_avg_ntp_loss: 5.4546 global_avg_mtp_loss: 20.1061 +[titan] 2025-06-13 12:51:42,644 - root - INFO - lr: 4.2008e-04 gnorm: 1.48 [ 0:10:13< 2:52:20] +[titan] 2025-06-13 12:51:45,723 - root - INFO - step: 845 loss: 24.4729 memory: 6.46GiB(27.34%) tps: 26,608 tflops: 26.78 mfu: 8.58% global_avg_ntp_loss: 5.1891 global_avg_mtp_loss: 19.2839 +[titan] 2025-06-13 12:51:45,723 - root - INFO - lr: 4.2258e-04 gnorm: 1.67 [ 0:10:16< 2:52:07] +[titan] 2025-06-13 12:51:49,353 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:51:49,981 - root - INFO - step: 850 loss: 25.2714 memory: 6.46GiB(27.34%) tps: 19,239 tflops: 19.36 mfu: 6.21% global_avg_ntp_loss: 5.3927 global_avg_mtp_loss: 19.8787 +[titan] 2025-06-13 12:51:49,982 - root - INFO - lr: 4.2507e-04 gnorm: 1.72 [ 0:10:20< 2:52:14] +[titan] 2025-06-13 12:51:53,283 - root - INFO - step: 855 loss: 24.8126 memory: 6.46GiB(27.34%) tps: 24,817 tflops: 24.98 mfu: 8.00% global_avg_ntp_loss: 5.2890 global_avg_mtp_loss: 19.5236 +[titan] 2025-06-13 12:51:53,283 - root - INFO - lr: 4.2757e-04 gnorm: 2.26 [ 0:10:24< 2:52:04] +[titan] 2025-06-13 12:51:57,823 - root - INFO - step: 860 loss: 25.8109 memory: 6.46GiB(27.34%) tps: 18,045 tflops: 18.16 mfu: 5.82% global_avg_ntp_loss: 5.4881 global_avg_mtp_loss: 20.3229 +[titan] 2025-06-13 12:51:57,823 - root - INFO - lr: 4.3007e-04 gnorm: 1.71 [ 0:10:28< 2:52:15] +[titan] 2025-06-13 12:52:01,615 - root - INFO - step: 865 loss: 24.2112 memory: 6.46GiB(27.34%) tps: 21,606 tflops: 21.74 mfu: 6.97% global_avg_ntp_loss: 5.2076 global_avg_mtp_loss: 19.0036 +[titan] 2025-06-13 12:52:01,615 - root - INFO - lr: 4.3257e-04 gnorm: 1.87 [ 0:10:32< 2:52:14] +[titan] 2025-06-13 12:52:04,799 - root - INFO - step: 870 loss: 25.8884 memory: 6.46GiB(27.34%) tps: 25,731 tflops: 25.90 mfu: 8.30% global_avg_ntp_loss: 5.5343 global_avg_mtp_loss: 20.3540 +[titan] 2025-06-13 12:52:04,799 - root - INFO - lr: 4.3506e-04 gnorm: 1.60 [ 0:10:35< 2:52:02] +[titan] 2025-06-13 12:52:08,359 - root - INFO - step: 875 loss: 25.1152 memory: 6.46GiB(27.34%) tps: 23,017 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 5.3694 global_avg_mtp_loss: 19.7458 +[titan] 2025-06-13 12:52:08,359 - root - INFO - lr: 4.3756e-04 gnorm: 1.56 [ 0:10:39< 2:51:57] +[titan] 2025-06-13 12:52:12,296 - root - INFO - step: 880 loss: 25.3834 memory: 6.46GiB(27.34%) tps: 20,808 tflops: 20.94 mfu: 6.71% global_avg_ntp_loss: 5.4266 global_avg_mtp_loss: 19.9567 +[titan] 2025-06-13 12:52:12,296 - root - INFO - lr: 4.4006e-04 gnorm: 1.55 [ 0:10:43< 2:51:58] +[titan] 2025-06-13 12:52:15,972 - root - INFO - step: 885 loss: 25.1855 memory: 6.46GiB(27.34%) tps: 22,288 tflops: 22.43 mfu: 7.19% global_avg_ntp_loss: 5.3809 global_avg_mtp_loss: 19.8046 +[titan] 2025-06-13 12:52:15,972 - root - INFO - lr: 4.4256e-04 gnorm: 1.44 [ 0:10:46< 2:51:55] +[titan] 2025-06-13 12:52:19,355 - root - INFO - step: 890 loss: 25.5319 memory: 6.46GiB(27.34%) tps: 24,220 tflops: 24.37 mfu: 7.81% global_avg_ntp_loss: 5.4132 global_avg_mtp_loss: 20.1187 +[titan] 2025-06-13 12:52:19,355 - root - INFO - lr: 4.4505e-04 gnorm: 1.77 [ 0:10:50< 2:51:47] +[titan] 2025-06-13 12:52:22,846 - root - INFO - step: 895 loss: 25.6842 memory: 6.46GiB(27.34%) tps: 23,466 tflops: 23.62 mfu: 7.57% global_avg_ntp_loss: 5.5744 global_avg_mtp_loss: 20.1098 +[titan] 2025-06-13 12:52:22,846 - root - INFO - lr: 4.4755e-04 gnorm: 3.85 [ 0:10:53< 2:51:41] +[titan] 2025-06-13 12:52:26,015 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:52:26,584 - root - INFO - step: 900 loss: 25.3594 memory: 6.46GiB(27.34%) tps: 21,919 tflops: 22.06 mfu: 7.07% global_avg_ntp_loss: 5.3886 global_avg_mtp_loss: 19.9708 +[titan] 2025-06-13 12:52:26,584 - root - INFO - lr: 4.5005e-04 gnorm: 1.40 [ 0:10:57< 2:51:38] +[titan] 2025-06-13 12:52:30,217 - root - INFO - step: 905 loss: 24.5221 memory: 6.46GiB(27.34%) tps: 22,552 tflops: 22.70 mfu: 7.27% global_avg_ntp_loss: 5.2301 global_avg_mtp_loss: 19.2920 +[titan] 2025-06-13 12:52:30,217 - root - INFO - lr: 4.5255e-04 gnorm: 1.47 [ 0:11:01< 2:51:34] +[titan] 2025-06-13 12:52:33,583 - root - INFO - step: 910 loss: 24.3697 memory: 6.46GiB(27.34%) tps: 24,342 tflops: 24.50 mfu: 7.85% global_avg_ntp_loss: 5.2208 global_avg_mtp_loss: 19.1489 +[titan] 2025-06-13 12:52:33,583 - root - INFO - lr: 4.5504e-04 gnorm: 3.43 [ 0:11:04< 2:51:26] +[titan] 2025-06-13 12:52:37,014 - root - INFO - step: 915 loss: 23.0413 memory: 6.46GiB(27.34%) tps: 23,875 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 4.8993 global_avg_mtp_loss: 18.1420 +[titan] 2025-06-13 12:52:37,015 - root - INFO - lr: 4.5754e-04 gnorm: 3.47 [ 0:11:07< 2:51:19] +[titan] 2025-06-13 12:52:40,588 - root - INFO - step: 920 loss: 26.2696 memory: 6.46GiB(27.34%) tps: 22,926 tflops: 23.07 mfu: 7.39% global_avg_ntp_loss: 5.6367 global_avg_mtp_loss: 20.6329 +[titan] 2025-06-13 12:52:40,589 - root - INFO - lr: 4.6004e-04 gnorm: 1.63 [ 0:11:11< 2:51:14] +[titan] 2025-06-13 12:52:44,191 - root - INFO - step: 925 loss: 25.1944 memory: 6.46GiB(27.34%) tps: 22,739 tflops: 22.88 mfu: 7.33% global_avg_ntp_loss: 5.3231 global_avg_mtp_loss: 19.8713 +[titan] 2025-06-13 12:52:44,192 - root - INFO - lr: 4.6254e-04 gnorm: 1.62 [ 0:11:14< 2:51:10] +[titan] 2025-06-13 12:52:48,635 - root - INFO - step: 930 loss: 25.3655 memory: 6.46GiB(27.34%) tps: 18,437 tflops: 18.55 mfu: 5.95% global_avg_ntp_loss: 5.4439 global_avg_mtp_loss: 19.9217 +[titan] 2025-06-13 12:52:48,635 - root - INFO - lr: 4.6503e-04 gnorm: 2.56 [ 0:11:19< 2:51:18] +[titan] 2025-06-13 12:52:51,933 - root - INFO - step: 935 loss: 25.3408 memory: 6.46GiB(27.34%) tps: 24,843 tflops: 25.00 mfu: 8.01% global_avg_ntp_loss: 5.3742 global_avg_mtp_loss: 19.9666 +[titan] 2025-06-13 12:52:51,933 - root - INFO - lr: 4.6753e-04 gnorm: 1.54 [ 0:11:22< 2:51:09] +[titan] 2025-06-13 12:52:55,259 - root - INFO - step: 940 loss: 24.7678 memory: 6.46GiB(27.34%) tps: 24,636 tflops: 24.79 mfu: 7.95% global_avg_ntp_loss: 5.2791 global_avg_mtp_loss: 19.4887 +[titan] 2025-06-13 12:52:55,259 - root - INFO - lr: 4.7003e-04 gnorm: 2.11 [ 0:11:26< 2:51:01] +[titan] 2025-06-13 12:52:58,711 - root - INFO - step: 945 loss: 24.6226 memory: 6.46GiB(27.34%) tps: 23,729 tflops: 23.88 mfu: 7.65% global_avg_ntp_loss: 5.2786 global_avg_mtp_loss: 19.3440 +[titan] 2025-06-13 12:52:58,712 - root - INFO - lr: 4.7253e-04 gnorm: 5.48 [ 0:11:29< 2:50:54] +[titan] 2025-06-13 12:53:01,056 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:53:01,794 - root - INFO - step: 950 loss: 26.0247 memory: 6.46GiB(27.34%) tps: 26,578 tflops: 26.75 mfu: 8.57% global_avg_ntp_loss: 5.5731 global_avg_mtp_loss: 20.4516 +[titan] 2025-06-13 12:53:01,795 - root - INFO - lr: 4.7502e-04 gnorm: 1.68 [ 0:11:32< 2:50:42] +[titan] 2025-06-13 12:53:05,153 - root - INFO - step: 955 loss: 25.3309 memory: 6.46GiB(27.34%) tps: 24,395 tflops: 24.55 mfu: 7.87% global_avg_ntp_loss: 5.4042 global_avg_mtp_loss: 19.9267 +[titan] 2025-06-13 12:53:05,153 - root - INFO - lr: 4.7752e-04 gnorm: 1.32 [ 0:11:35< 2:50:34] +[titan] 2025-06-13 12:53:08,532 - root - INFO - step: 960 loss: 25.6415 memory: 6.46GiB(27.34%) tps: 24,241 tflops: 24.40 mfu: 7.82% global_avg_ntp_loss: 5.4516 global_avg_mtp_loss: 20.1899 +[titan] 2025-06-13 12:53:08,533 - root - INFO - lr: 4.8002e-04 gnorm: 1.56 [ 0:11:39< 2:50:27] +[titan] 2025-06-13 12:53:11,741 - root - INFO - step: 965 loss: 24.9355 memory: 6.46GiB(27.34%) tps: 25,534 tflops: 25.70 mfu: 8.24% global_avg_ntp_loss: 5.2855 global_avg_mtp_loss: 19.6499 +[titan] 2025-06-13 12:53:11,742 - root - INFO - lr: 4.8252e-04 gnorm: 1.38 [ 0:11:42< 2:50:17] +[titan] 2025-06-13 12:53:15,303 - root - INFO - step: 970 loss: 25.2060 memory: 6.46GiB(27.34%) tps: 23,005 tflops: 23.15 mfu: 7.42% global_avg_ntp_loss: 5.3207 global_avg_mtp_loss: 19.8853 +[titan] 2025-06-13 12:53:15,303 - root - INFO - lr: 4.8501e-04 gnorm: 1.62 [ 0:11:46< 2:50:12] +[titan] 2025-06-13 12:53:18,882 - root - INFO - step: 975 loss: 24.4849 memory: 6.46GiB(27.34%) tps: 22,887 tflops: 23.03 mfu: 7.38% global_avg_ntp_loss: 5.1854 global_avg_mtp_loss: 19.2994 +[titan] 2025-06-13 12:53:18,883 - root - INFO - lr: 4.8751e-04 gnorm: 2.37 [ 0:11:49< 2:50:08] +[titan] 2025-06-13 12:53:22,078 - root - INFO - step: 980 loss: 24.9106 memory: 6.46GiB(27.34%) tps: 25,634 tflops: 25.80 mfu: 8.27% global_avg_ntp_loss: 5.3126 global_avg_mtp_loss: 19.5981 +[titan] 2025-06-13 12:53:22,079 - root - INFO - lr: 4.9001e-04 gnorm: 1.77 [ 0:11:52< 2:49:58] +[titan] 2025-06-13 12:53:25,394 - root - INFO - step: 985 loss: 25.5942 memory: 6.46GiB(27.34%) tps: 24,709 tflops: 24.87 mfu: 7.97% global_avg_ntp_loss: 5.4255 global_avg_mtp_loss: 20.1687 +[titan] 2025-06-13 12:53:25,395 - root - INFO - lr: 4.9251e-04 gnorm: 1.71 [ 0:11:56< 2:49:50] +[titan] 2025-06-13 12:53:28,957 - root - INFO - step: 990 loss: 24.6915 memory: 6.46GiB(27.34%) tps: 22,994 tflops: 23.14 mfu: 7.42% global_avg_ntp_loss: 5.1697 global_avg_mtp_loss: 19.5218 +[titan] 2025-06-13 12:53:28,958 - root - INFO - lr: 4.9500e-04 gnorm: 1.63 [ 0:11:59< 2:49:45] +[titan] 2025-06-13 12:53:32,148 - root - INFO - step: 995 loss: 25.0676 memory: 6.46GiB(27.34%) tps: 25,678 tflops: 25.84 mfu: 8.28% global_avg_ntp_loss: 5.2821 global_avg_mtp_loss: 19.7855 +[titan] 2025-06-13 12:53:32,148 - root - INFO - lr: 4.9750e-04 gnorm: 1.23 [ 0:12:02< 2:49:35] +[titan] 2025-06-13 12:53:34,899 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:53:35,476 - root - INFO - step: 1000 loss: 24.3228 memory: 6.46GiB(27.34%) tps: 24,622 tflops: 24.78 mfu: 7.94% global_avg_ntp_loss: 5.1655 global_avg_mtp_loss: 19.1573 +[titan] 2025-06-13 12:53:35,476 - root - INFO - lr: 5.0000e-04 gnorm: 1.62 [ 0:12:06< 2:49:27] +[titan] 2025-06-13 12:53:39,128 - root - INFO - step: 1005 loss: 24.7302 memory: 6.46GiB(27.34%) tps: 22,434 tflops: 22.58 mfu: 7.24% global_avg_ntp_loss: 5.1902 global_avg_mtp_loss: 19.5400 +[titan] 2025-06-13 12:53:39,128 - root - INFO - lr: 5.0000e-04 gnorm: 1.47 [ 0:12:09< 2:49:24] +[titan] 2025-06-13 12:53:42,418 - root - INFO - step: 1010 loss: 23.9122 memory: 6.46GiB(27.34%) tps: 24,903 tflops: 25.06 mfu: 8.03% global_avg_ntp_loss: 4.9752 global_avg_mtp_loss: 18.9370 +[titan] 2025-06-13 12:53:42,418 - root - INFO - lr: 5.0000e-04 gnorm: 1.25 [ 0:12:13< 2:49:15] +[titan] 2025-06-13 12:53:45,854 - root - INFO - step: 1015 loss: 24.8161 memory: 6.46GiB(27.34%) tps: 23,844 tflops: 24.00 mfu: 7.69% global_avg_ntp_loss: 5.2664 global_avg_mtp_loss: 19.5496 +[titan] 2025-06-13 12:53:45,854 - root - INFO - lr: 5.0000e-04 gnorm: 2.04 [ 0:12:16< 2:49:09] +[titan] 2025-06-13 12:53:48,969 - root - INFO - step: 1020 loss: 24.7459 memory: 6.46GiB(27.34%) tps: 26,304 tflops: 26.47 mfu: 8.48% global_avg_ntp_loss: 5.1836 global_avg_mtp_loss: 19.5622 +[titan] 2025-06-13 12:53:48,969 - root - INFO - lr: 5.0000e-04 gnorm: 1.56 [ 0:12:19< 2:48:58] +[titan] 2025-06-13 12:53:51,750 - root - INFO - Dumping profiler traces at step 1024 +[titan] 2025-06-13 12:53:51,842 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 12:53:52,322 - root - INFO - step: 1025 loss: 25.2507 memory: 6.46GiB(27.34%) tps: 24,433 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 5.3905 global_avg_mtp_loss: 19.8602 +[titan] 2025-06-13 12:53:52,322 - root - INFO - lr: 5.0000e-04 gnorm: 1.70 [ 0:12:23< 2:48:51] +[titan] 2025-06-13 12:53:55,882 - root - INFO - step: 1030 loss: 25.2023 memory: 6.46GiB(27.34%) tps: 23,015 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 5.3276 global_avg_mtp_loss: 19.8747 +[titan] 2025-06-13 12:53:55,882 - root - INFO - lr: 4.9999e-04 gnorm: 1.70 [ 0:12:26< 2:48:46] +[titan] 2025-06-13 12:53:59,303 - root - INFO - step: 1035 loss: 25.0347 memory: 6.46GiB(27.34%) tps: 23,946 tflops: 24.10 mfu: 7.72% global_avg_ntp_loss: 5.2813 global_avg_mtp_loss: 19.7535 +[titan] 2025-06-13 12:53:59,303 - root - INFO - lr: 4.9999e-04 gnorm: 1.96 [ 0:12:30< 2:48:40] +[titan] 2025-06-13 12:54:02,957 - root - INFO - step: 1040 loss: 24.8555 memory: 6.46GiB(27.34%) tps: 22,426 tflops: 22.57 mfu: 7.23% global_avg_ntp_loss: 5.2540 global_avg_mtp_loss: 19.6015 +[titan] 2025-06-13 12:54:02,957 - root - INFO - lr: 4.9999e-04 gnorm: 1.85 [ 0:12:33< 2:48:37] +[titan] 2025-06-13 12:54:06,802 - root - INFO - step: 1045 loss: 25.5060 memory: 6.46GiB(27.34%) tps: 21,306 tflops: 21.44 mfu: 6.87% global_avg_ntp_loss: 5.3438 global_avg_mtp_loss: 20.1622 +[titan] 2025-06-13 12:54:06,802 - root - INFO - lr: 4.9999e-04 gnorm: 1.40 [ 0:12:37< 2:48:36] +[titan] 2025-06-13 12:54:11,500 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:54:11,942 - root - INFO - step: 1050 loss: 25.8789 memory: 6.46GiB(27.34%) tps: 15,940 tflops: 16.04 mfu: 5.14% global_avg_ntp_loss: 5.5904 global_avg_mtp_loss: 20.2885 +[titan] 2025-06-13 12:54:11,942 - root - INFO - lr: 4.9999e-04 gnorm: 2.17 [ 0:12:42< 2:48:53] +[titan] 2025-06-13 12:54:14,984 - root - INFO - step: 1055 loss: 25.0693 memory: 6.46GiB(27.34%) tps: 26,928 tflops: 27.10 mfu: 8.69% global_avg_ntp_loss: 5.3904 global_avg_mtp_loss: 19.6788 +[titan] 2025-06-13 12:54:14,985 - root - INFO - lr: 4.9998e-04 gnorm: 1.35 [ 0:12:45< 2:48:41] +[titan] 2025-06-13 12:54:18,597 - root - INFO - step: 1060 loss: 25.1242 memory: 6.46GiB(27.34%) tps: 22,682 tflops: 22.83 mfu: 7.32% global_avg_ntp_loss: 5.4836 global_avg_mtp_loss: 19.6407 +[titan] 2025-06-13 12:54:18,597 - root - INFO - lr: 4.9998e-04 gnorm: 1.56 [ 0:12:49< 2:48:37] +[titan] 2025-06-13 12:54:21,824 - root - INFO - step: 1065 loss: 24.2143 memory: 6.46GiB(27.34%) tps: 25,391 tflops: 25.55 mfu: 8.19% global_avg_ntp_loss: 5.1347 global_avg_mtp_loss: 19.0796 +[titan] 2025-06-13 12:54:21,824 - root - INFO - lr: 4.9998e-04 gnorm: 1.68 [ 0:12:52< 2:48:29] +[titan] 2025-06-13 12:54:24,808 - root - INFO - step: 1070 loss: 24.5676 memory: 6.46GiB(27.34%) tps: 27,453 tflops: 27.63 mfu: 8.86% global_avg_ntp_loss: 5.1489 global_avg_mtp_loss: 19.4187 +[titan] 2025-06-13 12:54:24,808 - root - INFO - lr: 4.9997e-04 gnorm: 1.57 [ 0:12:55< 2:48:17] +[titan] 2025-06-13 12:54:28,150 - root - INFO - step: 1075 loss: 24.2036 memory: 6.46GiB(27.34%) tps: 24,518 tflops: 24.67 mfu: 7.91% global_avg_ntp_loss: 5.2758 global_avg_mtp_loss: 18.9277 +[titan] 2025-06-13 12:54:28,150 - root - INFO - lr: 4.9997e-04 gnorm: 7.39 [ 0:12:58< 2:48:09] +[titan] 2025-06-13 12:54:31,652 - root - INFO - step: 1080 loss: 24.9478 memory: 6.46GiB(27.34%) tps: 23,392 tflops: 23.54 mfu: 7.55% global_avg_ntp_loss: 5.2550 global_avg_mtp_loss: 19.6927 +[titan] 2025-06-13 12:54:31,653 - root - INFO - lr: 4.9996e-04 gnorm: 1.73 [ 0:13:02< 2:48:04] +[titan] 2025-06-13 12:54:34,984 - root - INFO - step: 1085 loss: 24.8713 memory: 6.46GiB(27.34%) tps: 24,589 tflops: 24.75 mfu: 7.93% global_avg_ntp_loss: 5.2000 global_avg_mtp_loss: 19.6713 +[titan] 2025-06-13 12:54:34,985 - root - INFO - lr: 4.9996e-04 gnorm: 1.47 [ 0:13:05< 2:47:57] +[titan] 2025-06-13 12:54:41,166 - root - INFO - step: 1090 loss: 25.2196 memory: 6.46GiB(27.34%) tps: 13,253 tflops: 13.34 mfu: 4.27% global_avg_ntp_loss: 5.2922 global_avg_mtp_loss: 19.9274 +[titan] 2025-06-13 12:54:41,166 - root - INFO - lr: 4.9995e-04 gnorm: 1.33 [ 0:13:11< 2:48:26] +[titan] 2025-06-13 12:54:44,078 - root - INFO - step: 1095 loss: 24.5654 memory: 6.46GiB(27.34%) tps: 28,136 tflops: 28.32 mfu: 9.08% global_avg_ntp_loss: 5.1220 global_avg_mtp_loss: 19.4433 +[titan] 2025-06-13 12:54:44,078 - root - INFO - lr: 4.9995e-04 gnorm: 1.53 [ 0:13:14< 2:48:13] +[titan] 2025-06-13 12:54:46,925 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:54:47,770 - root - INFO - step: 1100 loss: 24.9872 memory: 6.46GiB(27.34%) tps: 22,191 tflops: 22.33 mfu: 7.16% global_avg_ntp_loss: 5.2482 global_avg_mtp_loss: 19.7390 +[titan] 2025-06-13 12:54:47,770 - root - INFO - lr: 4.9994e-04 gnorm: 1.43 [ 0:13:18< 2:48:10] +[titan] 2025-06-13 12:54:50,839 - root - INFO - step: 1105 loss: 24.4168 memory: 6.46GiB(27.34%) tps: 26,701 tflops: 26.87 mfu: 8.61% global_avg_ntp_loss: 5.0559 global_avg_mtp_loss: 19.3610 +[titan] 2025-06-13 12:54:50,839 - root - INFO - lr: 4.9994e-04 gnorm: 1.99 [ 0:13:21< 2:47:59] +[titan] 2025-06-13 12:54:54,236 - root - INFO - step: 1110 loss: 24.1458 memory: 6.46GiB(27.34%) tps: 24,114 tflops: 24.27 mfu: 7.78% global_avg_ntp_loss: 5.0339 global_avg_mtp_loss: 19.1118 +[titan] 2025-06-13 12:54:54,236 - root - INFO - lr: 4.9993e-04 gnorm: 1.51 [ 0:13:25< 2:47:53] +[titan] 2025-06-13 12:54:57,327 - root - INFO - step: 1115 loss: 24.7568 memory: 6.46GiB(27.34%) tps: 26,507 tflops: 26.68 mfu: 8.55% global_avg_ntp_loss: 5.1779 global_avg_mtp_loss: 19.5789 +[titan] 2025-06-13 12:54:57,327 - root - INFO - lr: 4.9992e-04 gnorm: 1.31 [ 0:13:28< 2:47:43] +[titan] 2025-06-13 12:55:01,192 - root - INFO - step: 1120 loss: 24.9035 memory: 6.46GiB(27.34%) tps: 21,200 tflops: 21.33 mfu: 6.84% global_avg_ntp_loss: 5.2290 global_avg_mtp_loss: 19.6745 +[titan] 2025-06-13 12:55:01,192 - root - INFO - lr: 4.9992e-04 gnorm: 1.68 [ 0:13:31< 2:47:42] +[titan] 2025-06-13 12:55:04,356 - root - INFO - step: 1125 loss: 24.2404 memory: 6.46GiB(27.34%) tps: 25,897 tflops: 26.06 mfu: 8.35% global_avg_ntp_loss: 5.0161 global_avg_mtp_loss: 19.2243 +[titan] 2025-06-13 12:55:04,356 - root - INFO - lr: 4.9991e-04 gnorm: 1.32 [ 0:13:35< 2:47:33] +[titan] 2025-06-13 12:55:08,138 - root - INFO - step: 1130 loss: 24.5071 memory: 6.46GiB(27.34%) tps: 21,658 tflops: 21.80 mfu: 6.99% global_avg_ntp_loss: 5.1470 global_avg_mtp_loss: 19.3600 +[titan] 2025-06-13 12:55:08,139 - root - INFO - lr: 4.9990e-04 gnorm: 1.34 [ 0:13:38< 2:47:31] +[titan] 2025-06-13 12:55:11,556 - root - INFO - step: 1135 loss: 24.6129 memory: 6.46GiB(27.34%) tps: 23,970 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 5.1810 global_avg_mtp_loss: 19.4320 +[titan] 2025-06-13 12:55:11,557 - root - INFO - lr: 4.9990e-04 gnorm: 2.19 [ 0:13:42< 2:47:25] +[titan] 2025-06-13 12:55:14,590 - root - INFO - step: 1140 loss: 25.3019 memory: 6.46GiB(27.34%) tps: 27,006 tflops: 27.18 mfu: 8.71% global_avg_ntp_loss: 5.2892 global_avg_mtp_loss: 20.0128 +[titan] 2025-06-13 12:55:14,590 - root - INFO - lr: 4.9989e-04 gnorm: 1.48 [ 0:13:45< 2:47:14] +[titan] 2025-06-13 12:55:18,092 - root - INFO - step: 1145 loss: 25.3513 memory: 6.46GiB(27.34%) tps: 23,398 tflops: 23.55 mfu: 7.55% global_avg_ntp_loss: 5.2828 global_avg_mtp_loss: 20.0684 +[titan] 2025-06-13 12:55:18,092 - root - INFO - lr: 4.9988e-04 gnorm: 1.18 [ 0:13:48< 2:47:09] +[titan] 2025-06-13 12:55:20,982 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:55:21,876 - root - INFO - step: 1150 loss: 25.0567 memory: 6.46GiB(27.34%) tps: 21,650 tflops: 21.79 mfu: 6.98% global_avg_ntp_loss: 5.2250 global_avg_mtp_loss: 19.8318 +[titan] 2025-06-13 12:55:21,876 - root - INFO - lr: 4.9987e-04 gnorm: 1.25 [ 0:13:52< 2:47:07] +[titan] 2025-06-13 12:55:25,144 - root - INFO - step: 1155 loss: 24.2406 memory: 6.46GiB(27.34%) tps: 25,069 tflops: 25.23 mfu: 8.09% global_avg_ntp_loss: 5.0588 global_avg_mtp_loss: 19.1818 +[titan] 2025-06-13 12:55:25,145 - root - INFO - lr: 4.9986e-04 gnorm: 1.71 [ 0:13:55< 2:47:00] +[titan] 2025-06-13 12:55:28,617 - root - INFO - step: 1160 loss: 24.9377 memory: 6.46GiB(27.34%) tps: 23,592 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 5.2673 global_avg_mtp_loss: 19.6704 +[titan] 2025-06-13 12:55:28,617 - root - INFO - lr: 4.9985e-04 gnorm: 1.72 [ 0:13:59< 2:46:54] +[titan] 2025-06-13 12:55:32,188 - root - INFO - step: 1165 loss: 25.6381 memory: 6.46GiB(27.34%) tps: 22,946 tflops: 23.09 mfu: 7.40% global_avg_ntp_loss: 5.4636 global_avg_mtp_loss: 20.1745 +[titan] 2025-06-13 12:55:32,188 - root - INFO - lr: 4.9984e-04 gnorm: 1.56 [ 0:14:02< 2:46:50] +[titan] 2025-06-13 12:55:35,512 - root - INFO - step: 1170 loss: 24.4102 memory: 6.46GiB(27.34%) tps: 24,644 tflops: 24.80 mfu: 7.95% global_avg_ntp_loss: 5.0978 global_avg_mtp_loss: 19.3123 +[titan] 2025-06-13 12:55:35,512 - root - INFO - lr: 4.9983e-04 gnorm: 1.52 [ 0:14:06< 2:46:43] +[titan] 2025-06-13 12:55:38,806 - root - INFO - step: 1175 loss: 24.4305 memory: 6.46GiB(27.34%) tps: 24,874 tflops: 25.03 mfu: 8.02% global_avg_ntp_loss: 5.0677 global_avg_mtp_loss: 19.3628 +[titan] 2025-06-13 12:55:38,806 - root - INFO - lr: 4.9982e-04 gnorm: 1.41 [ 0:14:09< 2:46:35] +[titan] 2025-06-13 12:55:42,562 - root - INFO - step: 1180 loss: 25.0978 memory: 6.46GiB(27.34%) tps: 21,812 tflops: 21.95 mfu: 7.04% global_avg_ntp_loss: 5.2405 global_avg_mtp_loss: 19.8573 +[titan] 2025-06-13 12:55:42,563 - root - INFO - lr: 4.9981e-04 gnorm: 1.18 [ 0:14:13< 2:46:33] +[titan] 2025-06-13 12:55:46,171 - root - INFO - step: 1185 loss: 24.5848 memory: 6.46GiB(27.34%) tps: 22,700 tflops: 22.85 mfu: 7.32% global_avg_ntp_loss: 5.0909 global_avg_mtp_loss: 19.4939 +[titan] 2025-06-13 12:55:46,172 - root - INFO - lr: 4.9980e-04 gnorm: 1.20 [ 0:14:16< 2:46:30] +[titan] 2025-06-13 12:55:49,446 - root - INFO - step: 1190 loss: 24.3424 memory: 6.46GiB(27.34%) tps: 25,022 tflops: 25.18 mfu: 8.07% global_avg_ntp_loss: 5.0806 global_avg_mtp_loss: 19.2618 +[titan] 2025-06-13 12:55:49,446 - root - INFO - lr: 4.9979e-04 gnorm: 1.41 [ 0:14:20< 2:46:22] +[titan] 2025-06-13 12:55:53,062 - root - INFO - step: 1195 loss: 24.7355 memory: 6.46GiB(27.34%) tps: 22,660 tflops: 22.80 mfu: 7.31% global_avg_ntp_loss: 5.2721 global_avg_mtp_loss: 19.4633 +[titan] 2025-06-13 12:55:53,062 - root - INFO - lr: 4.9978e-04 gnorm: 2.30 [ 0:14:23< 2:46:19] +[titan] 2025-06-13 12:55:55,786 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:55:56,705 - root - INFO - step: 1200 loss: 23.9237 memory: 6.46GiB(27.34%) tps: 22,489 tflops: 22.63 mfu: 7.25% global_avg_ntp_loss: 4.9236 global_avg_mtp_loss: 19.0002 +[titan] 2025-06-13 12:55:56,705 - root - INFO - lr: 4.9977e-04 gnorm: 1.70 [ 0:14:27< 2:46:15] +[titan] 2025-06-13 12:56:00,635 - root - INFO - step: 1205 loss: 24.6788 memory: 6.46GiB(27.34%) tps: 20,845 tflops: 20.98 mfu: 6.72% global_avg_ntp_loss: 5.1493 global_avg_mtp_loss: 19.5295 +[titan] 2025-06-13 12:56:00,635 - root - INFO - lr: 4.9976e-04 gnorm: 1.44 [ 0:14:31< 2:46:15] +[titan] 2025-06-13 12:56:04,196 - root - INFO - step: 1210 loss: 24.8400 memory: 6.46GiB(27.34%) tps: 23,009 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 5.2417 global_avg_mtp_loss: 19.5984 +[titan] 2025-06-13 12:56:04,196 - root - INFO - lr: 4.9975e-04 gnorm: 1.19 [ 0:14:34< 2:46:11] +[titan] 2025-06-13 12:56:07,968 - root - INFO - step: 1215 loss: 24.5471 memory: 6.46GiB(27.34%) tps: 21,719 tflops: 21.86 mfu: 7.01% global_avg_ntp_loss: 5.1667 global_avg_mtp_loss: 19.3804 +[titan] 2025-06-13 12:56:07,969 - root - INFO - lr: 4.9974e-04 gnorm: 1.95 [ 0:14:38< 2:46:09] +[titan] 2025-06-13 12:56:11,958 - root - INFO - step: 1220 loss: 24.6582 memory: 6.46GiB(27.34%) tps: 20,534 tflops: 20.66 mfu: 6.62% global_avg_ntp_loss: 5.0445 global_avg_mtp_loss: 19.6138 +[titan] 2025-06-13 12:56:11,959 - root - INFO - lr: 4.9972e-04 gnorm: 1.36 [ 0:14:42< 2:46:10] +[titan] 2025-06-13 12:56:15,606 - root - INFO - step: 1225 loss: 24.3855 memory: 6.46GiB(27.34%) tps: 22,460 tflops: 22.60 mfu: 7.24% global_avg_ntp_loss: 5.0174 global_avg_mtp_loss: 19.3681 +[titan] 2025-06-13 12:56:15,606 - root - INFO - lr: 4.9971e-04 gnorm: 1.27 [ 0:14:46< 2:46:07] +[titan] 2025-06-13 12:56:19,650 - root - INFO - step: 1230 loss: 25.0589 memory: 6.46GiB(27.34%) tps: 20,258 tflops: 20.39 mfu: 6.53% global_avg_ntp_loss: 5.2000 global_avg_mtp_loss: 19.8588 +[titan] 2025-06-13 12:56:19,651 - root - INFO - lr: 4.9970e-04 gnorm: 1.26 [ 0:14:50< 2:46:08] +[titan] 2025-06-13 12:56:23,304 - root - INFO - step: 1235 loss: 24.0008 memory: 6.46GiB(27.34%) tps: 22,430 tflops: 22.57 mfu: 7.23% global_avg_ntp_loss: 4.9467 global_avg_mtp_loss: 19.0540 +[titan] 2025-06-13 12:56:23,304 - root - INFO - lr: 4.9968e-04 gnorm: 1.40 [ 0:14:54< 2:46:04] +[titan] 2025-06-13 12:56:27,070 - root - INFO - step: 1240 loss: 24.8249 memory: 6.46GiB(27.34%) tps: 21,753 tflops: 21.89 mfu: 7.02% global_avg_ntp_loss: 5.1508 global_avg_mtp_loss: 19.6742 +[titan] 2025-06-13 12:56:27,070 - root - INFO - lr: 4.9967e-04 gnorm: 1.27 [ 0:14:57< 2:46:02] +[titan] 2025-06-13 12:56:30,884 - root - INFO - step: 1245 loss: 24.3341 memory: 6.46GiB(27.34%) tps: 21,483 tflops: 21.62 mfu: 6.93% global_avg_ntp_loss: 5.0331 global_avg_mtp_loss: 19.3010 +[titan] 2025-06-13 12:56:30,884 - root - INFO - lr: 4.9966e-04 gnorm: 1.22 [ 0:15:01< 2:46:01] +[titan] 2025-06-13 12:56:33,706 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:56:34,291 - root - INFO - step: 1250 loss: 24.9879 memory: 6.46GiB(27.34%) tps: 24,043 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 5.1461 global_avg_mtp_loss: 19.8418 +[titan] 2025-06-13 12:56:34,292 - root - INFO - lr: 4.9964e-04 gnorm: 1.47 [ 0:15:05< 2:45:55] +[titan] 2025-06-13 12:56:38,346 - root - INFO - step: 1255 loss: 25.2154 memory: 6.46GiB(27.34%) tps: 20,205 tflops: 20.33 mfu: 6.52% global_avg_ntp_loss: 5.2370 global_avg_mtp_loss: 19.9784 +[titan] 2025-06-13 12:56:38,347 - root - INFO - lr: 4.9963e-04 gnorm: 1.32 [ 0:15:09< 2:45:56] +[titan] 2025-06-13 12:56:41,884 - root - INFO - step: 1260 loss: 24.1084 memory: 6.46GiB(27.34%) tps: 23,163 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 4.9597 global_avg_mtp_loss: 19.1487 +[titan] 2025-06-13 12:56:41,884 - root - INFO - lr: 4.9961e-04 gnorm: 1.24 [ 0:15:12< 2:45:52] +[titan] 2025-06-13 12:56:45,640 - root - INFO - step: 1265 loss: 24.4957 memory: 6.46GiB(27.34%) tps: 21,809 tflops: 21.95 mfu: 7.03% global_avg_ntp_loss: 5.0467 global_avg_mtp_loss: 19.4489 +[titan] 2025-06-13 12:56:45,640 - root - INFO - lr: 4.9960e-04 gnorm: 1.25 [ 0:15:16< 2:45:49] +[titan] 2025-06-13 12:56:49,359 - root - INFO - step: 1270 loss: 24.3512 memory: 6.46GiB(27.34%) tps: 22,033 tflops: 22.17 mfu: 7.11% global_avg_ntp_loss: 4.9826 global_avg_mtp_loss: 19.3687 +[titan] 2025-06-13 12:56:49,359 - root - INFO - lr: 4.9958e-04 gnorm: 1.28 [ 0:15:20< 2:45:47] +[titan] 2025-06-13 12:56:53,802 - root - INFO - step: 1275 loss: 24.1660 memory: 6.46GiB(27.34%) tps: 18,441 tflops: 18.56 mfu: 5.95% global_avg_ntp_loss: 4.8980 global_avg_mtp_loss: 19.2680 +[titan] 2025-06-13 12:56:53,803 - root - INFO - lr: 4.9957e-04 gnorm: 1.12 [ 0:15:24< 2:45:52] +[titan] 2025-06-13 12:56:57,395 - root - INFO - step: 1280 loss: 23.9124 memory: 6.46GiB(27.34%) tps: 22,805 tflops: 22.95 mfu: 7.36% global_avg_ntp_loss: 4.8794 global_avg_mtp_loss: 19.0330 +[titan] 2025-06-13 12:56:57,396 - root - INFO - lr: 4.9955e-04 gnorm: 1.31 [ 0:15:28< 2:45:48] +[titan] 2025-06-13 12:57:01,636 - root - INFO - step: 1285 loss: 24.2694 memory: 6.46GiB(27.34%) tps: 19,319 tflops: 19.44 mfu: 6.23% global_avg_ntp_loss: 4.9543 global_avg_mtp_loss: 19.3151 +[titan] 2025-06-13 12:57:01,637 - root - INFO - lr: 4.9954e-04 gnorm: 1.38 [ 0:15:32< 2:45:51] +[titan] 2025-06-13 12:57:04,915 - root - INFO - step: 1290 loss: 24.6126 memory: 6.46GiB(27.34%) tps: 24,990 tflops: 25.15 mfu: 8.06% global_avg_ntp_loss: 5.1895 global_avg_mtp_loss: 19.4231 +[titan] 2025-06-13 12:57:04,915 - root - INFO - lr: 4.9952e-04 gnorm: 2.32 [ 0:15:35< 2:45:44] +[titan] 2025-06-13 12:57:08,881 - root - INFO - step: 1295 loss: 24.3687 memory: 6.46GiB(27.34%) tps: 20,658 tflops: 20.79 mfu: 6.66% global_avg_ntp_loss: 4.9289 global_avg_mtp_loss: 19.4398 +[titan] 2025-06-13 12:57:08,881 - root - INFO - lr: 4.9950e-04 gnorm: 1.34 [ 0:15:39< 2:45:44] +[titan] 2025-06-13 12:57:12,359 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:57:12,892 - root - INFO - step: 1300 loss: 24.0710 memory: 6.46GiB(27.34%) tps: 20,427 tflops: 20.56 mfu: 6.59% global_avg_ntp_loss: 4.9447 global_avg_mtp_loss: 19.1263 +[titan] 2025-06-13 12:57:12,892 - root - INFO - lr: 4.9949e-04 gnorm: 1.40 [ 0:15:43< 2:45:44] +[titan] 2025-06-13 12:57:16,986 - root - INFO - step: 1305 loss: 24.5722 memory: 6.46GiB(27.34%) tps: 20,014 tflops: 20.14 mfu: 6.46% global_avg_ntp_loss: 5.0928 global_avg_mtp_loss: 19.4794 +[titan] 2025-06-13 12:57:16,986 - root - INFO - lr: 4.9947e-04 gnorm: 2.43 [ 0:15:47< 2:45:45] +[titan] 2025-06-13 12:57:20,737 - root - INFO - step: 1310 loss: 23.9922 memory: 6.46GiB(27.34%) tps: 21,842 tflops: 21.98 mfu: 7.05% global_avg_ntp_loss: 4.9276 global_avg_mtp_loss: 19.0646 +[titan] 2025-06-13 12:57:20,738 - root - INFO - lr: 4.9945e-04 gnorm: 1.48 [ 0:15:51< 2:45:43] +[titan] 2025-06-13 12:57:24,280 - root - INFO - step: 1315 loss: 24.3607 memory: 6.46GiB(27.34%) tps: 23,129 tflops: 23.28 mfu: 7.46% global_avg_ntp_loss: 5.0473 global_avg_mtp_loss: 19.3134 +[titan] 2025-06-13 12:57:24,280 - root - INFO - lr: 4.9943e-04 gnorm: 1.41 [ 0:15:55< 2:45:38] +[titan] 2025-06-13 12:57:27,605 - root - INFO - step: 1320 loss: 22.0976 memory: 6.46GiB(27.34%) tps: 24,639 tflops: 24.80 mfu: 7.95% global_avg_ntp_loss: 4.5136 global_avg_mtp_loss: 17.5840 +[titan] 2025-06-13 12:57:27,606 - root - INFO - lr: 4.9942e-04 gnorm: 5.78 [ 0:15:58< 2:45:31] +[titan] 2025-06-13 12:57:31,572 - root - INFO - step: 1325 loss: 24.4511 memory: 6.46GiB(27.34%) tps: 20,651 tflops: 20.78 mfu: 6.66% global_avg_ntp_loss: 5.0034 global_avg_mtp_loss: 19.4477 +[titan] 2025-06-13 12:57:31,573 - root - INFO - lr: 4.9940e-04 gnorm: 1.13 [ 0:16:02< 2:45:31] +[titan] 2025-06-13 12:57:34,897 - root - INFO - step: 1330 loss: 24.5698 memory: 6.46GiB(27.34%) tps: 24,647 tflops: 24.80 mfu: 7.95% global_avg_ntp_loss: 5.0131 global_avg_mtp_loss: 19.5567 +[titan] 2025-06-13 12:57:34,897 - root - INFO - lr: 4.9938e-04 gnorm: 1.18 [ 0:16:05< 2:45:25] +[titan] 2025-06-13 12:57:38,606 - root - INFO - step: 1335 loss: 24.1607 memory: 6.46GiB(27.34%) tps: 22,087 tflops: 22.23 mfu: 7.12% global_avg_ntp_loss: 4.9264 global_avg_mtp_loss: 19.2343 +[titan] 2025-06-13 12:57:38,606 - root - INFO - lr: 4.9936e-04 gnorm: 1.35 [ 0:16:09< 2:45:22] +[titan] 2025-06-13 12:57:42,060 - root - INFO - step: 1340 loss: 23.6192 memory: 6.46GiB(27.34%) tps: 23,723 tflops: 23.87 mfu: 7.65% global_avg_ntp_loss: 4.8722 global_avg_mtp_loss: 18.7470 +[titan] 2025-06-13 12:57:42,060 - root - INFO - lr: 4.9934e-04 gnorm: 1.85 [ 0:16:12< 2:45:16] +[titan] 2025-06-13 12:57:45,971 - root - INFO - step: 1345 loss: 24.4887 memory: 6.46GiB(27.34%) tps: 20,948 tflops: 21.08 mfu: 6.76% global_avg_ntp_loss: 4.9909 global_avg_mtp_loss: 19.4978 +[titan] 2025-06-13 12:57:45,971 - root - INFO - lr: 4.9932e-04 gnorm: 1.07 [ 0:16:16< 2:45:15] +[titan] 2025-06-13 12:57:49,102 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:57:50,227 - root - INFO - step: 1350 loss: 24.7803 memory: 6.46GiB(27.34%) tps: 19,251 tflops: 19.37 mfu: 6.21% global_avg_ntp_loss: 5.0825 global_avg_mtp_loss: 19.6977 +[titan] 2025-06-13 12:57:50,229 - root - INFO - lr: 4.9930e-04 gnorm: 1.35 [ 0:16:20< 2:45:18] +[titan] 2025-06-13 12:57:54,568 - root - INFO - step: 1355 loss: 23.3924 memory: 6.46GiB(27.34%) tps: 18,879 tflops: 19.00 mfu: 6.09% global_avg_ntp_loss: 4.7723 global_avg_mtp_loss: 18.6201 +[titan] 2025-06-13 12:57:54,568 - root - INFO - lr: 4.9928e-04 gnorm: 1.38 [ 0:16:25< 2:45:22] +[titan] 2025-06-13 12:57:58,382 - root - INFO - step: 1360 loss: 24.1753 memory: 6.46GiB(27.34%) tps: 21,482 tflops: 21.62 mfu: 6.93% global_avg_ntp_loss: 4.8950 global_avg_mtp_loss: 19.2803 +[titan] 2025-06-13 12:57:58,382 - root - INFO - lr: 4.9926e-04 gnorm: 1.34 [ 0:16:29< 2:45:20] +[titan] 2025-06-13 12:58:02,223 - root - INFO - step: 1365 loss: 25.1074 memory: 6.46GiB(27.34%) tps: 21,329 tflops: 21.47 mfu: 6.88% global_avg_ntp_loss: 5.2232 global_avg_mtp_loss: 19.8842 +[titan] 2025-06-13 12:58:02,223 - root - INFO - lr: 4.9924e-04 gnorm: 1.84 [ 0:16:32< 2:45:18] +[titan] 2025-06-13 12:58:06,250 - root - INFO - step: 1370 loss: 24.3263 memory: 6.46GiB(27.34%) tps: 20,342 tflops: 20.47 mfu: 6.56% global_avg_ntp_loss: 4.9580 global_avg_mtp_loss: 19.3683 +[titan] 2025-06-13 12:58:06,251 - root - INFO - lr: 4.9922e-04 gnorm: 1.17 [ 0:16:36< 2:45:18] +[titan] 2025-06-13 12:58:10,072 - root - INFO - step: 1375 loss: 24.2379 memory: 6.46GiB(27.34%) tps: 21,439 tflops: 21.58 mfu: 6.92% global_avg_ntp_loss: 4.9544 global_avg_mtp_loss: 19.2835 +[titan] 2025-06-13 12:58:10,072 - root - INFO - lr: 4.9920e-04 gnorm: 1.26 [ 0:16:40< 2:45:17] +[titan] 2025-06-13 12:58:14,154 - root - INFO - step: 1380 loss: 24.1822 memory: 6.46GiB(27.34%) tps: 20,069 tflops: 20.20 mfu: 6.47% global_avg_ntp_loss: 4.9155 global_avg_mtp_loss: 19.2667 +[titan] 2025-06-13 12:58:14,155 - root - INFO - lr: 4.9918e-04 gnorm: 1.15 [ 0:16:44< 2:45:17] +[titan] 2025-06-13 12:58:17,921 - root - INFO - step: 1385 loss: 24.7441 memory: 6.46GiB(27.34%) tps: 21,750 tflops: 21.89 mfu: 7.02% global_avg_ntp_loss: 5.1221 global_avg_mtp_loss: 19.6220 +[titan] 2025-06-13 12:58:17,922 - root - INFO - lr: 4.9916e-04 gnorm: 1.24 [ 0:16:48< 2:45:15] +[titan] 2025-06-13 12:58:21,944 - root - INFO - step: 1390 loss: 23.6328 memory: 6.46GiB(27.34%) tps: 20,370 tflops: 20.50 mfu: 6.57% global_avg_ntp_loss: 4.7831 global_avg_mtp_loss: 18.8497 +[titan] 2025-06-13 12:58:21,944 - root - INFO - lr: 4.9913e-04 gnorm: 1.32 [ 0:16:52< 2:45:15] +[titan] 2025-06-13 12:58:25,578 - root - INFO - step: 1395 loss: 23.6372 memory: 6.46GiB(27.34%) tps: 22,543 tflops: 22.69 mfu: 7.27% global_avg_ntp_loss: 4.7683 global_avg_mtp_loss: 18.8689 +[titan] 2025-06-13 12:58:25,578 - root - INFO - lr: 4.9911e-04 gnorm: 1.34 [ 0:16:56< 2:45:11] +[titan] 2025-06-13 12:58:28,457 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:58:29,003 - root - INFO - step: 1400 loss: 24.1824 memory: 6.46GiB(27.34%) tps: 23,920 tflops: 24.07 mfu: 7.72% global_avg_ntp_loss: 4.9056 global_avg_mtp_loss: 19.2768 +[titan] 2025-06-13 12:58:29,004 - root - INFO - lr: 4.9909e-04 gnorm: 1.58 [ 0:16:59< 2:45:06] +[titan] 2025-06-13 12:58:32,921 - root - INFO - step: 1405 loss: 24.2997 memory: 6.46GiB(27.34%) tps: 20,915 tflops: 21.05 mfu: 6.75% global_avg_ntp_loss: 4.9061 global_avg_mtp_loss: 19.3936 +[titan] 2025-06-13 12:58:32,921 - root - INFO - lr: 4.9907e-04 gnorm: 1.20 [ 0:17:03< 2:45:05] +[titan] 2025-06-13 12:58:36,307 - root - INFO - step: 1410 loss: 23.1618 memory: 6.46GiB(27.34%) tps: 24,196 tflops: 24.35 mfu: 7.80% global_avg_ntp_loss: 4.7737 global_avg_mtp_loss: 18.3882 +[titan] 2025-06-13 12:58:36,307 - root - INFO - lr: 4.9904e-04 gnorm: 1.92 [ 0:17:07< 2:44:58] +[titan] 2025-06-13 12:58:39,900 - root - INFO - step: 1415 loss: 23.5015 memory: 6.46GiB(27.34%) tps: 22,804 tflops: 22.95 mfu: 7.36% global_avg_ntp_loss: 4.7815 global_avg_mtp_loss: 18.7200 +[titan] 2025-06-13 12:58:39,900 - root - INFO - lr: 4.9902e-04 gnorm: 1.36 [ 0:17:10< 2:44:54] +[titan] 2025-06-13 12:58:43,912 - root - INFO - step: 1420 loss: 22.9281 memory: 6.46GiB(27.34%) tps: 20,418 tflops: 20.55 mfu: 6.59% global_avg_ntp_loss: 4.6803 global_avg_mtp_loss: 18.2477 +[titan] 2025-06-13 12:58:43,913 - root - INFO - lr: 4.9900e-04 gnorm: 1.57 [ 0:17:14< 2:44:54] +[titan] 2025-06-13 12:58:47,655 - root - INFO - step: 1425 loss: 23.3692 memory: 6.46GiB(27.34%) tps: 21,890 tflops: 22.03 mfu: 7.06% global_avg_ntp_loss: 4.6976 global_avg_mtp_loss: 18.6716 +[titan] 2025-06-13 12:58:47,656 - root - INFO - lr: 4.9897e-04 gnorm: 1.31 [ 0:17:18< 2:44:51] +[titan] 2025-06-13 12:58:51,387 - root - INFO - step: 1430 loss: 24.0341 memory: 6.46GiB(27.34%) tps: 21,957 tflops: 22.10 mfu: 7.08% global_avg_ntp_loss: 4.8728 global_avg_mtp_loss: 19.1612 +[titan] 2025-06-13 12:58:51,387 - root - INFO - lr: 4.9895e-04 gnorm: 1.04 [ 0:17:22< 2:44:49] +[titan] 2025-06-13 12:58:55,213 - root - INFO - step: 1435 loss: 24.5676 memory: 6.46GiB(27.34%) tps: 21,412 tflops: 21.55 mfu: 6.91% global_avg_ntp_loss: 5.0640 global_avg_mtp_loss: 19.5036 +[titan] 2025-06-13 12:58:55,213 - root - INFO - lr: 4.9892e-04 gnorm: 1.68 [ 0:17:25< 2:44:47] +[titan] 2025-06-13 12:58:59,038 - root - INFO - step: 1440 loss: 24.0538 memory: 6.46GiB(27.34%) tps: 21,420 tflops: 21.56 mfu: 6.91% global_avg_ntp_loss: 4.8423 global_avg_mtp_loss: 19.2115 +[titan] 2025-06-13 12:58:59,039 - root - INFO - lr: 4.9890e-04 gnorm: 1.17 [ 0:17:29< 2:44:45] +[titan] 2025-06-13 12:59:02,657 - root - INFO - step: 1445 loss: 24.3574 memory: 6.46GiB(27.34%) tps: 22,637 tflops: 22.78 mfu: 7.30% global_avg_ntp_loss: 4.9522 global_avg_mtp_loss: 19.4052 +[titan] 2025-06-13 12:59:02,658 - root - INFO - lr: 4.9887e-04 gnorm: 1.16 [ 0:17:33< 2:44:41] +[titan] 2025-06-13 12:59:05,641 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:59:06,258 - root - INFO - step: 1450 loss: 23.7388 memory: 6.46GiB(27.34%) tps: 22,757 tflops: 22.90 mfu: 7.34% global_avg_ntp_loss: 4.7683 global_avg_mtp_loss: 18.9705 +[titan] 2025-06-13 12:59:06,258 - root - INFO - lr: 4.9885e-04 gnorm: 1.35 [ 0:17:36< 2:44:37] +[titan] 2025-06-13 12:59:10,576 - root - INFO - step: 1455 loss: 24.3308 memory: 6.46GiB(27.34%) tps: 18,975 tflops: 19.10 mfu: 6.12% global_avg_ntp_loss: 4.9447 global_avg_mtp_loss: 19.3861 +[titan] 2025-06-13 12:59:10,576 - root - INFO - lr: 4.9882e-04 gnorm: 1.32 [ 0:17:41< 2:44:39] +[titan] 2025-06-13 12:59:16,113 - root - INFO - step: 1460 loss: 24.3401 memory: 6.46GiB(27.34%) tps: 14,795 tflops: 14.89 mfu: 4.77% global_avg_ntp_loss: 4.9377 global_avg_mtp_loss: 19.4024 +[titan] 2025-06-13 12:59:16,114 - root - INFO - lr: 4.9880e-04 gnorm: 1.59 [ 0:17:46< 2:44:53] +[titan] 2025-06-13 12:59:19,442 - root - INFO - step: 1465 loss: 23.8236 memory: 6.46GiB(27.34%) tps: 24,616 tflops: 24.77 mfu: 7.94% global_avg_ntp_loss: 4.7971 global_avg_mtp_loss: 19.0264 +[titan] 2025-06-13 12:59:19,442 - root - INFO - lr: 4.9877e-04 gnorm: 1.18 [ 0:17:50< 2:44:47] +[titan] 2025-06-13 12:59:23,153 - root - INFO - step: 1470 loss: 23.5063 memory: 6.46GiB(27.34%) tps: 22,080 tflops: 22.22 mfu: 7.12% global_avg_ntp_loss: 4.7750 global_avg_mtp_loss: 18.7313 +[titan] 2025-06-13 12:59:23,153 - root - INFO - lr: 4.9874e-04 gnorm: 1.32 [ 0:17:53< 2:44:44] +[titan] 2025-06-13 12:59:27,399 - root - INFO - step: 1475 loss: 23.5457 memory: 6.46GiB(27.34%) tps: 19,296 tflops: 19.42 mfu: 6.22% global_avg_ntp_loss: 4.7487 global_avg_mtp_loss: 18.7970 +[titan] 2025-06-13 12:59:27,399 - root - INFO - lr: 4.9872e-04 gnorm: 1.15 [ 0:17:58< 2:44:45] +[titan] 2025-06-13 12:59:30,956 - root - INFO - step: 1480 loss: 23.2577 memory: 6.46GiB(27.34%) tps: 23,031 tflops: 23.18 mfu: 7.43% global_avg_ntp_loss: 4.6994 global_avg_mtp_loss: 18.5582 +[titan] 2025-06-13 12:59:30,957 - root - INFO - lr: 4.9869e-04 gnorm: 1.26 [ 0:18:01< 2:44:41] +[titan] 2025-06-13 12:59:34,406 - root - INFO - step: 1485 loss: 23.8056 memory: 6.46GiB(27.34%) tps: 23,752 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 4.8108 global_avg_mtp_loss: 18.9948 +[titan] 2025-06-13 12:59:34,406 - root - INFO - lr: 4.9866e-04 gnorm: 1.28 [ 0:18:05< 2:44:35] +[titan] 2025-06-13 12:59:38,039 - root - INFO - step: 1490 loss: 24.3061 memory: 6.46GiB(27.34%) tps: 22,552 tflops: 22.70 mfu: 7.27% global_avg_ntp_loss: 5.1051 global_avg_mtp_loss: 19.2010 +[titan] 2025-06-13 12:59:38,039 - root - INFO - lr: 4.9864e-04 gnorm: 1.59 [ 0:18:08< 2:44:31] +[titan] 2025-06-13 12:59:41,990 - root - INFO - step: 1495 loss: 24.2816 memory: 6.46GiB(27.34%) tps: 20,735 tflops: 20.87 mfu: 6.69% global_avg_ntp_loss: 4.9102 global_avg_mtp_loss: 19.3713 +[titan] 2025-06-13 12:59:41,990 - root - INFO - lr: 4.9861e-04 gnorm: 0.99 [ 0:18:12< 2:44:30] +[titan] 2025-06-13 12:59:44,757 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 12:59:45,712 - root - INFO - step: 1500 loss: 24.5944 memory: 6.46GiB(27.34%) tps: 22,014 tflops: 22.15 mfu: 7.10% global_avg_ntp_loss: 4.9457 global_avg_mtp_loss: 19.6486 +[titan] 2025-06-13 12:59:45,712 - root - INFO - lr: 4.9858e-04 gnorm: 1.42 [ 0:18:16< 2:44:27] +[titan] 2025-06-13 12:59:49,504 - root - INFO - step: 1505 loss: 22.6350 memory: 6.46GiB(27.34%) tps: 21,609 tflops: 21.75 mfu: 6.97% global_avg_ntp_loss: 4.5138 global_avg_mtp_loss: 18.1212 +[titan] 2025-06-13 12:59:49,504 - root - INFO - lr: 4.9855e-04 gnorm: 1.59 [ 0:18:20< 2:44:25] +[titan] 2025-06-13 12:59:53,770 - root - INFO - step: 1510 loss: 24.2302 memory: 6.46GiB(27.34%) tps: 19,203 tflops: 19.33 mfu: 6.19% global_avg_ntp_loss: 4.8256 global_avg_mtp_loss: 19.4046 +[titan] 2025-06-13 12:59:53,771 - root - INFO - lr: 4.9852e-04 gnorm: 1.27 [ 0:18:24< 2:44:27] +[titan] 2025-06-13 12:59:57,692 - root - INFO - step: 1515 loss: 23.6119 memory: 6.46GiB(27.34%) tps: 20,892 tflops: 21.03 mfu: 6.74% global_avg_ntp_loss: 4.6664 global_avg_mtp_loss: 18.9455 +[titan] 2025-06-13 12:59:57,692 - root - INFO - lr: 4.9849e-04 gnorm: 1.31 [ 0:18:28< 2:44:25] +[titan] 2025-06-13 13:00:01,535 - root - INFO - step: 1520 loss: 24.1750 memory: 6.46GiB(27.34%) tps: 21,317 tflops: 21.45 mfu: 6.88% global_avg_ntp_loss: 4.8350 global_avg_mtp_loss: 19.3400 +[titan] 2025-06-13 13:00:01,536 - root - INFO - lr: 4.9846e-04 gnorm: 1.22 [ 0:18:32< 2:44:23] +[titan] 2025-06-13 13:00:05,845 - root - INFO - step: 1525 loss: 24.4461 memory: 6.46GiB(27.34%) tps: 19,010 tflops: 19.13 mfu: 6.13% global_avg_ntp_loss: 4.9728 global_avg_mtp_loss: 19.4733 +[titan] 2025-06-13 13:00:05,846 - root - INFO - lr: 4.9843e-04 gnorm: 1.41 [ 0:18:36< 2:44:26] +[titan] 2025-06-13 13:00:09,338 - root - INFO - step: 1530 loss: 23.4893 memory: 6.46GiB(27.34%) tps: 23,456 tflops: 23.61 mfu: 7.57% global_avg_ntp_loss: 4.6902 global_avg_mtp_loss: 18.7991 +[titan] 2025-06-13 13:00:09,339 - root - INFO - lr: 4.9840e-04 gnorm: 1.84 [ 0:18:40< 2:44:20] +[titan] 2025-06-13 13:00:13,823 - root - INFO - step: 1535 loss: 23.4258 memory: 6.46GiB(27.34%) tps: 18,268 tflops: 18.38 mfu: 5.89% global_avg_ntp_loss: 4.6432 global_avg_mtp_loss: 18.7826 +[titan] 2025-06-13 13:00:13,824 - root - INFO - lr: 4.9837e-04 gnorm: 1.40 [ 0:18:44< 2:44:24] +[titan] 2025-06-13 13:00:14,411 - root - INFO - Dumping profiler traces at step 1536 +[titan] 2025-06-13 13:00:14,512 - root - INFO - Finished dumping profiler traces in 0.10 seconds +[titan] 2025-06-13 13:00:16,946 - root - INFO - step: 1540 loss: 24.1590 memory: 6.46GiB(27.34%) tps: 26,239 tflops: 26.41 mfu: 8.46% global_avg_ntp_loss: 4.8542 global_avg_mtp_loss: 19.3048 +[titan] 2025-06-13 13:00:16,946 - root - INFO - lr: 4.9834e-04 gnorm: 1.40 [ 0:18:47< 2:44:16] +[titan] 2025-06-13 13:00:20,514 - root - INFO - step: 1545 loss: 23.5717 memory: 6.46GiB(27.34%) tps: 22,963 tflops: 23.11 mfu: 7.41% global_avg_ntp_loss: 4.7451 global_avg_mtp_loss: 18.8267 +[titan] 2025-06-13 13:00:20,514 - root - INFO - lr: 4.9831e-04 gnorm: 1.27 [ 0:18:51< 2:44:11] +[titan] 2025-06-13 13:00:23,722 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:00:24,390 - root - INFO - step: 1550 loss: 23.0118 memory: 6.46GiB(27.34%) tps: 21,135 tflops: 21.27 mfu: 6.82% global_avg_ntp_loss: 4.5195 global_avg_mtp_loss: 18.4923 +[titan] 2025-06-13 13:00:24,390 - root - INFO - lr: 4.9828e-04 gnorm: 1.32 [ 0:18:55< 2:44:09] +[titan] 2025-06-13 13:00:27,857 - root - INFO - step: 1555 loss: 23.5225 memory: 6.46GiB(27.34%) tps: 23,628 tflops: 23.78 mfu: 7.62% global_avg_ntp_loss: 4.6736 global_avg_mtp_loss: 18.8489 +[titan] 2025-06-13 13:00:27,858 - root - INFO - lr: 4.9825e-04 gnorm: 1.58 [ 0:18:58< 2:44:04] +[titan] 2025-06-13 13:00:31,346 - root - INFO - step: 1560 loss: 23.9916 memory: 6.46GiB(27.34%) tps: 23,485 tflops: 23.63 mfu: 7.58% global_avg_ntp_loss: 4.7689 global_avg_mtp_loss: 19.2227 +[titan] 2025-06-13 13:00:31,347 - root - INFO - lr: 4.9822e-04 gnorm: 1.10 [ 0:19:02< 2:43:59] +[titan] 2025-06-13 13:00:35,099 - root - INFO - step: 1565 loss: 23.7083 memory: 6.46GiB(27.34%) tps: 21,830 tflops: 21.97 mfu: 7.04% global_avg_ntp_loss: 4.7226 global_avg_mtp_loss: 18.9857 +[titan] 2025-06-13 13:00:35,103 - root - INFO - lr: 4.9819e-04 gnorm: 1.23 [ 0:19:05< 2:43:56] +[titan] 2025-06-13 13:00:38,880 - root - INFO - step: 1570 loss: 24.2724 memory: 6.46GiB(27.34%) tps: 21,688 tflops: 21.83 mfu: 7.00% global_avg_ntp_loss: 4.8373 global_avg_mtp_loss: 19.4351 +[titan] 2025-06-13 13:00:38,881 - root - INFO - lr: 4.9816e-04 gnorm: 1.21 [ 0:19:09< 2:43:53] +[titan] 2025-06-13 13:00:42,430 - root - INFO - step: 1575 loss: 23.4931 memory: 6.46GiB(27.34%) tps: 23,080 tflops: 23.23 mfu: 7.44% global_avg_ntp_loss: 4.6753 global_avg_mtp_loss: 18.8178 +[titan] 2025-06-13 13:00:42,431 - root - INFO - lr: 4.9812e-04 gnorm: 1.21 [ 0:19:13< 2:43:49] +[titan] 2025-06-13 13:00:46,282 - root - INFO - step: 1580 loss: 23.5597 memory: 6.46GiB(27.34%) tps: 21,273 tflops: 21.41 mfu: 6.86% global_avg_ntp_loss: 4.6558 global_avg_mtp_loss: 18.9039 +[titan] 2025-06-13 13:00:46,282 - root - INFO - lr: 4.9809e-04 gnorm: 1.51 [ 0:19:16< 2:43:47] +[titan] 2025-06-13 13:00:49,962 - root - INFO - step: 1585 loss: 23.3714 memory: 6.46GiB(27.34%) tps: 22,263 tflops: 22.40 mfu: 7.18% global_avg_ntp_loss: 4.6571 global_avg_mtp_loss: 18.7143 +[titan] 2025-06-13 13:00:49,962 - root - INFO - lr: 4.9806e-04 gnorm: 1.27 [ 0:19:20< 2:43:43] +[titan] 2025-06-13 13:00:53,919 - root - INFO - step: 1590 loss: 23.8830 memory: 6.46GiB(27.34%) tps: 20,705 tflops: 20.84 mfu: 6.68% global_avg_ntp_loss: 4.7434 global_avg_mtp_loss: 19.1397 +[titan] 2025-06-13 13:00:53,919 - root - INFO - lr: 4.9802e-04 gnorm: 0.99 [ 0:19:24< 2:43:42] +[titan] 2025-06-13 13:00:57,783 - root - INFO - step: 1595 loss: 23.5652 memory: 6.46GiB(27.34%) tps: 21,201 tflops: 21.34 mfu: 6.84% global_avg_ntp_loss: 4.6470 global_avg_mtp_loss: 18.9181 +[titan] 2025-06-13 13:00:57,784 - root - INFO - lr: 4.9799e-04 gnorm: 1.24 [ 0:19:28< 2:43:40] +[titan] 2025-06-13 13:01:00,529 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:01:01,366 - root - INFO - step: 1600 loss: 23.5192 memory: 6.46GiB(27.34%) tps: 22,869 tflops: 23.02 mfu: 7.38% global_avg_ntp_loss: 4.6527 global_avg_mtp_loss: 18.8665 +[titan] 2025-06-13 13:01:01,366 - root - INFO - lr: 4.9796e-04 gnorm: 1.23 [ 0:19:32< 2:43:36] +[titan] 2025-06-13 13:01:04,723 - root - INFO - step: 1605 loss: 23.5570 memory: 6.46GiB(27.34%) tps: 24,406 tflops: 24.56 mfu: 7.87% global_avg_ntp_loss: 4.6455 global_avg_mtp_loss: 18.9116 +[titan] 2025-06-13 13:01:04,723 - root - INFO - lr: 4.9792e-04 gnorm: 1.22 [ 0:19:35< 2:43:29] +[titan] 2025-06-13 13:01:08,107 - root - INFO - step: 1610 loss: 23.0277 memory: 6.46GiB(27.34%) tps: 24,213 tflops: 24.37 mfu: 7.81% global_avg_ntp_loss: 4.4931 global_avg_mtp_loss: 18.5346 +[titan] 2025-06-13 13:01:08,108 - root - INFO - lr: 4.9789e-04 gnorm: 1.58 [ 0:19:38< 2:43:23] +[titan] 2025-06-13 13:01:12,121 - root - INFO - step: 1615 loss: 23.4376 memory: 6.46GiB(27.34%) tps: 20,413 tflops: 20.54 mfu: 6.58% global_avg_ntp_loss: 4.6801 global_avg_mtp_loss: 18.7575 +[titan] 2025-06-13 13:01:12,121 - root - INFO - lr: 4.9785e-04 gnorm: 1.14 [ 0:19:42< 2:43:23] +[titan] 2025-06-13 13:01:15,299 - root - INFO - step: 1620 loss: 23.0983 memory: 6.46GiB(27.34%) tps: 25,775 tflops: 25.94 mfu: 8.31% global_avg_ntp_loss: 4.5483 global_avg_mtp_loss: 18.5501 +[titan] 2025-06-13 13:01:15,300 - root - INFO - lr: 4.9782e-04 gnorm: 1.20 [ 0:19:46< 2:43:15] +[titan] 2025-06-13 13:01:19,206 - root - INFO - step: 1625 loss: 23.7573 memory: 6.46GiB(27.34%) tps: 20,970 tflops: 21.10 mfu: 6.76% global_avg_ntp_loss: 4.6852 global_avg_mtp_loss: 19.0721 +[titan] 2025-06-13 13:01:19,206 - root - INFO - lr: 4.9778e-04 gnorm: 1.15 [ 0:19:49< 2:43:13] +[titan] 2025-06-13 13:01:22,792 - root - INFO - step: 1630 loss: 23.5135 memory: 6.46GiB(27.34%) tps: 22,847 tflops: 22.99 mfu: 7.37% global_avg_ntp_loss: 4.6034 global_avg_mtp_loss: 18.9101 +[titan] 2025-06-13 13:01:22,792 - root - INFO - lr: 4.9775e-04 gnorm: 1.17 [ 0:19:53< 2:43:09] +[titan] 2025-06-13 13:01:30,393 - root - INFO - step: 1635 loss: 22.2736 memory: 6.46GiB(27.34%) tps: 10,779 tflops: 10.85 mfu: 3.48% global_avg_ntp_loss: 4.3532 global_avg_mtp_loss: 17.9204 +[titan] 2025-06-13 13:01:30,393 - root - INFO - lr: 4.9771e-04 gnorm: 1.46 [ 0:20:01< 2:43:38] +[titan] 2025-06-13 13:01:33,841 - root - INFO - step: 1640 loss: 23.9271 memory: 6.46GiB(27.34%) tps: 23,759 tflops: 23.91 mfu: 7.66% global_avg_ntp_loss: 4.7185 global_avg_mtp_loss: 19.2086 +[titan] 2025-06-13 13:01:33,841 - root - INFO - lr: 4.9768e-04 gnorm: 1.46 [ 0:20:04< 2:43:32] +[titan] 2025-06-13 13:01:37,812 - root - INFO - step: 1645 loss: 23.6358 memory: 6.46GiB(27.34%) tps: 20,633 tflops: 20.76 mfu: 6.66% global_avg_ntp_loss: 4.6398 global_avg_mtp_loss: 18.9959 +[titan] 2025-06-13 13:01:37,812 - root - INFO - lr: 4.9764e-04 gnorm: 1.55 [ 0:20:08< 2:43:31] +[titan] 2025-06-13 13:01:40,721 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:01:41,391 - root - INFO - step: 1650 loss: 20.4798 memory: 6.46GiB(27.34%) tps: 22,893 tflops: 23.04 mfu: 7.38% global_avg_ntp_loss: 3.9236 global_avg_mtp_loss: 16.5562 +[titan] 2025-06-13 13:01:41,391 - root - INFO - lr: 4.9760e-04 gnorm: 2.76 [ 0:20:12< 2:43:26] +[titan] 2025-06-13 13:01:44,809 - root - INFO - step: 1655 loss: 22.3915 memory: 6.46GiB(27.34%) tps: 23,969 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 4.3165 global_avg_mtp_loss: 18.0749 +[titan] 2025-06-13 13:01:44,809 - root - INFO - lr: 4.9757e-04 gnorm: 1.77 [ 0:20:15< 2:43:21] +[titan] 2025-06-13 13:01:48,607 - root - INFO - step: 1660 loss: 23.0309 memory: 6.46GiB(27.34%) tps: 21,575 tflops: 21.71 mfu: 6.96% global_avg_ntp_loss: 4.5053 global_avg_mtp_loss: 18.5257 +[titan] 2025-06-13 13:01:48,607 - root - INFO - lr: 4.9753e-04 gnorm: 1.57 [ 0:20:19< 2:43:18] +[titan] 2025-06-13 13:01:52,025 - root - INFO - step: 1665 loss: 23.5227 memory: 6.46GiB(27.34%) tps: 23,965 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 4.6459 global_avg_mtp_loss: 18.8768 +[titan] 2025-06-13 13:01:52,025 - root - INFO - lr: 4.9749e-04 gnorm: 1.43 [ 0:20:22< 2:43:12] +[titan] 2025-06-13 13:01:55,570 - root - INFO - step: 1670 loss: 23.4769 memory: 6.46GiB(27.34%) tps: 23,114 tflops: 23.26 mfu: 7.46% global_avg_ntp_loss: 4.6842 global_avg_mtp_loss: 18.7926 +[titan] 2025-06-13 13:01:55,570 - root - INFO - lr: 4.9745e-04 gnorm: 1.38 [ 0:20:26< 2:43:08] +[titan] 2025-06-13 13:01:58,963 - root - INFO - step: 1675 loss: 23.6535 memory: 6.46GiB(27.34%) tps: 24,143 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 4.6949 global_avg_mtp_loss: 18.9587 +[titan] 2025-06-13 13:01:58,964 - root - INFO - lr: 4.9742e-04 gnorm: 1.03 [ 0:20:29< 2:43:02] +[titan] 2025-06-13 13:02:02,455 - root - INFO - step: 1680 loss: 23.5215 memory: 6.46GiB(27.34%) tps: 23,465 tflops: 23.61 mfu: 7.57% global_avg_ntp_loss: 4.6183 global_avg_mtp_loss: 18.9033 +[titan] 2025-06-13 13:02:02,455 - root - INFO - lr: 4.9738e-04 gnorm: 1.26 [ 0:20:33< 2:42:57] +[titan] 2025-06-13 13:02:06,037 - root - INFO - step: 1685 loss: 23.0776 memory: 6.46GiB(27.34%) tps: 22,875 tflops: 23.02 mfu: 7.38% global_avg_ntp_loss: 4.6008 global_avg_mtp_loss: 18.4768 +[titan] 2025-06-13 13:02:06,037 - root - INFO - lr: 4.9734e-04 gnorm: 1.40 [ 0:20:36< 2:42:52] +[titan] 2025-06-13 13:02:09,316 - root - INFO - step: 1690 loss: 23.9220 memory: 6.46GiB(27.34%) tps: 24,987 tflops: 25.15 mfu: 8.06% global_avg_ntp_loss: 4.7438 global_avg_mtp_loss: 19.1781 +[titan] 2025-06-13 13:02:09,316 - root - INFO - lr: 4.9730e-04 gnorm: 1.50 [ 0:20:40< 2:42:46] +[titan] 2025-06-13 13:02:12,872 - root - INFO - step: 1695 loss: 23.3375 memory: 6.46GiB(27.34%) tps: 23,038 tflops: 23.19 mfu: 7.43% global_avg_ntp_loss: 4.6033 global_avg_mtp_loss: 18.7342 +[titan] 2025-06-13 13:02:12,872 - root - INFO - lr: 4.9726e-04 gnorm: 1.11 [ 0:20:43< 2:42:41] +[titan] 2025-06-13 13:02:15,487 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:02:16,107 - root - INFO - step: 1700 loss: 23.2280 memory: 6.46GiB(27.34%) tps: 25,324 tflops: 25.49 mfu: 8.17% global_avg_ntp_loss: 4.5583 global_avg_mtp_loss: 18.6696 +[titan] 2025-06-13 13:02:16,108 - root - INFO - lr: 4.9722e-04 gnorm: 1.42 [ 0:20:46< 2:42:34] +[titan] 2025-06-13 13:02:19,576 - root - INFO - step: 1705 loss: 23.8541 memory: 6.46GiB(27.34%) tps: 23,622 tflops: 23.77 mfu: 7.62% global_avg_ntp_loss: 4.6846 global_avg_mtp_loss: 19.1695 +[titan] 2025-06-13 13:02:19,576 - root - INFO - lr: 4.9718e-04 gnorm: 1.16 [ 0:20:50< 2:42:29] +[titan] 2025-06-13 13:02:22,905 - root - INFO - step: 1710 loss: 23.3935 memory: 6.46GiB(27.34%) tps: 24,608 tflops: 24.76 mfu: 7.94% global_avg_ntp_loss: 4.5742 global_avg_mtp_loss: 18.8194 +[titan] 2025-06-13 13:02:22,906 - root - INFO - lr: 4.9714e-04 gnorm: 1.26 [ 0:20:53< 2:42:22] +[titan] 2025-06-13 13:02:26,548 - root - INFO - step: 1715 loss: 22.7578 memory: 6.46GiB(27.34%) tps: 22,489 tflops: 22.63 mfu: 7.25% global_avg_ntp_loss: 4.4532 global_avg_mtp_loss: 18.3046 +[titan] 2025-06-13 13:02:26,549 - root - INFO - lr: 4.9710e-04 gnorm: 2.37 [ 0:20:57< 2:42:19] +[titan] 2025-06-13 13:02:29,901 - root - INFO - step: 1720 loss: 23.4034 memory: 6.46GiB(27.34%) tps: 24,437 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 4.5839 global_avg_mtp_loss: 18.8195 +[titan] 2025-06-13 13:02:29,901 - root - INFO - lr: 4.9706e-04 gnorm: 1.10 [ 0:21:00< 2:42:12] +[titan] 2025-06-13 13:02:33,562 - root - INFO - step: 1725 loss: 23.1685 memory: 6.46GiB(27.34%) tps: 22,379 tflops: 22.52 mfu: 7.22% global_avg_ntp_loss: 4.4775 global_avg_mtp_loss: 18.6910 +[titan] 2025-06-13 13:02:33,562 - root - INFO - lr: 4.9702e-04 gnorm: 1.13 [ 0:21:04< 2:42:09] +[titan] 2025-06-13 13:02:37,029 - root - INFO - step: 1730 loss: 22.8367 memory: 6.46GiB(27.34%) tps: 23,636 tflops: 23.79 mfu: 7.62% global_avg_ntp_loss: 4.4230 global_avg_mtp_loss: 18.4137 +[titan] 2025-06-13 13:02:37,029 - root - INFO - lr: 4.9698e-04 gnorm: 1.70 [ 0:21:07< 2:42:04] +[titan] 2025-06-13 13:02:40,118 - root - INFO - step: 1735 loss: 23.8680 memory: 6.46GiB(27.34%) tps: 26,515 tflops: 26.68 mfu: 8.55% global_avg_ntp_loss: 4.6653 global_avg_mtp_loss: 19.2028 +[titan] 2025-06-13 13:02:40,119 - root - INFO - lr: 4.9694e-04 gnorm: 1.17 [ 0:21:10< 2:41:56] +[titan] 2025-06-13 13:02:43,523 - root - INFO - step: 1740 loss: 23.7798 memory: 6.46GiB(27.34%) tps: 24,062 tflops: 24.22 mfu: 7.76% global_avg_ntp_loss: 4.7094 global_avg_mtp_loss: 19.0704 +[titan] 2025-06-13 13:02:43,524 - root - INFO - lr: 4.9690e-04 gnorm: 1.29 [ 0:21:14< 2:41:50] +[titan] 2025-06-13 13:02:46,966 - root - INFO - step: 1745 loss: 23.6588 memory: 6.46GiB(27.34%) tps: 23,796 tflops: 23.95 mfu: 7.68% global_avg_ntp_loss: 4.5922 global_avg_mtp_loss: 19.0667 +[titan] 2025-06-13 13:02:46,967 - root - INFO - lr: 4.9686e-04 gnorm: 1.10 [ 0:21:17< 2:41:45] +[titan] 2025-06-13 13:02:49,903 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:02:50,585 - root - INFO - step: 1750 loss: 23.7247 memory: 6.46GiB(27.34%) tps: 22,638 tflops: 22.78 mfu: 7.30% global_avg_ntp_loss: 4.6229 global_avg_mtp_loss: 19.1019 +[titan] 2025-06-13 13:02:50,586 - root - INFO - lr: 4.9681e-04 gnorm: 1.29 [ 0:21:21< 2:41:41] +[titan] 2025-06-13 13:02:53,932 - root - INFO - step: 1755 loss: 22.5514 memory: 6.46GiB(27.34%) tps: 24,485 tflops: 24.64 mfu: 7.90% global_avg_ntp_loss: 4.3484 global_avg_mtp_loss: 18.2030 +[titan] 2025-06-13 13:02:53,932 - root - INFO - lr: 4.9677e-04 gnorm: 1.13 [ 0:21:24< 2:41:35] +[titan] 2025-06-13 13:02:57,182 - root - INFO - step: 1760 loss: 23.2988 memory: 6.46GiB(27.34%) tps: 25,203 tflops: 25.36 mfu: 8.13% global_avg_ntp_loss: 4.6321 global_avg_mtp_loss: 18.6667 +[titan] 2025-06-13 13:02:57,183 - root - INFO - lr: 4.9673e-04 gnorm: 1.49 [ 0:21:27< 2:41:28] +[titan] 2025-06-13 13:03:00,509 - root - INFO - step: 1765 loss: 23.4459 memory: 6.46GiB(27.34%) tps: 24,632 tflops: 24.79 mfu: 7.95% global_avg_ntp_loss: 4.5553 global_avg_mtp_loss: 18.8906 +[titan] 2025-06-13 13:03:00,509 - root - INFO - lr: 4.9668e-04 gnorm: 1.48 [ 0:21:31< 2:41:22] +[titan] 2025-06-13 13:03:04,094 - root - INFO - step: 1770 loss: 22.6223 memory: 6.46GiB(27.34%) tps: 22,850 tflops: 23.00 mfu: 7.37% global_avg_ntp_loss: 4.3440 global_avg_mtp_loss: 18.2782 +[titan] 2025-06-13 13:03:04,094 - root - INFO - lr: 4.9664e-04 gnorm: 1.30 [ 0:21:34< 2:41:17] +[titan] 2025-06-13 13:03:07,779 - root - INFO - step: 1775 loss: 23.3923 memory: 6.46GiB(27.34%) tps: 22,236 tflops: 22.38 mfu: 7.17% global_avg_ntp_loss: 4.5931 global_avg_mtp_loss: 18.7992 +[titan] 2025-06-13 13:03:07,779 - root - INFO - lr: 4.9660e-04 gnorm: 1.13 [ 0:21:38< 2:41:14] +[titan] 2025-06-13 13:03:11,300 - root - INFO - step: 1780 loss: 22.4598 memory: 6.46GiB(27.34%) tps: 23,264 tflops: 23.41 mfu: 7.50% global_avg_ntp_loss: 4.2901 global_avg_mtp_loss: 18.1697 +[titan] 2025-06-13 13:03:11,301 - root - INFO - lr: 4.9655e-04 gnorm: 1.96 [ 0:21:41< 2:41:09] +[titan] 2025-06-13 13:03:14,672 - root - INFO - step: 1785 loss: 20.8575 memory: 6.46GiB(27.34%) tps: 24,299 tflops: 24.45 mfu: 7.84% global_avg_ntp_loss: 3.9755 global_avg_mtp_loss: 16.8820 +[titan] 2025-06-13 13:03:14,672 - root - INFO - lr: 4.9651e-04 gnorm: 2.47 [ 0:21:45< 2:41:04] +[titan] 2025-06-13 13:03:17,939 - root - INFO - step: 1790 loss: 23.2250 memory: 6.46GiB(27.34%) tps: 25,078 tflops: 25.24 mfu: 8.09% global_avg_ntp_loss: 4.4891 global_avg_mtp_loss: 18.7359 +[titan] 2025-06-13 13:03:17,939 - root - INFO - lr: 4.9647e-04 gnorm: 1.28 [ 0:21:48< 2:40:57] +[titan] 2025-06-13 13:03:21,527 - root - INFO - step: 1795 loss: 23.4939 memory: 6.46GiB(27.34%) tps: 22,836 tflops: 22.98 mfu: 7.37% global_avg_ntp_loss: 4.5900 global_avg_mtp_loss: 18.9039 +[titan] 2025-06-13 13:03:21,527 - root - INFO - lr: 4.9642e-04 gnorm: 1.12 [ 0:21:52< 2:40:53] +[titan] 2025-06-13 13:03:24,097 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:03:25,104 - root - INFO - step: 1800 loss: 22.9488 memory: 6.46GiB(27.34%) tps: 22,900 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 4.4071 global_avg_mtp_loss: 18.5417 +[titan] 2025-06-13 13:03:25,105 - root - INFO - lr: 4.9638e-04 gnorm: 1.31 [ 0:21:55< 2:40:49] +[titan] 2025-06-13 13:03:28,431 - root - INFO - step: 1805 loss: 22.5889 memory: 6.46GiB(27.34%) tps: 24,628 tflops: 24.79 mfu: 7.94% global_avg_ntp_loss: 4.3103 global_avg_mtp_loss: 18.2786 +[titan] 2025-06-13 13:03:28,431 - root - INFO - lr: 4.9633e-04 gnorm: 1.71 [ 0:21:59< 2:40:43] +[titan] 2025-06-13 13:03:31,860 - root - INFO - step: 1810 loss: 23.8508 memory: 6.46GiB(27.34%) tps: 23,895 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 4.6460 global_avg_mtp_loss: 19.2047 +[titan] 2025-06-13 13:03:31,860 - root - INFO - lr: 4.9628e-04 gnorm: 1.12 [ 0:22:02< 2:40:37] +[titan] 2025-06-13 13:03:35,894 - root - INFO - step: 1815 loss: 22.9205 memory: 6.46GiB(27.34%) tps: 20,309 tflops: 20.44 mfu: 6.55% global_avg_ntp_loss: 4.4364 global_avg_mtp_loss: 18.4841 +[titan] 2025-06-13 13:03:35,894 - root - INFO - lr: 4.9624e-04 gnorm: 1.22 [ 0:22:06< 2:40:36] +[titan] 2025-06-13 13:03:39,359 - root - INFO - step: 1820 loss: 23.8725 memory: 6.46GiB(27.34%) tps: 23,646 tflops: 23.80 mfu: 7.63% global_avg_ntp_loss: 4.6182 global_avg_mtp_loss: 19.2544 +[titan] 2025-06-13 13:03:39,359 - root - INFO - lr: 4.9619e-04 gnorm: 1.15 [ 0:22:10< 2:40:31] +[titan] 2025-06-13 13:03:42,542 - root - INFO - step: 1825 loss: 22.6805 memory: 6.46GiB(27.34%) tps: 25,740 tflops: 25.90 mfu: 8.30% global_avg_ntp_loss: 4.3449 global_avg_mtp_loss: 18.3356 +[titan] 2025-06-13 13:03:42,542 - root - INFO - lr: 4.9615e-04 gnorm: 1.29 [ 0:22:13< 2:40:24] +[titan] 2025-06-13 13:03:46,435 - root - INFO - step: 1830 loss: 23.7000 memory: 6.46GiB(27.34%) tps: 21,043 tflops: 21.18 mfu: 6.79% global_avg_ntp_loss: 4.6319 global_avg_mtp_loss: 19.0681 +[titan] 2025-06-13 13:03:46,435 - root - INFO - lr: 4.9610e-04 gnorm: 1.12 [ 0:22:17< 2:40:22] +[titan] 2025-06-13 13:03:50,011 - root - INFO - step: 1835 loss: 22.9777 memory: 6.46GiB(27.34%) tps: 22,912 tflops: 23.06 mfu: 7.39% global_avg_ntp_loss: 4.4748 global_avg_mtp_loss: 18.5030 +[titan] 2025-06-13 13:03:50,011 - root - INFO - lr: 4.9605e-04 gnorm: 1.06 [ 0:22:20< 2:40:18] +[titan] 2025-06-13 13:03:53,750 - root - INFO - step: 1840 loss: 22.6566 memory: 6.46GiB(27.34%) tps: 21,913 tflops: 22.05 mfu: 7.07% global_avg_ntp_loss: 4.3932 global_avg_mtp_loss: 18.2634 +[titan] 2025-06-13 13:03:53,750 - root - INFO - lr: 4.9601e-04 gnorm: 1.38 [ 0:22:24< 2:40:15] +[titan] 2025-06-13 13:03:57,067 - root - INFO - step: 1845 loss: 23.7066 memory: 6.46GiB(27.34%) tps: 24,694 tflops: 24.85 mfu: 7.97% global_avg_ntp_loss: 4.6582 global_avg_mtp_loss: 19.0484 +[titan] 2025-06-13 13:03:57,068 - root - INFO - lr: 4.9596e-04 gnorm: 1.23 [ 0:22:27< 2:40:09] +[titan] 2025-06-13 13:03:59,931 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:04:00,837 - root - INFO - step: 1850 loss: 21.8764 memory: 6.46GiB(27.34%) tps: 21,737 tflops: 21.88 mfu: 7.01% global_avg_ntp_loss: 4.1456 global_avg_mtp_loss: 17.7308 +[titan] 2025-06-13 13:04:00,837 - root - INFO - lr: 4.9591e-04 gnorm: 2.13 [ 0:22:31< 2:40:06] +[titan] 2025-06-13 13:04:03,957 - root - INFO - step: 1855 loss: 22.1849 memory: 6.46GiB(27.34%) tps: 26,255 tflops: 26.42 mfu: 8.47% global_avg_ntp_loss: 4.2014 global_avg_mtp_loss: 17.9835 +[titan] 2025-06-13 13:04:03,957 - root - INFO - lr: 4.9586e-04 gnorm: 1.70 [ 0:22:34< 2:39:59] +[titan] 2025-06-13 13:04:07,427 - root - INFO - step: 1860 loss: 23.3138 memory: 6.46GiB(27.34%) tps: 23,609 tflops: 23.76 mfu: 7.62% global_avg_ntp_loss: 4.5528 global_avg_mtp_loss: 18.7610 +[titan] 2025-06-13 13:04:07,428 - root - INFO - lr: 4.9581e-04 gnorm: 1.27 [ 0:22:38< 2:39:54] +[titan] 2025-06-13 13:04:10,878 - root - INFO - step: 1865 loss: 23.4136 memory: 6.46GiB(27.34%) tps: 23,745 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 4.5302 global_avg_mtp_loss: 18.8833 +[titan] 2025-06-13 13:04:10,878 - root - INFO - lr: 4.9577e-04 gnorm: 1.25 [ 0:22:41< 2:39:49] +[titan] 2025-06-13 13:04:14,628 - root - INFO - step: 1870 loss: 22.3981 memory: 6.46GiB(27.34%) tps: 21,846 tflops: 21.99 mfu: 7.05% global_avg_ntp_loss: 4.2596 global_avg_mtp_loss: 18.1385 +[titan] 2025-06-13 13:04:14,628 - root - INFO - lr: 4.9572e-04 gnorm: 1.53 [ 0:22:45< 2:39:46] +[titan] 2025-06-13 13:04:18,091 - root - INFO - step: 1875 loss: 22.8446 memory: 6.46GiB(27.34%) tps: 23,657 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 4.3867 global_avg_mtp_loss: 18.4580 +[titan] 2025-06-13 13:04:18,092 - root - INFO - lr: 4.9567e-04 gnorm: 1.17 [ 0:22:48< 2:39:41] +[titan] 2025-06-13 13:04:21,641 - root - INFO - step: 1880 loss: 22.1977 memory: 6.46GiB(27.34%) tps: 23,084 tflops: 23.23 mfu: 7.45% global_avg_ntp_loss: 4.2106 global_avg_mtp_loss: 17.9871 +[titan] 2025-06-13 13:04:21,641 - root - INFO - lr: 4.9562e-04 gnorm: 1.47 [ 0:22:52< 2:39:37] +[titan] 2025-06-13 13:04:25,138 - root - INFO - step: 1885 loss: 21.4843 memory: 6.46GiB(27.34%) tps: 23,424 tflops: 23.57 mfu: 7.56% global_avg_ntp_loss: 4.1103 global_avg_mtp_loss: 17.3740 +[titan] 2025-06-13 13:04:25,139 - root - INFO - lr: 4.9557e-04 gnorm: 2.77 [ 0:22:55< 2:39:32] +[titan] 2025-06-13 13:04:28,886 - root - INFO - step: 1890 loss: 22.6452 memory: 6.46GiB(27.34%) tps: 21,863 tflops: 22.00 mfu: 7.05% global_avg_ntp_loss: 4.4117 global_avg_mtp_loss: 18.2335 +[titan] 2025-06-13 13:04:28,886 - root - INFO - lr: 4.9552e-04 gnorm: 1.24 [ 0:22:59< 2:39:29] +[titan] 2025-06-13 13:04:32,293 - root - INFO - step: 1895 loss: 20.6842 memory: 6.46GiB(27.34%) tps: 24,048 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 3.9919 global_avg_mtp_loss: 16.6924 +[titan] 2025-06-13 13:04:32,293 - root - INFO - lr: 4.9547e-04 gnorm: 3.36 [ 0:23:02< 2:39:24] +[titan] 2025-06-13 13:04:35,085 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:04:35,723 - root - INFO - step: 1900 loss: 22.4864 memory: 6.46GiB(27.34%) tps: 23,884 tflops: 24.04 mfu: 7.70% global_avg_ntp_loss: 4.3134 global_avg_mtp_loss: 18.1729 +[titan] 2025-06-13 13:04:35,724 - root - INFO - lr: 4.9542e-04 gnorm: 1.46 [ 0:23:06< 2:39:18] +[titan] 2025-06-13 13:04:39,507 - root - INFO - step: 1905 loss: 22.8804 memory: 6.46GiB(27.34%) tps: 21,655 tflops: 21.79 mfu: 6.99% global_avg_ntp_loss: 4.3639 global_avg_mtp_loss: 18.5166 +[titan] 2025-06-13 13:04:39,507 - root - INFO - lr: 4.9537e-04 gnorm: 1.16 [ 0:23:10< 2:39:16] +[titan] 2025-06-13 13:04:42,487 - root - INFO - step: 1910 loss: 23.1086 memory: 6.46GiB(27.34%) tps: 27,492 tflops: 27.67 mfu: 8.87% global_avg_ntp_loss: 4.4743 global_avg_mtp_loss: 18.6343 +[titan] 2025-06-13 13:04:42,487 - root - INFO - lr: 4.9532e-04 gnorm: 1.43 [ 0:23:13< 2:39:07] +[titan] 2025-06-13 13:04:45,860 - root - INFO - step: 1915 loss: 22.5335 memory: 6.46GiB(27.34%) tps: 24,288 tflops: 24.44 mfu: 7.83% global_avg_ntp_loss: 4.2751 global_avg_mtp_loss: 18.2584 +[titan] 2025-06-13 13:04:45,860 - root - INFO - lr: 4.9526e-04 gnorm: 1.35 [ 0:23:16< 2:39:02] +[titan] 2025-06-13 13:04:49,286 - root - INFO - step: 1920 loss: 23.3528 memory: 6.46GiB(27.34%) tps: 23,913 tflops: 24.07 mfu: 7.71% global_avg_ntp_loss: 4.4924 global_avg_mtp_loss: 18.8604 +[titan] 2025-06-13 13:04:49,286 - root - INFO - lr: 4.9521e-04 gnorm: 1.20 [ 0:23:19< 2:38:57] +[titan] 2025-06-13 13:04:52,490 - root - INFO - step: 1925 loss: 19.0984 memory: 6.46GiB(27.34%) tps: 25,568 tflops: 25.73 mfu: 8.25% global_avg_ntp_loss: 3.6171 global_avg_mtp_loss: 15.4813 +[titan] 2025-06-13 13:04:52,491 - root - INFO - lr: 4.9516e-04 gnorm: 2.43 [ 0:23:23< 2:38:50] +[titan] 2025-06-13 13:04:56,060 - root - INFO - step: 1930 loss: 23.4457 memory: 6.46GiB(27.34%) tps: 22,953 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 4.5527 global_avg_mtp_loss: 18.8930 +[titan] 2025-06-13 13:04:56,060 - root - INFO - lr: 4.9511e-04 gnorm: 1.18 [ 0:23:26< 2:38:46] +[titan] 2025-06-13 13:04:59,619 - root - INFO - step: 1935 loss: 22.6476 memory: 6.46GiB(27.34%) tps: 23,019 tflops: 23.17 mfu: 7.42% global_avg_ntp_loss: 4.3414 global_avg_mtp_loss: 18.3061 +[titan] 2025-06-13 13:04:59,619 - root - INFO - lr: 4.9506e-04 gnorm: 1.76 [ 0:23:30< 2:38:42] +[titan] 2025-06-13 13:05:02,994 - root - INFO - step: 1940 loss: 22.3888 memory: 6.46GiB(27.34%) tps: 24,278 tflops: 24.43 mfu: 7.83% global_avg_ntp_loss: 4.2608 global_avg_mtp_loss: 18.1281 +[titan] 2025-06-13 13:05:02,994 - root - INFO - lr: 4.9500e-04 gnorm: 1.99 [ 0:23:33< 2:38:36] +[titan] 2025-06-13 13:05:06,644 - root - INFO - step: 1945 loss: 23.6025 memory: 6.46GiB(27.34%) tps: 22,446 tflops: 22.59 mfu: 7.24% global_avg_ntp_loss: 4.5266 global_avg_mtp_loss: 19.0760 +[titan] 2025-06-13 13:05:06,644 - root - INFO - lr: 4.9495e-04 gnorm: 1.21 [ 0:23:37< 2:38:33] +[titan] 2025-06-13 13:05:09,509 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:05:10,299 - root - INFO - step: 1950 loss: 23.0044 memory: 6.46GiB(27.34%) tps: 22,415 tflops: 22.56 mfu: 7.23% global_avg_ntp_loss: 4.4327 global_avg_mtp_loss: 18.5716 +[titan] 2025-06-13 13:05:10,299 - root - INFO - lr: 4.9490e-04 gnorm: 1.19 [ 0:23:40< 2:38:29] +[titan] 2025-06-13 13:05:13,636 - root - INFO - step: 1955 loss: 23.0624 memory: 6.46GiB(27.34%) tps: 24,553 tflops: 24.71 mfu: 7.92% global_avg_ntp_loss: 4.3989 global_avg_mtp_loss: 18.6634 +[titan] 2025-06-13 13:05:13,636 - root - INFO - lr: 4.9484e-04 gnorm: 1.34 [ 0:23:44< 2:38:23] +[titan] 2025-06-13 13:05:17,541 - root - INFO - step: 1960 loss: 22.9510 memory: 6.46GiB(27.34%) tps: 20,982 tflops: 21.12 mfu: 6.77% global_avg_ntp_loss: 4.3739 global_avg_mtp_loss: 18.5771 +[titan] 2025-06-13 13:05:17,541 - root - INFO - lr: 4.9479e-04 gnorm: 1.21 [ 0:23:48< 2:38:21] +[titan] 2025-06-13 13:05:20,955 - root - INFO - step: 1965 loss: 22.6212 memory: 6.46GiB(27.34%) tps: 23,999 tflops: 24.15 mfu: 7.74% global_avg_ntp_loss: 4.3261 global_avg_mtp_loss: 18.2951 +[titan] 2025-06-13 13:05:20,955 - root - INFO - lr: 4.9474e-04 gnorm: 1.39 [ 0:23:51< 2:38:16] +[titan] 2025-06-13 13:05:24,558 - root - INFO - step: 1970 loss: 22.4788 memory: 6.46GiB(27.34%) tps: 22,736 tflops: 22.88 mfu: 7.33% global_avg_ntp_loss: 4.2894 global_avg_mtp_loss: 18.1893 +[titan] 2025-06-13 13:05:24,559 - root - INFO - lr: 4.9468e-04 gnorm: 1.86 [ 0:23:55< 2:38:12] +[titan] 2025-06-13 13:05:27,883 - root - INFO - step: 1975 loss: 20.1559 memory: 6.46GiB(27.34%) tps: 24,641 tflops: 24.80 mfu: 7.95% global_avg_ntp_loss: 3.8132 global_avg_mtp_loss: 16.3427 +[titan] 2025-06-13 13:05:27,884 - root - INFO - lr: 4.9463e-04 gnorm: 2.04 [ 0:23:58< 2:38:07] +[titan] 2025-06-13 13:05:33,968 - root - INFO - step: 1980 loss: 23.4496 memory: 6.46GiB(27.34%) tps: 13,465 tflops: 13.55 mfu: 4.34% global_avg_ntp_loss: 4.5103 global_avg_mtp_loss: 18.9393 +[titan] 2025-06-13 13:05:33,968 - root - INFO - lr: 4.9457e-04 gnorm: 1.12 [ 0:24:04< 2:38:19] +[titan] 2025-06-13 13:05:37,071 - root - INFO - step: 1985 loss: 22.1450 memory: 6.46GiB(27.34%) tps: 26,400 tflops: 26.57 mfu: 8.52% global_avg_ntp_loss: 4.2358 global_avg_mtp_loss: 17.9092 +[titan] 2025-06-13 13:05:37,071 - root - INFO - lr: 4.9452e-04 gnorm: 1.46 [ 0:24:07< 2:38:12] +[titan] 2025-06-13 13:05:40,438 - root - INFO - step: 1990 loss: 23.0704 memory: 6.46GiB(27.34%) tps: 24,331 tflops: 24.49 mfu: 7.85% global_avg_ntp_loss: 4.3861 global_avg_mtp_loss: 18.6843 +[titan] 2025-06-13 13:05:40,439 - root - INFO - lr: 4.9446e-04 gnorm: 1.17 [ 0:24:11< 2:38:06] +[titan] 2025-06-13 13:05:43,783 - root - INFO - step: 1995 loss: 22.6923 memory: 6.46GiB(27.34%) tps: 24,494 tflops: 24.65 mfu: 7.90% global_avg_ntp_loss: 4.3427 global_avg_mtp_loss: 18.3496 +[titan] 2025-06-13 13:05:43,783 - root - INFO - lr: 4.9440e-04 gnorm: 1.11 [ 0:24:14< 2:38:01] +[titan] 2025-06-13 13:05:46,404 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:05:47,304 - root - INFO - step: 2000 loss: 23.6947 memory: 6.46GiB(27.34%) tps: 23,267 tflops: 23.42 mfu: 7.50% global_avg_ntp_loss: 4.5902 global_avg_mtp_loss: 19.1045 +[titan] 2025-06-13 13:05:47,305 - root - INFO - lr: 4.9435e-04 gnorm: 1.12 [ 0:24:17< 2:37:56] +[titan] 2025-06-13 13:05:50,796 - root - INFO - step: 2005 loss: 22.5488 memory: 6.46GiB(27.34%) tps: 23,464 tflops: 23.61 mfu: 7.57% global_avg_ntp_loss: 4.3030 global_avg_mtp_loss: 18.2458 +[titan] 2025-06-13 13:05:50,796 - root - INFO - lr: 4.9429e-04 gnorm: 1.38 [ 0:24:21< 2:37:52] +[titan] 2025-06-13 13:05:54,119 - root - INFO - step: 2010 loss: 22.0874 memory: 6.46GiB(27.34%) tps: 24,659 tflops: 24.82 mfu: 7.95% global_avg_ntp_loss: 4.3780 global_avg_mtp_loss: 17.7094 +[titan] 2025-06-13 13:05:54,119 - root - INFO - lr: 4.9424e-04 gnorm: 4.55 [ 0:24:24< 2:37:46] +[titan] 2025-06-13 13:05:57,771 - root - INFO - step: 2015 loss: 23.2415 memory: 6.46GiB(27.34%) tps: 22,432 tflops: 22.58 mfu: 7.24% global_avg_ntp_loss: 4.4463 global_avg_mtp_loss: 18.7953 +[titan] 2025-06-13 13:05:57,771 - root - INFO - lr: 4.9418e-04 gnorm: 1.06 [ 0:24:28< 2:37:42] +[titan] 2025-06-13 13:06:01,325 - root - INFO - step: 2020 loss: 23.3478 memory: 6.46GiB(27.34%) tps: 23,050 tflops: 23.20 mfu: 7.43% global_avg_ntp_loss: 4.4960 global_avg_mtp_loss: 18.8518 +[titan] 2025-06-13 13:06:01,326 - root - INFO - lr: 4.9412e-04 gnorm: 1.20 [ 0:24:31< 2:37:38] +[titan] 2025-06-13 13:06:04,592 - root - INFO - step: 2025 loss: 22.1360 memory: 6.46GiB(27.34%) tps: 25,079 tflops: 25.24 mfu: 8.09% global_avg_ntp_loss: 4.2058 global_avg_mtp_loss: 17.9302 +[titan] 2025-06-13 13:06:04,593 - root - INFO - lr: 4.9406e-04 gnorm: 1.32 [ 0:24:35< 2:37:32] +[titan] 2025-06-13 13:06:08,194 - root - INFO - step: 2030 loss: 23.0452 memory: 6.46GiB(27.34%) tps: 22,749 tflops: 22.89 mfu: 7.34% global_avg_ntp_loss: 4.4122 global_avg_mtp_loss: 18.6330 +[titan] 2025-06-13 13:06:08,194 - root - INFO - lr: 4.9401e-04 gnorm: 1.11 [ 0:24:38< 2:37:28] +[titan] 2025-06-13 13:06:11,475 - root - INFO - step: 2035 loss: 23.2463 memory: 6.46GiB(27.34%) tps: 24,970 tflops: 25.13 mfu: 8.05% global_avg_ntp_loss: 4.4445 global_avg_mtp_loss: 18.8017 +[titan] 2025-06-13 13:06:11,475 - root - INFO - lr: 4.9395e-04 gnorm: 1.05 [ 0:24:42< 2:37:22] +[titan] 2025-06-13 13:06:15,033 - root - INFO - step: 2040 loss: 22.0325 memory: 6.46GiB(27.34%) tps: 23,024 tflops: 23.17 mfu: 7.43% global_avg_ntp_loss: 4.1895 global_avg_mtp_loss: 17.8430 +[titan] 2025-06-13 13:06:15,033 - root - INFO - lr: 4.9389e-04 gnorm: 1.65 [ 0:24:45< 2:37:18] +[titan] 2025-06-13 13:06:18,205 - root - INFO - step: 2045 loss: 23.0606 memory: 6.46GiB(27.34%) tps: 25,833 tflops: 26.00 mfu: 8.33% global_avg_ntp_loss: 4.3535 global_avg_mtp_loss: 18.7071 +[titan] 2025-06-13 13:06:18,205 - root - INFO - lr: 4.9383e-04 gnorm: 1.43 [ 0:24:48< 2:37:11] +[titan] 2025-06-13 13:06:20,515 - root - INFO - Dumping profiler traces at step 2048 +[titan] 2025-06-13 13:06:20,612 - root - INFO - Finished dumping profiler traces in 0.10 seconds +[titan] 2025-06-13 13:06:21,125 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:06:21,757 - root - INFO - step: 2050 loss: 18.9474 memory: 6.46GiB(27.34%) tps: 23,065 tflops: 23.21 mfu: 7.44% global_avg_ntp_loss: 3.5958 global_avg_mtp_loss: 15.3516 +[titan] 2025-06-13 13:06:21,757 - root - INFO - lr: 4.9377e-04 gnorm: 2.09 [ 0:24:52< 2:37:07] +[titan] 2025-06-13 13:06:24,812 - root - INFO - step: 2055 loss: 17.7013 memory: 6.46GiB(27.34%) tps: 26,823 tflops: 26.99 mfu: 8.65% global_avg_ntp_loss: 3.2577 global_avg_mtp_loss: 14.4436 +[titan] 2025-06-13 13:06:24,812 - root - INFO - lr: 4.9371e-04 gnorm: 1.76 [ 0:24:55< 2:37:00] +[titan] 2025-06-13 13:06:28,114 - root - INFO - step: 2060 loss: 23.1452 memory: 6.46GiB(27.34%) tps: 24,816 tflops: 24.97 mfu: 8.00% global_avg_ntp_loss: 4.4064 global_avg_mtp_loss: 18.7387 +[titan] 2025-06-13 13:06:28,114 - root - INFO - lr: 4.9365e-04 gnorm: 1.28 [ 0:24:58< 2:36:54] +[titan] 2025-06-13 13:06:31,374 - root - INFO - step: 2065 loss: 22.5172 memory: 6.46GiB(27.34%) tps: 25,128 tflops: 25.29 mfu: 8.11% global_avg_ntp_loss: 4.2467 global_avg_mtp_loss: 18.2705 +[titan] 2025-06-13 13:06:31,374 - root - INFO - lr: 4.9359e-04 gnorm: 1.21 [ 0:25:02< 2:36:48] +[titan] 2025-06-13 13:06:34,765 - root - INFO - step: 2070 loss: 22.9855 memory: 6.46GiB(27.34%) tps: 24,158 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 4.3464 global_avg_mtp_loss: 18.6392 +[titan] 2025-06-13 13:06:34,766 - root - INFO - lr: 4.9353e-04 gnorm: 1.13 [ 0:25:05< 2:36:43] +[titan] 2025-06-13 13:06:38,586 - root - INFO - step: 2075 loss: 22.5956 memory: 6.46GiB(27.34%) tps: 21,443 tflops: 21.58 mfu: 6.92% global_avg_ntp_loss: 4.2604 global_avg_mtp_loss: 18.3351 +[titan] 2025-06-13 13:06:38,587 - root - INFO - lr: 4.9347e-04 gnorm: 1.28 [ 0:25:09< 2:36:40] +[titan] 2025-06-13 13:06:41,515 - root - INFO - step: 2080 loss: 20.1154 memory: 6.46GiB(27.34%) tps: 27,979 tflops: 28.16 mfu: 9.02% global_avg_ntp_loss: 3.7721 global_avg_mtp_loss: 16.3434 +[titan] 2025-06-13 13:06:41,515 - root - INFO - lr: 4.9341e-04 gnorm: 1.92 [ 0:25:12< 2:36:32] +[titan] 2025-06-13 13:06:44,735 - root - INFO - step: 2085 loss: 22.8500 memory: 6.46GiB(27.34%) tps: 25,444 tflops: 25.61 mfu: 8.21% global_avg_ntp_loss: 4.3393 global_avg_mtp_loss: 18.5108 +[titan] 2025-06-13 13:06:44,735 - root - INFO - lr: 4.9335e-04 gnorm: 1.36 [ 0:25:15< 2:36:26] +[titan] 2025-06-13 13:06:48,088 - root - INFO - step: 2090 loss: 22.9332 memory: 6.46GiB(27.34%) tps: 24,432 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 4.4505 global_avg_mtp_loss: 18.4827 +[titan] 2025-06-13 13:06:48,088 - root - INFO - lr: 4.9329e-04 gnorm: 1.27 [ 0:25:18< 2:36:21] +[titan] 2025-06-13 13:06:51,623 - root - INFO - step: 2095 loss: 20.3704 memory: 6.46GiB(27.34%) tps: 23,176 tflops: 23.32 mfu: 7.48% global_avg_ntp_loss: 3.7899 global_avg_mtp_loss: 16.5805 +[titan] 2025-06-13 13:06:51,623 - root - INFO - lr: 4.9323e-04 gnorm: 1.90 [ 0:25:22< 2:36:17] +[titan] 2025-06-13 13:06:54,480 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:06:55,073 - root - INFO - step: 2100 loss: 23.2979 memory: 6.46GiB(27.34%) tps: 23,746 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 4.4594 global_avg_mtp_loss: 18.8385 +[titan] 2025-06-13 13:06:55,076 - root - INFO - lr: 4.9317e-04 gnorm: 1.08 [ 0:25:25< 2:36:12] +[titan] 2025-06-13 13:06:58,303 - root - INFO - step: 2105 loss: 22.6096 memory: 6.46GiB(27.34%) tps: 25,384 tflops: 25.55 mfu: 8.19% global_avg_ntp_loss: 4.2555 global_avg_mtp_loss: 18.3541 +[titan] 2025-06-13 13:06:58,303 - root - INFO - lr: 4.9311e-04 gnorm: 1.18 [ 0:25:28< 2:36:06] +[titan] 2025-06-13 13:07:01,879 - root - INFO - step: 2110 loss: 22.1600 memory: 6.46GiB(27.34%) tps: 22,908 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 4.1856 global_avg_mtp_loss: 17.9744 +[titan] 2025-06-13 13:07:01,879 - root - INFO - lr: 4.9304e-04 gnorm: 1.49 [ 0:25:32< 2:36:02] +[titan] 2025-06-13 13:07:06,072 - root - INFO - step: 2115 loss: 23.0542 memory: 6.46GiB(27.34%) tps: 19,539 tflops: 19.66 mfu: 6.30% global_avg_ntp_loss: 4.3705 global_avg_mtp_loss: 18.6838 +[titan] 2025-06-13 13:07:06,073 - root - INFO - lr: 4.9298e-04 gnorm: 1.24 [ 0:25:36< 2:36:02] +[titan] 2025-06-13 13:07:09,737 - root - INFO - step: 2120 loss: 20.6337 memory: 6.46GiB(27.34%) tps: 22,358 tflops: 22.50 mfu: 7.21% global_avg_ntp_loss: 3.8070 global_avg_mtp_loss: 16.8267 +[titan] 2025-06-13 13:07:09,737 - root - INFO - lr: 4.9292e-04 gnorm: 1.86 [ 0:25:40< 2:35:58] +[titan] 2025-06-13 13:07:12,829 - root - INFO - step: 2125 loss: 22.5274 memory: 6.46GiB(27.34%) tps: 26,498 tflops: 26.67 mfu: 8.55% global_avg_ntp_loss: 4.2152 global_avg_mtp_loss: 18.3121 +[titan] 2025-06-13 13:07:12,829 - root - INFO - lr: 4.9286e-04 gnorm: 1.06 [ 0:25:43< 2:35:51] +[titan] 2025-06-13 13:07:16,123 - root - INFO - step: 2130 loss: 22.9252 memory: 6.46GiB(27.34%) tps: 24,868 tflops: 25.03 mfu: 8.02% global_avg_ntp_loss: 4.4483 global_avg_mtp_loss: 18.4769 +[titan] 2025-06-13 13:07:16,124 - root - INFO - lr: 4.9279e-04 gnorm: 1.82 [ 0:25:46< 2:35:46] +[titan] 2025-06-13 13:07:19,582 - root - INFO - step: 2135 loss: 22.4222 memory: 6.46GiB(27.34%) tps: 23,684 tflops: 23.84 mfu: 7.64% global_avg_ntp_loss: 4.2091 global_avg_mtp_loss: 18.2131 +[titan] 2025-06-13 13:07:19,583 - root - INFO - lr: 4.9273e-04 gnorm: 1.51 [ 0:25:50< 2:35:41] +[titan] 2025-06-13 13:07:23,061 - root - INFO - step: 2140 loss: 22.9738 memory: 6.46GiB(27.34%) tps: 23,556 tflops: 23.71 mfu: 7.60% global_avg_ntp_loss: 4.3975 global_avg_mtp_loss: 18.5763 +[titan] 2025-06-13 13:07:23,061 - root - INFO - lr: 4.9267e-04 gnorm: 1.35 [ 0:25:53< 2:35:36] +[titan] 2025-06-13 13:07:26,452 - root - INFO - step: 2145 loss: 22.8697 memory: 6.46GiB(27.34%) tps: 24,155 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 4.3616 global_avg_mtp_loss: 18.5081 +[titan] 2025-06-13 13:07:26,453 - root - INFO - lr: 4.9260e-04 gnorm: 2.44 [ 0:25:57< 2:35:31] +[titan] 2025-06-13 13:07:29,095 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:07:29,781 - root - INFO - step: 2150 loss: 21.4705 memory: 6.46GiB(27.34%) tps: 24,613 tflops: 24.77 mfu: 7.94% global_avg_ntp_loss: 3.9861 global_avg_mtp_loss: 17.4845 +[titan] 2025-06-13 13:07:29,782 - root - INFO - lr: 4.9254e-04 gnorm: 1.72 [ 0:26:00< 2:35:26] +[titan] 2025-06-13 13:07:33,086 - root - INFO - step: 2155 loss: 22.6068 memory: 6.46GiB(27.34%) tps: 24,792 tflops: 24.95 mfu: 8.00% global_avg_ntp_loss: 4.2622 global_avg_mtp_loss: 18.3446 +[titan] 2025-06-13 13:07:33,086 - root - INFO - lr: 4.9247e-04 gnorm: 1.41 [ 0:26:03< 2:35:20] +[titan] 2025-06-13 13:07:36,382 - root - INFO - step: 2160 loss: 23.0058 memory: 6.46GiB(27.34%) tps: 24,859 tflops: 25.02 mfu: 8.02% global_avg_ntp_loss: 4.4004 global_avg_mtp_loss: 18.6054 +[titan] 2025-06-13 13:07:36,382 - root - INFO - lr: 4.9241e-04 gnorm: 1.19 [ 0:26:07< 2:35:15] +[titan] 2025-06-13 13:07:40,236 - root - INFO - step: 2165 loss: 22.9533 memory: 6.46GiB(27.34%) tps: 21,255 tflops: 21.39 mfu: 6.86% global_avg_ntp_loss: 4.3771 global_avg_mtp_loss: 18.5761 +[titan] 2025-06-13 13:07:40,237 - root - INFO - lr: 4.9234e-04 gnorm: 1.15 [ 0:26:10< 2:35:12] +[titan] 2025-06-13 13:07:43,646 - root - INFO - step: 2170 loss: 22.7327 memory: 6.46GiB(27.34%) tps: 24,029 tflops: 24.18 mfu: 7.75% global_avg_ntp_loss: 4.2851 global_avg_mtp_loss: 18.4476 +[titan] 2025-06-13 13:07:43,646 - root - INFO - lr: 4.9228e-04 gnorm: 1.22 [ 0:26:14< 2:35:07] +[titan] 2025-06-13 13:07:47,115 - root - INFO - step: 2175 loss: 21.4772 memory: 6.46GiB(27.34%) tps: 23,617 tflops: 23.77 mfu: 7.62% global_avg_ntp_loss: 4.0819 global_avg_mtp_loss: 17.3953 +[titan] 2025-06-13 13:07:47,115 - root - INFO - lr: 4.9221e-04 gnorm: 1.77 [ 0:26:17< 2:35:03] +[titan] 2025-06-13 13:07:50,718 - root - INFO - step: 2180 loss: 22.4425 memory: 6.46GiB(27.34%) tps: 22,741 tflops: 22.89 mfu: 7.34% global_avg_ntp_loss: 4.3046 global_avg_mtp_loss: 18.1380 +[titan] 2025-06-13 13:07:50,718 - root - INFO - lr: 4.9215e-04 gnorm: 1.23 [ 0:26:21< 2:34:59] +[titan] 2025-06-13 13:07:54,306 - root - INFO - step: 2185 loss: 21.9814 memory: 6.46GiB(27.34%) tps: 22,832 tflops: 22.98 mfu: 7.36% global_avg_ntp_loss: 4.1159 global_avg_mtp_loss: 17.8655 +[titan] 2025-06-13 13:07:54,307 - root - INFO - lr: 4.9208e-04 gnorm: 1.28 [ 0:26:24< 2:34:55] +[titan] 2025-06-13 13:07:57,824 - root - INFO - step: 2190 loss: 21.7372 memory: 6.46GiB(27.34%) tps: 23,290 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 4.0639 global_avg_mtp_loss: 17.6734 +[titan] 2025-06-13 13:07:57,825 - root - INFO - lr: 4.9201e-04 gnorm: 1.46 [ 0:26:28< 2:34:51] +[titan] 2025-06-13 13:08:01,252 - root - INFO - step: 2195 loss: 22.9180 memory: 6.46GiB(27.34%) tps: 23,909 tflops: 24.06 mfu: 7.71% global_avg_ntp_loss: 4.3688 global_avg_mtp_loss: 18.5492 +[titan] 2025-06-13 13:08:01,252 - root - INFO - lr: 4.9195e-04 gnorm: 1.13 [ 0:26:31< 2:34:46] +[titan] 2025-06-13 13:08:03,792 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:08:04,329 - root - INFO - step: 2200 loss: 22.5389 memory: 6.46GiB(27.34%) tps: 26,625 tflops: 26.79 mfu: 8.59% global_avg_ntp_loss: 4.2928 global_avg_mtp_loss: 18.2461 +[titan] 2025-06-13 13:08:04,329 - root - INFO - lr: 4.9188e-04 gnorm: 1.31 [ 0:26:34< 2:34:39] +[titan] 2025-06-13 13:08:07,880 - root - INFO - step: 2205 loss: 21.6409 memory: 6.46GiB(27.34%) tps: 23,074 tflops: 23.22 mfu: 7.44% global_avg_ntp_loss: 4.0525 global_avg_mtp_loss: 17.5884 +[titan] 2025-06-13 13:08:07,880 - root - INFO - lr: 4.9181e-04 gnorm: 1.53 [ 0:26:38< 2:34:35] +[titan] 2025-06-13 13:08:11,566 - root - INFO - step: 2210 loss: 22.9773 memory: 6.46GiB(27.34%) tps: 22,226 tflops: 22.37 mfu: 7.17% global_avg_ntp_loss: 4.3561 global_avg_mtp_loss: 18.6212 +[titan] 2025-06-13 13:08:11,566 - root - INFO - lr: 4.9174e-04 gnorm: 1.12 [ 0:26:42< 2:34:32] +[titan] 2025-06-13 13:08:14,914 - root - INFO - step: 2215 loss: 22.6590 memory: 6.46GiB(27.34%) tps: 24,468 tflops: 24.62 mfu: 7.89% global_avg_ntp_loss: 4.2606 global_avg_mtp_loss: 18.3984 +[titan] 2025-06-13 13:08:14,915 - root - INFO - lr: 4.9168e-04 gnorm: 1.10 [ 0:26:45< 2:34:27] +[titan] 2025-06-13 13:08:18,302 - root - INFO - step: 2220 loss: 22.4849 memory: 6.46GiB(27.34%) tps: 24,181 tflops: 24.34 mfu: 7.80% global_avg_ntp_loss: 4.2310 global_avg_mtp_loss: 18.2538 +[titan] 2025-06-13 13:08:18,303 - root - INFO - lr: 4.9161e-04 gnorm: 1.37 [ 0:26:48< 2:34:22] +[titan] 2025-06-13 13:08:21,546 - root - INFO - step: 2225 loss: 22.2465 memory: 6.46GiB(27.34%) tps: 25,257 tflops: 25.42 mfu: 8.15% global_avg_ntp_loss: 4.1644 global_avg_mtp_loss: 18.0820 +[titan] 2025-06-13 13:08:21,547 - root - INFO - lr: 4.9154e-04 gnorm: 1.15 [ 0:26:52< 2:34:16] +[titan] 2025-06-13 13:08:25,165 - root - INFO - step: 2230 loss: 22.2635 memory: 6.46GiB(27.34%) tps: 22,644 tflops: 22.79 mfu: 7.30% global_avg_ntp_loss: 4.1866 global_avg_mtp_loss: 18.0769 +[titan] 2025-06-13 13:08:25,165 - root - INFO - lr: 4.9147e-04 gnorm: 1.38 [ 0:26:55< 2:34:12] +[titan] 2025-06-13 13:08:28,551 - root - INFO - step: 2235 loss: 22.3020 memory: 6.46GiB(27.34%) tps: 24,194 tflops: 24.35 mfu: 7.80% global_avg_ntp_loss: 4.1955 global_avg_mtp_loss: 18.1065 +[titan] 2025-06-13 13:08:28,551 - root - INFO - lr: 4.9140e-04 gnorm: 1.20 [ 0:26:59< 2:34:07] +[titan] 2025-06-13 13:08:32,038 - root - INFO - step: 2240 loss: 23.2191 memory: 6.46GiB(27.34%) tps: 23,498 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 4.4152 global_avg_mtp_loss: 18.8038 +[titan] 2025-06-13 13:08:32,038 - root - INFO - lr: 4.9133e-04 gnorm: 1.08 [ 0:27:02< 2:34:03] +[titan] 2025-06-13 13:08:35,762 - root - INFO - step: 2245 loss: 22.2698 memory: 6.46GiB(27.34%) tps: 22,000 tflops: 22.14 mfu: 7.10% global_avg_ntp_loss: 4.2253 global_avg_mtp_loss: 18.0446 +[titan] 2025-06-13 13:08:35,762 - root - INFO - lr: 4.9126e-04 gnorm: 1.06 [ 0:27:06< 2:34:00] +[titan] 2025-06-13 13:08:38,764 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:08:39,235 - root - INFO - step: 2250 loss: 22.1860 memory: 6.46GiB(27.34%) tps: 23,586 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 4.1735 global_avg_mtp_loss: 18.0124 +[titan] 2025-06-13 13:08:39,236 - root - INFO - lr: 4.9119e-04 gnorm: 1.16 [ 0:27:09< 2:33:55] +[titan] 2025-06-13 13:08:42,443 - root - INFO - step: 2255 loss: 22.2312 memory: 6.46GiB(27.34%) tps: 25,539 tflops: 25.70 mfu: 8.24% global_avg_ntp_loss: 4.1982 global_avg_mtp_loss: 18.0331 +[titan] 2025-06-13 13:08:42,444 - root - INFO - lr: 4.9112e-04 gnorm: 1.17 [ 0:27:13< 2:33:50] +[titan] 2025-06-13 13:08:45,745 - root - INFO - step: 2260 loss: 21.5806 memory: 6.46GiB(27.34%) tps: 24,813 tflops: 24.97 mfu: 8.00% global_avg_ntp_loss: 4.0433 global_avg_mtp_loss: 17.5373 +[titan] 2025-06-13 13:08:45,746 - root - INFO - lr: 4.9105e-04 gnorm: 1.34 [ 0:27:16< 2:33:44] +[titan] 2025-06-13 13:08:49,269 - root - INFO - step: 2265 loss: 20.8396 memory: 6.46GiB(27.34%) tps: 23,251 tflops: 23.40 mfu: 7.50% global_avg_ntp_loss: 3.8834 global_avg_mtp_loss: 16.9562 +[titan] 2025-06-13 13:08:49,269 - root - INFO - lr: 4.9098e-04 gnorm: 1.45 [ 0:27:19< 2:33:40] +[titan] 2025-06-13 13:08:52,690 - root - INFO - step: 2270 loss: 21.9710 memory: 6.46GiB(27.34%) tps: 23,948 tflops: 24.10 mfu: 7.72% global_avg_ntp_loss: 4.0994 global_avg_mtp_loss: 17.8716 +[titan] 2025-06-13 13:08:52,690 - root - INFO - lr: 4.9091e-04 gnorm: 1.16 [ 0:27:23< 2:33:35] +[titan] 2025-06-13 13:08:55,801 - root - INFO - step: 2275 loss: 21.8373 memory: 6.46GiB(27.34%) tps: 26,338 tflops: 26.51 mfu: 8.50% global_avg_ntp_loss: 4.0791 global_avg_mtp_loss: 17.7582 +[titan] 2025-06-13 13:08:55,801 - root - INFO - lr: 4.9084e-04 gnorm: 1.37 [ 0:27:26< 2:33:29] +[titan] 2025-06-13 13:08:59,213 - root - INFO - step: 2280 loss: 22.5042 memory: 6.46GiB(27.34%) tps: 24,015 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 4.2409 global_avg_mtp_loss: 18.2633 +[titan] 2025-06-13 13:08:59,213 - root - INFO - lr: 4.9077e-04 gnorm: 1.00 [ 0:27:29< 2:33:24] +[titan] 2025-06-13 13:09:02,760 - root - INFO - step: 2285 loss: 21.5870 memory: 6.46GiB(27.34%) tps: 23,101 tflops: 23.25 mfu: 7.45% global_avg_ntp_loss: 4.0358 global_avg_mtp_loss: 17.5512 +[titan] 2025-06-13 13:09:02,760 - root - INFO - lr: 4.9070e-04 gnorm: 1.15 [ 0:27:33< 2:33:20] +[titan] 2025-06-13 13:09:06,524 - root - INFO - step: 2290 loss: 22.1031 memory: 6.46GiB(27.34%) tps: 21,764 tflops: 21.90 mfu: 7.02% global_avg_ntp_loss: 4.1477 global_avg_mtp_loss: 17.9553 +[titan] 2025-06-13 13:09:06,524 - root - INFO - lr: 4.9063e-04 gnorm: 1.08 [ 0:27:37< 2:33:17] +[titan] 2025-06-13 13:09:10,067 - root - INFO - step: 2295 loss: 23.0596 memory: 6.46GiB(27.34%) tps: 23,126 tflops: 23.27 mfu: 7.46% global_avg_ntp_loss: 4.3167 global_avg_mtp_loss: 18.7429 +[titan] 2025-06-13 13:09:10,067 - root - INFO - lr: 4.9055e-04 gnorm: 1.20 [ 0:27:40< 2:33:13] +[titan] 2025-06-13 13:09:12,596 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:09:13,298 - root - INFO - step: 2300 loss: 22.5617 memory: 6.46GiB(27.34%) tps: 25,357 tflops: 25.52 mfu: 8.18% global_avg_ntp_loss: 4.2067 global_avg_mtp_loss: 18.3550 +[titan] 2025-06-13 13:09:13,298 - root - INFO - lr: 4.9048e-04 gnorm: 1.11 [ 0:27:43< 2:33:07] +[titan] 2025-06-13 13:09:16,626 - root - INFO - step: 2305 loss: 22.4753 memory: 6.46GiB(27.34%) tps: 24,616 tflops: 24.77 mfu: 7.94% global_avg_ntp_loss: 4.1702 global_avg_mtp_loss: 18.3051 +[titan] 2025-06-13 13:09:16,627 - root - INFO - lr: 4.9041e-04 gnorm: 1.23 [ 0:27:47< 2:33:02] +[titan] 2025-06-13 13:09:20,034 - root - INFO - step: 2310 loss: 21.7151 memory: 6.46GiB(27.34%) tps: 24,045 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 4.0898 global_avg_mtp_loss: 17.6254 +[titan] 2025-06-13 13:09:20,034 - root - INFO - lr: 4.9033e-04 gnorm: 1.26 [ 0:27:50< 2:32:57] +[titan] 2025-06-13 13:09:23,241 - root - INFO - step: 2315 loss: 22.3026 memory: 6.46GiB(27.34%) tps: 25,546 tflops: 25.71 mfu: 8.24% global_avg_ntp_loss: 4.1680 global_avg_mtp_loss: 18.1346 +[titan] 2025-06-13 13:09:23,241 - root - INFO - lr: 4.9026e-04 gnorm: 1.05 [ 0:27:53< 2:32:51] +[titan] 2025-06-13 13:09:26,602 - root - INFO - step: 2320 loss: 22.8066 memory: 6.46GiB(27.34%) tps: 24,375 tflops: 24.53 mfu: 7.86% global_avg_ntp_loss: 4.3024 global_avg_mtp_loss: 18.5042 +[titan] 2025-06-13 13:09:26,602 - root - INFO - lr: 4.9019e-04 gnorm: 1.10 [ 0:27:57< 2:32:46] +[titan] 2025-06-13 13:09:30,078 - root - INFO - step: 2325 loss: 21.9769 memory: 6.46GiB(27.34%) tps: 23,573 tflops: 23.72 mfu: 7.60% global_avg_ntp_loss: 4.0900 global_avg_mtp_loss: 17.8870 +[titan] 2025-06-13 13:09:30,078 - root - INFO - lr: 4.9011e-04 gnorm: 1.10 [ 0:28:00< 2:32:42] +[titan] 2025-06-13 13:09:33,427 - root - INFO - step: 2330 loss: 22.3469 memory: 6.46GiB(27.34%) tps: 24,464 tflops: 24.62 mfu: 7.89% global_avg_ntp_loss: 4.2058 global_avg_mtp_loss: 18.1412 +[titan] 2025-06-13 13:09:33,427 - root - INFO - lr: 4.9004e-04 gnorm: 1.26 [ 0:28:04< 2:32:37] +[titan] 2025-06-13 13:09:36,791 - root - INFO - step: 2335 loss: 22.4590 memory: 6.46GiB(27.34%) tps: 24,354 tflops: 24.51 mfu: 7.86% global_avg_ntp_loss: 4.1994 global_avg_mtp_loss: 18.2596 +[titan] 2025-06-13 13:09:36,791 - root - INFO - lr: 4.8997e-04 gnorm: 1.23 [ 0:28:07< 2:32:32] +[titan] 2025-06-13 13:09:39,901 - root - INFO - step: 2340 loss: 21.8350 memory: 6.46GiB(27.34%) tps: 26,344 tflops: 26.51 mfu: 8.50% global_avg_ntp_loss: 4.0581 global_avg_mtp_loss: 17.7768 +[titan] 2025-06-13 13:09:39,901 - root - INFO - lr: 4.8989e-04 gnorm: 1.64 [ 0:28:10< 2:32:26] +[titan] 2025-06-13 13:09:43,345 - root - INFO - step: 2345 loss: 22.8476 memory: 6.46GiB(27.34%) tps: 23,792 tflops: 23.94 mfu: 7.67% global_avg_ntp_loss: 4.3324 global_avg_mtp_loss: 18.5152 +[titan] 2025-06-13 13:09:43,345 - root - INFO - lr: 4.8982e-04 gnorm: 1.24 [ 0:28:13< 2:32:21] +[titan] 2025-06-13 13:09:46,215 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:09:46,941 - root - INFO - step: 2350 loss: 22.2633 memory: 6.46GiB(27.34%) tps: 22,782 tflops: 22.93 mfu: 7.35% global_avg_ntp_loss: 4.1249 global_avg_mtp_loss: 18.1384 +[titan] 2025-06-13 13:09:46,941 - root - INFO - lr: 4.8974e-04 gnorm: 1.37 [ 0:28:17< 2:32:18] +[titan] 2025-06-13 13:09:50,254 - root - INFO - step: 2355 loss: 22.6910 memory: 6.46GiB(27.34%) tps: 24,731 tflops: 24.89 mfu: 7.98% global_avg_ntp_loss: 4.2515 global_avg_mtp_loss: 18.4395 +[titan] 2025-06-13 13:09:50,254 - root - INFO - lr: 4.8967e-04 gnorm: 1.03 [ 0:28:20< 2:32:12] +[titan] 2025-06-13 13:09:53,571 - root - INFO - step: 2360 loss: 21.5481 memory: 6.46GiB(27.34%) tps: 24,694 tflops: 24.85 mfu: 7.97% global_avg_ntp_loss: 4.0280 global_avg_mtp_loss: 17.5202 +[titan] 2025-06-13 13:09:53,572 - root - INFO - lr: 4.8959e-04 gnorm: 1.45 [ 0:28:24< 2:32:07] +[titan] 2025-06-13 13:09:56,920 - root - INFO - step: 2365 loss: 21.4405 memory: 6.46GiB(27.34%) tps: 24,469 tflops: 24.62 mfu: 7.89% global_avg_ntp_loss: 3.9889 global_avg_mtp_loss: 17.4515 +[titan] 2025-06-13 13:09:56,920 - root - INFO - lr: 4.8951e-04 gnorm: 1.29 [ 0:28:27< 2:32:02] +[titan] 2025-06-13 13:10:00,107 - root - INFO - step: 2370 loss: 20.2381 memory: 6.46GiB(27.34%) tps: 25,706 tflops: 25.87 mfu: 8.29% global_avg_ntp_loss: 3.6911 global_avg_mtp_loss: 16.5470 +[titan] 2025-06-13 13:10:00,107 - root - INFO - lr: 4.8944e-04 gnorm: 1.78 [ 0:28:30< 2:31:56] +[titan] 2025-06-13 13:10:03,578 - root - INFO - step: 2375 loss: 21.3865 memory: 6.46GiB(27.34%) tps: 23,602 tflops: 23.75 mfu: 7.61% global_avg_ntp_loss: 3.9689 global_avg_mtp_loss: 17.4176 +[titan] 2025-06-13 13:10:03,579 - root - INFO - lr: 4.8936e-04 gnorm: 1.40 [ 0:28:34< 2:31:52] +[titan] 2025-06-13 13:10:06,684 - root - INFO - step: 2380 loss: 22.3991 memory: 6.46GiB(27.34%) tps: 26,382 tflops: 26.55 mfu: 8.51% global_avg_ntp_loss: 4.2228 global_avg_mtp_loss: 18.1763 +[titan] 2025-06-13 13:10:06,684 - root - INFO - lr: 4.8928e-04 gnorm: 1.16 [ 0:28:37< 2:31:46] +[titan] 2025-06-13 13:10:10,225 - root - INFO - step: 2385 loss: 21.4700 memory: 6.46GiB(27.34%) tps: 23,137 tflops: 23.28 mfu: 7.46% global_avg_ntp_loss: 4.0403 global_avg_mtp_loss: 17.4297 +[titan] 2025-06-13 13:10:10,225 - root - INFO - lr: 4.8921e-04 gnorm: 2.44 [ 0:28:40< 2:31:42] +[titan] 2025-06-13 13:10:14,113 - root - INFO - step: 2390 loss: 21.3741 memory: 6.46GiB(27.34%) tps: 21,070 tflops: 21.20 mfu: 6.80% global_avg_ntp_loss: 3.9699 global_avg_mtp_loss: 17.4042 +[titan] 2025-06-13 13:10:14,114 - root - INFO - lr: 4.8913e-04 gnorm: 1.59 [ 0:28:44< 2:31:40] +[titan] 2025-06-13 13:10:17,241 - root - INFO - step: 2395 loss: 22.9161 memory: 6.46GiB(27.34%) tps: 26,193 tflops: 26.36 mfu: 8.45% global_avg_ntp_loss: 4.3572 global_avg_mtp_loss: 18.5589 +[titan] 2025-06-13 13:10:17,242 - root - INFO - lr: 4.8905e-04 gnorm: 1.21 [ 0:28:47< 2:31:33] +[titan] 2025-06-13 13:10:20,138 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:10:20,774 - root - INFO - step: 2400 loss: 22.6309 memory: 6.46GiB(27.34%) tps: 23,191 tflops: 23.34 mfu: 7.48% global_avg_ntp_loss: 4.2158 global_avg_mtp_loss: 18.4150 +[titan] 2025-06-13 13:10:20,774 - root - INFO - lr: 4.8897e-04 gnorm: 1.22 [ 0:28:51< 2:31:29] +[titan] 2025-06-13 13:10:23,959 - root - INFO - step: 2405 loss: 21.1620 memory: 6.46GiB(27.34%) tps: 25,728 tflops: 25.89 mfu: 8.30% global_avg_ntp_loss: 3.9281 global_avg_mtp_loss: 17.2340 +[titan] 2025-06-13 13:10:23,959 - root - INFO - lr: 4.8890e-04 gnorm: 1.30 [ 0:28:54< 2:31:24] +[titan] 2025-06-13 13:10:27,380 - root - INFO - step: 2410 loss: 22.5497 memory: 6.46GiB(27.34%) tps: 23,947 tflops: 24.10 mfu: 7.72% global_avg_ntp_loss: 4.2352 global_avg_mtp_loss: 18.3146 +[titan] 2025-06-13 13:10:27,380 - root - INFO - lr: 4.8882e-04 gnorm: 1.11 [ 0:28:58< 2:31:19] +[titan] 2025-06-13 13:10:30,766 - root - INFO - step: 2415 loss: 21.9965 memory: 6.46GiB(27.34%) tps: 24,198 tflops: 24.35 mfu: 7.81% global_avg_ntp_loss: 4.0971 global_avg_mtp_loss: 17.8994 +[titan] 2025-06-13 13:10:30,766 - root - INFO - lr: 4.8874e-04 gnorm: 1.30 [ 0:29:01< 2:31:14] +[titan] 2025-06-13 13:10:34,109 - root - INFO - step: 2420 loss: 22.6243 memory: 6.46GiB(27.34%) tps: 24,512 tflops: 24.67 mfu: 7.91% global_avg_ntp_loss: 4.2016 global_avg_mtp_loss: 18.4227 +[titan] 2025-06-13 13:10:34,109 - root - INFO - lr: 4.8866e-04 gnorm: 1.08 [ 0:29:04< 2:31:09] +[titan] 2025-06-13 13:10:37,634 - root - INFO - step: 2425 loss: 21.5304 memory: 6.46GiB(27.34%) tps: 23,238 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 4.0143 global_avg_mtp_loss: 17.5160 +[titan] 2025-06-13 13:10:37,635 - root - INFO - lr: 4.8858e-04 gnorm: 1.21 [ 0:29:08< 2:31:05] +[titan] 2025-06-13 13:10:41,078 - root - INFO - step: 2430 loss: 22.0878 memory: 6.46GiB(27.34%) tps: 23,792 tflops: 23.94 mfu: 7.67% global_avg_ntp_loss: 4.1402 global_avg_mtp_loss: 17.9476 +[titan] 2025-06-13 13:10:41,078 - root - INFO - lr: 4.8850e-04 gnorm: 1.15 [ 0:29:11< 2:31:01] +[titan] 2025-06-13 13:10:44,251 - root - INFO - step: 2435 loss: 20.3618 memory: 6.46GiB(27.34%) tps: 25,823 tflops: 25.99 mfu: 8.33% global_avg_ntp_loss: 3.7505 global_avg_mtp_loss: 16.6113 +[titan] 2025-06-13 13:10:44,251 - root - INFO - lr: 4.8842e-04 gnorm: 1.78 [ 0:29:14< 2:30:55] +[titan] 2025-06-13 13:10:47,614 - root - INFO - step: 2440 loss: 21.8739 memory: 6.46GiB(27.34%) tps: 24,362 tflops: 24.52 mfu: 7.86% global_avg_ntp_loss: 4.0769 global_avg_mtp_loss: 17.7970 +[titan] 2025-06-13 13:10:47,614 - root - INFO - lr: 4.8834e-04 gnorm: 1.12 [ 0:29:18< 2:30:50] +[titan] 2025-06-13 13:10:51,182 - root - INFO - step: 2445 loss: 22.5683 memory: 6.46GiB(27.34%) tps: 22,964 tflops: 23.11 mfu: 7.41% global_avg_ntp_loss: 4.2348 global_avg_mtp_loss: 18.3336 +[titan] 2025-06-13 13:10:51,182 - root - INFO - lr: 4.8826e-04 gnorm: 1.23 [ 0:29:21< 2:30:46] +[titan] 2025-06-13 13:10:53,906 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:10:54,580 - root - INFO - step: 2450 loss: 21.6007 memory: 6.46GiB(27.34%) tps: 24,110 tflops: 24.26 mfu: 7.78% global_avg_ntp_loss: 4.0247 global_avg_mtp_loss: 17.5760 +[titan] 2025-06-13 13:10:54,580 - root - INFO - lr: 4.8818e-04 gnorm: 1.20 [ 0:29:25< 2:30:42] +[titan] 2025-06-13 13:10:57,746 - root - INFO - step: 2455 loss: 22.5933 memory: 6.46GiB(27.34%) tps: 25,873 tflops: 26.04 mfu: 8.35% global_avg_ntp_loss: 4.2489 global_avg_mtp_loss: 18.3443 +[titan] 2025-06-13 13:10:57,747 - root - INFO - lr: 4.8810e-04 gnorm: 1.47 [ 0:29:28< 2:30:36] +[titan] 2025-06-13 13:11:00,994 - root - INFO - step: 2460 loss: 22.0461 memory: 6.46GiB(27.34%) tps: 25,231 tflops: 25.39 mfu: 8.14% global_avg_ntp_loss: 4.0839 global_avg_mtp_loss: 17.9622 +[titan] 2025-06-13 13:11:00,994 - root - INFO - lr: 4.8802e-04 gnorm: 1.15 [ 0:29:31< 2:30:30] +[titan] 2025-06-13 13:11:04,598 - root - INFO - step: 2465 loss: 21.6921 memory: 6.46GiB(27.34%) tps: 22,730 tflops: 22.87 mfu: 7.33% global_avg_ntp_loss: 4.0332 global_avg_mtp_loss: 17.6589 +[titan] 2025-06-13 13:11:04,599 - root - INFO - lr: 4.8794e-04 gnorm: 1.41 [ 0:29:35< 2:30:27] +[titan] 2025-06-13 13:11:08,135 - root - INFO - step: 2470 loss: 22.1884 memory: 6.46GiB(27.34%) tps: 23,164 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 4.1922 global_avg_mtp_loss: 17.9962 +[titan] 2025-06-13 13:11:08,135 - root - INFO - lr: 4.8785e-04 gnorm: 1.14 [ 0:29:38< 2:30:23] +[titan] 2025-06-13 13:11:11,629 - root - INFO - step: 2475 loss: 22.9242 memory: 6.46GiB(27.34%) tps: 23,447 tflops: 23.60 mfu: 7.56% global_avg_ntp_loss: 4.2876 global_avg_mtp_loss: 18.6365 +[titan] 2025-06-13 13:11:11,630 - root - INFO - lr: 4.8777e-04 gnorm: 1.25 [ 0:29:42< 2:30:19] +[titan] 2025-06-13 13:11:15,090 - root - INFO - step: 2480 loss: 22.1580 memory: 6.46GiB(27.34%) tps: 23,677 tflops: 23.83 mfu: 7.64% global_avg_ntp_loss: 4.1540 global_avg_mtp_loss: 18.0039 +[titan] 2025-06-13 13:11:15,090 - root - INFO - lr: 4.8769e-04 gnorm: 1.13 [ 0:29:45< 2:30:14] +[titan] 2025-06-13 13:11:18,435 - root - INFO - step: 2485 loss: 21.1216 memory: 6.46GiB(27.34%) tps: 24,488 tflops: 24.64 mfu: 7.90% global_avg_ntp_loss: 3.9116 global_avg_mtp_loss: 17.2101 +[titan] 2025-06-13 13:11:18,436 - root - INFO - lr: 4.8761e-04 gnorm: 1.25 [ 0:29:49< 2:30:10] +[titan] 2025-06-13 13:11:21,602 - root - INFO - step: 2490 loss: 19.8359 memory: 6.46GiB(27.34%) tps: 25,873 tflops: 26.04 mfu: 8.35% global_avg_ntp_loss: 3.6569 global_avg_mtp_loss: 16.1790 +[titan] 2025-06-13 13:11:21,602 - root - INFO - lr: 4.8753e-04 gnorm: 1.55 [ 0:29:52< 2:30:04] +[titan] 2025-06-13 13:11:25,160 - root - INFO - step: 2495 loss: 21.3067 memory: 6.46GiB(27.34%) tps: 23,030 tflops: 23.18 mfu: 7.43% global_avg_ntp_loss: 3.9668 global_avg_mtp_loss: 17.3399 +[titan] 2025-06-13 13:11:25,160 - root - INFO - lr: 4.8744e-04 gnorm: 1.27 [ 0:29:55< 2:30:00] +[titan] 2025-06-13 13:11:28,041 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:11:28,669 - root - INFO - step: 2500 loss: 22.7972 memory: 6.46GiB(27.34%) tps: 23,346 tflops: 23.50 mfu: 7.53% global_avg_ntp_loss: 4.3445 global_avg_mtp_loss: 18.4528 +[titan] 2025-06-13 13:11:28,669 - root - INFO - lr: 4.8736e-04 gnorm: 1.22 [ 0:29:59< 2:29:56] +[titan] 2025-06-13 13:11:31,900 - root - INFO - step: 2505 loss: 22.2089 memory: 6.46GiB(27.34%) tps: 25,354 tflops: 25.52 mfu: 8.18% global_avg_ntp_loss: 4.1493 global_avg_mtp_loss: 18.0596 +[titan] 2025-06-13 13:11:31,902 - root - INFO - lr: 4.8728e-04 gnorm: 1.26 [ 0:30:02< 2:29:51] +[titan] 2025-06-13 13:11:35,266 - root - INFO - step: 2510 loss: 22.5371 memory: 6.46GiB(27.34%) tps: 24,357 tflops: 24.51 mfu: 7.86% global_avg_ntp_loss: 4.2240 global_avg_mtp_loss: 18.3131 +[titan] 2025-06-13 13:11:35,266 - root - INFO - lr: 4.8719e-04 gnorm: 1.14 [ 0:30:05< 2:29:46] +[titan] 2025-06-13 13:11:38,973 - root - INFO - step: 2515 loss: 22.1834 memory: 6.46GiB(27.34%) tps: 22,098 tflops: 22.24 mfu: 7.13% global_avg_ntp_loss: 4.1176 global_avg_mtp_loss: 18.0658 +[titan] 2025-06-13 13:11:38,974 - root - INFO - lr: 4.8711e-04 gnorm: 1.22 [ 0:30:09< 2:29:43] +[titan] 2025-06-13 13:11:42,273 - root - INFO - step: 2520 loss: 22.1235 memory: 6.46GiB(27.34%) tps: 24,827 tflops: 24.99 mfu: 8.01% global_avg_ntp_loss: 4.1668 global_avg_mtp_loss: 17.9567 +[titan] 2025-06-13 13:11:42,274 - root - INFO - lr: 4.8702e-04 gnorm: 1.33 [ 0:30:12< 2:29:38] +[titan] 2025-06-13 13:11:45,704 - root - INFO - step: 2525 loss: 22.1413 memory: 6.46GiB(27.34%) tps: 23,881 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 4.1580 global_avg_mtp_loss: 17.9833 +[titan] 2025-06-13 13:11:45,705 - root - INFO - lr: 4.8694e-04 gnorm: 1.13 [ 0:30:16< 2:29:33] +[titan] 2025-06-13 13:11:49,518 - root - INFO - step: 2530 loss: 22.0743 memory: 6.46GiB(27.34%) tps: 21,484 tflops: 21.62 mfu: 6.93% global_avg_ntp_loss: 4.1274 global_avg_mtp_loss: 17.9469 +[titan] 2025-06-13 13:11:49,518 - root - INFO - lr: 4.8685e-04 gnorm: 1.31 [ 0:30:20< 2:29:31] +[titan] 2025-06-13 13:11:52,721 - root - INFO - step: 2535 loss: 19.6467 memory: 6.46GiB(27.34%) tps: 25,575 tflops: 25.74 mfu: 8.25% global_avg_ntp_loss: 3.5839 global_avg_mtp_loss: 16.0628 +[titan] 2025-06-13 13:11:52,722 - root - INFO - lr: 4.8677e-04 gnorm: 1.59 [ 0:30:23< 2:29:25] +[titan] 2025-06-13 13:11:55,836 - root - INFO - step: 2540 loss: 22.2162 memory: 6.46GiB(27.34%) tps: 26,302 tflops: 26.47 mfu: 8.48% global_avg_ntp_loss: 4.1263 global_avg_mtp_loss: 18.0898 +[titan] 2025-06-13 13:11:55,837 - root - INFO - lr: 4.8668e-04 gnorm: 1.58 [ 0:30:26< 2:29:19] +[titan] 2025-06-13 13:11:59,353 - root - INFO - step: 2545 loss: 22.6786 memory: 6.46GiB(27.34%) tps: 23,297 tflops: 23.45 mfu: 7.51% global_avg_ntp_loss: 4.2536 global_avg_mtp_loss: 18.4250 +[titan] 2025-06-13 13:11:59,354 - root - INFO - lr: 4.8660e-04 gnorm: 1.36 [ 0:30:29< 2:29:15] +[titan] 2025-06-13 13:12:02,070 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:12:03,179 - root - INFO - step: 2550 loss: 21.0355 memory: 6.46GiB(27.34%) tps: 21,416 tflops: 21.55 mfu: 6.91% global_avg_ntp_loss: 3.8872 global_avg_mtp_loss: 17.1482 +[titan] 2025-06-13 13:12:03,179 - root - INFO - lr: 4.8651e-04 gnorm: 1.22 [ 0:30:33< 2:29:13] +[titan] 2025-06-13 13:12:06,270 - root - INFO - step: 2555 loss: 22.3841 memory: 6.46GiB(27.34%) tps: 26,505 tflops: 26.67 mfu: 8.55% global_avg_ntp_loss: 4.1367 global_avg_mtp_loss: 18.2474 +[titan] 2025-06-13 13:12:06,271 - root - INFO - lr: 4.8643e-04 gnorm: 1.02 [ 0:30:36< 2:29:07] +[titan] 2025-06-13 13:12:09,572 - root - INFO - step: 2560 loss: 21.3230 memory: 6.46GiB(27.34%) tps: 24,813 tflops: 24.97 mfu: 8.00% global_avg_ntp_loss: 3.9248 global_avg_mtp_loss: 17.3982 +[titan] 2025-06-13 13:12:09,572 - root - INFO - lr: 4.8634e-04 gnorm: 1.41 [ 0:30:40< 2:29:02] +[titan] 2025-06-13 13:12:09,682 - root - INFO - Dumping profiler traces at step 2560 +[titan] 2025-06-13 13:12:09,770 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 13:12:13,522 - root - INFO - step: 2565 loss: 22.8790 memory: 6.46GiB(27.34%) tps: 20,743 tflops: 20.87 mfu: 6.69% global_avg_ntp_loss: 4.2642 global_avg_mtp_loss: 18.6147 +[titan] 2025-06-13 13:12:13,522 - root - INFO - lr: 4.8625e-04 gnorm: 0.98 [ 0:30:44< 2:29:00] +[titan] 2025-06-13 13:12:16,809 - root - INFO - step: 2570 loss: 21.4524 memory: 6.46GiB(27.34%) tps: 24,922 tflops: 25.08 mfu: 8.04% global_avg_ntp_loss: 3.9224 global_avg_mtp_loss: 17.5299 +[titan] 2025-06-13 13:12:16,809 - root - INFO - lr: 4.8616e-04 gnorm: 1.49 [ 0:30:47< 2:28:55] +[titan] 2025-06-13 13:12:20,006 - root - INFO - step: 2575 loss: 21.5804 memory: 6.46GiB(27.34%) tps: 25,631 tflops: 25.79 mfu: 8.27% global_avg_ntp_loss: 3.9508 global_avg_mtp_loss: 17.6297 +[titan] 2025-06-13 13:12:20,006 - root - INFO - lr: 4.8608e-04 gnorm: 1.34 [ 0:30:50< 2:28:49] +[titan] 2025-06-13 13:12:23,305 - root - INFO - step: 2580 loss: 21.7344 memory: 6.46GiB(27.34%) tps: 24,830 tflops: 24.99 mfu: 8.01% global_avg_ntp_loss: 4.0106 global_avg_mtp_loss: 17.7238 +[titan] 2025-06-13 13:12:23,306 - root - INFO - lr: 4.8599e-04 gnorm: 1.48 [ 0:30:53< 2:28:44] +[titan] 2025-06-13 13:12:26,761 - root - INFO - step: 2585 loss: 21.6181 memory: 6.46GiB(27.34%) tps: 23,707 tflops: 23.86 mfu: 7.65% global_avg_ntp_loss: 4.0119 global_avg_mtp_loss: 17.6062 +[titan] 2025-06-13 13:12:26,762 - root - INFO - lr: 4.8590e-04 gnorm: 1.19 [ 0:30:57< 2:28:40] +[titan] 2025-06-13 13:12:30,251 - root - INFO - step: 2590 loss: 21.1806 memory: 6.46GiB(27.34%) tps: 23,478 tflops: 23.63 mfu: 7.57% global_avg_ntp_loss: 3.8926 global_avg_mtp_loss: 17.2880 +[titan] 2025-06-13 13:12:30,251 - root - INFO - lr: 4.8581e-04 gnorm: 1.43 [ 0:31:00< 2:28:36] +[titan] 2025-06-13 13:12:33,638 - root - INFO - step: 2595 loss: 21.8822 memory: 6.46GiB(27.34%) tps: 24,188 tflops: 24.34 mfu: 7.80% global_avg_ntp_loss: 4.0734 global_avg_mtp_loss: 17.8087 +[titan] 2025-06-13 13:12:33,639 - root - INFO - lr: 4.8573e-04 gnorm: 1.18 [ 0:31:04< 2:28:31] +[titan] 2025-06-13 13:12:36,576 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:12:37,140 - root - INFO - step: 2600 loss: 23.2409 memory: 6.46GiB(27.34%) tps: 23,394 tflops: 23.54 mfu: 7.55% global_avg_ntp_loss: 4.4897 global_avg_mtp_loss: 18.7511 +[titan] 2025-06-13 13:12:37,141 - root - INFO - lr: 4.8564e-04 gnorm: 1.34 [ 0:31:07< 2:28:27] +[titan] 2025-06-13 13:12:40,363 - root - INFO - step: 2605 loss: 22.2596 memory: 6.46GiB(27.34%) tps: 25,423 tflops: 25.59 mfu: 8.20% global_avg_ntp_loss: 4.1213 global_avg_mtp_loss: 18.1383 +[titan] 2025-06-13 13:12:40,363 - root - INFO - lr: 4.8555e-04 gnorm: 1.20 [ 0:31:10< 2:28:22] +[titan] 2025-06-13 13:12:44,104 - root - INFO - step: 2610 loss: 22.0141 memory: 6.46GiB(27.34%) tps: 21,898 tflops: 22.04 mfu: 7.06% global_avg_ntp_loss: 4.1149 global_avg_mtp_loss: 17.8992 +[titan] 2025-06-13 13:12:44,105 - root - INFO - lr: 4.8546e-04 gnorm: 1.28 [ 0:31:14< 2:28:19] +[titan] 2025-06-13 13:12:47,737 - root - INFO - step: 2615 loss: 21.4595 memory: 6.46GiB(27.34%) tps: 22,551 tflops: 22.70 mfu: 7.27% global_avg_ntp_loss: 3.9802 global_avg_mtp_loss: 17.4793 +[titan] 2025-06-13 13:12:47,738 - root - INFO - lr: 4.8537e-04 gnorm: 1.32 [ 0:31:18< 2:28:16] +[titan] 2025-06-13 13:12:51,254 - root - INFO - step: 2620 loss: 22.0460 memory: 6.46GiB(27.34%) tps: 23,295 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 4.1257 global_avg_mtp_loss: 17.9203 +[titan] 2025-06-13 13:12:51,255 - root - INFO - lr: 4.8528e-04 gnorm: 1.03 [ 0:31:21< 2:28:12] +[titan] 2025-06-13 13:12:54,429 - root - INFO - step: 2625 loss: 21.6495 memory: 6.46GiB(27.34%) tps: 25,809 tflops: 25.97 mfu: 8.32% global_avg_ntp_loss: 4.0797 global_avg_mtp_loss: 17.5698 +[titan] 2025-06-13 13:12:54,429 - root - INFO - lr: 4.8519e-04 gnorm: 1.56 [ 0:31:25< 2:28:06] +[titan] 2025-06-13 13:12:58,027 - root - INFO - step: 2630 loss: 22.3122 memory: 6.46GiB(27.34%) tps: 22,771 tflops: 22.92 mfu: 7.34% global_avg_ntp_loss: 4.1804 global_avg_mtp_loss: 18.1318 +[titan] 2025-06-13 13:12:58,027 - root - INFO - lr: 4.8510e-04 gnorm: 1.14 [ 0:31:28< 2:28:03] +[titan] 2025-06-13 13:13:01,745 - root - INFO - step: 2635 loss: 22.8285 memory: 6.46GiB(27.34%) tps: 22,035 tflops: 22.18 mfu: 7.11% global_avg_ntp_loss: 4.2885 global_avg_mtp_loss: 18.5400 +[titan] 2025-06-13 13:13:01,745 - root - INFO - lr: 4.8501e-04 gnorm: 1.01 [ 0:31:32< 2:28:00] +[titan] 2025-06-13 13:13:05,606 - root - INFO - step: 2640 loss: 21.2466 memory: 6.46GiB(27.34%) tps: 21,221 tflops: 21.36 mfu: 6.84% global_avg_ntp_loss: 3.9199 global_avg_mtp_loss: 17.3267 +[titan] 2025-06-13 13:13:05,606 - root - INFO - lr: 4.8492e-04 gnorm: 1.32 [ 0:31:36< 2:27:57] +[titan] 2025-06-13 13:13:08,956 - root - INFO - step: 2645 loss: 21.7790 memory: 6.46GiB(27.34%) tps: 24,452 tflops: 24.61 mfu: 7.89% global_avg_ntp_loss: 4.0619 global_avg_mtp_loss: 17.7171 +[titan] 2025-06-13 13:13:08,957 - root - INFO - lr: 4.8483e-04 gnorm: 1.23 [ 0:31:39< 2:27:52] +[titan] 2025-06-13 13:13:11,689 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:13:12,341 - root - INFO - step: 2650 loss: 21.1615 memory: 6.46GiB(27.34%) tps: 24,207 tflops: 24.36 mfu: 7.81% global_avg_ntp_loss: 3.9043 global_avg_mtp_loss: 17.2571 +[titan] 2025-06-13 13:13:12,341 - root - INFO - lr: 4.8474e-04 gnorm: 1.20 [ 0:31:42< 2:27:48] +[titan] 2025-06-13 13:13:15,739 - root - INFO - step: 2655 loss: 21.4908 memory: 6.46GiB(27.34%) tps: 24,111 tflops: 24.27 mfu: 7.78% global_avg_ntp_loss: 3.9687 global_avg_mtp_loss: 17.5221 +[titan] 2025-06-13 13:13:15,739 - root - INFO - lr: 4.8464e-04 gnorm: 1.14 [ 0:31:46< 2:27:43] +[titan] 2025-06-13 13:13:19,163 - root - INFO - step: 2660 loss: 21.7730 memory: 6.46GiB(27.34%) tps: 23,930 tflops: 24.08 mfu: 7.72% global_avg_ntp_loss: 4.0091 global_avg_mtp_loss: 17.7639 +[titan] 2025-06-13 13:13:19,163 - root - INFO - lr: 4.8455e-04 gnorm: 1.21 [ 0:31:49< 2:27:39] +[titan] 2025-06-13 13:13:22,682 - root - INFO - step: 2665 loss: 19.9339 memory: 6.46GiB(27.34%) tps: 23,280 tflops: 23.43 mfu: 7.51% global_avg_ntp_loss: 3.7092 global_avg_mtp_loss: 16.2247 +[titan] 2025-06-13 13:13:22,682 - root - INFO - lr: 4.8446e-04 gnorm: 1.56 [ 0:31:53< 2:27:35] +[titan] 2025-06-13 13:13:26,042 - root - INFO - step: 2670 loss: 22.5282 memory: 6.46GiB(27.34%) tps: 24,385 tflops: 24.54 mfu: 7.87% global_avg_ntp_loss: 4.1381 global_avg_mtp_loss: 18.3900 +[titan] 2025-06-13 13:13:26,042 - root - INFO - lr: 4.8437e-04 gnorm: 1.19 [ 0:31:56< 2:27:31] +[titan] 2025-06-13 13:13:29,884 - root - INFO - step: 2675 loss: 21.6000 memory: 6.46GiB(27.34%) tps: 21,322 tflops: 21.46 mfu: 6.88% global_avg_ntp_loss: 3.9987 global_avg_mtp_loss: 17.6013 +[titan] 2025-06-13 13:13:29,884 - root - INFO - lr: 4.8428e-04 gnorm: 1.13 [ 0:32:00< 2:27:28] +[titan] 2025-06-13 13:13:33,368 - root - INFO - step: 2680 loss: 21.2494 memory: 6.46GiB(27.34%) tps: 23,518 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.9368 global_avg_mtp_loss: 17.3126 +[titan] 2025-06-13 13:13:33,368 - root - INFO - lr: 4.8418e-04 gnorm: 1.18 [ 0:32:03< 2:27:24] +[titan] 2025-06-13 13:13:36,795 - root - INFO - step: 2685 loss: 21.4348 memory: 6.46GiB(27.34%) tps: 23,908 tflops: 24.06 mfu: 7.71% global_avg_ntp_loss: 3.9778 global_avg_mtp_loss: 17.4570 +[titan] 2025-06-13 13:13:36,795 - root - INFO - lr: 4.8409e-04 gnorm: 1.19 [ 0:32:07< 2:27:20] +[titan] 2025-06-13 13:13:40,436 - root - INFO - step: 2690 loss: 22.5628 memory: 6.46GiB(27.34%) tps: 22,503 tflops: 22.65 mfu: 7.26% global_avg_ntp_loss: 4.2100 global_avg_mtp_loss: 18.3528 +[titan] 2025-06-13 13:13:40,436 - root - INFO - lr: 4.8400e-04 gnorm: 1.00 [ 0:32:11< 2:27:16] +[titan] 2025-06-13 13:13:43,835 - root - INFO - step: 2695 loss: 21.6500 memory: 6.46GiB(27.34%) tps: 24,101 tflops: 24.25 mfu: 7.77% global_avg_ntp_loss: 4.0041 global_avg_mtp_loss: 17.6459 +[titan] 2025-06-13 13:13:43,836 - root - INFO - lr: 4.8390e-04 gnorm: 1.13 [ 0:32:14< 2:27:12] +[titan] 2025-06-13 13:13:46,489 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:13:47,360 - root - INFO - step: 2700 loss: 22.8154 memory: 6.46GiB(27.34%) tps: 23,244 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 4.2371 global_avg_mtp_loss: 18.5783 +[titan] 2025-06-13 13:13:47,360 - root - INFO - lr: 4.8381e-04 gnorm: 1.21 [ 0:32:17< 2:27:08] +[titan] 2025-06-13 13:13:50,828 - root - INFO - step: 2705 loss: 21.8009 memory: 6.46GiB(27.34%) tps: 23,626 tflops: 23.78 mfu: 7.62% global_avg_ntp_loss: 4.0093 global_avg_mtp_loss: 17.7916 +[titan] 2025-06-13 13:13:50,828 - root - INFO - lr: 4.8372e-04 gnorm: 1.14 [ 0:32:21< 2:27:04] +[titan] 2025-06-13 13:13:54,514 - root - INFO - step: 2710 loss: 21.1645 memory: 6.46GiB(27.34%) tps: 22,228 tflops: 22.37 mfu: 7.17% global_avg_ntp_loss: 3.8873 global_avg_mtp_loss: 17.2772 +[titan] 2025-06-13 13:13:54,514 - root - INFO - lr: 4.8362e-04 gnorm: 1.43 [ 0:32:25< 2:27:01] +[titan] 2025-06-13 13:13:58,111 - root - INFO - step: 2715 loss: 21.4965 memory: 6.46GiB(27.34%) tps: 22,775 tflops: 22.92 mfu: 7.35% global_avg_ntp_loss: 3.9443 global_avg_mtp_loss: 17.5521 +[titan] 2025-06-13 13:13:58,111 - root - INFO - lr: 4.8353e-04 gnorm: 1.12 [ 0:32:28< 2:26:57] +[titan] 2025-06-13 13:14:01,366 - root - INFO - step: 2720 loss: 19.6284 memory: 6.46GiB(27.34%) tps: 25,175 tflops: 25.34 mfu: 8.12% global_avg_ntp_loss: 3.5841 global_avg_mtp_loss: 16.0444 +[titan] 2025-06-13 13:14:01,366 - root - INFO - lr: 4.8343e-04 gnorm: 1.60 [ 0:32:31< 2:26:52] +[titan] 2025-06-13 13:14:04,602 - root - INFO - step: 2725 loss: 22.2682 memory: 6.46GiB(27.34%) tps: 25,313 tflops: 25.47 mfu: 8.16% global_avg_ntp_loss: 4.1134 global_avg_mtp_loss: 18.1548 +[titan] 2025-06-13 13:14:04,603 - root - INFO - lr: 4.8334e-04 gnorm: 1.05 [ 0:32:35< 2:26:47] +[titan] 2025-06-13 13:14:08,018 - root - INFO - step: 2730 loss: 22.0393 memory: 6.46GiB(27.34%) tps: 23,983 tflops: 24.14 mfu: 7.74% global_avg_ntp_loss: 4.0974 global_avg_mtp_loss: 17.9418 +[titan] 2025-06-13 13:14:08,019 - root - INFO - lr: 4.8324e-04 gnorm: 1.14 [ 0:32:38< 2:26:43] +[titan] 2025-06-13 13:14:11,679 - root - INFO - step: 2735 loss: 21.7462 memory: 6.46GiB(27.34%) tps: 22,384 tflops: 22.53 mfu: 7.22% global_avg_ntp_loss: 4.0032 global_avg_mtp_loss: 17.7430 +[titan] 2025-06-13 13:14:11,679 - root - INFO - lr: 4.8314e-04 gnorm: 1.05 [ 0:32:42< 2:26:39] +[titan] 2025-06-13 13:14:15,248 - root - INFO - step: 2740 loss: 22.2923 memory: 6.46GiB(27.34%) tps: 22,953 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 4.1034 global_avg_mtp_loss: 18.1889 +[titan] 2025-06-13 13:14:15,249 - root - INFO - lr: 4.8305e-04 gnorm: 1.06 [ 0:32:45< 2:26:36] +[titan] 2025-06-13 13:14:18,971 - root - INFO - step: 2745 loss: 22.0550 memory: 6.46GiB(27.34%) tps: 22,007 tflops: 22.15 mfu: 7.10% global_avg_ntp_loss: 4.0990 global_avg_mtp_loss: 17.9560 +[titan] 2025-06-13 13:14:18,971 - root - INFO - lr: 4.8295e-04 gnorm: 1.06 [ 0:32:49< 2:26:33] +[titan] 2025-06-13 13:14:21,833 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:14:22,370 - root - INFO - step: 2750 loss: 21.3295 memory: 6.46GiB(27.34%) tps: 24,102 tflops: 24.26 mfu: 7.77% global_avg_ntp_loss: 3.9266 global_avg_mtp_loss: 17.4029 +[titan] 2025-06-13 13:14:22,371 - root - INFO - lr: 4.8286e-04 gnorm: 1.13 [ 0:32:52< 2:26:28] +[titan] 2025-06-13 13:14:26,052 - root - INFO - step: 2755 loss: 22.3129 memory: 6.46GiB(27.34%) tps: 22,255 tflops: 22.40 mfu: 7.18% global_avg_ntp_loss: 4.1162 global_avg_mtp_loss: 18.1967 +[titan] 2025-06-13 13:14:26,052 - root - INFO - lr: 4.8276e-04 gnorm: 1.02 [ 0:32:56< 2:26:25] +[titan] 2025-06-13 13:14:29,660 - root - INFO - step: 2760 loss: 20.6603 memory: 6.46GiB(27.34%) tps: 22,704 tflops: 22.85 mfu: 7.32% global_avg_ntp_loss: 3.8509 global_avg_mtp_loss: 16.8094 +[titan] 2025-06-13 13:14:29,661 - root - INFO - lr: 4.8266e-04 gnorm: 1.36 [ 0:33:00< 2:26:21] +[titan] 2025-06-13 13:14:32,732 - root - INFO - step: 2765 loss: 21.4243 memory: 6.46GiB(27.34%) tps: 26,669 tflops: 26.84 mfu: 8.60% global_avg_ntp_loss: 3.9180 global_avg_mtp_loss: 17.5062 +[titan] 2025-06-13 13:14:32,733 - root - INFO - lr: 4.8256e-04 gnorm: 1.12 [ 0:33:03< 2:26:16] +[titan] 2025-06-13 13:14:36,412 - root - INFO - step: 2770 loss: 21.7113 memory: 6.46GiB(27.34%) tps: 22,265 tflops: 22.41 mfu: 7.18% global_avg_ntp_loss: 4.0206 global_avg_mtp_loss: 17.6907 +[titan] 2025-06-13 13:14:36,412 - root - INFO - lr: 4.8247e-04 gnorm: 1.23 [ 0:33:07< 2:26:12] +[titan] 2025-06-13 13:14:39,835 - root - INFO - step: 2775 loss: 21.2869 memory: 6.46GiB(27.34%) tps: 23,939 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.9411 global_avg_mtp_loss: 17.3458 +[titan] 2025-06-13 13:14:39,835 - root - INFO - lr: 4.8237e-04 gnorm: 1.12 [ 0:33:10< 2:26:08] +[titan] 2025-06-13 13:14:43,300 - root - INFO - step: 2780 loss: 22.2099 memory: 6.46GiB(27.34%) tps: 23,643 tflops: 23.79 mfu: 7.63% global_avg_ntp_loss: 4.1405 global_avg_mtp_loss: 18.0694 +[titan] 2025-06-13 13:14:43,300 - root - INFO - lr: 4.8227e-04 gnorm: 1.14 [ 0:33:13< 2:26:04] +[titan] 2025-06-13 13:14:46,365 - root - INFO - step: 2785 loss: 20.9333 memory: 6.46GiB(27.34%) tps: 26,734 tflops: 26.90 mfu: 8.62% global_avg_ntp_loss: 3.8861 global_avg_mtp_loss: 17.0472 +[titan] 2025-06-13 13:14:46,365 - root - INFO - lr: 4.8217e-04 gnorm: 1.76 [ 0:33:16< 2:25:58] +[titan] 2025-06-13 13:14:49,853 - root - INFO - step: 2790 loss: 22.0047 memory: 6.46GiB(27.34%) tps: 23,487 tflops: 23.64 mfu: 7.58% global_avg_ntp_loss: 4.1139 global_avg_mtp_loss: 17.8908 +[titan] 2025-06-13 13:14:49,853 - root - INFO - lr: 4.8207e-04 gnorm: 1.34 [ 0:33:20< 2:25:54] +[titan] 2025-06-13 13:14:53,236 - root - INFO - step: 2795 loss: 21.4598 memory: 6.46GiB(27.34%) tps: 24,219 tflops: 24.37 mfu: 7.81% global_avg_ntp_loss: 3.9884 global_avg_mtp_loss: 17.4713 +[titan] 2025-06-13 13:14:53,236 - root - INFO - lr: 4.8198e-04 gnorm: 1.34 [ 0:33:23< 2:25:50] +[titan] 2025-06-13 13:14:56,071 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:14:56,759 - root - INFO - step: 2800 loss: 22.5620 memory: 6.46GiB(27.34%) tps: 23,254 tflops: 23.40 mfu: 7.50% global_avg_ntp_loss: 4.1653 global_avg_mtp_loss: 18.3967 +[titan] 2025-06-13 13:14:56,759 - root - INFO - lr: 4.8188e-04 gnorm: 1.04 [ 0:33:27< 2:25:46] +[titan] 2025-06-13 13:15:00,179 - root - INFO - step: 2805 loss: 22.1501 memory: 6.46GiB(27.34%) tps: 23,960 tflops: 24.11 mfu: 7.73% global_avg_ntp_loss: 4.1152 global_avg_mtp_loss: 18.0350 +[titan] 2025-06-13 13:15:00,179 - root - INFO - lr: 4.8178e-04 gnorm: 1.21 [ 0:33:30< 2:25:42] +[titan] 2025-06-13 13:15:03,330 - root - INFO - step: 2810 loss: 22.8094 memory: 6.46GiB(27.34%) tps: 25,996 tflops: 26.16 mfu: 8.39% global_avg_ntp_loss: 4.2154 global_avg_mtp_loss: 18.5940 +[titan] 2025-06-13 13:15:03,330 - root - INFO - lr: 4.8168e-04 gnorm: 1.17 [ 0:33:33< 2:25:36] +[titan] 2025-06-13 13:15:06,808 - root - INFO - step: 2815 loss: 21.5073 memory: 6.46GiB(27.34%) tps: 23,557 tflops: 23.71 mfu: 7.60% global_avg_ntp_loss: 3.9619 global_avg_mtp_loss: 17.5453 +[titan] 2025-06-13 13:15:06,808 - root - INFO - lr: 4.8158e-04 gnorm: 1.32 [ 0:33:37< 2:25:32] +[titan] 2025-06-13 13:15:10,194 - root - INFO - step: 2820 loss: 22.2839 memory: 6.46GiB(27.34%) tps: 24,199 tflops: 24.35 mfu: 7.81% global_avg_ntp_loss: 4.1198 global_avg_mtp_loss: 18.1641 +[titan] 2025-06-13 13:15:10,194 - root - INFO - lr: 4.8148e-04 gnorm: 1.11 [ 0:33:40< 2:25:28] +[titan] 2025-06-13 13:15:13,742 - root - INFO - step: 2825 loss: 22.3076 memory: 6.46GiB(27.34%) tps: 23,092 tflops: 23.24 mfu: 7.45% global_avg_ntp_loss: 4.1223 global_avg_mtp_loss: 18.1853 +[titan] 2025-06-13 13:15:13,742 - root - INFO - lr: 4.8138e-04 gnorm: 1.07 [ 0:33:44< 2:25:24] +[titan] 2025-06-13 13:15:17,217 - root - INFO - step: 2830 loss: 20.8138 memory: 6.46GiB(27.34%) tps: 23,576 tflops: 23.73 mfu: 7.60% global_avg_ntp_loss: 3.8532 global_avg_mtp_loss: 16.9606 +[titan] 2025-06-13 13:15:17,218 - root - INFO - lr: 4.8128e-04 gnorm: 1.19 [ 0:33:47< 2:25:20] +[titan] 2025-06-13 13:15:20,522 - root - INFO - step: 2835 loss: 21.3368 memory: 6.46GiB(27.34%) tps: 24,791 tflops: 24.95 mfu: 8.00% global_avg_ntp_loss: 3.9162 global_avg_mtp_loss: 17.4207 +[titan] 2025-06-13 13:15:20,522 - root - INFO - lr: 4.8118e-04 gnorm: 1.10 [ 0:33:51< 2:25:15] +[titan] 2025-06-13 13:15:24,109 - root - INFO - step: 2840 loss: 19.8536 memory: 6.46GiB(27.34%) tps: 22,839 tflops: 22.99 mfu: 7.37% global_avg_ntp_loss: 3.5981 global_avg_mtp_loss: 16.2555 +[titan] 2025-06-13 13:15:24,110 - root - INFO - lr: 4.8107e-04 gnorm: 1.64 [ 0:33:54< 2:25:11] +[titan] 2025-06-13 13:15:27,632 - root - INFO - step: 2845 loss: 21.7893 memory: 6.46GiB(27.34%) tps: 23,256 tflops: 23.40 mfu: 7.50% global_avg_ntp_loss: 4.0261 global_avg_mtp_loss: 17.7632 +[titan] 2025-06-13 13:15:27,632 - root - INFO - lr: 4.8097e-04 gnorm: 1.03 [ 0:33:58< 2:25:08] +[titan] 2025-06-13 13:15:30,214 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:15:31,448 - root - INFO - step: 2850 loss: 21.2071 memory: 6.46GiB(27.34%) tps: 21,469 tflops: 21.61 mfu: 6.92% global_avg_ntp_loss: 3.9186 global_avg_mtp_loss: 17.2885 +[titan] 2025-06-13 13:15:31,449 - root - INFO - lr: 4.8087e-04 gnorm: 1.14 [ 0:34:02< 2:25:05] +[titan] 2025-06-13 13:15:34,477 - root - INFO - step: 2855 loss: 21.8371 memory: 6.46GiB(27.34%) tps: 27,056 tflops: 27.23 mfu: 8.73% global_avg_ntp_loss: 4.0696 global_avg_mtp_loss: 17.7674 +[titan] 2025-06-13 13:15:34,477 - root - INFO - lr: 4.8077e-04 gnorm: 1.21 [ 0:34:05< 2:24:59] +[titan] 2025-06-13 13:15:37,568 - root - INFO - step: 2860 loss: 20.7309 memory: 6.46GiB(27.34%) tps: 26,502 tflops: 26.67 mfu: 8.55% global_avg_ntp_loss: 3.8229 global_avg_mtp_loss: 16.9080 +[titan] 2025-06-13 13:15:37,568 - root - INFO - lr: 4.8067e-04 gnorm: 1.52 [ 0:34:08< 2:24:53] +[titan] 2025-06-13 13:15:41,779 - root - INFO - step: 2865 loss: 22.1680 memory: 6.46GiB(27.34%) tps: 19,454 tflops: 19.58 mfu: 6.28% global_avg_ntp_loss: 4.1441 global_avg_mtp_loss: 18.0239 +[titan] 2025-06-13 13:15:41,780 - root - INFO - lr: 4.8056e-04 gnorm: 1.22 [ 0:34:12< 2:24:52] +[titan] 2025-06-13 13:15:45,288 - root - INFO - step: 2870 loss: 21.3764 memory: 6.46GiB(27.34%) tps: 23,350 tflops: 23.50 mfu: 7.53% global_avg_ntp_loss: 4.0023 global_avg_mtp_loss: 17.3741 +[titan] 2025-06-13 13:15:45,289 - root - INFO - lr: 4.8046e-04 gnorm: 1.39 [ 0:34:15< 2:24:49] +[titan] 2025-06-13 13:15:48,567 - root - INFO - step: 2875 loss: 21.9961 memory: 6.46GiB(27.34%) tps: 24,987 tflops: 25.15 mfu: 8.06% global_avg_ntp_loss: 4.1049 global_avg_mtp_loss: 17.8912 +[titan] 2025-06-13 13:15:48,568 - root - INFO - lr: 4.8036e-04 gnorm: 1.66 [ 0:34:19< 2:24:44] +[titan] 2025-06-13 13:15:51,925 - root - INFO - step: 2880 loss: 22.4661 memory: 6.46GiB(27.34%) tps: 24,400 tflops: 24.56 mfu: 7.87% global_avg_ntp_loss: 4.1314 global_avg_mtp_loss: 18.3347 +[titan] 2025-06-13 13:15:51,925 - root - INFO - lr: 4.8026e-04 gnorm: 1.14 [ 0:34:22< 2:24:39] +[titan] 2025-06-13 13:15:55,002 - root - INFO - step: 2885 loss: 20.8645 memory: 6.46GiB(27.34%) tps: 26,629 tflops: 26.80 mfu: 8.59% global_avg_ntp_loss: 3.8096 global_avg_mtp_loss: 17.0550 +[titan] 2025-06-13 13:15:55,002 - root - INFO - lr: 4.8015e-04 gnorm: 1.38 [ 0:34:25< 2:24:34] +[titan] 2025-06-13 13:15:58,157 - root - INFO - step: 2890 loss: 21.7259 memory: 6.46GiB(27.34%) tps: 25,969 tflops: 26.13 mfu: 8.38% global_avg_ntp_loss: 4.0273 global_avg_mtp_loss: 17.6986 +[titan] 2025-06-13 13:15:58,157 - root - INFO - lr: 4.8005e-04 gnorm: 1.18 [ 0:34:28< 2:24:28] +[titan] 2025-06-13 13:16:01,607 - root - INFO - step: 2895 loss: 21.9224 memory: 6.46GiB(27.34%) tps: 23,750 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 4.0144 global_avg_mtp_loss: 17.9080 +[titan] 2025-06-13 13:16:01,607 - root - INFO - lr: 4.7994e-04 gnorm: 1.22 [ 0:34:32< 2:24:24] +[titan] 2025-06-13 13:16:04,367 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:16:04,906 - root - INFO - step: 2900 loss: 21.8066 memory: 6.46GiB(27.34%) tps: 24,831 tflops: 24.99 mfu: 8.01% global_avg_ntp_loss: 4.0609 global_avg_mtp_loss: 17.7457 +[titan] 2025-06-13 13:16:04,906 - root - INFO - lr: 4.7984e-04 gnorm: 1.27 [ 0:34:35< 2:24:19] +[titan] 2025-06-13 13:16:08,285 - root - INFO - step: 2905 loss: 21.4669 memory: 6.46GiB(27.34%) tps: 24,245 tflops: 24.40 mfu: 7.82% global_avg_ntp_loss: 4.0075 global_avg_mtp_loss: 17.4593 +[titan] 2025-06-13 13:16:08,286 - root - INFO - lr: 4.7973e-04 gnorm: 1.11 [ 0:34:38< 2:24:15] +[titan] 2025-06-13 13:16:12,351 - root - INFO - step: 2910 loss: 21.6653 memory: 6.46GiB(27.34%) tps: 20,152 tflops: 20.28 mfu: 6.50% global_avg_ntp_loss: 4.0062 global_avg_mtp_loss: 17.6591 +[titan] 2025-06-13 13:16:12,351 - root - INFO - lr: 4.7963e-04 gnorm: 1.18 [ 0:34:42< 2:24:13] +[titan] 2025-06-13 13:16:15,731 - root - INFO - step: 2915 loss: 20.6736 memory: 6.46GiB(27.34%) tps: 24,239 tflops: 24.39 mfu: 7.82% global_avg_ntp_loss: 3.7997 global_avg_mtp_loss: 16.8739 +[titan] 2025-06-13 13:16:15,731 - root - INFO - lr: 4.7952e-04 gnorm: 1.52 [ 0:34:46< 2:24:09] +[titan] 2025-06-13 13:16:19,241 - root - INFO - step: 2920 loss: 21.3230 memory: 6.46GiB(27.34%) tps: 23,346 tflops: 23.49 mfu: 7.53% global_avg_ntp_loss: 3.9499 global_avg_mtp_loss: 17.3731 +[titan] 2025-06-13 13:16:19,241 - root - INFO - lr: 4.7942e-04 gnorm: 1.26 [ 0:34:49< 2:24:05] +[titan] 2025-06-13 13:16:23,024 - root - INFO - step: 2925 loss: 21.9907 memory: 6.46GiB(27.34%) tps: 21,655 tflops: 21.79 mfu: 6.98% global_avg_ntp_loss: 4.1032 global_avg_mtp_loss: 17.8875 +[titan] 2025-06-13 13:16:23,024 - root - INFO - lr: 4.7931e-04 gnorm: 1.28 [ 0:34:53< 2:24:02] +[titan] 2025-06-13 13:16:26,078 - root - INFO - step: 2930 loss: 22.3406 memory: 6.46GiB(27.34%) tps: 26,828 tflops: 27.00 mfu: 8.65% global_avg_ntp_loss: 4.1127 global_avg_mtp_loss: 18.2279 +[titan] 2025-06-13 13:16:26,078 - root - INFO - lr: 4.7921e-04 gnorm: 1.08 [ 0:34:56< 2:23:57] +[titan] 2025-06-13 13:16:29,679 - root - INFO - step: 2935 loss: 21.7976 memory: 6.46GiB(27.34%) tps: 22,751 tflops: 22.90 mfu: 7.34% global_avg_ntp_loss: 4.0140 global_avg_mtp_loss: 17.7835 +[titan] 2025-06-13 13:16:29,679 - root - INFO - lr: 4.7910e-04 gnorm: 1.14 [ 0:35:00< 2:23:53] +[titan] 2025-06-13 13:16:32,924 - root - INFO - step: 2940 loss: 20.5226 memory: 6.46GiB(27.34%) tps: 25,246 tflops: 25.41 mfu: 8.14% global_avg_ntp_loss: 3.7291 global_avg_mtp_loss: 16.7934 +[titan] 2025-06-13 13:16:32,924 - root - INFO - lr: 4.7900e-04 gnorm: 1.42 [ 0:35:03< 2:23:48] +[titan] 2025-06-13 13:16:36,380 - root - INFO - step: 2945 loss: 21.2858 memory: 6.46GiB(27.34%) tps: 23,710 tflops: 23.86 mfu: 7.65% global_avg_ntp_loss: 3.8794 global_avg_mtp_loss: 17.4064 +[titan] 2025-06-13 13:16:36,380 - root - INFO - lr: 4.7889e-04 gnorm: 1.27 [ 0:35:06< 2:23:44] +[titan] 2025-06-13 13:16:38,971 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:16:39,782 - root - INFO - step: 2950 loss: 21.2318 memory: 6.46GiB(27.34%) tps: 24,081 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 3.8934 global_avg_mtp_loss: 17.3384 +[titan] 2025-06-13 13:16:39,782 - root - INFO - lr: 4.7878e-04 gnorm: 1.25 [ 0:35:10< 2:23:40] +[titan] 2025-06-13 13:16:43,243 - root - INFO - step: 2955 loss: 21.6479 memory: 6.46GiB(27.34%) tps: 23,668 tflops: 23.82 mfu: 7.63% global_avg_ntp_loss: 3.9742 global_avg_mtp_loss: 17.6737 +[titan] 2025-06-13 13:16:43,244 - root - INFO - lr: 4.7867e-04 gnorm: 0.99 [ 0:35:13< 2:23:36] +[titan] 2025-06-13 13:16:46,688 - root - INFO - step: 2960 loss: 22.0552 memory: 6.46GiB(27.34%) tps: 23,788 tflops: 23.94 mfu: 7.67% global_avg_ntp_loss: 4.1573 global_avg_mtp_loss: 17.8979 +[titan] 2025-06-13 13:16:46,688 - root - INFO - lr: 4.7857e-04 gnorm: 1.32 [ 0:35:17< 2:23:32] +[titan] 2025-06-13 13:16:50,262 - root - INFO - step: 2965 loss: 22.1847 memory: 6.46GiB(27.34%) tps: 22,921 tflops: 23.07 mfu: 7.39% global_avg_ntp_loss: 4.1347 global_avg_mtp_loss: 18.0500 +[titan] 2025-06-13 13:16:50,262 - root - INFO - lr: 4.7846e-04 gnorm: 1.33 [ 0:35:20< 2:23:28] +[titan] 2025-06-13 13:16:53,443 - root - INFO - step: 2970 loss: 22.2102 memory: 6.46GiB(27.34%) tps: 25,757 tflops: 25.92 mfu: 8.31% global_avg_ntp_loss: 4.1375 global_avg_mtp_loss: 18.0726 +[titan] 2025-06-13 13:16:53,443 - root - INFO - lr: 4.7835e-04 gnorm: 1.25 [ 0:35:24< 2:23:23] +[titan] 2025-06-13 13:16:56,913 - root - INFO - step: 2975 loss: 22.2088 memory: 6.46GiB(27.34%) tps: 23,609 tflops: 23.76 mfu: 7.62% global_avg_ntp_loss: 4.2456 global_avg_mtp_loss: 17.9632 +[titan] 2025-06-13 13:16:56,914 - root - INFO - lr: 4.7824e-04 gnorm: 1.92 [ 0:35:27< 2:23:19] +[titan] 2025-06-13 13:17:00,461 - root - INFO - step: 2980 loss: 22.5837 memory: 6.46GiB(27.34%) tps: 23,095 tflops: 23.24 mfu: 7.45% global_avg_ntp_loss: 4.2874 global_avg_mtp_loss: 18.2964 +[titan] 2025-06-13 13:17:00,462 - root - INFO - lr: 4.7814e-04 gnorm: 1.80 [ 0:35:31< 2:23:15] +[titan] 2025-06-13 13:17:03,740 - root - INFO - step: 2985 loss: 20.0131 memory: 6.46GiB(27.34%) tps: 24,986 tflops: 25.14 mfu: 8.06% global_avg_ntp_loss: 3.6498 global_avg_mtp_loss: 16.3633 +[titan] 2025-06-13 13:17:03,741 - root - INFO - lr: 4.7803e-04 gnorm: 1.23 [ 0:35:34< 2:23:10] +[titan] 2025-06-13 13:17:07,026 - root - INFO - step: 2990 loss: 21.7771 memory: 6.46GiB(27.34%) tps: 24,935 tflops: 25.09 mfu: 8.04% global_avg_ntp_loss: 4.0236 global_avg_mtp_loss: 17.7535 +[titan] 2025-06-13 13:17:07,027 - root - INFO - lr: 4.7792e-04 gnorm: 1.16 [ 0:35:37< 2:23:06] +[titan] 2025-06-13 13:17:10,115 - root - INFO - step: 2995 loss: 21.5509 memory: 6.46GiB(27.34%) tps: 26,526 tflops: 26.69 mfu: 8.56% global_avg_ntp_loss: 3.9698 global_avg_mtp_loss: 17.5811 +[titan] 2025-06-13 13:17:10,115 - root - INFO - lr: 4.7781e-04 gnorm: 1.19 [ 0:35:40< 2:23:00] +[titan] 2025-06-13 13:17:13,120 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:17:13,807 - root - INFO - step: 3000 loss: 22.3299 memory: 6.46GiB(27.34%) tps: 22,190 tflops: 22.33 mfu: 7.16% global_avg_ntp_loss: 4.1604 global_avg_mtp_loss: 18.1695 +[titan] 2025-06-13 13:17:13,808 - root - INFO - lr: 4.7770e-04 gnorm: 1.23 [ 0:35:44< 2:22:57] +[titan] 2025-06-13 13:17:17,310 - root - INFO - step: 3005 loss: 20.5894 memory: 6.46GiB(27.34%) tps: 23,394 tflops: 23.54 mfu: 7.55% global_avg_ntp_loss: 3.8180 global_avg_mtp_loss: 16.7714 +[titan] 2025-06-13 13:17:17,310 - root - INFO - lr: 4.7759e-04 gnorm: 1.24 [ 0:35:47< 2:22:53] +[titan] 2025-06-13 13:17:20,589 - root - INFO - step: 3010 loss: 21.5519 memory: 6.46GiB(27.34%) tps: 24,984 tflops: 25.14 mfu: 8.06% global_avg_ntp_loss: 3.9215 global_avg_mtp_loss: 17.6304 +[titan] 2025-06-13 13:17:20,589 - root - INFO - lr: 4.7748e-04 gnorm: 1.09 [ 0:35:51< 2:22:48] +[titan] 2025-06-13 13:17:23,995 - root - INFO - step: 3015 loss: 21.6010 memory: 6.46GiB(27.34%) tps: 24,055 tflops: 24.21 mfu: 7.76% global_avg_ntp_loss: 3.9591 global_avg_mtp_loss: 17.6419 +[titan] 2025-06-13 13:17:23,995 - root - INFO - lr: 4.7737e-04 gnorm: 1.17 [ 0:35:54< 2:22:44] +[titan] 2025-06-13 13:17:27,567 - root - INFO - step: 3020 loss: 21.5948 memory: 6.46GiB(27.34%) tps: 22,936 tflops: 23.08 mfu: 7.40% global_avg_ntp_loss: 3.9390 global_avg_mtp_loss: 17.6558 +[titan] 2025-06-13 13:17:27,567 - root - INFO - lr: 4.7726e-04 gnorm: 1.07 [ 0:35:58< 2:22:41] +[titan] 2025-06-13 13:17:30,814 - root - INFO - step: 3025 loss: 21.6229 memory: 6.46GiB(27.34%) tps: 25,234 tflops: 25.40 mfu: 8.14% global_avg_ntp_loss: 3.9691 global_avg_mtp_loss: 17.6538 +[titan] 2025-06-13 13:17:30,814 - root - INFO - lr: 4.7715e-04 gnorm: 1.08 [ 0:36:01< 2:22:36] +[titan] 2025-06-13 13:17:34,558 - root - INFO - step: 3030 loss: 21.1818 memory: 6.46GiB(27.34%) tps: 21,881 tflops: 22.02 mfu: 7.06% global_avg_ntp_loss: 3.9489 global_avg_mtp_loss: 17.2328 +[titan] 2025-06-13 13:17:34,558 - root - INFO - lr: 4.7704e-04 gnorm: 1.10 [ 0:36:05< 2:22:33] +[titan] 2025-06-13 13:17:37,888 - root - INFO - step: 3035 loss: 21.7248 memory: 6.46GiB(27.34%) tps: 24,601 tflops: 24.76 mfu: 7.94% global_avg_ntp_loss: 4.1036 global_avg_mtp_loss: 17.6212 +[titan] 2025-06-13 13:17:37,889 - root - INFO - lr: 4.7693e-04 gnorm: 1.25 [ 0:36:08< 2:22:28] +[titan] 2025-06-13 13:17:40,916 - root - INFO - step: 3040 loss: 21.1500 memory: 6.46GiB(27.34%) tps: 27,062 tflops: 27.23 mfu: 8.73% global_avg_ntp_loss: 3.8343 global_avg_mtp_loss: 17.3157 +[titan] 2025-06-13 13:17:40,916 - root - INFO - lr: 4.7681e-04 gnorm: 1.18 [ 0:36:11< 2:22:23] +[titan] 2025-06-13 13:17:44,542 - root - INFO - step: 3045 loss: 21.4991 memory: 6.46GiB(27.34%) tps: 22,596 tflops: 22.74 mfu: 7.29% global_avg_ntp_loss: 3.9764 global_avg_mtp_loss: 17.5227 +[titan] 2025-06-13 13:17:44,542 - root - INFO - lr: 4.7670e-04 gnorm: 1.07 [ 0:36:15< 2:22:19] +[titan] 2025-06-13 13:17:47,709 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:17:48,270 - root - INFO - step: 3050 loss: 22.0340 memory: 6.46GiB(27.34%) tps: 21,979 tflops: 22.12 mfu: 7.09% global_avg_ntp_loss: 4.0619 global_avg_mtp_loss: 17.9721 +[titan] 2025-06-13 13:17:48,270 - root - INFO - lr: 4.7659e-04 gnorm: 1.14 [ 0:36:18< 2:22:16] +[titan] 2025-06-13 13:17:52,146 - root - INFO - step: 3055 loss: 20.1261 memory: 6.46GiB(27.34%) tps: 21,138 tflops: 21.27 mfu: 6.82% global_avg_ntp_loss: 3.6243 global_avg_mtp_loss: 16.5018 +[titan] 2025-06-13 13:17:52,146 - root - INFO - lr: 4.7648e-04 gnorm: 1.43 [ 0:36:22< 2:22:14] +[titan] 2025-06-13 13:17:55,633 - root - INFO - step: 3060 loss: 19.9341 memory: 6.46GiB(27.34%) tps: 23,496 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.6495 global_avg_mtp_loss: 16.2846 +[titan] 2025-06-13 13:17:55,633 - root - INFO - lr: 4.7637e-04 gnorm: 1.49 [ 0:36:26< 2:22:10] +[titan] 2025-06-13 13:17:59,351 - root - INFO - step: 3065 loss: 21.8762 memory: 6.46GiB(27.34%) tps: 22,031 tflops: 22.17 mfu: 7.11% global_avg_ntp_loss: 4.0620 global_avg_mtp_loss: 17.8142 +[titan] 2025-06-13 13:17:59,352 - root - INFO - lr: 4.7625e-04 gnorm: 1.08 [ 0:36:29< 2:22:07] +[titan] 2025-06-13 13:18:02,664 - root - INFO - step: 3070 loss: 20.0539 memory: 6.46GiB(27.34%) tps: 24,732 tflops: 24.89 mfu: 7.98% global_avg_ntp_loss: 3.6828 global_avg_mtp_loss: 16.3711 +[titan] 2025-06-13 13:18:02,664 - root - INFO - lr: 4.7614e-04 gnorm: 2.12 [ 0:36:33< 2:22:02] +[titan] 2025-06-13 13:18:03,956 - root - INFO - Dumping profiler traces at step 3072 +[titan] 2025-06-13 13:18:04,045 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 13:18:05,971 - root - INFO - step: 3075 loss: 22.1376 memory: 6.46GiB(27.34%) tps: 24,775 tflops: 24.93 mfu: 7.99% global_avg_ntp_loss: 4.1251 global_avg_mtp_loss: 18.0125 +[titan] 2025-06-13 13:18:05,971 - root - INFO - lr: 4.7603e-04 gnorm: 1.17 [ 0:36:36< 2:21:58] +[titan] 2025-06-13 13:18:09,352 - root - INFO - step: 3080 loss: 21.4665 memory: 6.46GiB(27.34%) tps: 24,237 tflops: 24.39 mfu: 7.82% global_avg_ntp_loss: 3.9782 global_avg_mtp_loss: 17.4883 +[titan] 2025-06-13 13:18:09,352 - root - INFO - lr: 4.7591e-04 gnorm: 1.21 [ 0:36:39< 2:21:53] +[titan] 2025-06-13 13:18:12,847 - root - INFO - step: 3085 loss: 20.6113 memory: 6.46GiB(27.34%) tps: 23,442 tflops: 23.59 mfu: 7.56% global_avg_ntp_loss: 3.7586 global_avg_mtp_loss: 16.8527 +[titan] 2025-06-13 13:18:12,847 - root - INFO - lr: 4.7580e-04 gnorm: 1.57 [ 0:36:43< 2:21:50] +[titan] 2025-06-13 13:18:16,253 - root - INFO - step: 3090 loss: 22.2273 memory: 6.46GiB(27.34%) tps: 24,049 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 4.0798 global_avg_mtp_loss: 18.1475 +[titan] 2025-06-13 13:18:16,254 - root - INFO - lr: 4.7569e-04 gnorm: 1.08 [ 0:36:46< 2:21:45] +[titan] 2025-06-13 13:18:19,368 - root - INFO - step: 3095 loss: 22.0034 memory: 6.46GiB(27.34%) tps: 26,308 tflops: 26.48 mfu: 8.49% global_avg_ntp_loss: 4.0653 global_avg_mtp_loss: 17.9380 +[titan] 2025-06-13 13:18:19,368 - root - INFO - lr: 4.7557e-04 gnorm: 1.00 [ 0:36:49< 2:21:40] +[titan] 2025-06-13 13:18:22,086 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:18:22,783 - root - INFO - step: 3100 loss: 20.6099 memory: 6.46GiB(27.34%) tps: 23,993 tflops: 24.15 mfu: 7.74% global_avg_ntp_loss: 3.7772 global_avg_mtp_loss: 16.8328 +[titan] 2025-06-13 13:18:22,783 - root - INFO - lr: 4.7546e-04 gnorm: 1.53 [ 0:36:53< 2:21:36] +[titan] 2025-06-13 13:18:26,259 - root - INFO - step: 3105 loss: 22.1661 memory: 6.46GiB(27.34%) tps: 23,569 tflops: 23.72 mfu: 7.60% global_avg_ntp_loss: 4.0547 global_avg_mtp_loss: 18.1114 +[titan] 2025-06-13 13:18:26,259 - root - INFO - lr: 4.7534e-04 gnorm: 1.05 [ 0:36:56< 2:21:32] +[titan] 2025-06-13 13:18:29,417 - root - INFO - step: 3110 loss: 20.6170 memory: 6.46GiB(27.34%) tps: 25,941 tflops: 26.11 mfu: 8.37% global_avg_ntp_loss: 3.7475 global_avg_mtp_loss: 16.8696 +[titan] 2025-06-13 13:18:29,418 - root - INFO - lr: 4.7523e-04 gnorm: 1.20 [ 0:36:59< 2:21:27] +[titan] 2025-06-13 13:18:32,993 - root - INFO - step: 3115 loss: 20.3625 memory: 6.46GiB(27.34%) tps: 22,915 tflops: 23.06 mfu: 7.39% global_avg_ntp_loss: 3.7152 global_avg_mtp_loss: 16.6472 +[titan] 2025-06-13 13:18:32,993 - root - INFO - lr: 4.7511e-04 gnorm: 1.39 [ 0:37:03< 2:21:23] +[titan] 2025-06-13 13:18:36,163 - root - INFO - step: 3120 loss: 21.3534 memory: 6.46GiB(27.34%) tps: 25,839 tflops: 26.00 mfu: 8.33% global_avg_ntp_loss: 3.9484 global_avg_mtp_loss: 17.4050 +[titan] 2025-06-13 13:18:36,164 - root - INFO - lr: 4.7500e-04 gnorm: 1.09 [ 0:37:06< 2:21:18] +[titan] 2025-06-13 13:18:39,796 - root - INFO - step: 3125 loss: 20.7949 memory: 6.46GiB(27.34%) tps: 22,556 tflops: 22.70 mfu: 7.28% global_avg_ntp_loss: 3.8171 global_avg_mtp_loss: 16.9779 +[titan] 2025-06-13 13:18:39,796 - root - INFO - lr: 4.7488e-04 gnorm: 1.09 [ 0:37:10< 2:21:15] +[titan] 2025-06-13 13:18:43,168 - root - INFO - step: 3130 loss: 21.0324 memory: 6.46GiB(27.34%) tps: 24,296 tflops: 24.45 mfu: 7.84% global_avg_ntp_loss: 3.9035 global_avg_mtp_loss: 17.1289 +[titan] 2025-06-13 13:18:43,168 - root - INFO - lr: 4.7476e-04 gnorm: 1.28 [ 0:37:13< 2:21:11] +[titan] 2025-06-13 13:18:46,661 - root - INFO - step: 3135 loss: 21.9564 memory: 6.46GiB(27.34%) tps: 23,453 tflops: 23.60 mfu: 7.56% global_avg_ntp_loss: 4.0430 global_avg_mtp_loss: 17.9134 +[titan] 2025-06-13 13:18:46,662 - root - INFO - lr: 4.7465e-04 gnorm: 1.05 [ 0:37:17< 2:21:07] +[titan] 2025-06-13 13:18:50,035 - root - INFO - step: 3140 loss: 21.3840 memory: 6.46GiB(27.34%) tps: 24,289 tflops: 24.44 mfu: 7.83% global_avg_ntp_loss: 3.9054 global_avg_mtp_loss: 17.4785 +[titan] 2025-06-13 13:18:50,035 - root - INFO - lr: 4.7453e-04 gnorm: 1.20 [ 0:37:20< 2:21:02] +[titan] 2025-06-13 13:18:53,398 - root - INFO - step: 3145 loss: 21.6781 memory: 6.46GiB(27.34%) tps: 24,359 tflops: 24.51 mfu: 7.86% global_avg_ntp_loss: 3.9659 global_avg_mtp_loss: 17.7121 +[titan] 2025-06-13 13:18:53,398 - root - INFO - lr: 4.7441e-04 gnorm: 1.14 [ 0:37:23< 2:20:58] +[titan] 2025-06-13 13:18:56,229 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:18:56,986 - root - INFO - step: 3150 loss: 22.3516 memory: 6.46GiB(27.34%) tps: 22,832 tflops: 22.98 mfu: 7.36% global_avg_ntp_loss: 4.1974 global_avg_mtp_loss: 18.1542 +[titan] 2025-06-13 13:18:56,987 - root - INFO - lr: 4.7430e-04 gnorm: 1.06 [ 0:37:27< 2:20:55] +[titan] 2025-06-13 13:19:00,187 - root - INFO - step: 3155 loss: 21.2637 memory: 6.46GiB(27.34%) tps: 25,602 tflops: 25.77 mfu: 8.26% global_avg_ntp_loss: 3.9292 global_avg_mtp_loss: 17.3344 +[titan] 2025-06-13 13:19:00,187 - root - INFO - lr: 4.7418e-04 gnorm: 1.25 [ 0:37:30< 2:20:50] +[titan] 2025-06-13 13:19:03,628 - root - INFO - step: 3160 loss: 18.9421 memory: 6.46GiB(27.34%) tps: 23,805 tflops: 23.96 mfu: 7.68% global_avg_ntp_loss: 3.4713 global_avg_mtp_loss: 15.4708 +[titan] 2025-06-13 13:19:03,628 - root - INFO - lr: 4.7406e-04 gnorm: 1.72 [ 0:37:34< 2:20:46] +[titan] 2025-06-13 13:19:06,726 - root - INFO - step: 3165 loss: 19.4243 memory: 6.46GiB(27.34%) tps: 26,448 tflops: 26.62 mfu: 8.53% global_avg_ntp_loss: 3.4955 global_avg_mtp_loss: 15.9288 +[titan] 2025-06-13 13:19:06,726 - root - INFO - lr: 4.7395e-04 gnorm: 1.43 [ 0:37:37< 2:20:40] +[titan] 2025-06-13 13:19:10,520 - root - INFO - step: 3170 loss: 21.6701 memory: 6.46GiB(27.34%) tps: 21,595 tflops: 21.73 mfu: 6.97% global_avg_ntp_loss: 4.0270 global_avg_mtp_loss: 17.6430 +[titan] 2025-06-13 13:19:10,520 - root - INFO - lr: 4.7383e-04 gnorm: 1.24 [ 0:37:41< 2:20:38] +[titan] 2025-06-13 13:19:14,071 - root - INFO - step: 3175 loss: 16.4522 memory: 6.46GiB(27.34%) tps: 23,070 tflops: 23.22 mfu: 7.44% global_avg_ntp_loss: 2.9521 global_avg_mtp_loss: 13.5000 +[titan] 2025-06-13 13:19:14,071 - root - INFO - lr: 4.7371e-04 gnorm: 1.69 [ 0:37:44< 2:20:34] +[titan] 2025-06-13 13:19:17,496 - root - INFO - step: 3180 loss: 21.3711 memory: 6.46GiB(27.34%) tps: 23,923 tflops: 24.08 mfu: 7.72% global_avg_ntp_loss: 3.9792 global_avg_mtp_loss: 17.3920 +[titan] 2025-06-13 13:19:17,496 - root - INFO - lr: 4.7359e-04 gnorm: 1.15 [ 0:37:48< 2:20:30] +[titan] 2025-06-13 13:19:21,211 - root - INFO - step: 3185 loss: 21.9082 memory: 6.46GiB(27.34%) tps: 22,051 tflops: 22.19 mfu: 7.11% global_avg_ntp_loss: 4.0391 global_avg_mtp_loss: 17.8691 +[titan] 2025-06-13 13:19:21,212 - root - INFO - lr: 4.7347e-04 gnorm: 1.05 [ 0:37:51< 2:20:27] +[titan] 2025-06-13 13:19:25,169 - root - INFO - step: 3190 loss: 18.1363 memory: 6.46GiB(27.34%) tps: 20,701 tflops: 20.83 mfu: 6.68% global_avg_ntp_loss: 3.3742 global_avg_mtp_loss: 14.7622 +[titan] 2025-06-13 13:19:25,169 - root - INFO - lr: 4.7335e-04 gnorm: 9.75 [ 0:37:55< 2:20:25] +[titan] 2025-06-13 13:19:28,484 - root - INFO - step: 3195 loss: 20.5527 memory: 6.46GiB(27.34%) tps: 24,717 tflops: 24.87 mfu: 7.97% global_avg_ntp_loss: 3.7506 global_avg_mtp_loss: 16.8021 +[titan] 2025-06-13 13:19:28,484 - root - INFO - lr: 4.7323e-04 gnorm: 1.29 [ 0:37:59< 2:20:20] +[titan] 2025-06-13 13:19:31,411 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:19:32,816 - root - INFO - step: 3200 loss: 21.7838 memory: 6.46GiB(27.34%) tps: 18,913 tflops: 19.03 mfu: 6.10% global_avg_ntp_loss: 3.9924 global_avg_mtp_loss: 17.7914 +[titan] 2025-06-13 13:19:32,816 - root - INFO - lr: 4.7311e-04 gnorm: 1.21 [ 0:38:03< 2:20:19] +[titan] 2025-06-13 13:19:35,747 - root - INFO - step: 3205 loss: 21.2475 memory: 6.46GiB(27.34%) tps: 27,955 tflops: 28.13 mfu: 9.02% global_avg_ntp_loss: 3.9623 global_avg_mtp_loss: 17.2851 +[titan] 2025-06-13 13:19:35,747 - root - INFO - lr: 4.7299e-04 gnorm: 1.24 [ 0:38:06< 2:20:14] +[titan] 2025-06-13 13:19:39,061 - root - INFO - step: 3210 loss: 22.2099 memory: 6.46GiB(27.34%) tps: 24,722 tflops: 24.88 mfu: 7.97% global_avg_ntp_loss: 4.0693 global_avg_mtp_loss: 18.1406 +[titan] 2025-06-13 13:19:39,061 - root - INFO - lr: 4.7287e-04 gnorm: 1.07 [ 0:38:09< 2:20:09] +[titan] 2025-06-13 13:19:42,483 - root - INFO - step: 3215 loss: 21.3006 memory: 6.46GiB(27.34%) tps: 23,940 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.9261 global_avg_mtp_loss: 17.3746 +[titan] 2025-06-13 13:19:42,483 - root - INFO - lr: 4.7275e-04 gnorm: 1.31 [ 0:38:13< 2:20:05] +[titan] 2025-06-13 13:19:46,297 - root - INFO - step: 3220 loss: 22.1718 memory: 6.46GiB(27.34%) tps: 21,480 tflops: 21.62 mfu: 6.93% global_avg_ntp_loss: 4.0987 global_avg_mtp_loss: 18.0731 +[titan] 2025-06-13 13:19:46,297 - root - INFO - lr: 4.7263e-04 gnorm: 2.39 [ 0:38:16< 2:20:02] +[titan] 2025-06-13 13:19:49,888 - root - INFO - step: 3225 loss: 21.2578 memory: 6.46GiB(27.34%) tps: 22,815 tflops: 22.96 mfu: 7.36% global_avg_ntp_loss: 3.8832 global_avg_mtp_loss: 17.3746 +[titan] 2025-06-13 13:19:49,889 - root - INFO - lr: 4.7251e-04 gnorm: 1.39 [ 0:38:20< 2:19:59] +[titan] 2025-06-13 13:19:53,313 - root - INFO - step: 3230 loss: 21.6397 memory: 6.46GiB(27.34%) tps: 23,927 tflops: 24.08 mfu: 7.72% global_avg_ntp_loss: 4.0123 global_avg_mtp_loss: 17.6274 +[titan] 2025-06-13 13:19:53,313 - root - INFO - lr: 4.7239e-04 gnorm: 1.16 [ 0:38:23< 2:19:55] +[titan] 2025-06-13 13:19:56,885 - root - INFO - step: 3235 loss: 21.6065 memory: 6.46GiB(27.34%) tps: 22,935 tflops: 23.08 mfu: 7.40% global_avg_ntp_loss: 3.9937 global_avg_mtp_loss: 17.6128 +[titan] 2025-06-13 13:19:56,885 - root - INFO - lr: 4.7227e-04 gnorm: 1.12 [ 0:38:27< 2:19:51] +[titan] 2025-06-13 13:20:00,411 - root - INFO - step: 3240 loss: 22.3065 memory: 6.46GiB(27.34%) tps: 23,234 tflops: 23.38 mfu: 7.49% global_avg_ntp_loss: 4.1444 global_avg_mtp_loss: 18.1622 +[titan] 2025-06-13 13:20:00,411 - root - INFO - lr: 4.7215e-04 gnorm: 1.07 [ 0:38:30< 2:19:47] +[titan] 2025-06-13 13:20:03,803 - root - INFO - step: 3245 loss: 21.1909 memory: 6.46GiB(27.34%) tps: 24,157 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 3.8622 global_avg_mtp_loss: 17.3287 +[titan] 2025-06-13 13:20:03,803 - root - INFO - lr: 4.7203e-04 gnorm: 1.14 [ 0:38:34< 2:19:43] +[titan] 2025-06-13 13:20:06,565 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:20:07,514 - root - INFO - step: 3250 loss: 22.4584 memory: 6.46GiB(27.34%) tps: 22,074 tflops: 22.22 mfu: 7.12% global_avg_ntp_loss: 4.1582 global_avg_mtp_loss: 18.3002 +[titan] 2025-06-13 13:20:07,515 - root - INFO - lr: 4.7190e-04 gnorm: 1.05 [ 0:38:38< 2:19:40] +[titan] 2025-06-13 13:20:10,522 - root - INFO - step: 3255 loss: 21.0291 memory: 6.46GiB(27.34%) tps: 27,237 tflops: 27.41 mfu: 8.79% global_avg_ntp_loss: 3.8111 global_avg_mtp_loss: 17.2180 +[titan] 2025-06-13 13:20:10,523 - root - INFO - lr: 4.7178e-04 gnorm: 1.13 [ 0:38:41< 2:19:35] +[titan] 2025-06-13 13:20:13,791 - root - INFO - step: 3260 loss: 20.0583 memory: 6.46GiB(27.34%) tps: 25,067 tflops: 25.23 mfu: 8.09% global_avg_ntp_loss: 3.6499 global_avg_mtp_loss: 16.4084 +[titan] 2025-06-13 13:20:13,791 - root - INFO - lr: 4.7166e-04 gnorm: 1.72 [ 0:38:44< 2:19:30] +[titan] 2025-06-13 13:20:17,308 - root - INFO - step: 3265 loss: 21.8348 memory: 6.46GiB(27.34%) tps: 23,293 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 3.9583 global_avg_mtp_loss: 17.8765 +[titan] 2025-06-13 13:20:17,308 - root - INFO - lr: 4.7154e-04 gnorm: 1.11 [ 0:38:47< 2:19:26] +[titan] 2025-06-13 13:20:20,475 - root - INFO - step: 3270 loss: 21.9281 memory: 6.46GiB(27.34%) tps: 25,875 tflops: 26.04 mfu: 8.35% global_avg_ntp_loss: 4.0373 global_avg_mtp_loss: 17.8908 +[titan] 2025-06-13 13:20:20,475 - root - INFO - lr: 4.7141e-04 gnorm: 1.11 [ 0:38:51< 2:19:21] +[titan] 2025-06-13 13:20:23,964 - root - INFO - step: 3275 loss: 19.4543 memory: 6.46GiB(27.34%) tps: 23,480 tflops: 23.63 mfu: 7.57% global_avg_ntp_loss: 3.5729 global_avg_mtp_loss: 15.8813 +[titan] 2025-06-13 13:20:23,964 - root - INFO - lr: 4.7129e-04 gnorm: 1.79 [ 0:38:54< 2:19:17] +[titan] 2025-06-13 13:20:27,252 - root - INFO - step: 3280 loss: 21.1614 memory: 6.46GiB(27.34%) tps: 24,921 tflops: 25.08 mfu: 8.04% global_avg_ntp_loss: 3.8590 global_avg_mtp_loss: 17.3024 +[titan] 2025-06-13 13:20:27,252 - root - INFO - lr: 4.7117e-04 gnorm: 1.38 [ 0:38:57< 2:19:13] +[titan] 2025-06-13 13:20:30,977 - root - INFO - step: 3285 loss: 20.9455 memory: 6.46GiB(27.34%) tps: 21,991 tflops: 22.13 mfu: 7.09% global_avg_ntp_loss: 3.8361 global_avg_mtp_loss: 17.1094 +[titan] 2025-06-13 13:20:30,978 - root - INFO - lr: 4.7104e-04 gnorm: 1.19 [ 0:39:01< 2:19:10] +[titan] 2025-06-13 13:20:34,038 - root - INFO - step: 3290 loss: 20.2751 memory: 6.46GiB(27.34%) tps: 26,773 tflops: 26.94 mfu: 8.64% global_avg_ntp_loss: 3.6647 global_avg_mtp_loss: 16.6104 +[titan] 2025-06-13 13:20:34,038 - root - INFO - lr: 4.7092e-04 gnorm: 1.23 [ 0:39:04< 2:19:04] +[titan] 2025-06-13 13:20:37,478 - root - INFO - step: 3295 loss: 21.2251 memory: 6.46GiB(27.34%) tps: 23,816 tflops: 23.97 mfu: 7.68% global_avg_ntp_loss: 3.8962 global_avg_mtp_loss: 17.3289 +[titan] 2025-06-13 13:20:37,478 - root - INFO - lr: 4.7080e-04 gnorm: 1.21 [ 0:39:08< 2:19:00] +[titan] 2025-06-13 13:20:40,547 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:20:41,236 - root - INFO - step: 3300 loss: 20.1953 memory: 6.46GiB(27.34%) tps: 21,798 tflops: 21.94 mfu: 7.03% global_avg_ntp_loss: 3.6596 global_avg_mtp_loss: 16.5357 +[titan] 2025-06-13 13:20:41,236 - root - INFO - lr: 4.7067e-04 gnorm: 1.19 [ 0:39:11< 2:18:58] +[titan] 2025-06-13 13:20:44,835 - root - INFO - step: 3305 loss: 21.8007 memory: 6.46GiB(27.34%) tps: 22,763 tflops: 22.91 mfu: 7.34% global_avg_ntp_loss: 3.9988 global_avg_mtp_loss: 17.8019 +[titan] 2025-06-13 13:20:44,836 - root - INFO - lr: 4.7055e-04 gnorm: 1.14 [ 0:39:15< 2:18:54] +[titan] 2025-06-13 13:20:48,423 - root - INFO - step: 3310 loss: 20.8808 memory: 6.46GiB(27.34%) tps: 22,837 tflops: 22.98 mfu: 7.37% global_avg_ntp_loss: 3.8276 global_avg_mtp_loss: 17.0532 +[titan] 2025-06-13 13:20:48,423 - root - INFO - lr: 4.7042e-04 gnorm: 1.16 [ 0:39:18< 2:18:51] +[titan] 2025-06-13 13:20:51,938 - root - INFO - step: 3315 loss: 21.9364 memory: 6.46GiB(27.34%) tps: 23,311 tflops: 23.46 mfu: 7.52% global_avg_ntp_loss: 4.0033 global_avg_mtp_loss: 17.9331 +[titan] 2025-06-13 13:20:51,938 - root - INFO - lr: 4.7030e-04 gnorm: 0.98 [ 0:39:22< 2:18:47] +[titan] 2025-06-13 13:20:55,494 - root - INFO - step: 3320 loss: 20.1908 memory: 6.46GiB(27.34%) tps: 23,039 tflops: 23.19 mfu: 7.43% global_avg_ntp_loss: 3.6752 global_avg_mtp_loss: 16.5155 +[titan] 2025-06-13 13:20:55,494 - root - INFO - lr: 4.7017e-04 gnorm: 1.24 [ 0:39:26< 2:18:43] +[titan] 2025-06-13 13:20:58,943 - root - INFO - step: 3325 loss: 20.7491 memory: 6.46GiB(27.34%) tps: 23,754 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 3.8465 global_avg_mtp_loss: 16.9025 +[titan] 2025-06-13 13:20:58,943 - root - INFO - lr: 4.7004e-04 gnorm: 1.15 [ 0:39:29< 2:18:39] +[titan] 2025-06-13 13:21:02,389 - root - INFO - step: 3330 loss: 22.0801 memory: 6.46GiB(27.34%) tps: 23,774 tflops: 23.93 mfu: 7.67% global_avg_ntp_loss: 4.0425 global_avg_mtp_loss: 18.0377 +[titan] 2025-06-13 13:21:02,390 - root - INFO - lr: 4.6992e-04 gnorm: 1.07 [ 0:39:32< 2:18:35] +[titan] 2025-06-13 13:21:05,852 - root - INFO - step: 3335 loss: 17.2911 memory: 6.46GiB(27.34%) tps: 23,661 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 3.1326 global_avg_mtp_loss: 14.1585 +[titan] 2025-06-13 13:21:05,852 - root - INFO - lr: 4.6979e-04 gnorm: 2.07 [ 0:39:36< 2:18:32] +[titan] 2025-06-13 13:21:09,203 - root - INFO - step: 3340 loss: 21.7774 memory: 6.46GiB(27.34%) tps: 24,450 tflops: 24.61 mfu: 7.89% global_avg_ntp_loss: 4.0052 global_avg_mtp_loss: 17.7722 +[titan] 2025-06-13 13:21:09,203 - root - INFO - lr: 4.6967e-04 gnorm: 1.15 [ 0:39:39< 2:18:27] +[titan] 2025-06-13 13:21:12,597 - root - INFO - step: 3345 loss: 21.3017 memory: 6.46GiB(27.34%) tps: 24,140 tflops: 24.29 mfu: 7.79% global_avg_ntp_loss: 3.9029 global_avg_mtp_loss: 17.3988 +[titan] 2025-06-13 13:21:12,597 - root - INFO - lr: 4.6954e-04 gnorm: 1.15 [ 0:39:43< 2:18:23] +[titan] 2025-06-13 13:21:15,489 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:21:16,156 - root - INFO - step: 3350 loss: 21.4014 memory: 6.46GiB(27.34%) tps: 23,020 tflops: 23.17 mfu: 7.43% global_avg_ntp_loss: 3.9059 global_avg_mtp_loss: 17.4955 +[titan] 2025-06-13 13:21:16,156 - root - INFO - lr: 4.6941e-04 gnorm: 1.02 [ 0:39:46< 2:18:19] +[titan] 2025-06-13 13:21:19,528 - root - INFO - step: 3355 loss: 21.7401 memory: 6.46GiB(27.34%) tps: 24,297 tflops: 24.45 mfu: 7.84% global_avg_ntp_loss: 3.9228 global_avg_mtp_loss: 17.8173 +[titan] 2025-06-13 13:21:19,528 - root - INFO - lr: 4.6929e-04 gnorm: 1.08 [ 0:39:50< 2:18:15] +[titan] 2025-06-13 13:21:22,868 - root - INFO - step: 3360 loss: 19.9438 memory: 6.46GiB(27.34%) tps: 24,529 tflops: 24.69 mfu: 7.91% global_avg_ntp_loss: 3.6439 global_avg_mtp_loss: 16.2999 +[titan] 2025-06-13 13:21:22,868 - root - INFO - lr: 4.6916e-04 gnorm: 1.27 [ 0:39:53< 2:18:11] +[titan] 2025-06-13 13:21:26,365 - root - INFO - step: 3365 loss: 21.7911 memory: 6.46GiB(27.34%) tps: 23,430 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 4.0387 global_avg_mtp_loss: 17.7524 +[titan] 2025-06-13 13:21:26,365 - root - INFO - lr: 4.6903e-04 gnorm: 1.06 [ 0:39:56< 2:18:07] +[titan] 2025-06-13 13:21:29,919 - root - INFO - step: 3370 loss: 21.4419 memory: 6.46GiB(27.34%) tps: 23,054 tflops: 23.20 mfu: 7.44% global_avg_ntp_loss: 3.9243 global_avg_mtp_loss: 17.5175 +[titan] 2025-06-13 13:21:29,919 - root - INFO - lr: 4.6890e-04 gnorm: 1.16 [ 0:40:00< 2:18:04] +[titan] 2025-06-13 13:21:33,383 - root - INFO - step: 3375 loss: 22.0313 memory: 6.46GiB(27.34%) tps: 23,648 tflops: 23.80 mfu: 7.63% global_avg_ntp_loss: 4.0902 global_avg_mtp_loss: 17.9411 +[titan] 2025-06-13 13:21:33,384 - root - INFO - lr: 4.6877e-04 gnorm: 1.06 [ 0:40:03< 2:18:00] +[titan] 2025-06-13 13:21:36,810 - root - INFO - step: 3380 loss: 21.2263 memory: 6.46GiB(27.34%) tps: 23,911 tflops: 24.06 mfu: 7.71% global_avg_ntp_loss: 3.8828 global_avg_mtp_loss: 17.3435 +[titan] 2025-06-13 13:21:36,810 - root - INFO - lr: 4.6865e-04 gnorm: 1.43 [ 0:40:07< 2:17:56] +[titan] 2025-06-13 13:21:40,270 - root - INFO - step: 3385 loss: 18.5462 memory: 6.46GiB(27.34%) tps: 23,679 tflops: 23.83 mfu: 7.64% global_avg_ntp_loss: 3.3668 global_avg_mtp_loss: 15.1795 +[titan] 2025-06-13 13:21:40,270 - root - INFO - lr: 4.6852e-04 gnorm: 1.63 [ 0:40:10< 2:17:52] +[titan] 2025-06-13 13:21:43,598 - root - INFO - step: 3390 loss: 21.3113 memory: 6.46GiB(27.34%) tps: 24,619 tflops: 24.78 mfu: 7.94% global_avg_ntp_loss: 3.8450 global_avg_mtp_loss: 17.4662 +[titan] 2025-06-13 13:21:43,598 - root - INFO - lr: 4.6839e-04 gnorm: 1.32 [ 0:40:14< 2:17:47] +[titan] 2025-06-13 13:21:47,181 - root - INFO - step: 3395 loss: 20.8971 memory: 6.46GiB(27.34%) tps: 22,861 tflops: 23.01 mfu: 7.37% global_avg_ntp_loss: 3.8611 global_avg_mtp_loss: 17.0360 +[titan] 2025-06-13 13:21:47,182 - root - INFO - lr: 4.6826e-04 gnorm: 1.17 [ 0:40:17< 2:17:44] +[titan] 2025-06-13 13:21:50,111 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:21:50,707 - root - INFO - step: 3400 loss: 18.5469 memory: 6.46GiB(27.34%) tps: 23,236 tflops: 23.38 mfu: 7.49% global_avg_ntp_loss: 3.3245 global_avg_mtp_loss: 15.2224 +[titan] 2025-06-13 13:21:50,708 - root - INFO - lr: 4.6813e-04 gnorm: 1.85 [ 0:40:21< 2:17:40] +[titan] 2025-06-13 13:21:54,347 - root - INFO - step: 3405 loss: 21.6638 memory: 6.46GiB(27.34%) tps: 22,512 tflops: 22.66 mfu: 7.26% global_avg_ntp_loss: 3.9887 global_avg_mtp_loss: 17.6751 +[titan] 2025-06-13 13:21:54,347 - root - INFO - lr: 4.6800e-04 gnorm: 1.22 [ 0:40:24< 2:17:37] +[titan] 2025-06-13 13:21:57,987 - root - INFO - step: 3410 loss: 21.2711 memory: 6.46GiB(27.34%) tps: 22,506 tflops: 22.65 mfu: 7.26% global_avg_ntp_loss: 3.8850 global_avg_mtp_loss: 17.3861 +[titan] 2025-06-13 13:21:57,988 - root - INFO - lr: 4.6787e-04 gnorm: 1.14 [ 0:40:28< 2:17:34] +[titan] 2025-06-13 13:22:02,150 - root - INFO - step: 3415 loss: 21.4362 memory: 6.46GiB(27.34%) tps: 19,680 tflops: 19.81 mfu: 6.35% global_avg_ntp_loss: 3.9069 global_avg_mtp_loss: 17.5293 +[titan] 2025-06-13 13:22:02,151 - root - INFO - lr: 4.6774e-04 gnorm: 1.10 [ 0:40:32< 2:17:32] +[titan] 2025-06-13 13:22:05,463 - root - INFO - step: 3420 loss: 21.0255 memory: 6.46GiB(27.34%) tps: 24,730 tflops: 24.89 mfu: 7.98% global_avg_ntp_loss: 3.8054 global_avg_mtp_loss: 17.2201 +[titan] 2025-06-13 13:22:05,464 - root - INFO - lr: 4.6761e-04 gnorm: 1.15 [ 0:40:35< 2:17:28] +[titan] 2025-06-13 13:22:08,685 - root - INFO - step: 3425 loss: 19.8203 memory: 6.46GiB(27.34%) tps: 25,429 tflops: 25.59 mfu: 8.20% global_avg_ntp_loss: 3.5902 global_avg_mtp_loss: 16.2301 +[titan] 2025-06-13 13:22:08,686 - root - INFO - lr: 4.6748e-04 gnorm: 1.58 [ 0:40:39< 2:17:23] +[titan] 2025-06-13 13:22:12,244 - root - INFO - step: 3430 loss: 21.2081 memory: 6.46GiB(27.34%) tps: 23,022 tflops: 23.17 mfu: 7.43% global_avg_ntp_loss: 3.8579 global_avg_mtp_loss: 17.3502 +[titan] 2025-06-13 13:22:12,244 - root - INFO - lr: 4.6735e-04 gnorm: 1.07 [ 0:40:42< 2:17:19] +[titan] 2025-06-13 13:22:15,647 - root - INFO - step: 3435 loss: 21.0949 memory: 6.46GiB(27.34%) tps: 24,074 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 3.7959 global_avg_mtp_loss: 17.2990 +[titan] 2025-06-13 13:22:15,648 - root - INFO - lr: 4.6722e-04 gnorm: 1.08 [ 0:40:46< 2:17:15] +[titan] 2025-06-13 13:22:19,042 - root - INFO - step: 3440 loss: 21.3879 memory: 6.46GiB(27.34%) tps: 24,136 tflops: 24.29 mfu: 7.79% global_avg_ntp_loss: 3.8853 global_avg_mtp_loss: 17.5026 +[titan] 2025-06-13 13:22:19,042 - root - INFO - lr: 4.6709e-04 gnorm: 1.09 [ 0:40:49< 2:17:11] +[titan] 2025-06-13 13:22:22,291 - root - INFO - step: 3445 loss: 21.2699 memory: 6.46GiB(27.34%) tps: 25,213 tflops: 25.37 mfu: 8.13% global_avg_ntp_loss: 3.8446 global_avg_mtp_loss: 17.4253 +[titan] 2025-06-13 13:22:22,292 - root - INFO - lr: 4.6695e-04 gnorm: 1.03 [ 0:40:52< 2:17:07] +[titan] 2025-06-13 13:22:25,151 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:22:25,900 - root - INFO - step: 3450 loss: 21.4831 memory: 6.46GiB(27.34%) tps: 22,702 tflops: 22.85 mfu: 7.32% global_avg_ntp_loss: 3.8916 global_avg_mtp_loss: 17.5915 +[titan] 2025-06-13 13:22:25,901 - root - INFO - lr: 4.6682e-04 gnorm: 1.13 [ 0:40:56< 2:17:03] +[titan] 2025-06-13 13:22:29,381 - root - INFO - step: 3455 loss: 21.2342 memory: 6.46GiB(27.34%) tps: 23,542 tflops: 23.69 mfu: 7.59% global_avg_ntp_loss: 3.8590 global_avg_mtp_loss: 17.3752 +[titan] 2025-06-13 13:22:29,381 - root - INFO - lr: 4.6669e-04 gnorm: 1.08 [ 0:40:59< 2:16:59] +[titan] 2025-06-13 13:22:33,050 - root - INFO - step: 3460 loss: 18.9620 memory: 6.46GiB(27.34%) tps: 22,329 tflops: 22.47 mfu: 7.20% global_avg_ntp_loss: 3.4355 global_avg_mtp_loss: 15.5266 +[titan] 2025-06-13 13:22:33,050 - root - INFO - lr: 4.6656e-04 gnorm: 1.36 [ 0:41:03< 2:16:56] +[titan] 2025-06-13 13:22:36,470 - root - INFO - step: 3465 loss: 22.1993 memory: 6.46GiB(27.34%) tps: 23,955 tflops: 24.11 mfu: 7.73% global_avg_ntp_loss: 4.0505 global_avg_mtp_loss: 18.1488 +[titan] 2025-06-13 13:22:36,470 - root - INFO - lr: 4.6643e-04 gnorm: 1.00 [ 0:41:06< 2:16:52] +[titan] 2025-06-13 13:22:40,011 - root - INFO - step: 3470 loss: 21.2419 memory: 6.46GiB(27.34%) tps: 23,133 tflops: 23.28 mfu: 7.46% global_avg_ntp_loss: 3.9154 global_avg_mtp_loss: 17.3265 +[titan] 2025-06-13 13:22:40,012 - root - INFO - lr: 4.6629e-04 gnorm: 1.13 [ 0:41:10< 2:16:49] +[titan] 2025-06-13 13:22:43,829 - root - INFO - step: 3475 loss: 21.4869 memory: 6.46GiB(27.34%) tps: 21,461 tflops: 21.60 mfu: 6.92% global_avg_ntp_loss: 3.9143 global_avg_mtp_loss: 17.5726 +[titan] 2025-06-13 13:22:43,829 - root - INFO - lr: 4.6616e-04 gnorm: 1.08 [ 0:41:14< 2:16:46] +[titan] 2025-06-13 13:22:47,094 - root - INFO - step: 3480 loss: 21.3281 memory: 6.46GiB(27.34%) tps: 25,095 tflops: 25.26 mfu: 8.09% global_avg_ntp_loss: 3.8889 global_avg_mtp_loss: 17.4392 +[titan] 2025-06-13 13:22:47,094 - root - INFO - lr: 4.6603e-04 gnorm: 1.19 [ 0:41:17< 2:16:41] +[titan] 2025-06-13 13:22:50,627 - root - INFO - step: 3485 loss: 20.1891 memory: 6.46GiB(27.34%) tps: 23,191 tflops: 23.34 mfu: 7.48% global_avg_ntp_loss: 3.6217 global_avg_mtp_loss: 16.5674 +[titan] 2025-06-13 13:22:50,627 - root - INFO - lr: 4.6589e-04 gnorm: 1.31 [ 0:41:21< 2:16:38] +[titan] 2025-06-13 13:22:54,110 - root - INFO - step: 3490 loss: 21.3985 memory: 6.46GiB(27.34%) tps: 23,521 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.8674 global_avg_mtp_loss: 17.5311 +[titan] 2025-06-13 13:22:54,110 - root - INFO - lr: 4.6576e-04 gnorm: 1.11 [ 0:41:24< 2:16:34] +[titan] 2025-06-13 13:22:57,761 - root - INFO - step: 3495 loss: 21.4147 memory: 6.46GiB(27.34%) tps: 22,439 tflops: 22.58 mfu: 7.24% global_avg_ntp_loss: 3.9010 global_avg_mtp_loss: 17.5138 +[titan] 2025-06-13 13:22:57,762 - root - INFO - lr: 4.6563e-04 gnorm: 1.09 [ 0:41:28< 2:16:31] +[titan] 2025-06-13 13:23:00,794 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:23:01,679 - root - INFO - step: 3500 loss: 21.0958 memory: 6.46GiB(27.34%) tps: 20,915 tflops: 21.05 mfu: 6.75% global_avg_ntp_loss: 3.8028 global_avg_mtp_loss: 17.2931 +[titan] 2025-06-13 13:23:01,679 - root - INFO - lr: 4.6549e-04 gnorm: 1.05 [ 0:41:32< 2:16:28] +[titan] 2025-06-13 13:23:04,825 - root - INFO - step: 3505 loss: 21.3860 memory: 6.46GiB(27.34%) tps: 26,044 tflops: 26.21 mfu: 8.40% global_avg_ntp_loss: 3.8997 global_avg_mtp_loss: 17.4864 +[titan] 2025-06-13 13:23:04,825 - root - INFO - lr: 4.6536e-04 gnorm: 1.18 [ 0:41:35< 2:16:23] +[titan] 2025-06-13 13:23:08,388 - root - INFO - step: 3510 loss: 21.3020 memory: 6.46GiB(27.34%) tps: 22,996 tflops: 23.14 mfu: 7.42% global_avg_ntp_loss: 3.8509 global_avg_mtp_loss: 17.4511 +[titan] 2025-06-13 13:23:08,388 - root - INFO - lr: 4.6522e-04 gnorm: 1.09 [ 0:41:38< 2:16:20] +[titan] 2025-06-13 13:23:11,652 - root - INFO - step: 3515 loss: 21.6394 memory: 6.46GiB(27.34%) tps: 25,100 tflops: 25.26 mfu: 8.10% global_avg_ntp_loss: 3.9222 global_avg_mtp_loss: 17.7173 +[titan] 2025-06-13 13:23:11,652 - root - INFO - lr: 4.6509e-04 gnorm: 1.05 [ 0:41:42< 2:16:15] +[titan] 2025-06-13 13:23:15,000 - root - INFO - step: 3520 loss: 21.4479 memory: 6.46GiB(27.34%) tps: 24,467 tflops: 24.62 mfu: 7.89% global_avg_ntp_loss: 3.8390 global_avg_mtp_loss: 17.6089 +[titan] 2025-06-13 13:23:15,001 - root - INFO - lr: 4.6495e-04 gnorm: 1.12 [ 0:41:45< 2:16:11] +[titan] 2025-06-13 13:23:18,433 - root - INFO - step: 3525 loss: 20.8658 memory: 6.46GiB(27.34%) tps: 23,869 tflops: 24.02 mfu: 7.70% global_avg_ntp_loss: 3.7873 global_avg_mtp_loss: 17.0784 +[titan] 2025-06-13 13:23:18,433 - root - INFO - lr: 4.6482e-04 gnorm: 1.15 [ 0:41:48< 2:16:07] +[titan] 2025-06-13 13:23:22,089 - root - INFO - step: 3530 loss: 21.2280 memory: 6.46GiB(27.34%) tps: 22,410 tflops: 22.55 mfu: 7.23% global_avg_ntp_loss: 3.8905 global_avg_mtp_loss: 17.3375 +[titan] 2025-06-13 13:23:22,089 - root - INFO - lr: 4.6468e-04 gnorm: 1.15 [ 0:41:52< 2:16:04] +[titan] 2025-06-13 13:23:25,290 - root - INFO - step: 3535 loss: 19.5307 memory: 6.46GiB(27.34%) tps: 25,595 tflops: 25.76 mfu: 8.26% global_avg_ntp_loss: 3.5588 global_avg_mtp_loss: 15.9719 +[titan] 2025-06-13 13:23:25,290 - root - INFO - lr: 4.6454e-04 gnorm: 1.58 [ 0:41:55< 2:15:59] +[titan] 2025-06-13 13:23:28,473 - root - INFO - step: 3540 loss: 19.8591 memory: 6.46GiB(27.34%) tps: 25,739 tflops: 25.90 mfu: 8.30% global_avg_ntp_loss: 3.6024 global_avg_mtp_loss: 16.2566 +[titan] 2025-06-13 13:23:28,473 - root - INFO - lr: 4.6441e-04 gnorm: 1.38 [ 0:41:58< 2:15:54] +[titan] 2025-06-13 13:23:31,936 - root - INFO - step: 3545 loss: 21.5155 memory: 6.46GiB(27.34%) tps: 23,659 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 3.9027 global_avg_mtp_loss: 17.6129 +[titan] 2025-06-13 13:23:31,936 - root - INFO - lr: 4.6427e-04 gnorm: 1.14 [ 0:42:02< 2:15:50] +[titan] 2025-06-13 13:23:34,565 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:23:35,284 - root - INFO - step: 3550 loss: 22.1286 memory: 6.46GiB(27.34%) tps: 24,473 tflops: 24.63 mfu: 7.89% global_avg_ntp_loss: 4.0316 global_avg_mtp_loss: 18.0971 +[titan] 2025-06-13 13:23:35,284 - root - INFO - lr: 4.6414e-04 gnorm: 1.11 [ 0:42:05< 2:15:46] +[titan] 2025-06-13 13:23:38,611 - root - INFO - step: 3555 loss: 21.3350 memory: 6.46GiB(27.34%) tps: 24,623 tflops: 24.78 mfu: 7.94% global_avg_ntp_loss: 3.8915 global_avg_mtp_loss: 17.4435 +[titan] 2025-06-13 13:23:38,611 - root - INFO - lr: 4.6400e-04 gnorm: 1.05 [ 0:42:09< 2:15:42] +[titan] 2025-06-13 13:23:42,180 - root - INFO - step: 3560 loss: 20.6971 memory: 6.46GiB(27.34%) tps: 22,959 tflops: 23.11 mfu: 7.41% global_avg_ntp_loss: 3.7988 global_avg_mtp_loss: 16.8982 +[titan] 2025-06-13 13:23:42,180 - root - INFO - lr: 4.6386e-04 gnorm: 1.27 [ 0:42:12< 2:15:38] +[titan] 2025-06-13 13:23:45,641 - root - INFO - step: 3565 loss: 21.9063 memory: 6.46GiB(27.34%) tps: 23,672 tflops: 23.82 mfu: 7.64% global_avg_ntp_loss: 3.9770 global_avg_mtp_loss: 17.9292 +[titan] 2025-06-13 13:23:45,641 - root - INFO - lr: 4.6372e-04 gnorm: 1.05 [ 0:42:16< 2:15:34] +[titan] 2025-06-13 13:23:48,898 - root - INFO - step: 3570 loss: 21.8186 memory: 6.46GiB(27.34%) tps: 25,153 tflops: 25.31 mfu: 8.11% global_avg_ntp_loss: 3.9534 global_avg_mtp_loss: 17.8651 +[titan] 2025-06-13 13:23:48,898 - root - INFO - lr: 4.6359e-04 gnorm: 1.04 [ 0:42:19< 2:15:30] +[titan] 2025-06-13 13:23:53,551 - root - INFO - step: 3575 loss: 20.8071 memory: 6.46GiB(27.34%) tps: 17,607 tflops: 17.72 mfu: 5.68% global_avg_ntp_loss: 3.7745 global_avg_mtp_loss: 17.0326 +[titan] 2025-06-13 13:23:53,552 - root - INFO - lr: 4.6345e-04 gnorm: 1.20 [ 0:42:24< 2:15:30] +[titan] 2025-06-13 13:23:57,271 - root - INFO - step: 3580 loss: 21.6079 memory: 6.46GiB(27.34%) tps: 22,026 tflops: 22.17 mfu: 7.10% global_avg_ntp_loss: 3.9615 global_avg_mtp_loss: 17.6464 +[titan] 2025-06-13 13:23:57,271 - root - INFO - lr: 4.6331e-04 gnorm: 1.09 [ 0:42:27< 2:15:27] +[titan] 2025-06-13 13:24:00,136 - root - INFO - Dumping profiler traces at step 3584 +[titan] 2025-06-13 13:24:00,217 - root - INFO - Finished dumping profiler traces in 0.08 seconds +[titan] 2025-06-13 13:24:00,878 - root - INFO - step: 3585 loss: 22.2256 memory: 6.46GiB(27.34%) tps: 22,712 tflops: 22.86 mfu: 7.33% global_avg_ntp_loss: 4.0552 global_avg_mtp_loss: 18.1704 +[titan] 2025-06-13 13:24:00,879 - root - INFO - lr: 4.6317e-04 gnorm: 1.06 [ 0:42:31< 2:15:23] +[titan] 2025-06-13 13:24:04,540 - root - INFO - step: 3590 loss: 21.4941 memory: 6.46GiB(27.34%) tps: 22,377 tflops: 22.52 mfu: 7.22% global_avg_ntp_loss: 3.8956 global_avg_mtp_loss: 17.5984 +[titan] 2025-06-13 13:24:04,540 - root - INFO - lr: 4.6303e-04 gnorm: 1.14 [ 0:42:35< 2:15:20] +[titan] 2025-06-13 13:24:07,997 - root - INFO - step: 3595 loss: 20.7771 memory: 6.46GiB(27.34%) tps: 23,696 tflops: 23.85 mfu: 7.64% global_avg_ntp_loss: 3.7677 global_avg_mtp_loss: 17.0094 +[titan] 2025-06-13 13:24:07,998 - root - INFO - lr: 4.6290e-04 gnorm: 1.38 [ 0:42:38< 2:15:16] +[titan] 2025-06-13 13:24:10,862 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:24:11,438 - root - INFO - step: 3600 loss: 18.7542 memory: 6.46GiB(27.34%) tps: 23,813 tflops: 23.96 mfu: 7.68% global_avg_ntp_loss: 3.3648 global_avg_mtp_loss: 15.3894 +[titan] 2025-06-13 13:24:11,438 - root - INFO - lr: 4.6276e-04 gnorm: 2.25 [ 0:42:41< 2:15:12] +[titan] 2025-06-13 13:24:14,896 - root - INFO - step: 3605 loss: 20.2761 memory: 6.46GiB(27.34%) tps: 23,689 tflops: 23.84 mfu: 7.64% global_avg_ntp_loss: 3.6612 global_avg_mtp_loss: 16.6150 +[titan] 2025-06-13 13:24:14,897 - root - INFO - lr: 4.6262e-04 gnorm: 1.46 [ 0:42:45< 2:15:08] +[titan] 2025-06-13 13:24:18,300 - root - INFO - step: 3610 loss: 21.2246 memory: 6.46GiB(27.34%) tps: 24,074 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 3.8270 global_avg_mtp_loss: 17.3976 +[titan] 2025-06-13 13:24:18,300 - root - INFO - lr: 4.6248e-04 gnorm: 1.18 [ 0:42:48< 2:15:04] +[titan] 2025-06-13 13:24:21,780 - root - INFO - step: 3615 loss: 19.5046 memory: 6.46GiB(27.34%) tps: 23,538 tflops: 23.69 mfu: 7.59% global_avg_ntp_loss: 3.5346 global_avg_mtp_loss: 15.9700 +[titan] 2025-06-13 13:24:21,781 - root - INFO - lr: 4.6234e-04 gnorm: 1.46 [ 0:42:52< 2:15:01] +[titan] 2025-06-13 13:24:25,414 - root - INFO - step: 3620 loss: 20.2362 memory: 6.46GiB(27.34%) tps: 22,549 tflops: 22.69 mfu: 7.27% global_avg_ntp_loss: 3.6491 global_avg_mtp_loss: 16.5871 +[titan] 2025-06-13 13:24:25,414 - root - INFO - lr: 4.6220e-04 gnorm: 1.66 [ 0:42:55< 2:14:57] +[titan] 2025-06-13 13:24:28,657 - root - INFO - step: 3625 loss: 21.2872 memory: 6.46GiB(27.34%) tps: 25,265 tflops: 25.43 mfu: 8.15% global_avg_ntp_loss: 3.8731 global_avg_mtp_loss: 17.4140 +[titan] 2025-06-13 13:24:28,657 - root - INFO - lr: 4.6206e-04 gnorm: 1.17 [ 0:42:59< 2:14:53] +[titan] 2025-06-13 13:24:32,191 - root - INFO - step: 3630 loss: 21.0631 memory: 6.46GiB(27.34%) tps: 23,180 tflops: 23.33 mfu: 7.48% global_avg_ntp_loss: 3.8128 global_avg_mtp_loss: 17.2503 +[titan] 2025-06-13 13:24:32,192 - root - INFO - lr: 4.6192e-04 gnorm: 1.16 [ 0:43:02< 2:14:49] +[titan] 2025-06-13 13:24:35,670 - root - INFO - step: 3635 loss: 20.4813 memory: 6.46GiB(27.34%) tps: 23,548 tflops: 23.70 mfu: 7.60% global_avg_ntp_loss: 3.7149 global_avg_mtp_loss: 16.7664 +[titan] 2025-06-13 13:24:35,671 - root - INFO - lr: 4.6178e-04 gnorm: 1.12 [ 0:43:06< 2:14:45] +[titan] 2025-06-13 13:24:39,191 - root - INFO - step: 3640 loss: 20.7495 memory: 6.46GiB(27.34%) tps: 23,274 tflops: 23.42 mfu: 7.51% global_avg_ntp_loss: 3.7464 global_avg_mtp_loss: 17.0030 +[titan] 2025-06-13 13:24:39,191 - root - INFO - lr: 4.6164e-04 gnorm: 1.24 [ 0:43:09< 2:14:42] +[titan] 2025-06-13 13:24:42,527 - root - INFO - step: 3645 loss: 20.7404 memory: 6.46GiB(27.34%) tps: 24,559 tflops: 24.72 mfu: 7.92% global_avg_ntp_loss: 3.7355 global_avg_mtp_loss: 17.0050 +[titan] 2025-06-13 13:24:42,527 - root - INFO - lr: 4.6149e-04 gnorm: 1.12 [ 0:43:13< 2:14:37] +[titan] 2025-06-13 13:24:45,459 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:24:46,104 - root - INFO - step: 3650 loss: 20.8939 memory: 6.46GiB(27.34%) tps: 22,904 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 3.7633 global_avg_mtp_loss: 17.1306 +[titan] 2025-06-13 13:24:46,104 - root - INFO - lr: 4.6135e-04 gnorm: 1.13 [ 0:43:16< 2:14:34] +[titan] 2025-06-13 13:24:50,551 - root - INFO - step: 3655 loss: 21.5864 memory: 6.46GiB(27.34%) tps: 18,421 tflops: 18.54 mfu: 5.94% global_avg_ntp_loss: 3.9083 global_avg_mtp_loss: 17.6781 +[titan] 2025-06-13 13:24:50,552 - root - INFO - lr: 4.6121e-04 gnorm: 1.39 [ 0:43:21< 2:14:33] +[titan] 2025-06-13 13:24:53,542 - root - INFO - step: 3660 loss: 21.8356 memory: 6.46GiB(27.34%) tps: 27,396 tflops: 27.57 mfu: 8.84% global_avg_ntp_loss: 3.9394 global_avg_mtp_loss: 17.8962 +[titan] 2025-06-13 13:24:53,542 - root - INFO - lr: 4.6107e-04 gnorm: 1.17 [ 0:43:24< 2:14:28] +[titan] 2025-06-13 13:24:56,933 - root - INFO - step: 3665 loss: 21.9707 memory: 6.46GiB(27.34%) tps: 24,160 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 4.0146 global_avg_mtp_loss: 17.9560 +[titan] 2025-06-13 13:24:56,933 - root - INFO - lr: 4.6093e-04 gnorm: 1.11 [ 0:43:27< 2:14:24] +[titan] 2025-06-13 13:25:00,045 - root - INFO - step: 3670 loss: 21.2031 memory: 6.46GiB(27.34%) tps: 26,330 tflops: 26.50 mfu: 8.49% global_avg_ntp_loss: 3.8568 global_avg_mtp_loss: 17.3463 +[titan] 2025-06-13 13:25:00,045 - root - INFO - lr: 4.6079e-04 gnorm: 1.42 [ 0:43:30< 2:14:19] +[titan] 2025-06-13 13:25:03,425 - root - INFO - step: 3675 loss: 20.8608 memory: 6.46GiB(27.34%) tps: 24,239 tflops: 24.39 mfu: 7.82% global_avg_ntp_loss: 3.8007 global_avg_mtp_loss: 17.0601 +[titan] 2025-06-13 13:25:03,425 - root - INFO - lr: 4.6064e-04 gnorm: 1.12 [ 0:43:33< 2:14:15] +[titan] 2025-06-13 13:25:07,247 - root - INFO - step: 3680 loss: 20.5152 memory: 6.46GiB(27.34%) tps: 21,438 tflops: 21.57 mfu: 6.91% global_avg_ntp_loss: 3.7384 global_avg_mtp_loss: 16.7768 +[titan] 2025-06-13 13:25:07,247 - root - INFO - lr: 4.6050e-04 gnorm: 1.32 [ 0:43:37< 2:14:12] +[titan] 2025-06-13 13:25:10,656 - root - INFO - step: 3685 loss: 21.6840 memory: 6.46GiB(27.34%) tps: 24,029 tflops: 24.18 mfu: 7.75% global_avg_ntp_loss: 3.9729 global_avg_mtp_loss: 17.7111 +[titan] 2025-06-13 13:25:10,657 - root - INFO - lr: 4.6036e-04 gnorm: 1.24 [ 0:43:41< 2:14:08] +[titan] 2025-06-13 13:25:13,883 - root - INFO - step: 3690 loss: 21.1733 memory: 6.46GiB(27.34%) tps: 25,389 tflops: 25.55 mfu: 8.19% global_avg_ntp_loss: 3.8575 global_avg_mtp_loss: 17.3158 +[titan] 2025-06-13 13:25:13,884 - root - INFO - lr: 4.6021e-04 gnorm: 1.19 [ 0:43:44< 2:14:03] +[titan] 2025-06-13 13:25:18,016 - root - INFO - step: 3695 loss: 21.7982 memory: 6.46GiB(27.34%) tps: 19,827 tflops: 19.95 mfu: 6.40% global_avg_ntp_loss: 3.9931 global_avg_mtp_loss: 17.8051 +[titan] 2025-06-13 13:25:18,016 - root - INFO - lr: 4.6007e-04 gnorm: 1.08 [ 0:43:48< 2:14:02] +[titan] 2025-06-13 13:25:20,807 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:25:21,641 - root - INFO - step: 3700 loss: 21.2467 memory: 6.46GiB(27.34%) tps: 22,601 tflops: 22.75 mfu: 7.29% global_avg_ntp_loss: 3.8539 global_avg_mtp_loss: 17.3928 +[titan] 2025-06-13 13:25:21,641 - root - INFO - lr: 4.5993e-04 gnorm: 1.39 [ 0:43:52< 2:13:58] +[titan] 2025-06-13 13:25:24,635 - root - INFO - step: 3705 loss: 20.8086 memory: 6.46GiB(27.34%) tps: 27,361 tflops: 27.54 mfu: 8.83% global_avg_ntp_loss: 3.7735 global_avg_mtp_loss: 17.0351 +[titan] 2025-06-13 13:25:24,635 - root - INFO - lr: 4.5978e-04 gnorm: 1.14 [ 0:43:55< 2:13:53] +[titan] 2025-06-13 13:25:27,850 - root - INFO - step: 3710 loss: 22.0380 memory: 6.46GiB(27.34%) tps: 25,487 tflops: 25.65 mfu: 8.22% global_avg_ntp_loss: 4.0436 global_avg_mtp_loss: 17.9944 +[titan] 2025-06-13 13:25:27,850 - root - INFO - lr: 4.5964e-04 gnorm: 1.03 [ 0:43:58< 2:13:48] +[titan] 2025-06-13 13:25:31,404 - root - INFO - step: 3715 loss: 21.5719 memory: 6.46GiB(27.34%) tps: 23,051 tflops: 23.20 mfu: 7.44% global_avg_ntp_loss: 3.9155 global_avg_mtp_loss: 17.6563 +[titan] 2025-06-13 13:25:31,404 - root - INFO - lr: 4.5949e-04 gnorm: 1.06 [ 0:44:01< 2:13:45] +[titan] 2025-06-13 13:25:34,741 - root - INFO - step: 3720 loss: 20.3053 memory: 6.46GiB(27.34%) tps: 24,552 tflops: 24.71 mfu: 7.92% global_avg_ntp_loss: 3.7551 global_avg_mtp_loss: 16.5503 +[titan] 2025-06-13 13:25:34,741 - root - INFO - lr: 4.5935e-04 gnorm: 1.20 [ 0:44:05< 2:13:41] +[titan] 2025-06-13 13:25:37,857 - root - INFO - step: 3725 loss: 18.7156 memory: 6.46GiB(27.34%) tps: 26,295 tflops: 26.46 mfu: 8.48% global_avg_ntp_loss: 3.3436 global_avg_mtp_loss: 15.3720 +[titan] 2025-06-13 13:25:37,857 - root - INFO - lr: 4.5921e-04 gnorm: 1.68 [ 0:44:08< 2:13:36] +[titan] 2025-06-13 13:25:41,536 - root - INFO - step: 3730 loss: 20.5463 memory: 6.46GiB(27.34%) tps: 22,269 tflops: 22.41 mfu: 7.18% global_avg_ntp_loss: 3.7134 global_avg_mtp_loss: 16.8328 +[titan] 2025-06-13 13:25:41,536 - root - INFO - lr: 4.5906e-04 gnorm: 1.22 [ 0:44:12< 2:13:32] +[titan] 2025-06-13 13:25:44,963 - root - INFO - step: 3735 loss: 21.2768 memory: 6.46GiB(27.34%) tps: 23,906 tflops: 24.06 mfu: 7.71% global_avg_ntp_loss: 3.8781 global_avg_mtp_loss: 17.3987 +[titan] 2025-06-13 13:25:44,963 - root - INFO - lr: 4.5892e-04 gnorm: 1.07 [ 0:44:15< 2:13:29] +[titan] 2025-06-13 13:25:48,707 - root - INFO - step: 3740 loss: 20.7666 memory: 6.46GiB(27.34%) tps: 21,883 tflops: 22.02 mfu: 7.06% global_avg_ntp_loss: 3.7317 global_avg_mtp_loss: 17.0349 +[titan] 2025-06-13 13:25:48,707 - root - INFO - lr: 4.5877e-04 gnorm: 1.12 [ 0:44:19< 2:13:26] +[titan] 2025-06-13 13:25:52,265 - root - INFO - step: 3745 loss: 21.3760 memory: 6.46GiB(27.34%) tps: 23,026 tflops: 23.17 mfu: 7.43% global_avg_ntp_loss: 3.8548 global_avg_mtp_loss: 17.5212 +[titan] 2025-06-13 13:25:52,266 - root - INFO - lr: 4.5862e-04 gnorm: 1.06 [ 0:44:22< 2:13:22] +[titan] 2025-06-13 13:25:55,093 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:25:55,864 - root - INFO - step: 3750 loss: 21.3511 memory: 6.46GiB(27.34%) tps: 22,766 tflops: 22.91 mfu: 7.34% global_avg_ntp_loss: 3.8884 global_avg_mtp_loss: 17.4627 +[titan] 2025-06-13 13:25:55,864 - root - INFO - lr: 4.5848e-04 gnorm: 1.11 [ 0:44:26< 2:13:19] +[titan] 2025-06-13 13:25:59,035 - root - INFO - step: 3755 loss: 17.3953 memory: 6.46GiB(27.34%) tps: 25,840 tflops: 26.00 mfu: 8.33% global_avg_ntp_loss: 3.1922 global_avg_mtp_loss: 14.2031 +[titan] 2025-06-13 13:25:59,035 - root - INFO - lr: 4.5833e-04 gnorm: 1.08 [ 0:44:29< 2:13:14] +[titan] 2025-06-13 13:26:02,649 - root - INFO - step: 3760 loss: 22.0016 memory: 6.46GiB(27.34%) tps: 22,670 tflops: 22.81 mfu: 7.31% global_avg_ntp_loss: 3.9573 global_avg_mtp_loss: 18.0443 +[titan] 2025-06-13 13:26:02,649 - root - INFO - lr: 4.5819e-04 gnorm: 1.17 [ 0:44:33< 2:13:11] +[titan] 2025-06-13 13:26:06,282 - root - INFO - step: 3765 loss: 21.1294 memory: 6.46GiB(27.34%) tps: 22,551 tflops: 22.69 mfu: 7.27% global_avg_ntp_loss: 3.8196 global_avg_mtp_loss: 17.3097 +[titan] 2025-06-13 13:26:06,282 - root - INFO - lr: 4.5804e-04 gnorm: 1.13 [ 0:44:36< 2:13:07] +[titan] 2025-06-13 13:26:09,755 - root - INFO - step: 3770 loss: 21.0678 memory: 6.46GiB(27.34%) tps: 23,592 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 3.8140 global_avg_mtp_loss: 17.2538 +[titan] 2025-06-13 13:26:09,755 - root - INFO - lr: 4.5789e-04 gnorm: 1.02 [ 0:44:40< 2:13:03] +[titan] 2025-06-13 13:26:13,248 - root - INFO - step: 3775 loss: 20.6523 memory: 6.46GiB(27.34%) tps: 23,451 tflops: 23.60 mfu: 7.56% global_avg_ntp_loss: 3.7233 global_avg_mtp_loss: 16.9290 +[titan] 2025-06-13 13:26:13,249 - root - INFO - lr: 4.5774e-04 gnorm: 1.30 [ 0:44:43< 2:13:00] +[titan] 2025-06-13 13:26:19,101 - root - INFO - step: 3780 loss: 21.5443 memory: 6.46GiB(27.34%) tps: 13,999 tflops: 14.09 mfu: 4.52% global_avg_ntp_loss: 4.0344 global_avg_mtp_loss: 17.5099 +[titan] 2025-06-13 13:26:19,101 - root - INFO - lr: 4.5760e-04 gnorm: 1.46 [ 0:44:49< 2:13:03] +[titan] 2025-06-13 13:26:22,588 - root - INFO - step: 3785 loss: 21.8333 memory: 6.46GiB(27.34%) tps: 23,491 tflops: 23.64 mfu: 7.58% global_avg_ntp_loss: 3.9952 global_avg_mtp_loss: 17.8381 +[titan] 2025-06-13 13:26:22,589 - root - INFO - lr: 4.5745e-04 gnorm: 1.09 [ 0:44:53< 2:12:59] +[titan] 2025-06-13 13:26:25,963 - root - INFO - step: 3790 loss: 21.4718 memory: 6.46GiB(27.34%) tps: 24,280 tflops: 24.43 mfu: 7.83% global_avg_ntp_loss: 3.8541 global_avg_mtp_loss: 17.6177 +[titan] 2025-06-13 13:26:25,963 - root - INFO - lr: 4.5730e-04 gnorm: 0.97 [ 0:44:56< 2:12:55] +[titan] 2025-06-13 13:26:28,928 - root - INFO - step: 3795 loss: 22.0940 memory: 6.46GiB(27.34%) tps: 27,629 tflops: 27.81 mfu: 8.91% global_avg_ntp_loss: 3.9946 global_avg_mtp_loss: 18.0994 +[titan] 2025-06-13 13:26:28,928 - root - INFO - lr: 4.5715e-04 gnorm: 1.05 [ 0:44:59< 2:12:50] +[titan] 2025-06-13 13:26:31,911 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:26:32,447 - root - INFO - step: 3800 loss: 20.7050 memory: 6.46GiB(27.34%) tps: 23,279 tflops: 23.43 mfu: 7.51% global_avg_ntp_loss: 3.8038 global_avg_mtp_loss: 16.9012 +[titan] 2025-06-13 13:26:32,448 - root - INFO - lr: 4.5701e-04 gnorm: 1.97 [ 0:45:02< 2:12:46] +[titan] 2025-06-13 13:26:35,665 - root - INFO - step: 3805 loss: 21.5749 memory: 6.46GiB(27.34%) tps: 25,468 tflops: 25.63 mfu: 8.21% global_avg_ntp_loss: 3.9755 global_avg_mtp_loss: 17.5993 +[titan] 2025-06-13 13:26:35,665 - root - INFO - lr: 4.5686e-04 gnorm: 1.19 [ 0:45:06< 2:12:42] +[titan] 2025-06-13 13:26:38,759 - root - INFO - step: 3810 loss: 21.5801 memory: 6.46GiB(27.34%) tps: 26,473 tflops: 26.64 mfu: 8.54% global_avg_ntp_loss: 3.9871 global_avg_mtp_loss: 17.5930 +[titan] 2025-06-13 13:26:38,760 - root - INFO - lr: 4.5671e-04 gnorm: 1.33 [ 0:45:09< 2:12:37] +[titan] 2025-06-13 13:26:42,313 - root - INFO - step: 3815 loss: 21.1018 memory: 6.46GiB(27.34%) tps: 23,060 tflops: 23.21 mfu: 7.44% global_avg_ntp_loss: 3.8235 global_avg_mtp_loss: 17.2783 +[titan] 2025-06-13 13:26:42,313 - root - INFO - lr: 4.5656e-04 gnorm: 1.07 [ 0:45:12< 2:12:33] +[titan] 2025-06-13 13:26:45,920 - root - INFO - step: 3820 loss: 20.7508 memory: 6.46GiB(27.34%) tps: 22,713 tflops: 22.86 mfu: 7.33% global_avg_ntp_loss: 3.7468 global_avg_mtp_loss: 17.0039 +[titan] 2025-06-13 13:26:45,920 - root - INFO - lr: 4.5641e-04 gnorm: 1.02 [ 0:45:16< 2:12:30] +[titan] 2025-06-13 13:26:49,349 - root - INFO - step: 3825 loss: 20.9424 memory: 6.46GiB(27.34%) tps: 23,887 tflops: 24.04 mfu: 7.71% global_avg_ntp_loss: 3.8699 global_avg_mtp_loss: 17.0725 +[titan] 2025-06-13 13:26:49,350 - root - INFO - lr: 4.5626e-04 gnorm: 1.27 [ 0:45:19< 2:12:26] +[titan] 2025-06-13 13:26:52,724 - root - INFO - step: 3830 loss: 21.5684 memory: 6.46GiB(27.34%) tps: 24,276 tflops: 24.43 mfu: 7.83% global_avg_ntp_loss: 3.9068 global_avg_mtp_loss: 17.6616 +[titan] 2025-06-13 13:26:52,725 - root - INFO - lr: 4.5611e-04 gnorm: 1.08 [ 0:45:23< 2:12:22] +[titan] 2025-06-13 13:26:56,098 - root - INFO - step: 3835 loss: 19.6037 memory: 6.46GiB(27.34%) tps: 24,285 tflops: 24.44 mfu: 7.83% global_avg_ntp_loss: 3.5407 global_avg_mtp_loss: 16.0630 +[titan] 2025-06-13 13:26:56,098 - root - INFO - lr: 4.5596e-04 gnorm: 1.26 [ 0:45:26< 2:12:18] +[titan] 2025-06-13 13:26:59,601 - root - INFO - step: 3840 loss: 20.4492 memory: 6.46GiB(27.34%) tps: 23,388 tflops: 23.54 mfu: 7.54% global_avg_ntp_loss: 3.6982 global_avg_mtp_loss: 16.7510 +[titan] 2025-06-13 13:26:59,602 - root - INFO - lr: 4.5581e-04 gnorm: 1.80 [ 0:45:30< 2:12:14] +[titan] 2025-06-13 13:27:03,245 - root - INFO - step: 3845 loss: 20.8924 memory: 6.46GiB(27.34%) tps: 22,483 tflops: 22.63 mfu: 7.25% global_avg_ntp_loss: 3.7405 global_avg_mtp_loss: 17.1520 +[titan] 2025-06-13 13:27:03,246 - root - INFO - lr: 4.5566e-04 gnorm: 1.24 [ 0:45:33< 2:12:11] +[titan] 2025-06-13 13:27:05,670 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:27:06,333 - root - INFO - step: 3850 loss: 21.0751 memory: 6.46GiB(27.34%) tps: 26,535 tflops: 26.70 mfu: 8.56% global_avg_ntp_loss: 3.8090 global_avg_mtp_loss: 17.2661 +[titan] 2025-06-13 13:27:06,333 - root - INFO - lr: 4.5551e-04 gnorm: 1.15 [ 0:45:36< 2:12:06] +[titan] 2025-06-13 13:27:09,804 - root - INFO - step: 3855 loss: 19.7669 memory: 6.46GiB(27.34%) tps: 23,606 tflops: 23.76 mfu: 7.61% global_avg_ntp_loss: 3.5444 global_avg_mtp_loss: 16.2224 +[titan] 2025-06-13 13:27:09,804 - root - INFO - lr: 4.5536e-04 gnorm: 1.40 [ 0:45:40< 2:12:02] +[titan] 2025-06-13 13:27:13,345 - root - INFO - step: 3860 loss: 21.0703 memory: 6.46GiB(27.34%) tps: 23,133 tflops: 23.28 mfu: 7.46% global_avg_ntp_loss: 3.8060 global_avg_mtp_loss: 17.2642 +[titan] 2025-06-13 13:27:13,346 - root - INFO - lr: 4.5521e-04 gnorm: 1.22 [ 0:45:43< 2:11:58] +[titan] 2025-06-13 13:27:16,811 - root - INFO - step: 3865 loss: 21.1169 memory: 6.46GiB(27.34%) tps: 23,644 tflops: 23.79 mfu: 7.63% global_avg_ntp_loss: 3.9946 global_avg_mtp_loss: 17.1223 +[titan] 2025-06-13 13:27:16,811 - root - INFO - lr: 4.5506e-04 gnorm: 1.69 [ 0:45:47< 2:11:54] +[titan] 2025-06-13 13:27:20,338 - root - INFO - step: 3870 loss: 21.3868 memory: 6.46GiB(27.34%) tps: 23,227 tflops: 23.38 mfu: 7.49% global_avg_ntp_loss: 3.8613 global_avg_mtp_loss: 17.5255 +[titan] 2025-06-13 13:27:20,338 - root - INFO - lr: 4.5490e-04 gnorm: 1.16 [ 0:45:50< 2:11:51] +[titan] 2025-06-13 13:27:23,644 - root - INFO - step: 3875 loss: 21.7581 memory: 6.46GiB(27.34%) tps: 24,779 tflops: 24.94 mfu: 7.99% global_avg_ntp_loss: 3.9025 global_avg_mtp_loss: 17.8555 +[titan] 2025-06-13 13:27:23,645 - root - INFO - lr: 4.5475e-04 gnorm: 0.94 [ 0:45:54< 2:11:47] +[titan] 2025-06-13 13:27:27,106 - root - INFO - step: 3880 loss: 21.6706 memory: 6.46GiB(27.34%) tps: 23,670 tflops: 23.82 mfu: 7.63% global_avg_ntp_loss: 3.9484 global_avg_mtp_loss: 17.7222 +[titan] 2025-06-13 13:27:27,106 - root - INFO - lr: 4.5460e-04 gnorm: 1.06 [ 0:45:57< 2:11:43] +[titan] 2025-06-13 13:27:30,461 - root - INFO - step: 3885 loss: 21.5805 memory: 6.46GiB(27.34%) tps: 24,419 tflops: 24.58 mfu: 7.88% global_avg_ntp_loss: 3.8553 global_avg_mtp_loss: 17.7252 +[titan] 2025-06-13 13:27:30,461 - root - INFO - lr: 4.5445e-04 gnorm: 1.30 [ 0:46:00< 2:11:39] +[titan] 2025-06-13 13:27:34,037 - root - INFO - step: 3890 loss: 20.4234 memory: 6.46GiB(27.34%) tps: 22,912 tflops: 23.06 mfu: 7.39% global_avg_ntp_loss: 3.6365 global_avg_mtp_loss: 16.7870 +[titan] 2025-06-13 13:27:34,037 - root - INFO - lr: 4.5430e-04 gnorm: 1.22 [ 0:46:04< 2:11:35] +[titan] 2025-06-13 13:27:37,374 - root - INFO - step: 3895 loss: 20.3405 memory: 6.46GiB(27.34%) tps: 24,551 tflops: 24.71 mfu: 7.92% global_avg_ntp_loss: 3.5730 global_avg_mtp_loss: 16.7675 +[titan] 2025-06-13 13:27:37,374 - root - INFO - lr: 4.5414e-04 gnorm: 1.25 [ 0:46:07< 2:11:31] +[titan] 2025-06-13 13:27:40,296 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:27:41,003 - root - INFO - step: 3900 loss: 21.3993 memory: 6.46GiB(27.34%) tps: 22,576 tflops: 22.72 mfu: 7.28% global_avg_ntp_loss: 3.8484 global_avg_mtp_loss: 17.5508 +[titan] 2025-06-13 13:27:41,003 - root - INFO - lr: 4.5399e-04 gnorm: 1.25 [ 0:46:11< 2:11:28] +[titan] 2025-06-13 13:27:44,394 - root - INFO - step: 3905 loss: 19.8990 memory: 6.46GiB(27.34%) tps: 24,161 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 3.6230 global_avg_mtp_loss: 16.2760 +[titan] 2025-06-13 13:27:44,394 - root - INFO - lr: 4.5384e-04 gnorm: 1.13 [ 0:46:14< 2:11:24] +[titan] 2025-06-13 13:27:47,968 - root - INFO - step: 3910 loss: 20.7124 memory: 6.46GiB(27.34%) tps: 22,924 tflops: 23.07 mfu: 7.39% global_avg_ntp_loss: 3.7154 global_avg_mtp_loss: 16.9970 +[titan] 2025-06-13 13:27:47,968 - root - INFO - lr: 4.5368e-04 gnorm: 1.14 [ 0:46:18< 2:11:20] +[titan] 2025-06-13 13:27:51,439 - root - INFO - step: 3915 loss: 20.0835 memory: 6.46GiB(27.34%) tps: 23,603 tflops: 23.75 mfu: 7.61% global_avg_ntp_loss: 3.5779 global_avg_mtp_loss: 16.5056 +[titan] 2025-06-13 13:27:51,439 - root - INFO - lr: 4.5353e-04 gnorm: 1.97 [ 0:46:21< 2:11:16] +[titan] 2025-06-13 13:27:54,962 - root - INFO - step: 3920 loss: 21.5828 memory: 6.46GiB(27.34%) tps: 23,258 tflops: 23.41 mfu: 7.50% global_avg_ntp_loss: 3.9256 global_avg_mtp_loss: 17.6571 +[titan] 2025-06-13 13:27:54,962 - root - INFO - lr: 4.5338e-04 gnorm: 1.07 [ 0:46:25< 2:11:13] +[titan] 2025-06-13 13:27:58,716 - root - INFO - step: 3925 loss: 21.3776 memory: 6.46GiB(27.34%) tps: 21,820 tflops: 21.96 mfu: 7.04% global_avg_ntp_loss: 3.8736 global_avg_mtp_loss: 17.5040 +[titan] 2025-06-13 13:27:58,717 - root - INFO - lr: 4.5322e-04 gnorm: 1.01 [ 0:46:29< 2:11:10] +[titan] 2025-06-13 13:28:02,108 - root - INFO - step: 3930 loss: 19.4804 memory: 6.46GiB(27.34%) tps: 24,159 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 3.5329 global_avg_mtp_loss: 15.9475 +[titan] 2025-06-13 13:28:02,108 - root - INFO - lr: 4.5307e-04 gnorm: 1.48 [ 0:46:32< 2:11:06] +[titan] 2025-06-13 13:28:05,832 - root - INFO - step: 3935 loss: 20.3600 memory: 6.46GiB(27.34%) tps: 22,000 tflops: 22.14 mfu: 7.10% global_avg_ntp_loss: 3.6983 global_avg_mtp_loss: 16.6617 +[titan] 2025-06-13 13:28:05,832 - root - INFO - lr: 4.5292e-04 gnorm: 1.53 [ 0:46:36< 2:11:03] +[titan] 2025-06-13 13:28:09,057 - root - INFO - step: 3940 loss: 20.3293 memory: 6.46GiB(27.34%) tps: 25,403 tflops: 25.56 mfu: 8.19% global_avg_ntp_loss: 3.6963 global_avg_mtp_loss: 16.6330 +[titan] 2025-06-13 13:28:09,058 - root - INFO - lr: 4.5276e-04 gnorm: 1.17 [ 0:46:39< 2:10:58] +[titan] 2025-06-13 13:28:12,322 - root - INFO - step: 3945 loss: 19.9797 memory: 6.46GiB(27.34%) tps: 25,098 tflops: 25.26 mfu: 8.10% global_avg_ntp_loss: 3.5965 global_avg_mtp_loss: 16.3832 +[titan] 2025-06-13 13:28:12,322 - root - INFO - lr: 4.5261e-04 gnorm: 1.32 [ 0:46:42< 2:10:54] +[titan] 2025-06-13 13:28:14,867 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:28:15,451 - root - INFO - step: 3950 loss: 21.1523 memory: 6.46GiB(27.34%) tps: 26,181 tflops: 26.35 mfu: 8.44% global_avg_ntp_loss: 3.8058 global_avg_mtp_loss: 17.3465 +[titan] 2025-06-13 13:28:15,451 - root - INFO - lr: 4.5245e-04 gnorm: 1.09 [ 0:46:45< 2:10:49] +[titan] 2025-06-13 13:28:19,670 - root - INFO - step: 3955 loss: 19.4963 memory: 6.46GiB(27.34%) tps: 19,420 tflops: 19.54 mfu: 6.26% global_avg_ntp_loss: 3.4920 global_avg_mtp_loss: 16.0043 +[titan] 2025-06-13 13:28:19,670 - root - INFO - lr: 4.5230e-04 gnorm: 1.22 [ 0:46:50< 2:10:47] +[titan] 2025-06-13 13:28:23,046 - root - INFO - step: 3960 loss: 21.4535 memory: 6.46GiB(27.34%) tps: 24,266 tflops: 24.42 mfu: 7.83% global_avg_ntp_loss: 3.8778 global_avg_mtp_loss: 17.5757 +[titan] 2025-06-13 13:28:23,047 - root - INFO - lr: 4.5214e-04 gnorm: 1.15 [ 0:46:53< 2:10:43] +[titan] 2025-06-13 13:28:26,863 - root - INFO - step: 3965 loss: 20.5897 memory: 6.46GiB(27.34%) tps: 21,468 tflops: 21.60 mfu: 6.92% global_avg_ntp_loss: 3.6870 global_avg_mtp_loss: 16.9027 +[titan] 2025-06-13 13:28:26,863 - root - INFO - lr: 4.5198e-04 gnorm: 1.39 [ 0:46:57< 2:10:40] +[titan] 2025-06-13 13:28:30,400 - root - INFO - step: 3970 loss: 19.8476 memory: 6.46GiB(27.34%) tps: 23,163 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.5804 global_avg_mtp_loss: 16.2672 +[titan] 2025-06-13 13:28:30,400 - root - INFO - lr: 4.5183e-04 gnorm: 1.50 [ 0:47:00< 2:10:37] +[titan] 2025-06-13 13:28:33,791 - root - INFO - step: 3975 loss: 20.7157 memory: 6.46GiB(27.34%) tps: 24,161 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 3.7084 global_avg_mtp_loss: 17.0074 +[titan] 2025-06-13 13:28:33,791 - root - INFO - lr: 4.5167e-04 gnorm: 1.08 [ 0:47:04< 2:10:33] +[titan] 2025-06-13 13:28:37,570 - root - INFO - step: 3980 loss: 21.0444 memory: 6.46GiB(27.34%) tps: 21,676 tflops: 21.81 mfu: 6.99% global_avg_ntp_loss: 3.8274 global_avg_mtp_loss: 17.2170 +[titan] 2025-06-13 13:28:37,570 - root - INFO - lr: 4.5152e-04 gnorm: 1.01 [ 0:47:08< 2:10:30] +[titan] 2025-06-13 13:28:40,979 - root - INFO - step: 3985 loss: 21.0697 memory: 6.46GiB(27.34%) tps: 24,033 tflops: 24.19 mfu: 7.75% global_avg_ntp_loss: 3.8255 global_avg_mtp_loss: 17.2442 +[titan] 2025-06-13 13:28:40,979 - root - INFO - lr: 4.5136e-04 gnorm: 1.11 [ 0:47:11< 2:10:26] +[titan] 2025-06-13 13:28:44,105 - root - INFO - step: 3990 loss: 22.0601 memory: 6.46GiB(27.34%) tps: 26,212 tflops: 26.38 mfu: 8.45% global_avg_ntp_loss: 4.0798 global_avg_mtp_loss: 17.9803 +[titan] 2025-06-13 13:28:44,105 - root - INFO - lr: 4.5120e-04 gnorm: 1.16 [ 0:47:14< 2:10:21] +[titan] 2025-06-13 13:28:47,826 - root - INFO - step: 3995 loss: 21.2415 memory: 6.46GiB(27.34%) tps: 22,017 tflops: 22.16 mfu: 7.10% global_avg_ntp_loss: 3.8343 global_avg_mtp_loss: 17.4072 +[titan] 2025-06-13 13:28:47,827 - root - INFO - lr: 4.5104e-04 gnorm: 1.20 [ 0:47:18< 2:10:18] +[titan] 2025-06-13 13:28:50,653 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:28:51,352 - root - INFO - step: 4000 loss: 20.8727 memory: 6.46GiB(27.34%) tps: 23,237 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.8221 global_avg_mtp_loss: 17.0506 +[titan] 2025-06-13 13:28:51,353 - root - INFO - lr: 4.5089e-04 gnorm: 1.07 [ 0:47:21< 2:10:15] +[titan] 2025-06-13 13:28:54,991 - root - INFO - step: 4005 loss: 19.9424 memory: 6.46GiB(27.34%) tps: 22,514 tflops: 22.66 mfu: 7.26% global_avg_ntp_loss: 3.6134 global_avg_mtp_loss: 16.3290 +[titan] 2025-06-13 13:28:54,992 - root - INFO - lr: 4.5073e-04 gnorm: 1.95 [ 0:47:25< 2:10:11] +[titan] 2025-06-13 13:28:58,253 - root - INFO - step: 4010 loss: 21.4018 memory: 6.46GiB(27.34%) tps: 25,116 tflops: 25.28 mfu: 8.10% global_avg_ntp_loss: 3.8585 global_avg_mtp_loss: 17.5433 +[titan] 2025-06-13 13:28:58,254 - root - INFO - lr: 4.5057e-04 gnorm: 1.13 [ 0:47:28< 2:10:07] +[titan] 2025-06-13 13:29:02,146 - root - INFO - step: 4015 loss: 21.0489 memory: 6.46GiB(27.34%) tps: 21,047 tflops: 21.18 mfu: 6.79% global_avg_ntp_loss: 3.8104 global_avg_mtp_loss: 17.2385 +[titan] 2025-06-13 13:29:02,146 - root - INFO - lr: 4.5041e-04 gnorm: 1.17 [ 0:47:32< 2:10:04] +[titan] 2025-06-13 13:29:05,542 - root - INFO - step: 4020 loss: 20.5958 memory: 6.46GiB(27.34%) tps: 24,125 tflops: 24.28 mfu: 7.78% global_avg_ntp_loss: 3.6742 global_avg_mtp_loss: 16.9216 +[titan] 2025-06-13 13:29:05,542 - root - INFO - lr: 4.5026e-04 gnorm: 1.19 [ 0:47:36< 2:10:00] +[titan] 2025-06-13 13:29:08,946 - root - INFO - step: 4025 loss: 21.2907 memory: 6.46GiB(27.34%) tps: 24,073 tflops: 24.23 mfu: 7.76% global_avg_ntp_loss: 3.9282 global_avg_mtp_loss: 17.3625 +[titan] 2025-06-13 13:29:08,946 - root - INFO - lr: 4.5010e-04 gnorm: 1.39 [ 0:47:39< 2:09:56] +[titan] 2025-06-13 13:29:12,147 - root - INFO - step: 4030 loss: 20.5277 memory: 6.46GiB(27.34%) tps: 25,594 tflops: 25.76 mfu: 8.26% global_avg_ntp_loss: 3.6776 global_avg_mtp_loss: 16.8501 +[titan] 2025-06-13 13:29:12,147 - root - INFO - lr: 4.4994e-04 gnorm: 1.32 [ 0:47:42< 2:09:52] +[titan] 2025-06-13 13:29:15,550 - root - INFO - step: 4035 loss: 20.0739 memory: 6.46GiB(27.34%) tps: 24,076 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 3.6360 global_avg_mtp_loss: 16.4379 +[titan] 2025-06-13 13:29:15,550 - root - INFO - lr: 4.4978e-04 gnorm: 1.30 [ 0:47:46< 2:09:48] +[titan] 2025-06-13 13:29:18,965 - root - INFO - step: 4040 loss: 21.3873 memory: 6.46GiB(27.34%) tps: 23,989 tflops: 24.14 mfu: 7.74% global_avg_ntp_loss: 4.0063 global_avg_mtp_loss: 17.3810 +[titan] 2025-06-13 13:29:18,965 - root - INFO - lr: 4.4962e-04 gnorm: 2.48 [ 0:47:49< 2:09:44] +[titan] 2025-06-13 13:29:22,635 - root - INFO - step: 4045 loss: 21.2670 memory: 6.46GiB(27.34%) tps: 22,325 tflops: 22.47 mfu: 7.20% global_avg_ntp_loss: 3.8144 global_avg_mtp_loss: 17.4526 +[titan] 2025-06-13 13:29:22,635 - root - INFO - lr: 4.4946e-04 gnorm: 1.05 [ 0:47:53< 2:09:41] +[titan] 2025-06-13 13:29:25,397 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:29:26,133 - root - INFO - step: 4050 loss: 21.2014 memory: 6.46GiB(27.34%) tps: 23,418 tflops: 23.57 mfu: 7.55% global_avg_ntp_loss: 3.8783 global_avg_mtp_loss: 17.3231 +[titan] 2025-06-13 13:29:26,134 - root - INFO - lr: 4.4930e-04 gnorm: 1.15 [ 0:47:56< 2:09:37] +[titan] 2025-06-13 13:29:29,219 - root - INFO - step: 4055 loss: 21.2328 memory: 6.46GiB(27.34%) tps: 26,554 tflops: 26.72 mfu: 8.57% global_avg_ntp_loss: 3.8595 global_avg_mtp_loss: 17.3733 +[titan] 2025-06-13 13:29:29,219 - root - INFO - lr: 4.4914e-04 gnorm: 1.13 [ 0:47:59< 2:09:32] +[titan] 2025-06-13 13:29:32,553 - root - INFO - step: 4060 loss: 21.0101 memory: 6.46GiB(27.34%) tps: 24,577 tflops: 24.73 mfu: 7.93% global_avg_ntp_loss: 3.7748 global_avg_mtp_loss: 17.2353 +[titan] 2025-06-13 13:29:32,553 - root - INFO - lr: 4.4898e-04 gnorm: 1.15 [ 0:48:03< 2:09:28] +[titan] 2025-06-13 13:29:36,226 - root - INFO - step: 4065 loss: 19.8445 memory: 6.46GiB(27.34%) tps: 22,302 tflops: 22.44 mfu: 7.19% global_avg_ntp_loss: 3.5432 global_avg_mtp_loss: 16.3013 +[titan] 2025-06-13 13:29:36,227 - root - INFO - lr: 4.4882e-04 gnorm: 1.17 [ 0:48:06< 2:09:25] +[titan] 2025-06-13 13:29:39,688 - root - INFO - step: 4070 loss: 18.7849 memory: 6.46GiB(27.34%) tps: 23,665 tflops: 23.82 mfu: 7.63% global_avg_ntp_loss: 3.3828 global_avg_mtp_loss: 15.4021 +[titan] 2025-06-13 13:29:39,689 - root - INFO - lr: 4.4866e-04 gnorm: 1.33 [ 0:48:10< 2:09:21] +[titan] 2025-06-13 13:29:42,819 - root - INFO - step: 4075 loss: 21.1399 memory: 6.46GiB(27.34%) tps: 26,172 tflops: 26.34 mfu: 8.44% global_avg_ntp_loss: 3.9104 global_avg_mtp_loss: 17.2294 +[titan] 2025-06-13 13:29:42,819 - root - INFO - lr: 4.4850e-04 gnorm: 1.25 [ 0:48:13< 2:09:16] +[titan] 2025-06-13 13:29:45,916 - root - INFO - step: 4080 loss: 21.3853 memory: 6.46GiB(27.34%) tps: 26,455 tflops: 26.62 mfu: 8.53% global_avg_ntp_loss: 3.8874 global_avg_mtp_loss: 17.4980 +[titan] 2025-06-13 13:29:45,916 - root - INFO - lr: 4.4834e-04 gnorm: 1.51 [ 0:48:16< 2:09:12] +[titan] 2025-06-13 13:29:49,357 - root - INFO - step: 4085 loss: 21.0120 memory: 6.46GiB(27.34%) tps: 23,810 tflops: 23.96 mfu: 7.68% global_avg_ntp_loss: 3.8022 global_avg_mtp_loss: 17.2098 +[titan] 2025-06-13 13:29:49,357 - root - INFO - lr: 4.4818e-04 gnorm: 1.58 [ 0:48:19< 2:09:08] +[titan] 2025-06-13 13:29:52,785 - root - INFO - step: 4090 loss: 20.7246 memory: 6.46GiB(27.34%) tps: 23,901 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 3.7176 global_avg_mtp_loss: 17.0070 +[titan] 2025-06-13 13:29:52,785 - root - INFO - lr: 4.4802e-04 gnorm: 1.13 [ 0:48:23< 2:09:04] +[titan] 2025-06-13 13:29:55,967 - root - INFO - step: 4095 loss: 19.8929 memory: 6.46GiB(27.34%) tps: 25,749 tflops: 25.91 mfu: 8.31% global_avg_ntp_loss: 3.5597 global_avg_mtp_loss: 16.3333 +[titan] 2025-06-13 13:29:55,967 - root - INFO - lr: 4.4786e-04 gnorm: 1.43 [ 0:48:26< 2:08:59] +[titan] 2025-06-13 13:29:56,880 - root - INFO - Dumping profiler traces at step 4096 +[titan] 2025-06-13 13:29:56,970 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 13:29:58,822 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:29:59,432 - root - INFO - step: 4100 loss: 21.6300 memory: 6.46GiB(27.34%) tps: 23,643 tflops: 23.79 mfu: 7.63% global_avg_ntp_loss: 3.8998 global_avg_mtp_loss: 17.7303 +[titan] 2025-06-13 13:29:59,432 - root - INFO - lr: 4.4769e-04 gnorm: 1.03 [ 0:48:29< 2:08:56] +[titan] 2025-06-13 13:30:03,007 - root - INFO - step: 4105 loss: 20.4243 memory: 6.46GiB(27.34%) tps: 22,916 tflops: 23.06 mfu: 7.39% global_avg_ntp_loss: 3.6383 global_avg_mtp_loss: 16.7860 +[titan] 2025-06-13 13:30:03,007 - root - INFO - lr: 4.4753e-04 gnorm: 1.06 [ 0:48:33< 2:08:52] +[titan] 2025-06-13 13:30:06,490 - root - INFO - step: 4110 loss: 20.1935 memory: 6.46GiB(27.34%) tps: 23,520 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.6211 global_avg_mtp_loss: 16.5724 +[titan] 2025-06-13 13:30:06,491 - root - INFO - lr: 4.4737e-04 gnorm: 1.02 [ 0:48:36< 2:08:48] +[titan] 2025-06-13 13:30:10,001 - root - INFO - step: 4115 loss: 20.9547 memory: 6.46GiB(27.34%) tps: 23,334 tflops: 23.48 mfu: 7.53% global_avg_ntp_loss: 3.7810 global_avg_mtp_loss: 17.1737 +[titan] 2025-06-13 13:30:10,002 - root - INFO - lr: 4.4721e-04 gnorm: 1.05 [ 0:48:40< 2:08:45] +[titan] 2025-06-13 13:30:14,005 - root - INFO - step: 4120 loss: 21.0463 memory: 6.46GiB(27.34%) tps: 20,466 tflops: 20.60 mfu: 6.60% global_avg_ntp_loss: 3.8164 global_avg_mtp_loss: 17.2299 +[titan] 2025-06-13 13:30:14,005 - root - INFO - lr: 4.4705e-04 gnorm: 1.03 [ 0:48:44< 2:08:42] +[titan] 2025-06-13 13:30:17,253 - root - INFO - step: 4125 loss: 21.0251 memory: 6.46GiB(27.34%) tps: 25,224 tflops: 25.39 mfu: 8.14% global_avg_ntp_loss: 3.7665 global_avg_mtp_loss: 17.2586 +[titan] 2025-06-13 13:30:17,253 - root - INFO - lr: 4.4688e-04 gnorm: 1.02 [ 0:48:47< 2:08:38] +[titan] 2025-06-13 13:30:20,814 - root - INFO - step: 4130 loss: 20.8042 memory: 6.46GiB(27.34%) tps: 23,010 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 3.7538 global_avg_mtp_loss: 17.0504 +[titan] 2025-06-13 13:30:20,814 - root - INFO - lr: 4.4672e-04 gnorm: 1.07 [ 0:48:51< 2:08:35] +[titan] 2025-06-13 13:30:23,968 - root - INFO - step: 4135 loss: 19.2834 memory: 6.46GiB(27.34%) tps: 25,979 tflops: 26.14 mfu: 8.38% global_avg_ntp_loss: 3.4053 global_avg_mtp_loss: 15.8780 +[titan] 2025-06-13 13:30:23,968 - root - INFO - lr: 4.4656e-04 gnorm: 1.47 [ 0:48:54< 2:08:30] +[titan] 2025-06-13 13:30:27,465 - root - INFO - step: 4140 loss: 19.4503 memory: 6.46GiB(27.34%) tps: 23,429 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 3.4856 global_avg_mtp_loss: 15.9647 +[titan] 2025-06-13 13:30:27,465 - root - INFO - lr: 4.4639e-04 gnorm: 1.13 [ 0:48:57< 2:08:26] +[titan] 2025-06-13 13:30:30,759 - root - INFO - step: 4145 loss: 21.5336 memory: 6.46GiB(27.34%) tps: 24,874 tflops: 25.03 mfu: 8.02% global_avg_ntp_loss: 3.9495 global_avg_mtp_loss: 17.5841 +[titan] 2025-06-13 13:30:30,759 - root - INFO - lr: 4.4623e-04 gnorm: 1.14 [ 0:49:01< 2:08:22] +[titan] 2025-06-13 13:30:33,459 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:30:34,002 - root - INFO - step: 4150 loss: 20.9195 memory: 6.46GiB(27.34%) tps: 25,261 tflops: 25.42 mfu: 8.15% global_avg_ntp_loss: 3.7639 global_avg_mtp_loss: 17.1556 +[titan] 2025-06-13 13:30:34,003 - root - INFO - lr: 4.4607e-04 gnorm: 1.26 [ 0:49:04< 2:08:18] +[titan] 2025-06-13 13:30:37,412 - root - INFO - step: 4155 loss: 20.2623 memory: 6.46GiB(27.34%) tps: 24,032 tflops: 24.19 mfu: 7.75% global_avg_ntp_loss: 3.6296 global_avg_mtp_loss: 16.6327 +[titan] 2025-06-13 13:30:37,412 - root - INFO - lr: 4.4590e-04 gnorm: 1.06 [ 0:49:07< 2:08:14] +[titan] 2025-06-13 13:30:41,010 - root - INFO - step: 4160 loss: 20.4926 memory: 6.46GiB(27.34%) tps: 22,766 tflops: 22.91 mfu: 7.34% global_avg_ntp_loss: 3.7118 global_avg_mtp_loss: 16.7808 +[titan] 2025-06-13 13:30:41,011 - root - INFO - lr: 4.4574e-04 gnorm: 1.10 [ 0:49:11< 2:08:10] +[titan] 2025-06-13 13:30:45,320 - root - INFO - step: 4165 loss: 19.2047 memory: 6.46GiB(27.34%) tps: 19,008 tflops: 19.13 mfu: 6.13% global_avg_ntp_loss: 3.4807 global_avg_mtp_loss: 15.7240 +[titan] 2025-06-13 13:30:45,321 - root - INFO - lr: 4.4557e-04 gnorm: 1.20 [ 0:49:15< 2:08:09] +[titan] 2025-06-13 13:30:48,176 - root - INFO - step: 4170 loss: 19.4461 memory: 6.46GiB(27.34%) tps: 28,690 tflops: 28.87 mfu: 9.25% global_avg_ntp_loss: 3.4918 global_avg_mtp_loss: 15.9544 +[titan] 2025-06-13 13:30:48,177 - root - INFO - lr: 4.4541e-04 gnorm: 1.86 [ 0:49:18< 2:08:03] +[titan] 2025-06-13 13:30:51,261 - root - INFO - step: 4175 loss: 20.4619 memory: 6.46GiB(27.34%) tps: 26,562 tflops: 26.73 mfu: 8.57% global_avg_ntp_loss: 3.6553 global_avg_mtp_loss: 16.8065 +[titan] 2025-06-13 13:30:51,261 - root - INFO - lr: 4.4524e-04 gnorm: 0.97 [ 0:49:21< 2:07:59] +[titan] 2025-06-13 13:30:54,831 - root - INFO - step: 4180 loss: 20.6012 memory: 6.46GiB(27.34%) tps: 22,946 tflops: 23.09 mfu: 7.40% global_avg_ntp_loss: 3.7034 global_avg_mtp_loss: 16.8978 +[titan] 2025-06-13 13:30:54,832 - root - INFO - lr: 4.4508e-04 gnorm: 1.16 [ 0:49:25< 2:07:55] +[titan] 2025-06-13 13:30:58,353 - root - INFO - step: 4185 loss: 20.6606 memory: 6.46GiB(27.34%) tps: 23,267 tflops: 23.42 mfu: 7.50% global_avg_ntp_loss: 3.8315 global_avg_mtp_loss: 16.8291 +[titan] 2025-06-13 13:30:58,353 - root - INFO - lr: 4.4491e-04 gnorm: 1.47 [ 0:49:28< 2:07:52] +[titan] 2025-06-13 13:31:01,975 - root - INFO - step: 4190 loss: 20.8770 memory: 6.46GiB(27.34%) tps: 22,618 tflops: 22.76 mfu: 7.30% global_avg_ntp_loss: 3.7464 global_avg_mtp_loss: 17.1306 +[titan] 2025-06-13 13:31:01,975 - root - INFO - lr: 4.4475e-04 gnorm: 1.12 [ 0:49:32< 2:07:48] +[titan] 2025-06-13 13:31:05,403 - root - INFO - step: 4195 loss: 20.5474 memory: 6.46GiB(27.34%) tps: 23,899 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 3.7315 global_avg_mtp_loss: 16.8159 +[titan] 2025-06-13 13:31:05,403 - root - INFO - lr: 4.4458e-04 gnorm: 1.29 [ 0:49:35< 2:07:44] +[titan] 2025-06-13 13:31:07,905 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:31:08,759 - root - INFO - step: 4200 loss: 20.2185 memory: 6.46GiB(27.34%) tps: 24,410 tflops: 24.57 mfu: 7.87% global_avg_ntp_loss: 3.6241 global_avg_mtp_loss: 16.5944 +[titan] 2025-06-13 13:31:08,760 - root - INFO - lr: 4.4442e-04 gnorm: 1.19 [ 0:49:39< 2:07:40] +[titan] 2025-06-13 13:31:12,083 - root - INFO - step: 4205 loss: 21.1647 memory: 6.46GiB(27.34%) tps: 24,654 tflops: 24.81 mfu: 7.95% global_avg_ntp_loss: 3.7898 global_avg_mtp_loss: 17.3749 +[titan] 2025-06-13 13:31:12,083 - root - INFO - lr: 4.4425e-04 gnorm: 1.07 [ 0:49:42< 2:07:36] +[titan] 2025-06-13 13:31:15,514 - root - INFO - step: 4210 loss: 21.2725 memory: 6.46GiB(27.34%) tps: 23,879 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 3.8735 global_avg_mtp_loss: 17.3990 +[titan] 2025-06-13 13:31:15,514 - root - INFO - lr: 4.4408e-04 gnorm: 1.16 [ 0:49:45< 2:07:32] +[titan] 2025-06-13 13:31:19,218 - root - INFO - step: 4215 loss: 21.2711 memory: 6.46GiB(27.34%) tps: 22,119 tflops: 22.26 mfu: 7.13% global_avg_ntp_loss: 3.8179 global_avg_mtp_loss: 17.4532 +[titan] 2025-06-13 13:31:19,218 - root - INFO - lr: 4.4392e-04 gnorm: 1.35 [ 0:49:49< 2:07:29] +[titan] 2025-06-13 13:31:22,708 - root - INFO - step: 4220 loss: 20.4964 memory: 6.46GiB(27.34%) tps: 23,471 tflops: 23.62 mfu: 7.57% global_avg_ntp_loss: 3.6990 global_avg_mtp_loss: 16.7975 +[titan] 2025-06-13 13:31:22,709 - root - INFO - lr: 4.4375e-04 gnorm: 1.11 [ 0:49:53< 2:07:26] +[titan] 2025-06-13 13:31:26,243 - root - INFO - step: 4225 loss: 20.2020 memory: 6.46GiB(27.34%) tps: 23,179 tflops: 23.33 mfu: 7.48% global_avg_ntp_loss: 3.5923 global_avg_mtp_loss: 16.6097 +[titan] 2025-06-13 13:31:26,244 - root - INFO - lr: 4.4358e-04 gnorm: 1.02 [ 0:49:56< 2:07:22] +[titan] 2025-06-13 13:31:29,734 - root - INFO - step: 4230 loss: 20.8820 memory: 6.46GiB(27.34%) tps: 23,472 tflops: 23.62 mfu: 7.57% global_avg_ntp_loss: 3.7538 global_avg_mtp_loss: 17.1282 +[titan] 2025-06-13 13:31:29,734 - root - INFO - lr: 4.4341e-04 gnorm: 1.12 [ 0:50:00< 2:07:18] +[titan] 2025-06-13 13:31:33,098 - root - INFO - step: 4235 loss: 20.2502 memory: 6.46GiB(27.34%) tps: 24,353 tflops: 24.51 mfu: 7.86% global_avg_ntp_loss: 3.5981 global_avg_mtp_loss: 16.6521 +[titan] 2025-06-13 13:31:33,098 - root - INFO - lr: 4.4325e-04 gnorm: 1.17 [ 0:50:03< 2:07:14] +[titan] 2025-06-13 13:31:36,353 - root - INFO - step: 4240 loss: 21.3383 memory: 6.46GiB(27.34%) tps: 25,174 tflops: 25.33 mfu: 8.12% global_avg_ntp_loss: 3.8738 global_avg_mtp_loss: 17.4645 +[titan] 2025-06-13 13:31:36,353 - root - INFO - lr: 4.4308e-04 gnorm: 1.07 [ 0:50:06< 2:07:10] +[titan] 2025-06-13 13:31:39,936 - root - INFO - step: 4245 loss: 20.4774 memory: 6.46GiB(27.34%) tps: 22,867 tflops: 23.01 mfu: 7.38% global_avg_ntp_loss: 3.6522 global_avg_mtp_loss: 16.8252 +[titan] 2025-06-13 13:31:39,936 - root - INFO - lr: 4.4291e-04 gnorm: 1.19 [ 0:50:10< 2:07:07] +[titan] 2025-06-13 13:31:42,678 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:31:43,245 - root - INFO - step: 4250 loss: 18.1181 memory: 6.46GiB(27.34%) tps: 24,754 tflops: 24.91 mfu: 7.98% global_avg_ntp_loss: 3.1919 global_avg_mtp_loss: 14.9262 +[titan] 2025-06-13 13:31:43,246 - root - INFO - lr: 4.4274e-04 gnorm: 1.78 [ 0:50:13< 2:07:02] +[titan] 2025-06-13 13:31:46,516 - root - INFO - step: 4255 loss: 21.0676 memory: 6.46GiB(27.34%) tps: 25,047 tflops: 25.21 mfu: 8.08% global_avg_ntp_loss: 3.7795 global_avg_mtp_loss: 17.2881 +[titan] 2025-06-13 13:31:46,517 - root - INFO - lr: 4.4258e-04 gnorm: 1.04 [ 0:50:16< 2:06:58] +[titan] 2025-06-13 13:31:49,758 - root - INFO - step: 4260 loss: 21.2598 memory: 6.46GiB(27.34%) tps: 25,274 tflops: 25.43 mfu: 8.15% global_avg_ntp_loss: 3.8778 global_avg_mtp_loss: 17.3820 +[titan] 2025-06-13 13:31:49,759 - root - INFO - lr: 4.4241e-04 gnorm: 1.04 [ 0:50:20< 2:06:54] +[titan] 2025-06-13 13:31:53,379 - root - INFO - step: 4265 loss: 19.8103 memory: 6.46GiB(27.34%) tps: 22,631 tflops: 22.77 mfu: 7.30% global_avg_ntp_loss: 3.5054 global_avg_mtp_loss: 16.3049 +[titan] 2025-06-13 13:31:53,379 - root - INFO - lr: 4.4224e-04 gnorm: 1.33 [ 0:50:23< 2:06:50] +[titan] 2025-06-13 13:31:56,497 - root - INFO - step: 4270 loss: 21.1754 memory: 6.46GiB(27.34%) tps: 26,277 tflops: 26.44 mfu: 8.48% global_avg_ntp_loss: 3.8140 global_avg_mtp_loss: 17.3614 +[titan] 2025-06-13 13:31:56,497 - root - INFO - lr: 4.4207e-04 gnorm: 1.06 [ 0:50:26< 2:06:46] +[titan] 2025-06-13 13:31:59,601 - root - INFO - step: 4275 loss: 19.9982 memory: 6.46GiB(27.34%) tps: 26,396 tflops: 26.56 mfu: 8.51% global_avg_ntp_loss: 3.6747 global_avg_mtp_loss: 16.3236 +[titan] 2025-06-13 13:31:59,601 - root - INFO - lr: 4.4190e-04 gnorm: 1.64 [ 0:50:30< 2:06:41] +[titan] 2025-06-13 13:32:03,097 - root - INFO - step: 4280 loss: 17.6013 memory: 6.46GiB(27.34%) tps: 23,433 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 3.1968 global_avg_mtp_loss: 14.4045 +[titan] 2025-06-13 13:32:03,097 - root - INFO - lr: 4.4173e-04 gnorm: 1.68 [ 0:50:33< 2:06:38] +[titan] 2025-06-13 13:32:06,230 - root - INFO - step: 4285 loss: 20.7667 memory: 6.46GiB(27.34%) tps: 26,146 tflops: 26.31 mfu: 8.43% global_avg_ntp_loss: 3.7428 global_avg_mtp_loss: 17.0239 +[titan] 2025-06-13 13:32:06,231 - root - INFO - lr: 4.4156e-04 gnorm: 1.05 [ 0:50:36< 2:06:33] +[titan] 2025-06-13 13:32:10,069 - root - INFO - step: 4290 loss: 21.1171 memory: 6.46GiB(27.34%) tps: 21,345 tflops: 21.48 mfu: 6.88% global_avg_ntp_loss: 3.8267 global_avg_mtp_loss: 17.2904 +[titan] 2025-06-13 13:32:10,069 - root - INFO - lr: 4.4139e-04 gnorm: 1.12 [ 0:50:40< 2:06:30] +[titan] 2025-06-13 13:32:13,240 - root - INFO - step: 4295 loss: 19.6834 memory: 6.46GiB(27.34%) tps: 25,835 tflops: 26.00 mfu: 8.33% global_avg_ntp_loss: 3.5496 global_avg_mtp_loss: 16.1337 +[titan] 2025-06-13 13:32:13,240 - root - INFO - lr: 4.4122e-04 gnorm: 1.20 [ 0:50:43< 2:06:26] +[titan] 2025-06-13 13:32:16,105 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:32:16,872 - root - INFO - step: 4300 loss: 19.6061 memory: 6.46GiB(27.34%) tps: 22,559 tflops: 22.70 mfu: 7.28% global_avg_ntp_loss: 3.5042 global_avg_mtp_loss: 16.1019 +[titan] 2025-06-13 13:32:16,872 - root - INFO - lr: 4.4105e-04 gnorm: 1.05 [ 0:50:47< 2:06:22] +[titan] 2025-06-13 13:32:20,293 - root - INFO - step: 4305 loss: 21.1002 memory: 6.46GiB(27.34%) tps: 23,945 tflops: 24.10 mfu: 7.72% global_avg_ntp_loss: 3.8052 global_avg_mtp_loss: 17.2950 +[titan] 2025-06-13 13:32:20,294 - root - INFO - lr: 4.4088e-04 gnorm: 1.12 [ 0:50:50< 2:06:19] +[titan] 2025-06-13 13:32:23,907 - root - INFO - step: 4310 loss: 20.3288 memory: 6.46GiB(27.34%) tps: 22,675 tflops: 22.82 mfu: 7.31% global_avg_ntp_loss: 3.6195 global_avg_mtp_loss: 16.7093 +[titan] 2025-06-13 13:32:23,907 - root - INFO - lr: 4.4071e-04 gnorm: 1.38 [ 0:50:54< 2:06:15] +[titan] 2025-06-13 13:32:27,122 - root - INFO - step: 4315 loss: 20.3120 memory: 6.46GiB(27.34%) tps: 25,481 tflops: 25.64 mfu: 8.22% global_avg_ntp_loss: 3.6302 global_avg_mtp_loss: 16.6818 +[titan] 2025-06-13 13:32:27,122 - root - INFO - lr: 4.4054e-04 gnorm: 1.26 [ 0:50:57< 2:06:11] +[titan] 2025-06-13 13:32:30,685 - root - INFO - step: 4320 loss: 20.5734 memory: 6.46GiB(27.34%) tps: 22,994 tflops: 23.14 mfu: 7.42% global_avg_ntp_loss: 3.6578 global_avg_mtp_loss: 16.9156 +[titan] 2025-06-13 13:32:30,685 - root - INFO - lr: 4.4037e-04 gnorm: 1.09 [ 0:51:01< 2:06:07] +[titan] 2025-06-13 13:32:33,907 - root - INFO - step: 4325 loss: 19.5547 memory: 6.46GiB(27.34%) tps: 25,431 tflops: 25.59 mfu: 8.20% global_avg_ntp_loss: 3.5359 global_avg_mtp_loss: 16.0188 +[titan] 2025-06-13 13:32:33,907 - root - INFO - lr: 4.4020e-04 gnorm: 2.07 [ 0:51:04< 2:06:03] +[titan] 2025-06-13 13:32:37,491 - root - INFO - step: 4330 loss: 20.4227 memory: 6.46GiB(27.34%) tps: 22,857 tflops: 23.00 mfu: 7.37% global_avg_ntp_loss: 3.6602 global_avg_mtp_loss: 16.7624 +[titan] 2025-06-13 13:32:37,491 - root - INFO - lr: 4.4002e-04 gnorm: 1.05 [ 0:51:07< 2:06:00] +[titan] 2025-06-13 13:32:40,788 - root - INFO - step: 4335 loss: 20.4211 memory: 6.46GiB(27.34%) tps: 24,850 tflops: 25.01 mfu: 8.02% global_avg_ntp_loss: 3.6366 global_avg_mtp_loss: 16.7844 +[titan] 2025-06-13 13:32:40,789 - root - INFO - lr: 4.3985e-04 gnorm: 1.24 [ 0:51:11< 2:05:55] +[titan] 2025-06-13 13:32:43,874 - root - INFO - step: 4340 loss: 20.4663 memory: 6.46GiB(27.34%) tps: 26,549 tflops: 26.72 mfu: 8.56% global_avg_ntp_loss: 3.7004 global_avg_mtp_loss: 16.7659 +[titan] 2025-06-13 13:32:43,874 - root - INFO - lr: 4.3968e-04 gnorm: 1.24 [ 0:51:14< 2:05:51] +[titan] 2025-06-13 13:32:47,531 - root - INFO - step: 4345 loss: 21.7339 memory: 6.46GiB(27.34%) tps: 22,402 tflops: 22.54 mfu: 7.23% global_avg_ntp_loss: 3.9758 global_avg_mtp_loss: 17.7581 +[titan] 2025-06-13 13:32:47,532 - root - INFO - lr: 4.3951e-04 gnorm: 1.06 [ 0:51:17< 2:05:47] +[titan] 2025-06-13 13:32:50,399 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:32:51,059 - root - INFO - step: 4350 loss: 18.8139 memory: 6.46GiB(27.34%) tps: 23,227 tflops: 23.38 mfu: 7.49% global_avg_ntp_loss: 3.4140 global_avg_mtp_loss: 15.3999 +[titan] 2025-06-13 13:32:51,059 - root - INFO - lr: 4.3934e-04 gnorm: 1.63 [ 0:51:21< 2:05:44] +[titan] 2025-06-13 13:32:54,559 - root - INFO - step: 4355 loss: 20.6672 memory: 6.46GiB(27.34%) tps: 23,404 tflops: 23.55 mfu: 7.55% global_avg_ntp_loss: 3.7150 global_avg_mtp_loss: 16.9523 +[titan] 2025-06-13 13:32:54,560 - root - INFO - lr: 4.3916e-04 gnorm: 1.18 [ 0:51:25< 2:05:40] +[titan] 2025-06-13 13:32:58,010 - root - INFO - step: 4360 loss: 21.0043 memory: 6.46GiB(27.34%) tps: 23,747 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 3.8127 global_avg_mtp_loss: 17.1916 +[titan] 2025-06-13 13:32:58,010 - root - INFO - lr: 4.3899e-04 gnorm: 1.14 [ 0:51:28< 2:05:36] +[titan] 2025-06-13 13:33:01,479 - root - INFO - step: 4365 loss: 20.9517 memory: 6.46GiB(27.34%) tps: 23,618 tflops: 23.77 mfu: 7.62% global_avg_ntp_loss: 3.7508 global_avg_mtp_loss: 17.2009 +[titan] 2025-06-13 13:33:01,479 - root - INFO - lr: 4.3882e-04 gnorm: 1.04 [ 0:51:31< 2:05:33] +[titan] 2025-06-13 13:33:04,565 - root - INFO - step: 4370 loss: 20.3606 memory: 6.46GiB(27.34%) tps: 26,543 tflops: 26.71 mfu: 8.56% global_avg_ntp_loss: 3.6344 global_avg_mtp_loss: 16.7262 +[titan] 2025-06-13 13:33:04,566 - root - INFO - lr: 4.3865e-04 gnorm: 1.11 [ 0:51:35< 2:05:28] +[titan] 2025-06-13 13:33:08,111 - root - INFO - step: 4375 loss: 20.2412 memory: 6.46GiB(27.34%) tps: 23,108 tflops: 23.26 mfu: 7.45% global_avg_ntp_loss: 3.6085 global_avg_mtp_loss: 16.6327 +[titan] 2025-06-13 13:33:08,111 - root - INFO - lr: 4.3847e-04 gnorm: 1.11 [ 0:51:38< 2:05:25] +[titan] 2025-06-13 13:33:11,494 - root - INFO - step: 4380 loss: 21.6758 memory: 6.46GiB(27.34%) tps: 24,220 tflops: 24.37 mfu: 7.81% global_avg_ntp_loss: 3.8958 global_avg_mtp_loss: 17.7800 +[titan] 2025-06-13 13:33:11,494 - root - INFO - lr: 4.3830e-04 gnorm: 0.98 [ 0:51:41< 2:05:21] +[titan] 2025-06-13 13:33:15,278 - root - INFO - step: 4385 loss: 20.1975 memory: 6.46GiB(27.34%) tps: 21,653 tflops: 21.79 mfu: 6.98% global_avg_ntp_loss: 3.6134 global_avg_mtp_loss: 16.5841 +[titan] 2025-06-13 13:33:15,278 - root - INFO - lr: 4.3812e-04 gnorm: 1.41 [ 0:51:45< 2:05:18] +[titan] 2025-06-13 13:33:18,493 - root - INFO - step: 4390 loss: 20.6090 memory: 6.46GiB(27.34%) tps: 25,476 tflops: 25.64 mfu: 8.22% global_avg_ntp_loss: 3.7207 global_avg_mtp_loss: 16.8883 +[titan] 2025-06-13 13:33:18,494 - root - INFO - lr: 4.3795e-04 gnorm: 1.11 [ 0:51:48< 2:05:13] +[titan] 2025-06-13 13:33:22,275 - root - INFO - step: 4395 loss: 21.0722 memory: 6.46GiB(27.34%) tps: 21,668 tflops: 21.81 mfu: 6.99% global_avg_ntp_loss: 3.7576 global_avg_mtp_loss: 17.3146 +[titan] 2025-06-13 13:33:22,275 - root - INFO - lr: 4.3778e-04 gnorm: 0.96 [ 0:51:52< 2:05:10] +[titan] 2025-06-13 13:33:25,081 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:33:25,978 - root - INFO - step: 4400 loss: 19.8050 memory: 6.46GiB(27.34%) tps: 22,122 tflops: 22.26 mfu: 7.14% global_avg_ntp_loss: 3.6298 global_avg_mtp_loss: 16.1752 +[titan] 2025-06-13 13:33:25,979 - root - INFO - lr: 4.3760e-04 gnorm: 1.18 [ 0:51:56< 2:05:07] +[titan] 2025-06-13 13:33:29,242 - root - INFO - step: 4405 loss: 17.4428 memory: 6.46GiB(27.34%) tps: 25,106 tflops: 25.27 mfu: 8.10% global_avg_ntp_loss: 3.1392 global_avg_mtp_loss: 14.3037 +[titan] 2025-06-13 13:33:29,242 - root - INFO - lr: 4.3743e-04 gnorm: 1.81 [ 0:51:59< 2:05:03] +[titan] 2025-06-13 13:33:32,653 - root - INFO - step: 4410 loss: 21.2488 memory: 6.46GiB(27.34%) tps: 24,017 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 3.7885 global_avg_mtp_loss: 17.4603 +[titan] 2025-06-13 13:33:32,653 - root - INFO - lr: 4.3725e-04 gnorm: 1.16 [ 0:52:03< 2:04:59] +[titan] 2025-06-13 13:33:36,162 - root - INFO - step: 4415 loss: 18.5108 memory: 6.46GiB(27.34%) tps: 23,349 tflops: 23.50 mfu: 7.53% global_avg_ntp_loss: 3.2708 global_avg_mtp_loss: 15.2400 +[titan] 2025-06-13 13:33:36,163 - root - INFO - lr: 4.3708e-04 gnorm: 1.92 [ 0:52:06< 2:04:56] +[titan] 2025-06-13 13:33:39,463 - root - INFO - step: 4420 loss: 21.2149 memory: 6.46GiB(27.34%) tps: 24,820 tflops: 24.98 mfu: 8.01% global_avg_ntp_loss: 3.8367 global_avg_mtp_loss: 17.3783 +[titan] 2025-06-13 13:33:39,464 - root - INFO - lr: 4.3690e-04 gnorm: 1.10 [ 0:52:09< 2:04:51] +[titan] 2025-06-13 13:33:42,775 - root - INFO - step: 4425 loss: 20.0581 memory: 6.46GiB(27.34%) tps: 24,736 tflops: 24.89 mfu: 7.98% global_avg_ntp_loss: 3.6272 global_avg_mtp_loss: 16.4309 +[titan] 2025-06-13 13:33:42,776 - root - INFO - lr: 4.3673e-04 gnorm: 1.49 [ 0:52:13< 2:04:47] +[titan] 2025-06-13 13:33:45,914 - root - INFO - step: 4430 loss: 21.3821 memory: 6.46GiB(27.34%) tps: 26,106 tflops: 26.27 mfu: 8.42% global_avg_ntp_loss: 3.8368 global_avg_mtp_loss: 17.5453 +[titan] 2025-06-13 13:33:45,914 - root - INFO - lr: 4.3655e-04 gnorm: 1.05 [ 0:52:16< 2:04:43] +[titan] 2025-06-13 13:33:49,040 - root - INFO - step: 4435 loss: 19.9289 memory: 6.46GiB(27.34%) tps: 26,203 tflops: 26.37 mfu: 8.45% global_avg_ntp_loss: 3.5314 global_avg_mtp_loss: 16.3976 +[titan] 2025-06-13 13:33:49,041 - root - INFO - lr: 4.3638e-04 gnorm: 1.37 [ 0:52:19< 2:04:38] +[titan] 2025-06-13 13:33:52,273 - root - INFO - step: 4440 loss: 20.6607 memory: 6.46GiB(27.34%) tps: 25,345 tflops: 25.51 mfu: 8.18% global_avg_ntp_loss: 3.7360 global_avg_mtp_loss: 16.9247 +[titan] 2025-06-13 13:33:52,273 - root - INFO - lr: 4.3620e-04 gnorm: 1.06 [ 0:52:22< 2:04:34] +[titan] 2025-06-13 13:33:55,659 - root - INFO - step: 4445 loss: 20.9680 memory: 6.46GiB(27.34%) tps: 24,194 tflops: 24.35 mfu: 7.80% global_avg_ntp_loss: 3.7518 global_avg_mtp_loss: 17.2162 +[titan] 2025-06-13 13:33:55,660 - root - INFO - lr: 4.3602e-04 gnorm: 1.10 [ 0:52:26< 2:04:30] +[titan] 2025-06-13 13:33:58,547 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:33:59,149 - root - INFO - step: 4450 loss: 20.4111 memory: 6.46GiB(27.34%) tps: 23,477 tflops: 23.63 mfu: 7.57% global_avg_ntp_loss: 3.6635 global_avg_mtp_loss: 16.7476 +[titan] 2025-06-13 13:33:59,150 - root - INFO - lr: 4.3585e-04 gnorm: 1.34 [ 0:52:29< 2:04:26] +[titan] 2025-06-13 13:34:02,660 - root - INFO - step: 4455 loss: 19.7321 memory: 6.46GiB(27.34%) tps: 23,338 tflops: 23.49 mfu: 7.53% global_avg_ntp_loss: 3.5201 global_avg_mtp_loss: 16.2121 +[titan] 2025-06-13 13:34:02,660 - root - INFO - lr: 4.3567e-04 gnorm: 1.24 [ 0:52:33< 2:04:23] +[titan] 2025-06-13 13:34:06,175 - root - INFO - step: 4460 loss: 20.9574 memory: 6.46GiB(27.34%) tps: 23,306 tflops: 23.45 mfu: 7.52% global_avg_ntp_loss: 3.8050 global_avg_mtp_loss: 17.1523 +[titan] 2025-06-13 13:34:06,176 - root - INFO - lr: 4.3549e-04 gnorm: 1.12 [ 0:52:36< 2:04:19] +[titan] 2025-06-13 13:34:09,737 - root - INFO - step: 4465 loss: 21.4699 memory: 6.46GiB(27.34%) tps: 23,004 tflops: 23.15 mfu: 7.42% global_avg_ntp_loss: 3.8581 global_avg_mtp_loss: 17.6119 +[titan] 2025-06-13 13:34:09,737 - root - INFO - lr: 4.3532e-04 gnorm: 1.12 [ 0:52:40< 2:04:16] +[titan] 2025-06-13 13:34:13,285 - root - INFO - step: 4470 loss: 21.2834 memory: 6.46GiB(27.34%) tps: 23,089 tflops: 23.24 mfu: 7.45% global_avg_ntp_loss: 3.8743 global_avg_mtp_loss: 17.4091 +[titan] 2025-06-13 13:34:13,286 - root - INFO - lr: 4.3514e-04 gnorm: 1.11 [ 0:52:43< 2:04:12] +[titan] 2025-06-13 13:34:17,191 - root - INFO - step: 4475 loss: 21.0547 memory: 6.46GiB(27.34%) tps: 20,977 tflops: 21.11 mfu: 6.77% global_avg_ntp_loss: 3.8356 global_avg_mtp_loss: 17.2191 +[titan] 2025-06-13 13:34:17,191 - root - INFO - lr: 4.3496e-04 gnorm: 1.18 [ 0:52:47< 2:04:10] +[titan] 2025-06-13 13:34:20,156 - root - INFO - step: 4480 loss: 21.4497 memory: 6.46GiB(27.34%) tps: 27,631 tflops: 27.81 mfu: 8.91% global_avg_ntp_loss: 3.8088 global_avg_mtp_loss: 17.6409 +[titan] 2025-06-13 13:34:20,157 - root - INFO - lr: 4.3478e-04 gnorm: 1.06 [ 0:52:50< 2:04:05] +[titan] 2025-06-13 13:34:23,957 - root - INFO - step: 4485 loss: 20.8915 memory: 6.46GiB(27.34%) tps: 21,556 tflops: 21.69 mfu: 6.95% global_avg_ntp_loss: 3.7577 global_avg_mtp_loss: 17.1337 +[titan] 2025-06-13 13:34:23,957 - root - INFO - lr: 4.3461e-04 gnorm: 1.08 [ 0:52:54< 2:04:02] +[titan] 2025-06-13 13:34:27,506 - root - INFO - step: 4490 loss: 21.6437 memory: 6.46GiB(27.34%) tps: 23,083 tflops: 23.23 mfu: 7.45% global_avg_ntp_loss: 3.9440 global_avg_mtp_loss: 17.6997 +[titan] 2025-06-13 13:34:27,507 - root - INFO - lr: 4.3443e-04 gnorm: 1.17 [ 0:52:57< 2:03:58] +[titan] 2025-06-13 13:34:30,854 - root - INFO - step: 4495 loss: 21.1035 memory: 6.46GiB(27.34%) tps: 24,475 tflops: 24.63 mfu: 7.89% global_avg_ntp_loss: 3.8038 global_avg_mtp_loss: 17.2998 +[titan] 2025-06-13 13:34:30,854 - root - INFO - lr: 4.3425e-04 gnorm: 0.97 [ 0:53:01< 2:03:54] +[titan] 2025-06-13 13:34:33,597 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:34:34,283 - root - INFO - step: 4500 loss: 21.1872 memory: 6.46GiB(27.34%) tps: 23,893 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 3.7727 global_avg_mtp_loss: 17.4144 +[titan] 2025-06-13 13:34:34,283 - root - INFO - lr: 4.3407e-04 gnorm: 1.10 [ 0:53:04< 2:03:50] +[titan] 2025-06-13 13:34:37,664 - root - INFO - step: 4505 loss: 20.7901 memory: 6.46GiB(27.34%) tps: 24,235 tflops: 24.39 mfu: 7.82% global_avg_ntp_loss: 3.7587 global_avg_mtp_loss: 17.0313 +[titan] 2025-06-13 13:34:37,664 - root - INFO - lr: 4.3389e-04 gnorm: 1.09 [ 0:53:08< 2:03:47] +[titan] 2025-06-13 13:34:41,117 - root - INFO - step: 4510 loss: 21.3517 memory: 6.46GiB(27.34%) tps: 23,726 tflops: 23.88 mfu: 7.65% global_avg_ntp_loss: 3.8222 global_avg_mtp_loss: 17.5295 +[titan] 2025-06-13 13:34:41,117 - root - INFO - lr: 4.3371e-04 gnorm: 1.04 [ 0:53:11< 2:03:43] +[titan] 2025-06-13 13:34:44,528 - root - INFO - step: 4515 loss: 20.9340 memory: 6.46GiB(27.34%) tps: 24,018 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 3.7389 global_avg_mtp_loss: 17.1952 +[titan] 2025-06-13 13:34:44,528 - root - INFO - lr: 4.3354e-04 gnorm: 1.20 [ 0:53:14< 2:03:39] +[titan] 2025-06-13 13:34:47,633 - root - INFO - step: 4520 loss: 20.9607 memory: 6.46GiB(27.34%) tps: 26,391 tflops: 26.56 mfu: 8.51% global_avg_ntp_loss: 3.7720 global_avg_mtp_loss: 17.1887 +[titan] 2025-06-13 13:34:47,633 - root - INFO - lr: 4.3336e-04 gnorm: 1.18 [ 0:53:18< 2:03:34] +[titan] 2025-06-13 13:34:51,091 - root - INFO - step: 4525 loss: 21.1229 memory: 6.46GiB(27.34%) tps: 23,689 tflops: 23.84 mfu: 7.64% global_avg_ntp_loss: 3.7877 global_avg_mtp_loss: 17.3352 +[titan] 2025-06-13 13:34:51,091 - root - INFO - lr: 4.3318e-04 gnorm: 1.20 [ 0:53:21< 2:03:31] +[titan] 2025-06-13 13:34:54,509 - root - INFO - step: 4530 loss: 21.2346 memory: 6.46GiB(27.34%) tps: 23,972 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 3.8229 global_avg_mtp_loss: 17.4117 +[titan] 2025-06-13 13:34:54,509 - root - INFO - lr: 4.3300e-04 gnorm: 1.14 [ 0:53:24< 2:03:27] +[titan] 2025-06-13 13:34:57,693 - root - INFO - step: 4535 loss: 20.5791 memory: 6.46GiB(27.34%) tps: 25,734 tflops: 25.90 mfu: 8.30% global_avg_ntp_loss: 3.7097 global_avg_mtp_loss: 16.8694 +[titan] 2025-06-13 13:34:57,693 - root - INFO - lr: 4.3282e-04 gnorm: 1.16 [ 0:53:28< 2:03:23] +[titan] 2025-06-13 13:35:00,887 - root - INFO - step: 4540 loss: 19.8076 memory: 6.46GiB(27.34%) tps: 25,654 tflops: 25.82 mfu: 8.27% global_avg_ntp_loss: 3.5622 global_avg_mtp_loss: 16.2454 +[titan] 2025-06-13 13:35:00,887 - root - INFO - lr: 4.3264e-04 gnorm: 1.60 [ 0:53:31< 2:03:18] +[titan] 2025-06-13 13:35:04,610 - root - INFO - step: 4545 loss: 20.3641 memory: 6.46GiB(27.34%) tps: 22,004 tflops: 22.14 mfu: 7.10% global_avg_ntp_loss: 3.6577 global_avg_mtp_loss: 16.7064 +[titan] 2025-06-13 13:35:04,610 - root - INFO - lr: 4.3246e-04 gnorm: 1.31 [ 0:53:35< 2:03:15] +[titan] 2025-06-13 13:35:07,285 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:35:07,887 - root - INFO - step: 4550 loss: 20.2023 memory: 6.46GiB(27.34%) tps: 25,002 tflops: 25.16 mfu: 8.06% global_avg_ntp_loss: 3.6195 global_avg_mtp_loss: 16.5827 +[titan] 2025-06-13 13:35:07,887 - root - INFO - lr: 4.3228e-04 gnorm: 1.12 [ 0:53:38< 2:03:11] +[titan] 2025-06-13 13:35:11,393 - root - INFO - step: 4555 loss: 21.0250 memory: 6.46GiB(27.34%) tps: 23,367 tflops: 23.52 mfu: 7.54% global_avg_ntp_loss: 3.7661 global_avg_mtp_loss: 17.2588 +[titan] 2025-06-13 13:35:11,394 - root - INFO - lr: 4.3210e-04 gnorm: 1.08 [ 0:53:41< 2:03:07] +[titan] 2025-06-13 13:35:14,827 - root - INFO - step: 4560 loss: 20.5147 memory: 6.46GiB(27.34%) tps: 23,858 tflops: 24.01 mfu: 7.70% global_avg_ntp_loss: 3.6768 global_avg_mtp_loss: 16.8379 +[titan] 2025-06-13 13:35:14,828 - root - INFO - lr: 4.3192e-04 gnorm: 1.10 [ 0:53:45< 2:03:04] +[titan] 2025-06-13 13:35:18,257 - root - INFO - step: 4565 loss: 19.0499 memory: 6.46GiB(27.34%) tps: 23,887 tflops: 24.04 mfu: 7.70% global_avg_ntp_loss: 3.3791 global_avg_mtp_loss: 15.6708 +[titan] 2025-06-13 13:35:18,258 - root - INFO - lr: 4.3173e-04 gnorm: 1.45 [ 0:53:48< 2:03:00] +[titan] 2025-06-13 13:35:22,148 - root - INFO - step: 4570 loss: 17.8803 memory: 6.46GiB(27.34%) tps: 21,060 tflops: 21.19 mfu: 6.79% global_avg_ntp_loss: 3.1468 global_avg_mtp_loss: 14.7335 +[titan] 2025-06-13 13:35:22,148 - root - INFO - lr: 4.3155e-04 gnorm: 2.36 [ 0:53:52< 2:02:57] +[titan] 2025-06-13 13:35:25,559 - root - INFO - step: 4575 loss: 20.4305 memory: 6.46GiB(27.34%) tps: 24,018 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 3.6458 global_avg_mtp_loss: 16.7847 +[titan] 2025-06-13 13:35:25,559 - root - INFO - lr: 4.3137e-04 gnorm: 1.10 [ 0:53:55< 2:02:53] +[titan] 2025-06-13 13:35:28,809 - root - INFO - step: 4580 loss: 20.0977 memory: 6.46GiB(27.34%) tps: 25,206 tflops: 25.37 mfu: 8.13% global_avg_ntp_loss: 3.5872 global_avg_mtp_loss: 16.5105 +[titan] 2025-06-13 13:35:28,810 - root - INFO - lr: 4.3119e-04 gnorm: 1.12 [ 0:53:59< 2:02:49] +[titan] 2025-06-13 13:35:32,208 - root - INFO - step: 4585 loss: 19.8245 memory: 6.46GiB(27.34%) tps: 24,107 tflops: 24.26 mfu: 7.78% global_avg_ntp_loss: 3.5471 global_avg_mtp_loss: 16.2774 +[titan] 2025-06-13 13:35:32,208 - root - INFO - lr: 4.3101e-04 gnorm: 1.31 [ 0:54:02< 2:02:45] +[titan] 2025-06-13 13:35:35,530 - root - INFO - step: 4590 loss: 20.5342 memory: 6.46GiB(27.34%) tps: 24,661 tflops: 24.82 mfu: 7.95% global_avg_ntp_loss: 3.6611 global_avg_mtp_loss: 16.8730 +[titan] 2025-06-13 13:35:35,531 - root - INFO - lr: 4.3083e-04 gnorm: 1.16 [ 0:54:05< 2:02:41] +[titan] 2025-06-13 13:35:39,031 - root - INFO - step: 4595 loss: 21.4488 memory: 6.46GiB(27.34%) tps: 23,406 tflops: 23.56 mfu: 7.55% global_avg_ntp_loss: 3.8941 global_avg_mtp_loss: 17.5547 +[titan] 2025-06-13 13:35:39,031 - root - INFO - lr: 4.3064e-04 gnorm: 1.22 [ 0:54:09< 2:02:38] +[titan] 2025-06-13 13:35:41,663 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:35:42,387 - root - INFO - step: 4600 loss: 20.9301 memory: 6.46GiB(27.34%) tps: 24,410 tflops: 24.57 mfu: 7.87% global_avg_ntp_loss: 3.7660 global_avg_mtp_loss: 17.1641 +[titan] 2025-06-13 13:35:42,387 - root - INFO - lr: 4.3046e-04 gnorm: 1.19 [ 0:54:12< 2:02:34] +[titan] 2025-06-13 13:35:45,905 - root - INFO - step: 4605 loss: 20.9775 memory: 6.46GiB(27.34%) tps: 23,287 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 3.7361 global_avg_mtp_loss: 17.2414 +[titan] 2025-06-13 13:35:45,906 - root - INFO - lr: 4.3028e-04 gnorm: 1.08 [ 0:54:16< 2:02:30] +[titan] 2025-06-13 13:35:47,999 - root - INFO - Dumping profiler traces at step 4608 +[titan] 2025-06-13 13:35:48,093 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 13:35:49,180 - root - INFO - step: 4610 loss: 18.5742 memory: 6.46GiB(27.34%) tps: 25,020 tflops: 25.18 mfu: 8.07% global_avg_ntp_loss: 3.2637 global_avg_mtp_loss: 15.3105 +[titan] 2025-06-13 13:35:49,180 - root - INFO - lr: 4.3010e-04 gnorm: 1.88 [ 0:54:19< 2:02:26] +[titan] 2025-06-13 13:35:53,209 - root - INFO - step: 4615 loss: 20.9267 memory: 6.46GiB(27.34%) tps: 20,332 tflops: 20.46 mfu: 6.56% global_avg_ntp_loss: 3.7766 global_avg_mtp_loss: 17.1501 +[titan] 2025-06-13 13:35:53,210 - root - INFO - lr: 4.2991e-04 gnorm: 1.15 [ 0:54:23< 2:02:24] +[titan] 2025-06-13 13:35:56,348 - root - INFO - step: 4620 loss: 20.9560 memory: 6.46GiB(27.34%) tps: 26,104 tflops: 26.27 mfu: 8.42% global_avg_ntp_loss: 3.8402 global_avg_mtp_loss: 17.1158 +[titan] 2025-06-13 13:35:56,349 - root - INFO - lr: 4.2973e-04 gnorm: 1.18 [ 0:54:26< 2:02:19] +[titan] 2025-06-13 13:36:00,173 - root - INFO - step: 4625 loss: 21.5551 memory: 6.46GiB(27.34%) tps: 21,423 tflops: 21.56 mfu: 6.91% global_avg_ntp_loss: 3.9671 global_avg_mtp_loss: 17.5880 +[titan] 2025-06-13 13:36:00,173 - root - INFO - lr: 4.2955e-04 gnorm: 1.48 [ 0:54:30< 2:02:16] +[titan] 2025-06-13 13:36:03,485 - root - INFO - step: 4630 loss: 21.4914 memory: 6.46GiB(27.34%) tps: 24,733 tflops: 24.89 mfu: 7.98% global_avg_ntp_loss: 3.8924 global_avg_mtp_loss: 17.5990 +[titan] 2025-06-13 13:36:03,486 - root - INFO - lr: 4.2936e-04 gnorm: 1.07 [ 0:54:33< 2:02:12] +[titan] 2025-06-13 13:36:07,002 - root - INFO - step: 4635 loss: 21.5114 memory: 6.46GiB(27.34%) tps: 23,299 tflops: 23.45 mfu: 7.52% global_avg_ntp_loss: 3.8452 global_avg_mtp_loss: 17.6662 +[titan] 2025-06-13 13:36:07,002 - root - INFO - lr: 4.2918e-04 gnorm: 1.13 [ 0:54:37< 2:02:09] +[titan] 2025-06-13 13:36:10,688 - root - INFO - step: 4640 loss: 21.0387 memory: 6.46GiB(27.34%) tps: 22,229 tflops: 22.37 mfu: 7.17% global_avg_ntp_loss: 3.7364 global_avg_mtp_loss: 17.3023 +[titan] 2025-06-13 13:36:10,688 - root - INFO - lr: 4.2900e-04 gnorm: 1.19 [ 0:54:41< 2:02:05] +[titan] 2025-06-13 13:36:14,161 - root - INFO - step: 4645 loss: 19.9888 memory: 6.46GiB(27.34%) tps: 23,590 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 3.6104 global_avg_mtp_loss: 16.3784 +[titan] 2025-06-13 13:36:14,161 - root - INFO - lr: 4.2881e-04 gnorm: 1.07 [ 0:54:44< 2:02:02] +[titan] 2025-06-13 13:36:16,827 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:36:17,438 - root - INFO - step: 4650 loss: 21.1165 memory: 6.46GiB(27.34%) tps: 24,999 tflops: 25.16 mfu: 8.06% global_avg_ntp_loss: 3.7824 global_avg_mtp_loss: 17.3341 +[titan] 2025-06-13 13:36:17,438 - root - INFO - lr: 4.2863e-04 gnorm: 1.05 [ 0:54:47< 2:01:58] +[titan] 2025-06-13 13:36:21,226 - root - INFO - step: 4655 loss: 21.4795 memory: 6.46GiB(27.34%) tps: 21,631 tflops: 21.77 mfu: 6.98% global_avg_ntp_loss: 3.9025 global_avg_mtp_loss: 17.5770 +[titan] 2025-06-13 13:36:21,226 - root - INFO - lr: 4.2844e-04 gnorm: 1.08 [ 0:54:51< 2:01:55] +[titan] 2025-06-13 13:36:24,663 - root - INFO - step: 4660 loss: 20.8997 memory: 6.46GiB(27.34%) tps: 23,837 tflops: 23.99 mfu: 7.69% global_avg_ntp_loss: 3.7361 global_avg_mtp_loss: 17.1635 +[titan] 2025-06-13 13:36:24,663 - root - INFO - lr: 4.2826e-04 gnorm: 1.40 [ 0:54:55< 2:01:51] +[titan] 2025-06-13 13:36:28,086 - root - INFO - step: 4665 loss: 20.4319 memory: 6.46GiB(27.34%) tps: 23,936 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.6911 global_avg_mtp_loss: 16.7408 +[titan] 2025-06-13 13:36:28,086 - root - INFO - lr: 4.2807e-04 gnorm: 1.14 [ 0:54:58< 2:01:47] +[titan] 2025-06-13 13:36:31,494 - root - INFO - step: 4670 loss: 21.2503 memory: 6.46GiB(27.34%) tps: 24,038 tflops: 24.19 mfu: 7.75% global_avg_ntp_loss: 3.8003 global_avg_mtp_loss: 17.4500 +[titan] 2025-06-13 13:36:31,495 - root - INFO - lr: 4.2789e-04 gnorm: 1.01 [ 0:55:01< 2:01:43] +[titan] 2025-06-13 13:36:35,039 - root - INFO - step: 4675 loss: 20.9289 memory: 6.46GiB(27.34%) tps: 23,115 tflops: 23.26 mfu: 7.46% global_avg_ntp_loss: 3.7653 global_avg_mtp_loss: 17.1636 +[titan] 2025-06-13 13:36:35,039 - root - INFO - lr: 4.2770e-04 gnorm: 1.03 [ 0:55:05< 2:01:40] +[titan] 2025-06-13 13:36:38,644 - root - INFO - step: 4680 loss: 20.2896 memory: 6.46GiB(27.34%) tps: 22,727 tflops: 22.87 mfu: 7.33% global_avg_ntp_loss: 3.5840 global_avg_mtp_loss: 16.7055 +[titan] 2025-06-13 13:36:38,644 - root - INFO - lr: 4.2752e-04 gnorm: 1.16 [ 0:55:09< 2:01:36] +[titan] 2025-06-13 13:36:41,682 - root - INFO - step: 4685 loss: 21.6732 memory: 6.46GiB(27.34%) tps: 26,971 tflops: 27.14 mfu: 8.70% global_avg_ntp_loss: 3.8972 global_avg_mtp_loss: 17.7760 +[titan] 2025-06-13 13:36:41,682 - root - INFO - lr: 4.2733e-04 gnorm: 1.39 [ 0:55:12< 2:01:32] +[titan] 2025-06-13 13:36:45,003 - root - INFO - step: 4690 loss: 18.3570 memory: 6.46GiB(27.34%) tps: 24,667 tflops: 24.82 mfu: 7.96% global_avg_ntp_loss: 3.3126 global_avg_mtp_loss: 15.0444 +[titan] 2025-06-13 13:36:45,003 - root - INFO - lr: 4.2715e-04 gnorm: 1.88 [ 0:55:15< 2:01:28] +[titan] 2025-06-13 13:36:48,047 - root - INFO - step: 4695 loss: 19.4354 memory: 6.46GiB(27.34%) tps: 26,916 tflops: 27.09 mfu: 8.68% global_avg_ntp_loss: 3.4342 global_avg_mtp_loss: 16.0012 +[titan] 2025-06-13 13:36:48,047 - root - INFO - lr: 4.2696e-04 gnorm: 1.56 [ 0:55:18< 2:01:23] +[titan] 2025-06-13 13:36:50,965 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:36:51,664 - root - INFO - step: 4700 loss: 21.3490 memory: 6.46GiB(27.34%) tps: 22,651 tflops: 22.80 mfu: 7.31% global_avg_ntp_loss: 3.8159 global_avg_mtp_loss: 17.5330 +[titan] 2025-06-13 13:36:51,664 - root - INFO - lr: 4.2677e-04 gnorm: 1.02 [ 0:55:22< 2:01:20] +[titan] 2025-06-13 13:36:55,165 - root - INFO - step: 4705 loss: 20.3137 memory: 6.46GiB(27.34%) tps: 23,402 tflops: 23.55 mfu: 7.55% global_avg_ntp_loss: 3.6134 global_avg_mtp_loss: 16.7003 +[titan] 2025-06-13 13:36:55,165 - root - INFO - lr: 4.2659e-04 gnorm: 1.21 [ 0:55:25< 2:01:16] +[titan] 2025-06-13 13:36:58,596 - root - INFO - step: 4710 loss: 21.7457 memory: 6.46GiB(27.34%) tps: 23,880 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 3.9175 global_avg_mtp_loss: 17.8282 +[titan] 2025-06-13 13:36:58,596 - root - INFO - lr: 4.2640e-04 gnorm: 1.23 [ 0:55:29< 2:01:12] +[titan] 2025-06-13 13:37:02,005 - root - INFO - step: 4715 loss: 20.9745 memory: 6.46GiB(27.34%) tps: 24,035 tflops: 24.19 mfu: 7.75% global_avg_ntp_loss: 3.7514 global_avg_mtp_loss: 17.2231 +[titan] 2025-06-13 13:37:02,005 - root - INFO - lr: 4.2621e-04 gnorm: 1.10 [ 0:55:32< 2:01:09] +[titan] 2025-06-13 13:37:05,395 - root - INFO - step: 4720 loss: 21.3250 memory: 6.46GiB(27.34%) tps: 24,167 tflops: 24.32 mfu: 7.80% global_avg_ntp_loss: 3.7861 global_avg_mtp_loss: 17.5390 +[titan] 2025-06-13 13:37:05,396 - root - INFO - lr: 4.2603e-04 gnorm: 1.66 [ 0:55:35< 2:01:05] +[titan] 2025-06-13 13:37:08,932 - root - INFO - step: 4725 loss: 21.2465 memory: 6.46GiB(27.34%) tps: 23,164 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.8592 global_avg_mtp_loss: 17.3873 +[titan] 2025-06-13 13:37:08,932 - root - INFO - lr: 4.2584e-04 gnorm: 1.51 [ 0:55:39< 2:01:01] +[titan] 2025-06-13 13:37:12,368 - root - INFO - step: 4730 loss: 20.9299 memory: 6.46GiB(27.34%) tps: 23,849 tflops: 24.00 mfu: 7.69% global_avg_ntp_loss: 3.7362 global_avg_mtp_loss: 17.1937 +[titan] 2025-06-13 13:37:12,368 - root - INFO - lr: 4.2565e-04 gnorm: 1.15 [ 0:55:42< 2:00:57] +[titan] 2025-06-13 13:37:15,918 - root - INFO - step: 4735 loss: 21.3835 memory: 6.46GiB(27.34%) tps: 23,072 tflops: 23.22 mfu: 7.44% global_avg_ntp_loss: 3.8605 global_avg_mtp_loss: 17.5229 +[titan] 2025-06-13 13:37:15,919 - root - INFO - lr: 4.2546e-04 gnorm: 1.05 [ 0:55:46< 2:00:54] +[titan] 2025-06-13 13:37:19,158 - root - INFO - step: 4740 loss: 21.1144 memory: 6.46GiB(27.34%) tps: 25,290 tflops: 25.45 mfu: 8.16% global_avg_ntp_loss: 3.8061 global_avg_mtp_loss: 17.3082 +[titan] 2025-06-13 13:37:19,158 - root - INFO - lr: 4.2528e-04 gnorm: 1.11 [ 0:55:49< 2:00:50] +[titan] 2025-06-13 13:37:22,581 - root - INFO - step: 4745 loss: 21.1495 memory: 6.46GiB(27.34%) tps: 23,937 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.7760 global_avg_mtp_loss: 17.3735 +[titan] 2025-06-13 13:37:22,581 - root - INFO - lr: 4.2509e-04 gnorm: 1.12 [ 0:55:52< 2:00:46] +[titan] 2025-06-13 13:37:25,270 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:37:26,038 - root - INFO - step: 4750 loss: 20.6621 memory: 6.46GiB(27.34%) tps: 23,697 tflops: 23.85 mfu: 7.64% global_avg_ntp_loss: 3.6986 global_avg_mtp_loss: 16.9635 +[titan] 2025-06-13 13:37:26,038 - root - INFO - lr: 4.2490e-04 gnorm: 0.96 [ 0:55:56< 2:00:42] +[titan] 2025-06-13 13:37:29,826 - root - INFO - step: 4755 loss: 19.9743 memory: 6.46GiB(27.34%) tps: 21,628 tflops: 21.77 mfu: 6.98% global_avg_ntp_loss: 3.5623 global_avg_mtp_loss: 16.4120 +[titan] 2025-06-13 13:37:29,826 - root - INFO - lr: 4.2471e-04 gnorm: 1.21 [ 0:56:00< 2:00:39] +[titan] 2025-06-13 13:37:33,193 - root - INFO - step: 4760 loss: 21.0072 memory: 6.46GiB(27.34%) tps: 24,338 tflops: 24.49 mfu: 7.85% global_avg_ntp_loss: 3.7777 global_avg_mtp_loss: 17.2295 +[titan] 2025-06-13 13:37:33,193 - root - INFO - lr: 4.2452e-04 gnorm: 1.34 [ 0:56:03< 2:00:35] +[titan] 2025-06-13 13:37:36,516 - root - INFO - step: 4765 loss: 20.7170 memory: 6.46GiB(27.34%) tps: 24,650 tflops: 24.81 mfu: 7.95% global_avg_ntp_loss: 3.6984 global_avg_mtp_loss: 17.0186 +[titan] 2025-06-13 13:37:36,517 - root - INFO - lr: 4.2433e-04 gnorm: 1.17 [ 0:56:06< 2:00:31] +[titan] 2025-06-13 13:37:39,915 - root - INFO - step: 4770 loss: 20.3253 memory: 6.46GiB(27.34%) tps: 24,108 tflops: 24.26 mfu: 7.78% global_avg_ntp_loss: 3.6104 global_avg_mtp_loss: 16.7149 +[titan] 2025-06-13 13:37:39,915 - root - INFO - lr: 4.2415e-04 gnorm: 1.14 [ 0:56:10< 2:00:28] +[titan] 2025-06-13 13:37:43,254 - root - INFO - step: 4775 loss: 20.5843 memory: 6.46GiB(27.34%) tps: 24,539 tflops: 24.70 mfu: 7.92% global_avg_ntp_loss: 3.7206 global_avg_mtp_loss: 16.8637 +[titan] 2025-06-13 13:37:43,254 - root - INFO - lr: 4.2396e-04 gnorm: 1.26 [ 0:56:13< 2:00:24] +[titan] 2025-06-13 13:37:46,660 - root - INFO - step: 4780 loss: 20.5734 memory: 6.46GiB(27.34%) tps: 24,055 tflops: 24.21 mfu: 7.76% global_avg_ntp_loss: 3.6693 global_avg_mtp_loss: 16.9041 +[titan] 2025-06-13 13:37:46,660 - root - INFO - lr: 4.2377e-04 gnorm: 1.07 [ 0:56:17< 2:00:20] +[titan] 2025-06-13 13:37:50,106 - root - INFO - step: 4785 loss: 21.1621 memory: 6.46GiB(27.34%) tps: 23,772 tflops: 23.92 mfu: 7.67% global_avg_ntp_loss: 3.9198 global_avg_mtp_loss: 17.2423 +[titan] 2025-06-13 13:37:50,106 - root - INFO - lr: 4.2358e-04 gnorm: 1.68 [ 0:56:20< 2:00:16] +[titan] 2025-06-13 13:37:53,287 - root - INFO - step: 4790 loss: 19.1667 memory: 6.46GiB(27.34%) tps: 25,758 tflops: 25.92 mfu: 8.31% global_avg_ntp_loss: 3.3715 global_avg_mtp_loss: 15.7952 +[titan] 2025-06-13 13:37:53,287 - root - INFO - lr: 4.2339e-04 gnorm: 1.33 [ 0:56:23< 2:00:12] +[titan] 2025-06-13 13:37:56,511 - root - INFO - step: 4795 loss: 16.5408 memory: 6.46GiB(27.34%) tps: 25,415 tflops: 25.58 mfu: 8.20% global_avg_ntp_loss: 2.9725 global_avg_mtp_loss: 13.5683 +[titan] 2025-06-13 13:37:56,511 - root - INFO - lr: 4.2320e-04 gnorm: 1.46 [ 0:56:26< 2:00:08] +[titan] 2025-06-13 13:37:59,236 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:37:59,832 - root - INFO - step: 4800 loss: 20.5551 memory: 6.46GiB(27.34%) tps: 24,669 tflops: 24.83 mfu: 7.96% global_avg_ntp_loss: 3.6770 global_avg_mtp_loss: 16.8781 +[titan] 2025-06-13 13:37:59,832 - root - INFO - lr: 4.2301e-04 gnorm: 1.12 [ 0:56:30< 2:00:04] +[titan] 2025-06-13 13:38:03,302 - root - INFO - step: 4805 loss: 18.8540 memory: 6.46GiB(27.34%) tps: 23,606 tflops: 23.76 mfu: 7.61% global_avg_ntp_loss: 3.3112 global_avg_mtp_loss: 15.5428 +[titan] 2025-06-13 13:38:03,303 - root - INFO - lr: 4.2282e-04 gnorm: 1.98 [ 0:56:33< 2:00:00] +[titan] 2025-06-13 13:38:06,672 - root - INFO - step: 4810 loss: 20.1212 memory: 6.46GiB(27.34%) tps: 24,320 tflops: 24.47 mfu: 7.84% global_avg_ntp_loss: 3.5566 global_avg_mtp_loss: 16.5646 +[titan] 2025-06-13 13:38:06,672 - root - INFO - lr: 4.2263e-04 gnorm: 1.19 [ 0:56:37< 1:59:56] +[titan] 2025-06-13 13:38:10,022 - root - INFO - step: 4815 loss: 20.8555 memory: 6.46GiB(27.34%) tps: 24,452 tflops: 24.61 mfu: 7.89% global_avg_ntp_loss: 3.7230 global_avg_mtp_loss: 17.1325 +[titan] 2025-06-13 13:38:10,022 - root - INFO - lr: 4.2244e-04 gnorm: 1.05 [ 0:56:40< 1:59:52] +[titan] 2025-06-13 13:38:13,589 - root - INFO - step: 4820 loss: 20.7643 memory: 6.46GiB(27.34%) tps: 22,968 tflops: 23.11 mfu: 7.41% global_avg_ntp_loss: 3.7427 global_avg_mtp_loss: 17.0216 +[titan] 2025-06-13 13:38:13,590 - root - INFO - lr: 4.2225e-04 gnorm: 1.16 [ 0:56:43< 1:59:49] +[titan] 2025-06-13 13:38:17,188 - root - INFO - step: 4825 loss: 19.5681 memory: 6.46GiB(27.34%) tps: 22,766 tflops: 22.91 mfu: 7.34% global_avg_ntp_loss: 3.4927 global_avg_mtp_loss: 16.0754 +[titan] 2025-06-13 13:38:17,189 - root - INFO - lr: 4.2206e-04 gnorm: 1.15 [ 0:56:47< 1:59:45] +[titan] 2025-06-13 13:38:20,606 - root - INFO - step: 4830 loss: 20.3435 memory: 6.46GiB(27.34%) tps: 23,969 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 3.6583 global_avg_mtp_loss: 16.6852 +[titan] 2025-06-13 13:38:20,607 - root - INFO - lr: 4.2186e-04 gnorm: 1.18 [ 0:56:51< 1:59:42] +[titan] 2025-06-13 13:38:24,075 - root - INFO - step: 4835 loss: 19.9356 memory: 6.46GiB(27.34%) tps: 23,620 tflops: 23.77 mfu: 7.62% global_avg_ntp_loss: 3.5799 global_avg_mtp_loss: 16.3558 +[titan] 2025-06-13 13:38:24,076 - root - INFO - lr: 4.2167e-04 gnorm: 1.16 [ 0:56:54< 1:59:38] +[titan] 2025-06-13 13:38:27,473 - root - INFO - step: 4840 loss: 20.5109 memory: 6.46GiB(27.34%) tps: 24,110 tflops: 24.26 mfu: 7.78% global_avg_ntp_loss: 3.6659 global_avg_mtp_loss: 16.8450 +[titan] 2025-06-13 13:38:27,474 - root - INFO - lr: 4.2148e-04 gnorm: 1.37 [ 0:56:57< 1:59:34] +[titan] 2025-06-13 13:38:31,123 - root - INFO - step: 4845 loss: 21.3832 memory: 6.46GiB(27.34%) tps: 22,447 tflops: 22.59 mfu: 7.24% global_avg_ntp_loss: 3.8532 global_avg_mtp_loss: 17.5300 +[titan] 2025-06-13 13:38:31,124 - root - INFO - lr: 4.2129e-04 gnorm: 1.07 [ 0:57:01< 1:59:31] +[titan] 2025-06-13 13:38:33,833 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:38:34,399 - root - INFO - step: 4850 loss: 19.4808 memory: 6.46GiB(27.34%) tps: 25,013 tflops: 25.17 mfu: 8.07% global_avg_ntp_loss: 3.4788 global_avg_mtp_loss: 16.0020 +[titan] 2025-06-13 13:38:34,399 - root - INFO - lr: 4.2110e-04 gnorm: 1.45 [ 0:57:04< 1:59:27] +[titan] 2025-06-13 13:38:37,761 - root - INFO - step: 4855 loss: 19.9186 memory: 6.46GiB(27.34%) tps: 24,370 tflops: 24.53 mfu: 7.86% global_avg_ntp_loss: 3.5928 global_avg_mtp_loss: 16.3259 +[titan] 2025-06-13 13:38:37,761 - root - INFO - lr: 4.2091e-04 gnorm: 1.26 [ 0:57:08< 1:59:23] +[titan] 2025-06-13 13:38:41,291 - root - INFO - step: 4860 loss: 20.7666 memory: 6.46GiB(27.34%) tps: 23,208 tflops: 23.36 mfu: 7.49% global_avg_ntp_loss: 3.7081 global_avg_mtp_loss: 17.0585 +[titan] 2025-06-13 13:38:41,291 - root - INFO - lr: 4.2071e-04 gnorm: 1.24 [ 0:57:11< 1:59:19] +[titan] 2025-06-13 13:38:44,526 - root - INFO - step: 4865 loss: 20.1320 memory: 6.46GiB(27.34%) tps: 25,324 tflops: 25.49 mfu: 8.17% global_avg_ntp_loss: 3.6057 global_avg_mtp_loss: 16.5264 +[titan] 2025-06-13 13:38:44,526 - root - INFO - lr: 4.2052e-04 gnorm: 1.07 [ 0:57:14< 1:59:15] +[titan] 2025-06-13 13:38:48,011 - root - INFO - step: 4870 loss: 21.5287 memory: 6.46GiB(27.34%) tps: 23,511 tflops: 23.66 mfu: 7.58% global_avg_ntp_loss: 3.9065 global_avg_mtp_loss: 17.6222 +[titan] 2025-06-13 13:38:48,011 - root - INFO - lr: 4.2033e-04 gnorm: 1.06 [ 0:57:18< 1:59:12] +[titan] 2025-06-13 13:38:51,880 - root - INFO - step: 4875 loss: 20.7712 memory: 6.46GiB(27.34%) tps: 21,174 tflops: 21.31 mfu: 6.83% global_avg_ntp_loss: 3.7243 global_avg_mtp_loss: 17.0469 +[titan] 2025-06-13 13:38:51,880 - root - INFO - lr: 4.2014e-04 gnorm: 1.16 [ 0:57:22< 1:59:09] +[titan] 2025-06-13 13:38:55,206 - root - INFO - step: 4880 loss: 20.7474 memory: 6.46GiB(27.34%) tps: 24,631 tflops: 24.79 mfu: 7.94% global_avg_ntp_loss: 3.6726 global_avg_mtp_loss: 17.0748 +[titan] 2025-06-13 13:38:55,207 - root - INFO - lr: 4.1994e-04 gnorm: 1.09 [ 0:57:25< 1:59:05] +[titan] 2025-06-13 13:38:58,841 - root - INFO - step: 4885 loss: 21.7620 memory: 6.46GiB(27.34%) tps: 22,538 tflops: 22.68 mfu: 7.27% global_avg_ntp_loss: 3.9192 global_avg_mtp_loss: 17.8429 +[titan] 2025-06-13 13:38:58,842 - root - INFO - lr: 4.1975e-04 gnorm: 1.07 [ 0:57:29< 1:59:02] +[titan] 2025-06-13 13:39:02,395 - root - INFO - step: 4890 loss: 19.0198 memory: 6.46GiB(27.34%) tps: 23,053 tflops: 23.20 mfu: 7.44% global_avg_ntp_loss: 3.3478 global_avg_mtp_loss: 15.6720 +[titan] 2025-06-13 13:39:02,396 - root - INFO - lr: 4.1956e-04 gnorm: 1.27 [ 0:57:32< 1:58:58] +[titan] 2025-06-13 13:39:05,382 - root - INFO - step: 4895 loss: 20.3056 memory: 6.46GiB(27.34%) tps: 27,431 tflops: 27.61 mfu: 8.85% global_avg_ntp_loss: 3.6452 global_avg_mtp_loss: 16.6604 +[titan] 2025-06-13 13:39:05,382 - root - INFO - lr: 4.1936e-04 gnorm: 1.45 [ 0:57:35< 1:58:53] +[titan] 2025-06-13 13:39:08,026 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:39:08,670 - root - INFO - step: 4900 loss: 20.7822 memory: 6.46GiB(27.34%) tps: 24,923 tflops: 25.08 mfu: 8.04% global_avg_ntp_loss: 3.6905 global_avg_mtp_loss: 17.0918 +[titan] 2025-06-13 13:39:08,670 - root - INFO - lr: 4.1917e-04 gnorm: 1.29 [ 0:57:39< 1:58:49] +[titan] 2025-06-13 13:39:12,102 - root - INFO - step: 4905 loss: 19.6818 memory: 6.46GiB(27.34%) tps: 23,871 tflops: 24.02 mfu: 7.70% global_avg_ntp_loss: 3.4988 global_avg_mtp_loss: 16.1830 +[titan] 2025-06-13 13:39:12,102 - root - INFO - lr: 4.1898e-04 gnorm: 1.31 [ 0:57:42< 1:58:46] +[titan] 2025-06-13 13:39:15,619 - root - INFO - step: 4910 loss: 17.1969 memory: 6.46GiB(27.34%) tps: 23,297 tflops: 23.45 mfu: 7.51% global_avg_ntp_loss: 3.0014 global_avg_mtp_loss: 14.1955 +[titan] 2025-06-13 13:39:15,619 - root - INFO - lr: 4.1878e-04 gnorm: 1.79 [ 0:57:46< 1:58:42] +[titan] 2025-06-13 13:39:19,244 - root - INFO - step: 4915 loss: 20.9762 memory: 6.46GiB(27.34%) tps: 22,596 tflops: 22.74 mfu: 7.29% global_avg_ntp_loss: 3.7558 global_avg_mtp_loss: 17.2204 +[titan] 2025-06-13 13:39:19,245 - root - INFO - lr: 4.1859e-04 gnorm: 1.12 [ 0:57:49< 1:58:39] +[titan] 2025-06-13 13:39:22,859 - root - INFO - step: 4920 loss: 20.6816 memory: 6.46GiB(27.34%) tps: 22,665 tflops: 22.81 mfu: 7.31% global_avg_ntp_loss: 3.6743 global_avg_mtp_loss: 17.0073 +[titan] 2025-06-13 13:39:22,859 - root - INFO - lr: 4.1839e-04 gnorm: 1.27 [ 0:57:53< 1:58:35] +[titan] 2025-06-13 13:39:26,443 - root - INFO - step: 4925 loss: 19.3448 memory: 6.46GiB(27.34%) tps: 22,859 tflops: 23.00 mfu: 7.37% global_avg_ntp_loss: 3.4283 global_avg_mtp_loss: 15.9166 +[titan] 2025-06-13 13:39:26,444 - root - INFO - lr: 4.1820e-04 gnorm: 1.25 [ 0:57:56< 1:58:32] +[titan] 2025-06-13 13:39:29,916 - root - INFO - step: 4930 loss: 20.5614 memory: 6.46GiB(27.34%) tps: 23,590 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 3.7105 global_avg_mtp_loss: 16.8509 +[titan] 2025-06-13 13:39:29,917 - root - INFO - lr: 4.1800e-04 gnorm: 1.19 [ 0:58:00< 1:58:28] +[titan] 2025-06-13 13:39:34,743 - root - INFO - step: 4935 loss: 21.8360 memory: 6.46GiB(27.34%) tps: 16,976 tflops: 17.08 mfu: 5.48% global_avg_ntp_loss: 4.0262 global_avg_mtp_loss: 17.8098 +[titan] 2025-06-13 13:39:34,743 - root - INFO - lr: 4.1781e-04 gnorm: 1.16 [ 0:58:05< 1:58:27] +[titan] 2025-06-13 13:39:38,481 - root - INFO - step: 4940 loss: 20.6036 memory: 6.46GiB(27.34%) tps: 21,919 tflops: 22.06 mfu: 7.07% global_avg_ntp_loss: 3.6810 global_avg_mtp_loss: 16.9226 +[titan] 2025-06-13 13:39:38,481 - root - INFO - lr: 4.1761e-04 gnorm: 1.26 [ 0:58:08< 1:58:24] +[titan] 2025-06-13 13:39:41,988 - root - INFO - step: 4945 loss: 20.2460 memory: 6.46GiB(27.34%) tps: 23,356 tflops: 23.50 mfu: 7.53% global_avg_ntp_loss: 3.5923 global_avg_mtp_loss: 16.6538 +[titan] 2025-06-13 13:39:41,989 - root - INFO - lr: 4.1742e-04 gnorm: 1.10 [ 0:58:12< 1:58:21] +[titan] 2025-06-13 13:39:44,747 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:39:45,661 - root - INFO - step: 4950 loss: 21.2239 memory: 6.46GiB(27.34%) tps: 22,308 tflops: 22.45 mfu: 7.20% global_avg_ntp_loss: 3.7926 global_avg_mtp_loss: 17.4314 +[titan] 2025-06-13 13:39:45,662 - root - INFO - lr: 4.1722e-04 gnorm: 0.93 [ 0:58:16< 1:58:18] +[titan] 2025-06-13 13:39:48,906 - root - INFO - step: 4955 loss: 19.9913 memory: 6.46GiB(27.34%) tps: 25,249 tflops: 25.41 mfu: 8.14% global_avg_ntp_loss: 3.5997 global_avg_mtp_loss: 16.3916 +[titan] 2025-06-13 13:39:48,907 - root - INFO - lr: 4.1703e-04 gnorm: 1.22 [ 0:58:19< 1:58:13] +[titan] 2025-06-13 13:39:52,400 - root - INFO - step: 4960 loss: 19.9360 memory: 6.46GiB(27.34%) tps: 23,449 tflops: 23.60 mfu: 7.56% global_avg_ntp_loss: 3.5286 global_avg_mtp_loss: 16.4074 +[titan] 2025-06-13 13:39:52,401 - root - INFO - lr: 4.1683e-04 gnorm: 1.30 [ 0:58:22< 1:58:10] +[titan] 2025-06-13 13:39:55,793 - root - INFO - step: 4965 loss: 19.5128 memory: 6.46GiB(27.34%) tps: 24,147 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 3.4400 global_avg_mtp_loss: 16.0729 +[titan] 2025-06-13 13:39:55,794 - root - INFO - lr: 4.1663e-04 gnorm: 1.27 [ 0:58:26< 1:58:06] +[titan] 2025-06-13 13:39:59,517 - root - INFO - step: 4970 loss: 20.1826 memory: 6.46GiB(27.34%) tps: 22,003 tflops: 22.14 mfu: 7.10% global_avg_ntp_loss: 3.6235 global_avg_mtp_loss: 16.5591 +[titan] 2025-06-13 13:39:59,517 - root - INFO - lr: 4.1644e-04 gnorm: 1.23 [ 0:58:29< 1:58:03] +[titan] 2025-06-13 13:40:02,926 - root - INFO - step: 4975 loss: 18.7316 memory: 6.46GiB(27.34%) tps: 24,030 tflops: 24.18 mfu: 7.75% global_avg_ntp_loss: 3.3548 global_avg_mtp_loss: 15.3768 +[titan] 2025-06-13 13:40:02,927 - root - INFO - lr: 4.1624e-04 gnorm: 1.50 [ 0:58:33< 1:57:59] +[titan] 2025-06-13 13:40:06,526 - root - INFO - step: 4980 loss: 20.9619 memory: 6.46GiB(27.34%) tps: 22,764 tflops: 22.91 mfu: 7.34% global_avg_ntp_loss: 3.7564 global_avg_mtp_loss: 17.2055 +[titan] 2025-06-13 13:40:06,526 - root - INFO - lr: 4.1605e-04 gnorm: 1.05 [ 0:58:36< 1:57:56] +[titan] 2025-06-13 13:40:09,807 - root - INFO - step: 4985 loss: 18.6729 memory: 6.46GiB(27.34%) tps: 24,971 tflops: 25.13 mfu: 8.05% global_avg_ntp_loss: 3.5049 global_avg_mtp_loss: 15.1680 +[titan] 2025-06-13 13:40:09,807 - root - INFO - lr: 4.1585e-04 gnorm: 1.98 [ 0:58:40< 1:57:52] +[titan] 2025-06-13 13:40:13,561 - root - INFO - step: 4990 loss: 19.0505 memory: 6.46GiB(27.34%) tps: 21,824 tflops: 21.96 mfu: 7.04% global_avg_ntp_loss: 3.3977 global_avg_mtp_loss: 15.6528 +[titan] 2025-06-13 13:40:13,561 - root - INFO - lr: 4.1565e-04 gnorm: 1.32 [ 0:58:43< 1:57:49] +[titan] 2025-06-13 13:40:17,091 - root - INFO - step: 4995 loss: 19.9575 memory: 6.46GiB(27.34%) tps: 23,211 tflops: 23.36 mfu: 7.49% global_avg_ntp_loss: 3.6227 global_avg_mtp_loss: 16.3348 +[titan] 2025-06-13 13:40:17,091 - root - INFO - lr: 4.1545e-04 gnorm: 1.70 [ 0:58:47< 1:57:45] +[titan] 2025-06-13 13:40:20,132 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:40:21,085 - root - INFO - step: 5000 loss: 19.7599 memory: 6.46GiB(27.34%) tps: 20,511 tflops: 20.64 mfu: 6.62% global_avg_ntp_loss: 3.5047 global_avg_mtp_loss: 16.2552 +[titan] 2025-06-13 13:40:21,086 - root - INFO - lr: 4.1526e-04 gnorm: 1.38 [ 0:58:51< 1:57:42] +[titan] 2025-06-13 13:40:21,086 - root - INFO - Saving the checkpoint (or staging if async is enabled). +[titan] 2025-06-13 13:40:22,365 - root - INFO - [GC] GC collection invoked by checkpointer. 0.01 seconds. +[titan] 2025-06-13 13:40:22,365 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 1.28 seconds. +[titan] 2025-06-13 13:40:25,643 - root - INFO - step: 5005 loss: 20.0620 memory: 6.46GiB(27.34%) tps: 17,976 tflops: 18.09 mfu: 5.80% global_avg_ntp_loss: 3.5444 global_avg_mtp_loss: 16.5177 +[titan] 2025-06-13 13:40:25,643 - root - INFO - lr: 4.1506e-04 gnorm: 1.26 [ 0:58:56< 1:57:41] +[titan] 2025-06-13 13:40:28,893 - root - INFO - step: 5010 loss: 20.5230 memory: 6.46GiB(27.34%) tps: 25,210 tflops: 25.37 mfu: 8.13% global_avg_ntp_loss: 3.6015 global_avg_mtp_loss: 16.9216 +[titan] 2025-06-13 13:40:28,893 - root - INFO - lr: 4.1486e-04 gnorm: 1.40 [ 0:58:59< 1:57:37] +[titan] 2025-06-13 13:40:32,196 - root - INFO - step: 5015 loss: 19.8586 memory: 6.46GiB(27.34%) tps: 24,808 tflops: 24.97 mfu: 8.00% global_avg_ntp_loss: 3.5551 global_avg_mtp_loss: 16.3036 +[titan] 2025-06-13 13:40:32,196 - root - INFO - lr: 4.1466e-04 gnorm: 1.33 [ 0:59:02< 1:57:33] +[titan] 2025-06-13 13:40:35,503 - root - INFO - step: 5020 loss: 19.9195 memory: 6.46GiB(27.34%) tps: 24,775 tflops: 24.93 mfu: 7.99% global_avg_ntp_loss: 3.5350 global_avg_mtp_loss: 16.3845 +[titan] 2025-06-13 13:40:35,503 - root - INFO - lr: 4.1447e-04 gnorm: 1.22 [ 0:59:05< 1:57:29] +[titan] 2025-06-13 13:40:38,965 - root - INFO - step: 5025 loss: 21.3214 memory: 6.46GiB(27.34%) tps: 23,661 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 3.8131 global_avg_mtp_loss: 17.5083 +[titan] 2025-06-13 13:40:38,966 - root - INFO - lr: 4.1427e-04 gnorm: 1.05 [ 0:59:09< 1:57:25] +[titan] 2025-06-13 13:40:42,670 - root - INFO - step: 5030 loss: 20.1700 memory: 6.46GiB(27.34%) tps: 22,118 tflops: 22.26 mfu: 7.13% global_avg_ntp_loss: 3.5901 global_avg_mtp_loss: 16.5799 +[titan] 2025-06-13 13:40:42,670 - root - INFO - lr: 4.1407e-04 gnorm: 1.10 [ 0:59:13< 1:57:22] +[titan] 2025-06-13 13:40:45,819 - root - INFO - step: 5035 loss: 21.2607 memory: 6.46GiB(27.34%) tps: 26,019 tflops: 26.18 mfu: 8.39% global_avg_ntp_loss: 3.7887 global_avg_mtp_loss: 17.4720 +[titan] 2025-06-13 13:40:45,819 - root - INFO - lr: 4.1387e-04 gnorm: 1.20 [ 0:59:16< 1:57:18] +[titan] 2025-06-13 13:40:49,819 - root - INFO - step: 5040 loss: 20.2190 memory: 6.46GiB(27.34%) tps: 20,480 tflops: 20.61 mfu: 6.61% global_avg_ntp_loss: 3.5872 global_avg_mtp_loss: 16.6317 +[titan] 2025-06-13 13:40:49,820 - root - INFO - lr: 4.1367e-04 gnorm: 1.10 [ 0:59:20< 1:57:15] +[titan] 2025-06-13 13:40:53,090 - root - INFO - step: 5045 loss: 19.7298 memory: 6.46GiB(27.34%) tps: 25,049 tflops: 25.21 mfu: 8.08% global_avg_ntp_loss: 3.4919 global_avg_mtp_loss: 16.2378 +[titan] 2025-06-13 13:40:53,090 - root - INFO - lr: 4.1347e-04 gnorm: 1.13 [ 0:59:23< 1:57:11] +[titan] 2025-06-13 13:40:55,696 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:40:56,466 - root - INFO - step: 5050 loss: 20.5370 memory: 6.46GiB(27.34%) tps: 24,268 tflops: 24.42 mfu: 7.83% global_avg_ntp_loss: 3.7116 global_avg_mtp_loss: 16.8254 +[titan] 2025-06-13 13:40:56,467 - root - INFO - lr: 4.1327e-04 gnorm: 1.21 [ 0:59:26< 1:57:07] +[titan] 2025-06-13 13:41:00,200 - root - INFO - step: 5055 loss: 20.3314 memory: 6.46GiB(27.34%) tps: 21,945 tflops: 22.08 mfu: 7.08% global_avg_ntp_loss: 3.6379 global_avg_mtp_loss: 16.6935 +[titan] 2025-06-13 13:41:00,200 - root - INFO - lr: 4.1308e-04 gnorm: 1.31 [ 0:59:30< 1:57:04] +[titan] 2025-06-13 13:41:03,546 - root - INFO - step: 5060 loss: 21.8892 memory: 6.46GiB(27.34%) tps: 24,483 tflops: 24.64 mfu: 7.90% global_avg_ntp_loss: 3.9043 global_avg_mtp_loss: 17.9848 +[titan] 2025-06-13 13:41:03,547 - root - INFO - lr: 4.1288e-04 gnorm: 1.35 [ 0:59:33< 1:57:00] +[titan] 2025-06-13 13:41:07,467 - root - INFO - step: 5065 loss: 20.5444 memory: 6.46GiB(27.34%) tps: 20,898 tflops: 21.03 mfu: 6.74% global_avg_ntp_loss: 3.6230 global_avg_mtp_loss: 16.9214 +[titan] 2025-06-13 13:41:07,467 - root - INFO - lr: 4.1268e-04 gnorm: 1.23 [ 0:59:37< 1:56:57] +[titan] 2025-06-13 13:41:11,109 - root - INFO - step: 5070 loss: 21.0748 memory: 6.46GiB(27.34%) tps: 22,499 tflops: 22.64 mfu: 7.26% global_avg_ntp_loss: 3.7829 global_avg_mtp_loss: 17.2919 +[titan] 2025-06-13 13:41:11,109 - root - INFO - lr: 4.1248e-04 gnorm: 1.09 [ 0:59:41< 1:56:54] +[titan] 2025-06-13 13:41:14,705 - root - INFO - step: 5075 loss: 19.4855 memory: 6.46GiB(27.34%) tps: 22,779 tflops: 22.92 mfu: 7.35% global_avg_ntp_loss: 3.4593 global_avg_mtp_loss: 16.0263 +[titan] 2025-06-13 13:41:14,706 - root - INFO - lr: 4.1228e-04 gnorm: 1.44 [ 0:59:45< 1:56:51] +[titan] 2025-06-13 13:41:18,188 - root - INFO - step: 5080 loss: 20.1023 memory: 6.46GiB(27.34%) tps: 23,527 tflops: 23.68 mfu: 7.59% global_avg_ntp_loss: 3.5490 global_avg_mtp_loss: 16.5533 +[titan] 2025-06-13 13:41:18,188 - root - INFO - lr: 4.1208e-04 gnorm: 1.08 [ 0:59:48< 1:56:47] +[titan] 2025-06-13 13:41:21,846 - root - INFO - step: 5085 loss: 21.1143 memory: 6.46GiB(27.34%) tps: 22,397 tflops: 22.54 mfu: 7.22% global_avg_ntp_loss: 3.7887 global_avg_mtp_loss: 17.3256 +[titan] 2025-06-13 13:41:21,846 - root - INFO - lr: 4.1188e-04 gnorm: 1.14 [ 0:59:52< 1:56:44] +[titan] 2025-06-13 13:41:25,439 - root - INFO - step: 5090 loss: 21.6007 memory: 6.46GiB(27.34%) tps: 22,801 tflops: 22.95 mfu: 7.35% global_avg_ntp_loss: 3.8989 global_avg_mtp_loss: 17.7018 +[titan] 2025-06-13 13:41:25,440 - root - INFO - lr: 4.1168e-04 gnorm: 1.04 [ 0:59:55< 1:56:40] +[titan] 2025-06-13 13:41:29,287 - root - INFO - step: 5095 loss: 20.0666 memory: 6.46GiB(27.34%) tps: 21,297 tflops: 21.43 mfu: 6.87% global_avg_ntp_loss: 3.6159 global_avg_mtp_loss: 16.4507 +[titan] 2025-06-13 13:41:29,287 - root - INFO - lr: 4.1148e-04 gnorm: 1.11 [ 0:59:59< 1:56:37] +[titan] 2025-06-13 13:41:32,078 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:41:32,712 - root - INFO - step: 5100 loss: 21.0782 memory: 6.46GiB(27.34%) tps: 23,920 tflops: 24.07 mfu: 7.72% global_avg_ntp_loss: 3.7758 global_avg_mtp_loss: 17.3024 +[titan] 2025-06-13 13:41:32,713 - root - INFO - lr: 4.1127e-04 gnorm: 1.04 [ 1:00:03< 1:56:34] +[titan] 2025-06-13 13:41:36,064 - root - INFO - step: 5105 loss: 19.9828 memory: 6.46GiB(27.34%) tps: 24,448 tflops: 24.60 mfu: 7.89% global_avg_ntp_loss: 3.5614 global_avg_mtp_loss: 16.4214 +[titan] 2025-06-13 13:41:36,064 - root - INFO - lr: 4.1107e-04 gnorm: 1.23 [ 1:00:06< 1:56:30] +[titan] 2025-06-13 13:41:39,701 - root - INFO - step: 5110 loss: 20.7564 memory: 6.46GiB(27.34%) tps: 22,525 tflops: 22.67 mfu: 7.27% global_avg_ntp_loss: 3.7007 global_avg_mtp_loss: 17.0557 +[titan] 2025-06-13 13:41:39,701 - root - INFO - lr: 4.1087e-04 gnorm: 1.12 [ 1:00:10< 1:56:27] +[titan] 2025-06-13 13:41:43,328 - root - INFO - step: 5115 loss: 19.5509 memory: 6.46GiB(27.34%) tps: 22,589 tflops: 22.73 mfu: 7.29% global_avg_ntp_loss: 3.4650 global_avg_mtp_loss: 16.0859 +[titan] 2025-06-13 13:41:43,328 - root - INFO - lr: 4.1067e-04 gnorm: 1.14 [ 1:00:13< 1:56:23] +[titan] 2025-06-13 13:41:46,630 - root - INFO - step: 5120 loss: 20.6872 memory: 6.46GiB(27.34%) tps: 24,813 tflops: 24.97 mfu: 8.00% global_avg_ntp_loss: 3.6712 global_avg_mtp_loss: 17.0160 +[titan] 2025-06-13 13:41:46,631 - root - INFO - lr: 4.1047e-04 gnorm: 1.13 [ 1:00:16< 1:56:19] +[titan] 2025-06-13 13:41:46,761 - root - INFO - Dumping profiler traces at step 5120 +[titan] 2025-06-13 13:41:46,857 - root - INFO - Finished dumping profiler traces in 0.10 seconds +[titan] 2025-06-13 13:41:50,190 - root - INFO - step: 5125 loss: 21.7946 memory: 6.46GiB(27.34%) tps: 23,017 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 4.0159 global_avg_mtp_loss: 17.7787 +[titan] 2025-06-13 13:41:50,190 - root - INFO - lr: 4.1027e-04 gnorm: 1.30 [ 1:00:20< 1:56:16] +[titan] 2025-06-13 13:41:53,545 - root - INFO - step: 5130 loss: 19.8216 memory: 6.46GiB(27.34%) tps: 24,418 tflops: 24.57 mfu: 7.88% global_avg_ntp_loss: 3.5428 global_avg_mtp_loss: 16.2788 +[titan] 2025-06-13 13:41:53,545 - root - INFO - lr: 4.1007e-04 gnorm: 1.42 [ 1:00:23< 1:56:12] +[titan] 2025-06-13 13:41:57,140 - root - INFO - step: 5135 loss: 20.1172 memory: 6.46GiB(27.34%) tps: 22,794 tflops: 22.94 mfu: 7.35% global_avg_ntp_loss: 3.5188 global_avg_mtp_loss: 16.5984 +[titan] 2025-06-13 13:41:57,140 - root - INFO - lr: 4.0986e-04 gnorm: 1.06 [ 1:00:27< 1:56:08] +[titan] 2025-06-13 13:42:00,904 - root - INFO - step: 5140 loss: 20.8329 memory: 6.46GiB(27.34%) tps: 21,762 tflops: 21.90 mfu: 7.02% global_avg_ntp_loss: 3.6746 global_avg_mtp_loss: 17.1583 +[titan] 2025-06-13 13:42:00,905 - root - INFO - lr: 4.0966e-04 gnorm: 1.05 [ 1:00:31< 1:56:05] +[titan] 2025-06-13 13:42:04,279 - root - INFO - step: 5145 loss: 16.6713 memory: 6.46GiB(27.34%) tps: 24,276 tflops: 24.43 mfu: 7.83% global_avg_ntp_loss: 2.9964 global_avg_mtp_loss: 13.6749 +[titan] 2025-06-13 13:42:04,280 - root - INFO - lr: 4.0946e-04 gnorm: 1.34 [ 1:00:34< 1:56:01] +[titan] 2025-06-13 13:42:07,465 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:42:07,909 - root - INFO - step: 5150 loss: 21.2448 memory: 6.46GiB(27.34%) tps: 22,573 tflops: 22.72 mfu: 7.28% global_avg_ntp_loss: 3.9262 global_avg_mtp_loss: 17.3186 +[titan] 2025-06-13 13:42:07,909 - root - INFO - lr: 4.0926e-04 gnorm: 1.76 [ 1:00:38< 1:55:58] +[titan] 2025-06-13 13:42:11,262 - root - INFO - step: 5155 loss: 21.0015 memory: 6.46GiB(27.34%) tps: 24,439 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 3.7469 global_avg_mtp_loss: 17.2546 +[titan] 2025-06-13 13:42:11,262 - root - INFO - lr: 4.0905e-04 gnorm: 1.19 [ 1:00:41< 1:55:54] +[titan] 2025-06-13 13:42:14,312 - root - INFO - step: 5160 loss: 20.4612 memory: 6.46GiB(27.34%) tps: 26,861 tflops: 27.03 mfu: 8.66% global_avg_ntp_loss: 3.7162 global_avg_mtp_loss: 16.7450 +[titan] 2025-06-13 13:42:14,312 - root - INFO - lr: 4.0885e-04 gnorm: 1.35 [ 1:00:44< 1:55:50] +[titan] 2025-06-13 13:42:17,563 - root - INFO - step: 5165 loss: 18.0510 memory: 6.46GiB(27.34%) tps: 25,201 tflops: 25.36 mfu: 8.13% global_avg_ntp_loss: 3.1593 global_avg_mtp_loss: 14.8917 +[titan] 2025-06-13 13:42:17,563 - root - INFO - lr: 4.0865e-04 gnorm: 1.23 [ 1:00:47< 1:55:46] +[titan] 2025-06-13 13:42:21,052 - root - INFO - step: 5170 loss: 19.7744 memory: 6.46GiB(27.34%) tps: 23,484 tflops: 23.63 mfu: 7.58% global_avg_ntp_loss: 3.5603 global_avg_mtp_loss: 16.2141 +[titan] 2025-06-13 13:42:21,052 - root - INFO - lr: 4.0845e-04 gnorm: 1.14 [ 1:00:51< 1:55:42] +[titan] 2025-06-13 13:42:24,928 - root - INFO - step: 5175 loss: 20.3058 memory: 6.46GiB(27.34%) tps: 21,138 tflops: 21.27 mfu: 6.82% global_avg_ntp_loss: 3.5971 global_avg_mtp_loss: 16.7086 +[titan] 2025-06-13 13:42:24,928 - root - INFO - lr: 4.0824e-04 gnorm: 1.07 [ 1:00:55< 1:55:39] +[titan] 2025-06-13 13:42:28,507 - root - INFO - step: 5180 loss: 20.2228 memory: 6.46GiB(27.34%) tps: 22,890 tflops: 23.04 mfu: 7.38% global_avg_ntp_loss: 3.6290 global_avg_mtp_loss: 16.5938 +[titan] 2025-06-13 13:42:28,507 - root - INFO - lr: 4.0804e-04 gnorm: 1.18 [ 1:00:58< 1:55:36] +[titan] 2025-06-13 13:42:32,375 - root - INFO - step: 5185 loss: 19.8839 memory: 6.46GiB(27.34%) tps: 21,179 tflops: 21.31 mfu: 6.83% global_avg_ntp_loss: 3.5271 global_avg_mtp_loss: 16.3568 +[titan] 2025-06-13 13:42:32,376 - root - INFO - lr: 4.0784e-04 gnorm: 1.13 [ 1:01:02< 1:55:33] +[titan] 2025-06-13 13:42:35,520 - root - INFO - step: 5190 loss: 18.4873 memory: 6.46GiB(27.34%) tps: 26,052 tflops: 26.22 mfu: 8.40% global_avg_ntp_loss: 3.2234 global_avg_mtp_loss: 15.2638 +[titan] 2025-06-13 13:42:35,521 - root - INFO - lr: 4.0763e-04 gnorm: 1.26 [ 1:01:05< 1:55:29] +[titan] 2025-06-13 13:42:38,933 - root - INFO - step: 5195 loss: 20.8652 memory: 6.46GiB(27.34%) tps: 24,011 tflops: 24.16 mfu: 7.74% global_avg_ntp_loss: 3.6946 global_avg_mtp_loss: 17.1706 +[titan] 2025-06-13 13:42:38,933 - root - INFO - lr: 4.0743e-04 gnorm: 1.36 [ 1:01:09< 1:55:25] +[titan] 2025-06-13 13:42:41,823 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:42:42,343 - root - INFO - step: 5200 loss: 20.6995 memory: 6.46GiB(27.34%) tps: 24,025 tflops: 24.18 mfu: 7.75% global_avg_ntp_loss: 3.6684 global_avg_mtp_loss: 17.0311 +[titan] 2025-06-13 13:42:42,343 - root - INFO - lr: 4.0722e-04 gnorm: 1.08 [ 1:01:12< 1:55:21] +[titan] 2025-06-13 13:42:45,567 - root - INFO - step: 5205 loss: 20.0290 memory: 6.46GiB(27.34%) tps: 25,414 tflops: 25.58 mfu: 8.20% global_avg_ntp_loss: 3.5472 global_avg_mtp_loss: 16.4817 +[titan] 2025-06-13 13:42:45,567 - root - INFO - lr: 4.0702e-04 gnorm: 1.03 [ 1:01:15< 1:55:17] +[titan] 2025-06-13 13:42:48,660 - root - INFO - step: 5210 loss: 19.8998 memory: 6.46GiB(27.34%) tps: 26,486 tflops: 26.65 mfu: 8.54% global_avg_ntp_loss: 3.5244 global_avg_mtp_loss: 16.3755 +[titan] 2025-06-13 13:42:48,661 - root - INFO - lr: 4.0681e-04 gnorm: 1.12 [ 1:01:19< 1:55:13] +[titan] 2025-06-13 13:42:52,092 - root - INFO - step: 5215 loss: 20.9277 memory: 6.46GiB(27.34%) tps: 23,873 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 3.7394 global_avg_mtp_loss: 17.1883 +[titan] 2025-06-13 13:42:52,093 - root - INFO - lr: 4.0661e-04 gnorm: 1.15 [ 1:01:22< 1:55:09] +[titan] 2025-06-13 13:42:55,671 - root - INFO - step: 5220 loss: 20.6882 memory: 6.46GiB(27.34%) tps: 22,893 tflops: 23.04 mfu: 7.38% global_avg_ntp_loss: 3.6580 global_avg_mtp_loss: 17.0302 +[titan] 2025-06-13 13:42:55,672 - root - INFO - lr: 4.0640e-04 gnorm: 1.15 [ 1:01:26< 1:55:06] +[titan] 2025-06-13 13:42:58,608 - root - INFO - step: 5225 loss: 19.4552 memory: 6.46GiB(27.34%) tps: 27,905 tflops: 28.08 mfu: 9.00% global_avg_ntp_loss: 3.4453 global_avg_mtp_loss: 16.0099 +[titan] 2025-06-13 13:42:58,608 - root - INFO - lr: 4.0620e-04 gnorm: 1.30 [ 1:01:28< 1:55:01] +[titan] 2025-06-13 13:43:01,694 - root - INFO - step: 5230 loss: 20.2478 memory: 6.46GiB(27.34%) tps: 26,547 tflops: 26.72 mfu: 8.56% global_avg_ntp_loss: 3.6680 global_avg_mtp_loss: 16.5798 +[titan] 2025-06-13 13:43:01,695 - root - INFO - lr: 4.0599e-04 gnorm: 1.25 [ 1:01:32< 1:54:57] +[titan] 2025-06-13 13:43:05,227 - root - INFO - step: 5235 loss: 20.9857 memory: 6.46GiB(27.34%) tps: 23,195 tflops: 23.34 mfu: 7.48% global_avg_ntp_loss: 3.7874 global_avg_mtp_loss: 17.1982 +[titan] 2025-06-13 13:43:05,227 - root - INFO - lr: 4.0579e-04 gnorm: 1.16 [ 1:01:35< 1:54:53] +[titan] 2025-06-13 13:43:09,125 - root - INFO - step: 5240 loss: 20.3712 memory: 6.46GiB(27.34%) tps: 21,014 tflops: 21.15 mfu: 6.78% global_avg_ntp_loss: 3.6527 global_avg_mtp_loss: 16.7185 +[titan] 2025-06-13 13:43:09,126 - root - INFO - lr: 4.0558e-04 gnorm: 1.18 [ 1:01:39< 1:54:50] +[titan] 2025-06-13 13:43:12,726 - root - INFO - step: 5245 loss: 21.1312 memory: 6.46GiB(27.34%) tps: 22,753 tflops: 22.90 mfu: 7.34% global_avg_ntp_loss: 3.7638 global_avg_mtp_loss: 17.3674 +[titan] 2025-06-13 13:43:12,726 - root - INFO - lr: 4.0538e-04 gnorm: 1.02 [ 1:01:43< 1:54:47] +[titan] 2025-06-13 13:43:15,318 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:43:16,223 - root - INFO - step: 5250 loss: 20.5451 memory: 6.46GiB(27.34%) tps: 23,431 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 3.6547 global_avg_mtp_loss: 16.8904 +[titan] 2025-06-13 13:43:16,223 - root - INFO - lr: 4.0517e-04 gnorm: 1.28 [ 1:01:46< 1:54:43] +[titan] 2025-06-13 13:43:19,973 - root - INFO - step: 5255 loss: 21.4791 memory: 6.46GiB(27.34%) tps: 21,844 tflops: 21.98 mfu: 7.05% global_avg_ntp_loss: 3.8232 global_avg_mtp_loss: 17.6560 +[titan] 2025-06-13 13:43:19,974 - root - INFO - lr: 4.0497e-04 gnorm: 1.12 [ 1:01:50< 1:54:40] +[titan] 2025-06-13 13:43:23,329 - root - INFO - step: 5260 loss: 18.2442 memory: 6.46GiB(27.34%) tps: 24,416 tflops: 24.57 mfu: 7.88% global_avg_ntp_loss: 3.2495 global_avg_mtp_loss: 14.9947 +[titan] 2025-06-13 13:43:23,330 - root - INFO - lr: 4.0476e-04 gnorm: 1.91 [ 1:01:53< 1:54:36] +[titan] 2025-06-13 13:43:26,852 - root - INFO - step: 5265 loss: 21.3231 memory: 6.46GiB(27.34%) tps: 23,255 tflops: 23.40 mfu: 7.50% global_avg_ntp_loss: 3.7912 global_avg_mtp_loss: 17.5318 +[titan] 2025-06-13 13:43:26,853 - root - INFO - lr: 4.0455e-04 gnorm: 1.24 [ 1:01:57< 1:54:33] +[titan] 2025-06-13 13:43:30,322 - root - INFO - step: 5270 loss: 20.8194 memory: 6.46GiB(27.34%) tps: 23,613 tflops: 23.76 mfu: 7.62% global_avg_ntp_loss: 3.7240 global_avg_mtp_loss: 17.0954 +[titan] 2025-06-13 13:43:30,322 - root - INFO - lr: 4.0435e-04 gnorm: 1.35 [ 1:02:00< 1:54:29] +[titan] 2025-06-13 13:43:34,527 - root - INFO - step: 5275 loss: 20.2807 memory: 6.46GiB(27.34%) tps: 19,483 tflops: 19.61 mfu: 6.28% global_avg_ntp_loss: 3.6530 global_avg_mtp_loss: 16.6277 +[titan] 2025-06-13 13:43:34,527 - root - INFO - lr: 4.0414e-04 gnorm: 1.57 [ 1:02:04< 1:54:27] +[titan] 2025-06-13 13:43:37,908 - root - INFO - step: 5280 loss: 19.5049 memory: 6.46GiB(27.34%) tps: 24,233 tflops: 24.39 mfu: 7.82% global_avg_ntp_loss: 3.5089 global_avg_mtp_loss: 15.9961 +[titan] 2025-06-13 13:43:37,908 - root - INFO - lr: 4.0393e-04 gnorm: 1.30 [ 1:02:08< 1:54:23] +[titan] 2025-06-13 13:43:41,560 - root - INFO - step: 5285 loss: 20.6577 memory: 6.46GiB(27.34%) tps: 22,433 tflops: 22.58 mfu: 7.24% global_avg_ntp_loss: 3.6721 global_avg_mtp_loss: 16.9856 +[titan] 2025-06-13 13:43:41,561 - root - INFO - lr: 4.0373e-04 gnorm: 1.23 [ 1:02:11< 1:54:20] +[titan] 2025-06-13 13:43:45,222 - root - INFO - step: 5290 loss: 19.6953 memory: 6.46GiB(27.34%) tps: 22,378 tflops: 22.52 mfu: 7.22% global_avg_ntp_loss: 3.5145 global_avg_mtp_loss: 16.1808 +[titan] 2025-06-13 13:43:45,222 - root - INFO - lr: 4.0352e-04 gnorm: 1.20 [ 1:02:15< 1:54:16] +[titan] 2025-06-13 13:43:48,559 - root - INFO - step: 5295 loss: 20.0516 memory: 6.46GiB(27.34%) tps: 24,545 tflops: 24.70 mfu: 7.92% global_avg_ntp_loss: 3.4836 global_avg_mtp_loss: 16.5680 +[titan] 2025-06-13 13:43:48,560 - root - INFO - lr: 4.0331e-04 gnorm: 1.32 [ 1:02:18< 1:54:12] +[titan] 2025-06-13 13:43:51,292 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:43:51,836 - root - INFO - step: 5300 loss: 19.2229 memory: 6.46GiB(27.34%) tps: 25,002 tflops: 25.16 mfu: 8.06% global_avg_ntp_loss: 3.3689 global_avg_mtp_loss: 15.8540 +[titan] 2025-06-13 13:43:51,837 - root - INFO - lr: 4.0311e-04 gnorm: 1.40 [ 1:02:22< 1:54:08] +[titan] 2025-06-13 13:43:55,525 - root - INFO - step: 5305 loss: 20.4740 memory: 6.46GiB(27.34%) tps: 22,212 tflops: 22.35 mfu: 7.16% global_avg_ntp_loss: 3.5903 global_avg_mtp_loss: 16.8837 +[titan] 2025-06-13 13:43:55,525 - root - INFO - lr: 4.0290e-04 gnorm: 1.06 [ 1:02:25< 1:54:05] +[titan] 2025-06-13 13:43:58,806 - root - INFO - step: 5310 loss: 20.9225 memory: 6.46GiB(27.34%) tps: 24,969 tflops: 25.13 mfu: 8.05% global_avg_ntp_loss: 3.7039 global_avg_mtp_loss: 17.2187 +[titan] 2025-06-13 13:43:58,806 - root - INFO - lr: 4.0269e-04 gnorm: 1.14 [ 1:02:29< 1:54:01] +[titan] 2025-06-13 13:44:02,231 - root - INFO - step: 5315 loss: 20.9951 memory: 6.46GiB(27.34%) tps: 23,921 tflops: 24.07 mfu: 7.72% global_avg_ntp_loss: 3.7014 global_avg_mtp_loss: 17.2937 +[titan] 2025-06-13 13:44:02,231 - root - INFO - lr: 4.0248e-04 gnorm: 1.20 [ 1:02:32< 1:53:57] +[titan] 2025-06-13 13:44:05,398 - root - INFO - step: 5320 loss: 18.9567 memory: 6.46GiB(27.34%) tps: 25,868 tflops: 26.03 mfu: 8.34% global_avg_ntp_loss: 3.4025 global_avg_mtp_loss: 15.5543 +[titan] 2025-06-13 13:44:05,398 - root - INFO - lr: 4.0227e-04 gnorm: 1.32 [ 1:02:35< 1:53:53] +[titan] 2025-06-13 13:44:09,019 - root - INFO - step: 5325 loss: 20.0602 memory: 6.46GiB(27.34%) tps: 22,624 tflops: 22.77 mfu: 7.30% global_avg_ntp_loss: 3.5640 global_avg_mtp_loss: 16.4961 +[titan] 2025-06-13 13:44:09,020 - root - INFO - lr: 4.0207e-04 gnorm: 1.13 [ 1:02:39< 1:53:50] +[titan] 2025-06-13 13:44:12,582 - root - INFO - step: 5330 loss: 20.3320 memory: 6.46GiB(27.34%) tps: 22,998 tflops: 23.14 mfu: 7.42% global_avg_ntp_loss: 3.5793 global_avg_mtp_loss: 16.7527 +[titan] 2025-06-13 13:44:12,582 - root - INFO - lr: 4.0186e-04 gnorm: 1.23 [ 1:02:42< 1:53:46] +[titan] 2025-06-13 13:44:16,137 - root - INFO - step: 5335 loss: 21.4925 memory: 6.46GiB(27.34%) tps: 23,048 tflops: 23.19 mfu: 7.43% global_avg_ntp_loss: 3.8784 global_avg_mtp_loss: 17.6140 +[titan] 2025-06-13 13:44:16,137 - root - INFO - lr: 4.0165e-04 gnorm: 1.07 [ 1:02:46< 1:53:43] +[titan] 2025-06-13 13:44:19,316 - root - INFO - step: 5340 loss: 20.8993 memory: 6.46GiB(27.34%) tps: 25,769 tflops: 25.93 mfu: 8.31% global_avg_ntp_loss: 3.7031 global_avg_mtp_loss: 17.1962 +[titan] 2025-06-13 13:44:19,316 - root - INFO - lr: 4.0144e-04 gnorm: 1.14 [ 1:02:49< 1:53:39] +[titan] 2025-06-13 13:44:22,905 - root - INFO - step: 5345 loss: 20.1072 memory: 6.46GiB(27.34%) tps: 22,828 tflops: 22.97 mfu: 7.36% global_avg_ntp_loss: 3.5510 global_avg_mtp_loss: 16.5561 +[titan] 2025-06-13 13:44:22,906 - root - INFO - lr: 4.0123e-04 gnorm: 1.14 [ 1:02:53< 1:53:35] +[titan] 2025-06-13 13:44:25,716 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:44:26,475 - root - INFO - step: 5350 loss: 20.0810 memory: 6.46GiB(27.34%) tps: 22,954 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 3.5669 global_avg_mtp_loss: 16.5141 +[titan] 2025-06-13 13:44:26,475 - root - INFO - lr: 4.0102e-04 gnorm: 1.17 [ 1:02:56< 1:53:32] +[titan] 2025-06-13 13:44:29,882 - root - INFO - step: 5355 loss: 21.0716 memory: 6.46GiB(27.34%) tps: 24,048 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 3.8078 global_avg_mtp_loss: 17.2638 +[titan] 2025-06-13 13:44:29,882 - root - INFO - lr: 4.0081e-04 gnorm: 1.21 [ 1:03:00< 1:53:28] +[titan] 2025-06-13 13:44:33,261 - root - INFO - step: 5360 loss: 20.7055 memory: 6.46GiB(27.34%) tps: 24,246 tflops: 24.40 mfu: 7.82% global_avg_ntp_loss: 3.6491 global_avg_mtp_loss: 17.0564 +[titan] 2025-06-13 13:44:33,261 - root - INFO - lr: 4.0060e-04 gnorm: 1.10 [ 1:03:03< 1:53:24] +[titan] 2025-06-13 13:44:36,855 - root - INFO - step: 5365 loss: 20.1677 memory: 6.46GiB(27.34%) tps: 22,795 tflops: 22.94 mfu: 7.35% global_avg_ntp_loss: 3.5236 global_avg_mtp_loss: 16.6441 +[titan] 2025-06-13 13:44:36,855 - root - INFO - lr: 4.0039e-04 gnorm: 1.25 [ 1:03:07< 1:53:21] +[titan] 2025-06-13 13:44:40,460 - root - INFO - step: 5370 loss: 20.2802 memory: 6.46GiB(27.34%) tps: 22,725 tflops: 22.87 mfu: 7.33% global_avg_ntp_loss: 3.5748 global_avg_mtp_loss: 16.7054 +[titan] 2025-06-13 13:44:40,460 - root - INFO - lr: 4.0018e-04 gnorm: 1.11 [ 1:03:10< 1:53:18] +[titan] 2025-06-13 13:44:44,114 - root - INFO - step: 5375 loss: 21.2207 memory: 6.46GiB(27.34%) tps: 22,419 tflops: 22.56 mfu: 7.23% global_avg_ntp_loss: 3.7227 global_avg_mtp_loss: 17.4981 +[titan] 2025-06-13 13:44:44,115 - root - INFO - lr: 3.9997e-04 gnorm: 1.15 [ 1:03:14< 1:53:14] +[titan] 2025-06-13 13:44:47,226 - root - INFO - step: 5380 loss: 19.6526 memory: 6.46GiB(27.34%) tps: 26,330 tflops: 26.50 mfu: 8.49% global_avg_ntp_loss: 3.4258 global_avg_mtp_loss: 16.2269 +[titan] 2025-06-13 13:44:47,226 - root - INFO - lr: 3.9976e-04 gnorm: 2.16 [ 1:03:17< 1:53:10] +[titan] 2025-06-13 13:44:50,752 - root - INFO - step: 5385 loss: 19.8570 memory: 6.46GiB(27.34%) tps: 23,237 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.5435 global_avg_mtp_loss: 16.3135 +[titan] 2025-06-13 13:44:50,752 - root - INFO - lr: 3.9955e-04 gnorm: 1.28 [ 1:03:21< 1:53:06] +[titan] 2025-06-13 13:44:54,026 - root - INFO - step: 5390 loss: 18.5574 memory: 6.46GiB(27.34%) tps: 25,026 tflops: 25.19 mfu: 8.07% global_avg_ntp_loss: 3.2647 global_avg_mtp_loss: 15.2928 +[titan] 2025-06-13 13:44:54,026 - root - INFO - lr: 3.9934e-04 gnorm: 2.01 [ 1:03:24< 1:53:02] +[titan] 2025-06-13 13:44:57,546 - root - INFO - step: 5395 loss: 20.3136 memory: 6.46GiB(27.34%) tps: 23,274 tflops: 23.42 mfu: 7.51% global_avg_ntp_loss: 3.6298 global_avg_mtp_loss: 16.6838 +[titan] 2025-06-13 13:44:57,546 - root - INFO - lr: 3.9913e-04 gnorm: 1.55 [ 1:03:27< 1:52:59] +[titan] 2025-06-13 13:45:00,063 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:45:00,939 - root - INFO - step: 5400 loss: 21.2209 memory: 6.46GiB(27.34%) tps: 24,149 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 3.7900 global_avg_mtp_loss: 17.4309 +[titan] 2025-06-13 13:45:00,939 - root - INFO - lr: 3.9892e-04 gnorm: 1.06 [ 1:03:31< 1:52:55] +[titan] 2025-06-13 13:45:04,318 - root - INFO - step: 5405 loss: 20.4522 memory: 6.46GiB(27.34%) tps: 24,244 tflops: 24.40 mfu: 7.82% global_avg_ntp_loss: 3.6238 global_avg_mtp_loss: 16.8284 +[titan] 2025-06-13 13:45:04,318 - root - INFO - lr: 3.9871e-04 gnorm: 1.18 [ 1:03:34< 1:52:51] +[titan] 2025-06-13 13:45:07,486 - root - INFO - step: 5410 loss: 21.0858 memory: 6.46GiB(27.34%) tps: 25,863 tflops: 26.03 mfu: 8.34% global_avg_ntp_loss: 3.7448 global_avg_mtp_loss: 17.3410 +[titan] 2025-06-13 13:45:07,486 - root - INFO - lr: 3.9850e-04 gnorm: 1.10 [ 1:03:37< 1:52:47] +[titan] 2025-06-13 13:45:10,960 - root - INFO - step: 5415 loss: 20.3197 memory: 6.46GiB(27.34%) tps: 23,581 tflops: 23.73 mfu: 7.61% global_avg_ntp_loss: 3.6370 global_avg_mtp_loss: 16.6827 +[titan] 2025-06-13 13:45:10,961 - root - INFO - lr: 3.9829e-04 gnorm: 1.32 [ 1:03:41< 1:52:44] +[titan] 2025-06-13 13:45:14,424 - root - INFO - step: 5420 loss: 19.9950 memory: 6.46GiB(27.34%) tps: 23,656 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 3.5842 global_avg_mtp_loss: 16.4108 +[titan] 2025-06-13 13:45:14,424 - root - INFO - lr: 3.9808e-04 gnorm: 1.39 [ 1:03:44< 1:52:40] +[titan] 2025-06-13 13:45:17,790 - root - INFO - step: 5425 loss: 20.5364 memory: 6.46GiB(27.34%) tps: 24,341 tflops: 24.50 mfu: 7.85% global_avg_ntp_loss: 3.6309 global_avg_mtp_loss: 16.9055 +[titan] 2025-06-13 13:45:17,790 - root - INFO - lr: 3.9787e-04 gnorm: 1.24 [ 1:03:48< 1:52:36] +[titan] 2025-06-13 13:45:21,372 - root - INFO - step: 5430 loss: 21.7382 memory: 6.46GiB(27.34%) tps: 22,873 tflops: 23.02 mfu: 7.38% global_avg_ntp_loss: 3.9195 global_avg_mtp_loss: 17.8187 +[titan] 2025-06-13 13:45:21,372 - root - INFO - lr: 3.9766e-04 gnorm: 1.04 [ 1:03:51< 1:52:33] +[titan] 2025-06-13 13:45:24,919 - root - INFO - step: 5435 loss: 20.3838 memory: 6.46GiB(27.34%) tps: 23,097 tflops: 23.24 mfu: 7.45% global_avg_ntp_loss: 3.6635 global_avg_mtp_loss: 16.7203 +[titan] 2025-06-13 13:45:24,920 - root - INFO - lr: 3.9744e-04 gnorm: 1.19 [ 1:03:55< 1:52:29] +[titan] 2025-06-13 13:45:28,194 - root - INFO - step: 5440 loss: 21.5603 memory: 6.46GiB(27.34%) tps: 25,019 tflops: 25.18 mfu: 8.07% global_avg_ntp_loss: 3.8470 global_avg_mtp_loss: 17.7133 +[titan] 2025-06-13 13:45:28,194 - root - INFO - lr: 3.9723e-04 gnorm: 1.08 [ 1:03:58< 1:52:25] +[titan] 2025-06-13 13:45:31,790 - root - INFO - step: 5445 loss: 21.2635 memory: 6.46GiB(27.34%) tps: 22,784 tflops: 22.93 mfu: 7.35% global_avg_ntp_loss: 3.8069 global_avg_mtp_loss: 17.4565 +[titan] 2025-06-13 13:45:31,790 - root - INFO - lr: 3.9702e-04 gnorm: 1.05 [ 1:04:02< 1:52:22] +[titan] 2025-06-13 13:45:34,360 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:45:35,006 - root - INFO - step: 5450 loss: 20.2524 memory: 6.46GiB(27.34%) tps: 25,479 tflops: 25.64 mfu: 8.22% global_avg_ntp_loss: 3.5649 global_avg_mtp_loss: 16.6875 +[titan] 2025-06-13 13:45:35,006 - root - INFO - lr: 3.9681e-04 gnorm: 1.11 [ 1:04:05< 1:52:18] +[titan] 2025-06-13 13:45:38,085 - root - INFO - step: 5455 loss: 20.1903 memory: 6.46GiB(27.34%) tps: 26,610 tflops: 26.78 mfu: 8.58% global_avg_ntp_loss: 3.5908 global_avg_mtp_loss: 16.5995 +[titan] 2025-06-13 13:45:38,085 - root - INFO - lr: 3.9660e-04 gnorm: 1.34 [ 1:04:08< 1:52:13] +[titan] 2025-06-13 13:45:41,629 - root - INFO - step: 5460 loss: 19.2958 memory: 6.46GiB(27.34%) tps: 23,117 tflops: 23.26 mfu: 7.46% global_avg_ntp_loss: 3.4427 global_avg_mtp_loss: 15.8531 +[titan] 2025-06-13 13:45:41,629 - root - INFO - lr: 3.9638e-04 gnorm: 1.20 [ 1:04:11< 1:52:10] +[titan] 2025-06-13 13:45:44,907 - root - INFO - step: 5465 loss: 21.0281 memory: 6.46GiB(27.34%) tps: 24,993 tflops: 25.15 mfu: 8.06% global_avg_ntp_loss: 3.7577 global_avg_mtp_loss: 17.2704 +[titan] 2025-06-13 13:45:44,908 - root - INFO - lr: 3.9617e-04 gnorm: 1.16 [ 1:04:15< 1:52:06] +[titan] 2025-06-13 13:45:48,321 - root - INFO - step: 5470 loss: 20.6653 memory: 6.46GiB(27.34%) tps: 24,001 tflops: 24.15 mfu: 7.74% global_avg_ntp_loss: 3.6556 global_avg_mtp_loss: 17.0098 +[titan] 2025-06-13 13:45:48,321 - root - INFO - lr: 3.9596e-04 gnorm: 1.13 [ 1:04:18< 1:52:02] +[titan] 2025-06-13 13:45:54,216 - root - INFO - step: 5475 loss: 21.5199 memory: 6.46GiB(27.34%) tps: 13,897 tflops: 13.99 mfu: 4.48% global_avg_ntp_loss: 3.8473 global_avg_mtp_loss: 17.6725 +[titan] 2025-06-13 13:45:54,216 - root - INFO - lr: 3.9575e-04 gnorm: 1.21 [ 1:04:24< 1:52:03] +[titan] 2025-06-13 13:45:57,224 - root - INFO - step: 5480 loss: 18.4502 memory: 6.46GiB(27.34%) tps: 27,242 tflops: 27.42 mfu: 8.79% global_avg_ntp_loss: 3.3686 global_avg_mtp_loss: 15.0816 +[titan] 2025-06-13 13:45:57,225 - root - INFO - lr: 3.9553e-04 gnorm: 2.78 [ 1:04:27< 1:51:58] +[titan] 2025-06-13 13:46:00,438 - root - INFO - step: 5485 loss: 18.2763 memory: 6.46GiB(27.34%) tps: 25,498 tflops: 25.66 mfu: 8.22% global_avg_ntp_loss: 3.3190 global_avg_mtp_loss: 14.9572 +[titan] 2025-06-13 13:46:00,438 - root - INFO - lr: 3.9532e-04 gnorm: 1.93 [ 1:04:30< 1:51:54] +[titan] 2025-06-13 13:46:03,639 - root - INFO - step: 5490 loss: 19.7445 memory: 6.46GiB(27.34%) tps: 25,594 tflops: 25.76 mfu: 8.26% global_avg_ntp_loss: 3.5250 global_avg_mtp_loss: 16.2195 +[titan] 2025-06-13 13:46:03,639 - root - INFO - lr: 3.9511e-04 gnorm: 1.13 [ 1:04:33< 1:51:50] +[titan] 2025-06-13 13:46:07,498 - root - INFO - step: 5495 loss: 21.1656 memory: 6.46GiB(27.34%) tps: 21,227 tflops: 21.36 mfu: 6.85% global_avg_ntp_loss: 3.7820 global_avg_mtp_loss: 17.3837 +[titan] 2025-06-13 13:46:07,499 - root - INFO - lr: 3.9489e-04 gnorm: 1.17 [ 1:04:37< 1:51:47] +[titan] 2025-06-13 13:46:10,216 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:46:10,799 - root - INFO - step: 5500 loss: 20.0114 memory: 6.46GiB(27.34%) tps: 24,819 tflops: 24.98 mfu: 8.01% global_avg_ntp_loss: 3.6294 global_avg_mtp_loss: 16.3821 +[titan] 2025-06-13 13:46:10,800 - root - INFO - lr: 3.9468e-04 gnorm: 1.30 [ 1:04:41< 1:51:43] +[titan] 2025-06-13 13:46:14,285 - root - INFO - step: 5505 loss: 20.5393 memory: 6.46GiB(27.34%) tps: 23,505 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.6653 global_avg_mtp_loss: 16.8741 +[titan] 2025-06-13 13:46:14,285 - root - INFO - lr: 3.9446e-04 gnorm: 1.16 [ 1:04:44< 1:51:40] +[titan] 2025-06-13 13:46:17,956 - root - INFO - step: 5510 loss: 21.0443 memory: 6.46GiB(27.34%) tps: 22,320 tflops: 22.46 mfu: 7.20% global_avg_ntp_loss: 3.7777 global_avg_mtp_loss: 17.2665 +[titan] 2025-06-13 13:46:17,956 - root - INFO - lr: 3.9425e-04 gnorm: 1.02 [ 1:04:48< 1:51:36] +[titan] 2025-06-13 13:46:21,356 - root - INFO - step: 5515 loss: 21.0219 memory: 6.46GiB(27.34%) tps: 24,094 tflops: 24.25 mfu: 7.77% global_avg_ntp_loss: 3.7661 global_avg_mtp_loss: 17.2558 +[titan] 2025-06-13 13:46:21,357 - root - INFO - lr: 3.9404e-04 gnorm: 1.11 [ 1:04:51< 1:51:33] +[titan] 2025-06-13 13:46:24,950 - root - INFO - step: 5520 loss: 20.6384 memory: 6.46GiB(27.34%) tps: 22,796 tflops: 22.94 mfu: 7.35% global_avg_ntp_loss: 3.6574 global_avg_mtp_loss: 16.9810 +[titan] 2025-06-13 13:46:24,951 - root - INFO - lr: 3.9382e-04 gnorm: 1.29 [ 1:04:55< 1:51:29] +[titan] 2025-06-13 13:46:28,332 - root - INFO - step: 5525 loss: 20.1395 memory: 6.46GiB(27.34%) tps: 24,227 tflops: 24.38 mfu: 7.81% global_avg_ntp_loss: 3.5845 global_avg_mtp_loss: 16.5550 +[titan] 2025-06-13 13:46:28,333 - root - INFO - lr: 3.9361e-04 gnorm: 1.29 [ 1:04:58< 1:51:25] +[titan] 2025-06-13 13:46:32,022 - root - INFO - step: 5530 loss: 18.9329 memory: 6.46GiB(27.34%) tps: 22,202 tflops: 22.34 mfu: 7.16% global_avg_ntp_loss: 3.3403 global_avg_mtp_loss: 15.5927 +[titan] 2025-06-13 13:46:32,023 - root - INFO - lr: 3.9339e-04 gnorm: 1.30 [ 1:05:02< 1:51:22] +[titan] 2025-06-13 13:46:35,540 - root - INFO - step: 5535 loss: 20.3869 memory: 6.46GiB(27.34%) tps: 23,289 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 3.6014 global_avg_mtp_loss: 16.7854 +[titan] 2025-06-13 13:46:35,541 - root - INFO - lr: 3.9318e-04 gnorm: 1.20 [ 1:05:05< 1:51:19] +[titan] 2025-06-13 13:46:38,608 - root - INFO - step: 5540 loss: 20.8898 memory: 6.46GiB(27.34%) tps: 26,711 tflops: 26.88 mfu: 8.62% global_avg_ntp_loss: 3.7237 global_avg_mtp_loss: 17.1661 +[titan] 2025-06-13 13:46:38,608 - root - INFO - lr: 3.9296e-04 gnorm: 1.06 [ 1:05:08< 1:51:14] +[titan] 2025-06-13 13:46:42,150 - root - INFO - step: 5545 loss: 20.5929 memory: 6.46GiB(27.34%) tps: 23,132 tflops: 23.28 mfu: 7.46% global_avg_ntp_loss: 3.6327 global_avg_mtp_loss: 16.9602 +[titan] 2025-06-13 13:46:42,150 - root - INFO - lr: 3.9275e-04 gnorm: 1.19 [ 1:05:12< 1:51:11] +[titan] 2025-06-13 13:46:45,063 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:46:45,709 - root - INFO - step: 5550 loss: 14.6176 memory: 6.46GiB(27.34%) tps: 23,016 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 2.5951 global_avg_mtp_loss: 12.0224 +[titan] 2025-06-13 13:46:45,710 - root - INFO - lr: 3.9253e-04 gnorm: 1.29 [ 1:05:16< 1:51:07] +[titan] 2025-06-13 13:46:48,851 - root - INFO - step: 5555 loss: 20.0991 memory: 6.46GiB(27.34%) tps: 26,081 tflops: 26.25 mfu: 8.41% global_avg_ntp_loss: 3.5455 global_avg_mtp_loss: 16.5535 +[titan] 2025-06-13 13:46:48,851 - root - INFO - lr: 3.9232e-04 gnorm: 1.16 [ 1:05:19< 1:51:03] +[titan] 2025-06-13 13:46:52,565 - root - INFO - step: 5560 loss: 18.8795 memory: 6.46GiB(27.34%) tps: 22,055 tflops: 22.20 mfu: 7.11% global_avg_ntp_loss: 3.3373 global_avg_mtp_loss: 15.5421 +[titan] 2025-06-13 13:46:52,566 - root - INFO - lr: 3.9210e-04 gnorm: 1.75 [ 1:05:22< 1:51:00] +[titan] 2025-06-13 13:46:55,683 - root - INFO - step: 5565 loss: 19.8824 memory: 6.46GiB(27.34%) tps: 26,280 tflops: 26.45 mfu: 8.48% global_avg_ntp_loss: 3.4725 global_avg_mtp_loss: 16.4099 +[titan] 2025-06-13 13:46:55,684 - root - INFO - lr: 3.9189e-04 gnorm: 1.36 [ 1:05:26< 1:50:56] +[titan] 2025-06-13 13:46:59,289 - root - INFO - step: 5570 loss: 20.3868 memory: 6.46GiB(27.34%) tps: 22,724 tflops: 22.87 mfu: 7.33% global_avg_ntp_loss: 3.6577 global_avg_mtp_loss: 16.7291 +[titan] 2025-06-13 13:46:59,290 - root - INFO - lr: 3.9167e-04 gnorm: 1.14 [ 1:05:29< 1:50:52] +[titan] 2025-06-13 13:47:02,980 - root - INFO - step: 5575 loss: 20.7957 memory: 6.46GiB(27.34%) tps: 22,200 tflops: 22.34 mfu: 7.16% global_avg_ntp_loss: 3.6436 global_avg_mtp_loss: 17.1521 +[titan] 2025-06-13 13:47:02,980 - root - INFO - lr: 3.9146e-04 gnorm: 1.21 [ 1:05:33< 1:50:49] +[titan] 2025-06-13 13:47:06,585 - root - INFO - step: 5580 loss: 20.7881 memory: 6.46GiB(27.34%) tps: 22,725 tflops: 22.87 mfu: 7.33% global_avg_ntp_loss: 3.6887 global_avg_mtp_loss: 17.0994 +[titan] 2025-06-13 13:47:06,585 - root - INFO - lr: 3.9124e-04 gnorm: 1.21 [ 1:05:36< 1:50:46] +[titan] 2025-06-13 13:47:09,738 - root - INFO - step: 5585 loss: 20.6750 memory: 6.46GiB(27.34%) tps: 25,983 tflops: 26.15 mfu: 8.38% global_avg_ntp_loss: 3.6410 global_avg_mtp_loss: 17.0340 +[titan] 2025-06-13 13:47:09,739 - root - INFO - lr: 3.9102e-04 gnorm: 1.10 [ 1:05:40< 1:50:42] +[titan] 2025-06-13 13:47:14,510 - root - INFO - step: 5590 loss: 19.9765 memory: 6.46GiB(27.34%) tps: 17,171 tflops: 17.28 mfu: 5.54% global_avg_ntp_loss: 3.5089 global_avg_mtp_loss: 16.4676 +[titan] 2025-06-13 13:47:14,510 - root - INFO - lr: 3.9081e-04 gnorm: 1.25 [ 1:05:44< 1:50:40] +[titan] 2025-06-13 13:47:17,969 - root - INFO - step: 5595 loss: 20.6038 memory: 6.46GiB(27.34%) tps: 23,681 tflops: 23.83 mfu: 7.64% global_avg_ntp_loss: 3.6076 global_avg_mtp_loss: 16.9962 +[titan] 2025-06-13 13:47:17,970 - root - INFO - lr: 3.9059e-04 gnorm: 1.19 [ 1:05:48< 1:50:36] +[titan] 2025-06-13 13:47:20,646 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:47:21,302 - root - INFO - step: 5600 loss: 19.5790 memory: 6.46GiB(27.34%) tps: 24,581 tflops: 24.74 mfu: 7.93% global_avg_ntp_loss: 3.4438 global_avg_mtp_loss: 16.1352 +[titan] 2025-06-13 13:47:21,302 - root - INFO - lr: 3.9037e-04 gnorm: 1.28 [ 1:05:51< 1:50:33] +[titan] 2025-06-13 13:47:25,107 - root - INFO - step: 5605 loss: 20.9763 memory: 6.46GiB(27.34%) tps: 21,530 tflops: 21.67 mfu: 6.94% global_avg_ntp_loss: 3.7259 global_avg_mtp_loss: 17.2505 +[titan] 2025-06-13 13:47:25,108 - root - INFO - lr: 3.9016e-04 gnorm: 1.23 [ 1:05:55< 1:50:30] +[titan] 2025-06-13 13:47:28,633 - root - INFO - step: 5610 loss: 20.6376 memory: 6.46GiB(27.34%) tps: 23,240 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.6773 global_avg_mtp_loss: 16.9603 +[titan] 2025-06-13 13:47:28,633 - root - INFO - lr: 3.8994e-04 gnorm: 1.11 [ 1:05:58< 1:50:26] +[titan] 2025-06-13 13:47:32,101 - root - INFO - step: 5615 loss: 18.0921 memory: 6.46GiB(27.34%) tps: 23,622 tflops: 23.77 mfu: 7.62% global_avg_ntp_loss: 3.1827 global_avg_mtp_loss: 14.9094 +[titan] 2025-06-13 13:47:32,102 - root - INFO - lr: 3.8972e-04 gnorm: 1.49 [ 1:06:02< 1:50:22] +[titan] 2025-06-13 13:47:35,607 - root - INFO - step: 5620 loss: 21.1085 memory: 6.46GiB(27.34%) tps: 23,372 tflops: 23.52 mfu: 7.54% global_avg_ntp_loss: 3.7296 global_avg_mtp_loss: 17.3789 +[titan] 2025-06-13 13:47:35,607 - root - INFO - lr: 3.8951e-04 gnorm: 1.16 [ 1:06:05< 1:50:19] +[titan] 2025-06-13 13:47:39,013 - root - INFO - step: 5625 loss: 20.4705 memory: 6.46GiB(27.34%) tps: 24,056 tflops: 24.21 mfu: 7.76% global_avg_ntp_loss: 3.6168 global_avg_mtp_loss: 16.8537 +[titan] 2025-06-13 13:47:39,013 - root - INFO - lr: 3.8929e-04 gnorm: 1.20 [ 1:06:09< 1:50:15] +[titan] 2025-06-13 13:47:42,699 - root - INFO - step: 5630 loss: 19.0426 memory: 6.46GiB(27.34%) tps: 22,225 tflops: 22.37 mfu: 7.17% global_avg_ntp_loss: 3.3510 global_avg_mtp_loss: 15.6917 +[titan] 2025-06-13 13:47:42,700 - root - INFO - lr: 3.8907e-04 gnorm: 1.49 [ 1:06:13< 1:50:12] +[titan] 2025-06-13 13:47:44,407 - root - INFO - Dumping profiler traces at step 5632 +[titan] 2025-06-13 13:47:44,511 - root - INFO - Finished dumping profiler traces in 0.10 seconds +[titan] 2025-06-13 13:47:46,312 - root - INFO - step: 5635 loss: 20.4391 memory: 6.46GiB(27.34%) tps: 22,678 tflops: 22.82 mfu: 7.31% global_avg_ntp_loss: 3.5464 global_avg_mtp_loss: 16.8927 +[titan] 2025-06-13 13:47:46,312 - root - INFO - lr: 3.8885e-04 gnorm: 1.40 [ 1:06:16< 1:50:08] +[titan] 2025-06-13 13:47:49,914 - root - INFO - step: 5640 loss: 18.5111 memory: 6.46GiB(27.34%) tps: 22,744 tflops: 22.89 mfu: 7.34% global_avg_ntp_loss: 3.2568 global_avg_mtp_loss: 15.2543 +[titan] 2025-06-13 13:47:49,915 - root - INFO - lr: 3.8863e-04 gnorm: 1.74 [ 1:06:20< 1:50:05] +[titan] 2025-06-13 13:47:53,493 - root - INFO - step: 5645 loss: 17.8855 memory: 6.46GiB(27.34%) tps: 22,895 tflops: 23.04 mfu: 7.38% global_avg_ntp_loss: 3.2361 global_avg_mtp_loss: 14.6494 +[titan] 2025-06-13 13:47:53,493 - root - INFO - lr: 3.8842e-04 gnorm: 1.74 [ 1:06:23< 1:50:02] +[titan] 2025-06-13 13:47:56,746 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:47:57,286 - root - INFO - step: 5650 loss: 20.3923 memory: 6.46GiB(27.34%) tps: 21,597 tflops: 21.74 mfu: 6.97% global_avg_ntp_loss: 3.6292 global_avg_mtp_loss: 16.7630 +[titan] 2025-06-13 13:47:57,286 - root - INFO - lr: 3.8820e-04 gnorm: 1.15 [ 1:06:27< 1:49:58] +[titan] 2025-06-13 13:48:00,931 - root - INFO - step: 5655 loss: 20.1828 memory: 6.46GiB(27.34%) tps: 22,477 tflops: 22.62 mfu: 7.25% global_avg_ntp_loss: 3.5585 global_avg_mtp_loss: 16.6243 +[titan] 2025-06-13 13:48:00,931 - root - INFO - lr: 3.8798e-04 gnorm: 1.34 [ 1:06:31< 1:49:55] +[titan] 2025-06-13 13:48:04,837 - root - INFO - step: 5660 loss: 20.8829 memory: 6.46GiB(27.34%) tps: 20,973 tflops: 21.11 mfu: 6.77% global_avg_ntp_loss: 3.7507 global_avg_mtp_loss: 17.1321 +[titan] 2025-06-13 13:48:04,838 - root - INFO - lr: 3.8776e-04 gnorm: 1.20 [ 1:06:35< 1:49:52] +[titan] 2025-06-13 13:48:08,368 - root - INFO - step: 5665 loss: 20.2161 memory: 6.46GiB(27.34%) tps: 23,210 tflops: 23.36 mfu: 7.49% global_avg_ntp_loss: 3.5706 global_avg_mtp_loss: 16.6455 +[titan] 2025-06-13 13:48:08,368 - root - INFO - lr: 3.8754e-04 gnorm: 1.15 [ 1:06:38< 1:49:49] +[titan] 2025-06-13 13:48:11,943 - root - INFO - step: 5670 loss: 20.2730 memory: 6.46GiB(27.34%) tps: 22,916 tflops: 23.06 mfu: 7.39% global_avg_ntp_loss: 3.5817 global_avg_mtp_loss: 16.6913 +[titan] 2025-06-13 13:48:11,943 - root - INFO - lr: 3.8733e-04 gnorm: 1.08 [ 1:06:42< 1:49:45] +[titan] 2025-06-13 13:48:15,427 - root - INFO - step: 5675 loss: 19.8868 memory: 6.46GiB(27.34%) tps: 23,515 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.5664 global_avg_mtp_loss: 16.3203 +[titan] 2025-06-13 13:48:15,427 - root - INFO - lr: 3.8711e-04 gnorm: 1.63 [ 1:06:45< 1:49:42] +[titan] 2025-06-13 13:48:19,139 - root - INFO - step: 5680 loss: 19.1387 memory: 6.46GiB(27.34%) tps: 22,069 tflops: 22.21 mfu: 7.12% global_avg_ntp_loss: 3.3324 global_avg_mtp_loss: 15.8063 +[titan] 2025-06-13 13:48:19,140 - root - INFO - lr: 3.8689e-04 gnorm: 1.20 [ 1:06:49< 1:49:38] +[titan] 2025-06-13 13:48:22,367 - root - INFO - step: 5685 loss: 19.7103 memory: 6.46GiB(27.34%) tps: 25,383 tflops: 25.54 mfu: 8.19% global_avg_ntp_loss: 3.4810 global_avg_mtp_loss: 16.2293 +[titan] 2025-06-13 13:48:22,368 - root - INFO - lr: 3.8667e-04 gnorm: 1.38 [ 1:06:52< 1:49:34] +[titan] 2025-06-13 13:48:25,731 - root - INFO - step: 5690 loss: 20.9660 memory: 6.46GiB(27.34%) tps: 24,354 tflops: 24.51 mfu: 7.86% global_avg_ntp_loss: 3.7500 global_avg_mtp_loss: 17.2159 +[titan] 2025-06-13 13:48:25,732 - root - INFO - lr: 3.8645e-04 gnorm: 1.23 [ 1:06:56< 1:49:31] +[titan] 2025-06-13 13:48:29,215 - root - INFO - step: 5695 loss: 20.7446 memory: 6.46GiB(27.34%) tps: 23,519 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.6811 global_avg_mtp_loss: 17.0635 +[titan] 2025-06-13 13:48:29,215 - root - INFO - lr: 3.8623e-04 gnorm: 1.12 [ 1:06:59< 1:49:27] +[titan] 2025-06-13 13:48:32,052 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:48:32,653 - root - INFO - step: 5700 loss: 19.8177 memory: 6.46GiB(27.34%) tps: 23,830 tflops: 23.98 mfu: 7.69% global_avg_ntp_loss: 3.4785 global_avg_mtp_loss: 16.3392 +[titan] 2025-06-13 13:48:32,653 - root - INFO - lr: 3.8601e-04 gnorm: 1.30 [ 1:07:02< 1:49:23] +[titan] 2025-06-13 13:48:35,975 - root - INFO - step: 5705 loss: 20.1652 memory: 6.46GiB(27.34%) tps: 24,664 tflops: 24.82 mfu: 7.96% global_avg_ntp_loss: 3.5596 global_avg_mtp_loss: 16.6056 +[titan] 2025-06-13 13:48:35,975 - root - INFO - lr: 3.8579e-04 gnorm: 1.29 [ 1:07:06< 1:49:19] +[titan] 2025-06-13 13:48:39,442 - root - INFO - step: 5710 loss: 21.7442 memory: 6.46GiB(27.34%) tps: 23,632 tflops: 23.78 mfu: 7.62% global_avg_ntp_loss: 3.9506 global_avg_mtp_loss: 17.7935 +[titan] 2025-06-13 13:48:39,442 - root - INFO - lr: 3.8557e-04 gnorm: 1.43 [ 1:07:09< 1:49:16] +[titan] 2025-06-13 13:48:42,750 - root - INFO - step: 5715 loss: 19.5130 memory: 6.46GiB(27.34%) tps: 24,767 tflops: 24.92 mfu: 7.99% global_avg_ntp_loss: 3.4251 global_avg_mtp_loss: 16.0880 +[titan] 2025-06-13 13:48:42,750 - root - INFO - lr: 3.8535e-04 gnorm: 1.39 [ 1:07:13< 1:49:12] +[titan] 2025-06-13 13:48:46,136 - root - INFO - step: 5720 loss: 20.8704 memory: 6.46GiB(27.34%) tps: 24,195 tflops: 24.35 mfu: 7.80% global_avg_ntp_loss: 3.7445 global_avg_mtp_loss: 17.1259 +[titan] 2025-06-13 13:48:46,136 - root - INFO - lr: 3.8513e-04 gnorm: 1.19 [ 1:07:16< 1:49:08] +[titan] 2025-06-13 13:48:49,548 - root - INFO - step: 5725 loss: 19.3068 memory: 6.46GiB(27.34%) tps: 24,014 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 3.4335 global_avg_mtp_loss: 15.8733 +[titan] 2025-06-13 13:48:49,548 - root - INFO - lr: 3.8491e-04 gnorm: 1.14 [ 1:07:19< 1:49:04] +[titan] 2025-06-13 13:48:53,028 - root - INFO - step: 5730 loss: 19.0339 memory: 6.46GiB(27.34%) tps: 23,539 tflops: 23.69 mfu: 7.59% global_avg_ntp_loss: 3.3365 global_avg_mtp_loss: 15.6974 +[titan] 2025-06-13 13:48:53,029 - root - INFO - lr: 3.8469e-04 gnorm: 1.17 [ 1:07:23< 1:49:01] +[titan] 2025-06-13 13:48:56,704 - root - INFO - step: 5735 loss: 20.7526 memory: 6.46GiB(27.34%) tps: 22,287 tflops: 22.43 mfu: 7.19% global_avg_ntp_loss: 3.7052 global_avg_mtp_loss: 17.0474 +[titan] 2025-06-13 13:48:56,705 - root - INFO - lr: 3.8447e-04 gnorm: 1.09 [ 1:07:27< 1:48:58] +[titan] 2025-06-13 13:49:00,299 - root - INFO - step: 5740 loss: 20.4564 memory: 6.46GiB(27.34%) tps: 22,794 tflops: 22.94 mfu: 7.35% global_avg_ntp_loss: 3.6296 global_avg_mtp_loss: 16.8268 +[titan] 2025-06-13 13:49:00,299 - root - INFO - lr: 3.8425e-04 gnorm: 1.20 [ 1:07:30< 1:48:54] +[titan] 2025-06-13 13:49:03,711 - root - INFO - step: 5745 loss: 21.1304 memory: 6.46GiB(27.34%) tps: 24,009 tflops: 24.16 mfu: 7.74% global_avg_ntp_loss: 3.7432 global_avg_mtp_loss: 17.3871 +[titan] 2025-06-13 13:49:03,712 - root - INFO - lr: 3.8403e-04 gnorm: 1.16 [ 1:07:34< 1:48:50] +[titan] 2025-06-13 13:49:06,354 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:49:07,236 - root - INFO - step: 5750 loss: 21.2316 memory: 6.46GiB(27.34%) tps: 23,244 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.7565 global_avg_mtp_loss: 17.4751 +[titan] 2025-06-13 13:49:07,236 - root - INFO - lr: 3.8381e-04 gnorm: 1.16 [ 1:07:37< 1:48:47] +[titan] 2025-06-13 13:49:10,531 - root - INFO - step: 5755 loss: 19.8367 memory: 6.46GiB(27.34%) tps: 24,863 tflops: 25.02 mfu: 8.02% global_avg_ntp_loss: 3.5198 global_avg_mtp_loss: 16.3169 +[titan] 2025-06-13 13:49:10,532 - root - INFO - lr: 3.8359e-04 gnorm: 1.34 [ 1:07:40< 1:48:43] +[titan] 2025-06-13 13:49:14,481 - root - INFO - step: 5760 loss: 20.6142 memory: 6.46GiB(27.34%) tps: 20,745 tflops: 20.88 mfu: 6.69% global_avg_ntp_loss: 3.6592 global_avg_mtp_loss: 16.9550 +[titan] 2025-06-13 13:49:14,481 - root - INFO - lr: 3.8337e-04 gnorm: 0.99 [ 1:07:44< 1:48:40] +[titan] 2025-06-13 13:49:17,803 - root - INFO - step: 5765 loss: 19.1371 memory: 6.46GiB(27.34%) tps: 24,661 tflops: 24.82 mfu: 7.95% global_avg_ntp_loss: 3.3625 global_avg_mtp_loss: 15.7746 +[titan] 2025-06-13 13:49:17,803 - root - INFO - lr: 3.8314e-04 gnorm: 1.16 [ 1:07:48< 1:48:36] +[titan] 2025-06-13 13:49:21,581 - root - INFO - step: 5770 loss: 20.2912 memory: 6.46GiB(27.34%) tps: 21,686 tflops: 21.82 mfu: 6.99% global_avg_ntp_loss: 3.5538 global_avg_mtp_loss: 16.7374 +[titan] 2025-06-13 13:49:21,581 - root - INFO - lr: 3.8292e-04 gnorm: 1.26 [ 1:07:51< 1:48:33] +[titan] 2025-06-13 13:49:24,926 - root - INFO - step: 5775 loss: 20.1736 memory: 6.46GiB(27.34%) tps: 24,494 tflops: 24.65 mfu: 7.90% global_avg_ntp_loss: 3.5246 global_avg_mtp_loss: 16.6490 +[titan] 2025-06-13 13:49:24,926 - root - INFO - lr: 3.8270e-04 gnorm: 1.12 [ 1:07:55< 1:48:29] +[titan] 2025-06-13 13:49:28,210 - root - INFO - step: 5780 loss: 18.5182 memory: 6.46GiB(27.34%) tps: 24,947 tflops: 25.11 mfu: 8.05% global_avg_ntp_loss: 3.2440 global_avg_mtp_loss: 15.2742 +[titan] 2025-06-13 13:49:28,210 - root - INFO - lr: 3.8248e-04 gnorm: 1.28 [ 1:07:58< 1:48:25] +[titan] 2025-06-13 13:49:31,788 - root - INFO - step: 5785 loss: 20.2154 memory: 6.46GiB(27.34%) tps: 22,896 tflops: 23.04 mfu: 7.39% global_avg_ntp_loss: 3.5597 global_avg_mtp_loss: 16.6557 +[titan] 2025-06-13 13:49:31,789 - root - INFO - lr: 3.8226e-04 gnorm: 1.22 [ 1:08:02< 1:48:22] +[titan] 2025-06-13 13:49:34,933 - root - INFO - step: 5790 loss: 21.0168 memory: 6.46GiB(27.34%) tps: 26,051 tflops: 26.22 mfu: 8.40% global_avg_ntp_loss: 3.7478 global_avg_mtp_loss: 17.2690 +[titan] 2025-06-13 13:49:34,934 - root - INFO - lr: 3.8204e-04 gnorm: 1.11 [ 1:08:05< 1:48:18] +[titan] 2025-06-13 13:49:38,379 - root - INFO - step: 5795 loss: 19.0733 memory: 6.46GiB(27.34%) tps: 23,779 tflops: 23.93 mfu: 7.67% global_avg_ntp_loss: 3.3449 global_avg_mtp_loss: 15.7284 +[titan] 2025-06-13 13:49:38,380 - root - INFO - lr: 3.8181e-04 gnorm: 1.19 [ 1:08:08< 1:48:14] +[titan] 2025-06-13 13:49:41,295 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:49:42,251 - root - INFO - step: 5800 loss: 19.9637 memory: 6.46GiB(27.34%) tps: 21,161 tflops: 21.30 mfu: 6.83% global_avg_ntp_loss: 3.5429 global_avg_mtp_loss: 16.4208 +[titan] 2025-06-13 13:49:42,251 - root - INFO - lr: 3.8159e-04 gnorm: 1.26 [ 1:08:12< 1:48:11] +[titan] 2025-06-13 13:49:45,331 - root - INFO - step: 5805 loss: 19.7448 memory: 6.46GiB(27.34%) tps: 26,602 tflops: 26.77 mfu: 8.58% global_avg_ntp_loss: 3.4980 global_avg_mtp_loss: 16.2468 +[titan] 2025-06-13 13:49:45,331 - root - INFO - lr: 3.8137e-04 gnorm: 1.26 [ 1:08:15< 1:48:07] +[titan] 2025-06-13 13:49:48,875 - root - INFO - step: 5810 loss: 20.0951 memory: 6.46GiB(27.34%) tps: 23,118 tflops: 23.27 mfu: 7.46% global_avg_ntp_loss: 3.6027 global_avg_mtp_loss: 16.4924 +[titan] 2025-06-13 13:49:48,875 - root - INFO - lr: 3.8115e-04 gnorm: 1.23 [ 1:08:19< 1:48:03] +[titan] 2025-06-13 13:49:52,612 - root - INFO - step: 5815 loss: 20.0935 memory: 6.46GiB(27.34%) tps: 21,924 tflops: 22.06 mfu: 7.07% global_avg_ntp_loss: 3.5764 global_avg_mtp_loss: 16.5171 +[titan] 2025-06-13 13:49:52,612 - root - INFO - lr: 3.8092e-04 gnorm: 1.34 [ 1:08:22< 1:48:00] +[titan] 2025-06-13 13:49:55,786 - root - INFO - step: 5820 loss: 20.7037 memory: 6.46GiB(27.34%) tps: 25,814 tflops: 25.98 mfu: 8.33% global_avg_ntp_loss: 3.7097 global_avg_mtp_loss: 16.9940 +[titan] 2025-06-13 13:49:55,786 - root - INFO - lr: 3.8070e-04 gnorm: 1.04 [ 1:08:26< 1:47:56] +[titan] 2025-06-13 13:49:59,283 - root - INFO - step: 5825 loss: 18.9699 memory: 6.46GiB(27.34%) tps: 23,427 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 3.3180 global_avg_mtp_loss: 15.6519 +[titan] 2025-06-13 13:49:59,283 - root - INFO - lr: 3.8048e-04 gnorm: 1.21 [ 1:08:29< 1:47:53] +[titan] 2025-06-13 13:50:02,603 - root - INFO - step: 5830 loss: 20.1429 memory: 6.46GiB(27.34%) tps: 24,675 tflops: 24.83 mfu: 7.96% global_avg_ntp_loss: 3.6019 global_avg_mtp_loss: 16.5409 +[titan] 2025-06-13 13:50:02,603 - root - INFO - lr: 3.8025e-04 gnorm: 1.30 [ 1:08:32< 1:47:49] +[titan] 2025-06-13 13:50:05,804 - root - INFO - step: 5835 loss: 19.1001 memory: 6.46GiB(27.34%) tps: 25,595 tflops: 25.76 mfu: 8.26% global_avg_ntp_loss: 3.3224 global_avg_mtp_loss: 15.7777 +[titan] 2025-06-13 13:50:05,805 - root - INFO - lr: 3.8003e-04 gnorm: 1.26 [ 1:08:36< 1:47:45] +[titan] 2025-06-13 13:50:09,301 - root - INFO - step: 5840 loss: 20.0130 memory: 6.46GiB(27.34%) tps: 23,428 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 3.5397 global_avg_mtp_loss: 16.4732 +[titan] 2025-06-13 13:50:09,302 - root - INFO - lr: 3.7981e-04 gnorm: 1.15 [ 1:08:39< 1:47:41] +[titan] 2025-06-13 13:50:12,790 - root - INFO - step: 5845 loss: 20.3110 memory: 6.46GiB(27.34%) tps: 23,490 tflops: 23.64 mfu: 7.58% global_avg_ntp_loss: 3.5839 global_avg_mtp_loss: 16.7272 +[titan] 2025-06-13 13:50:12,790 - root - INFO - lr: 3.7959e-04 gnorm: 1.09 [ 1:08:43< 1:47:37] +[titan] 2025-06-13 13:50:15,570 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:50:16,236 - root - INFO - step: 5850 loss: 20.9191 memory: 6.46GiB(27.34%) tps: 23,775 tflops: 23.93 mfu: 7.67% global_avg_ntp_loss: 3.6970 global_avg_mtp_loss: 17.2221 +[titan] 2025-06-13 13:50:16,236 - root - INFO - lr: 3.7936e-04 gnorm: 1.12 [ 1:08:46< 1:47:34] +[titan] 2025-06-13 13:50:19,773 - root - INFO - step: 5855 loss: 19.6325 memory: 6.46GiB(27.34%) tps: 23,165 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.4445 global_avg_mtp_loss: 16.1880 +[titan] 2025-06-13 13:50:19,773 - root - INFO - lr: 3.7914e-04 gnorm: 1.26 [ 1:08:50< 1:47:30] +[titan] 2025-06-13 13:50:23,405 - root - INFO - step: 5860 loss: 18.5919 memory: 6.46GiB(27.34%) tps: 22,554 tflops: 22.70 mfu: 7.27% global_avg_ntp_loss: 3.2235 global_avg_mtp_loss: 15.3684 +[titan] 2025-06-13 13:50:23,406 - root - INFO - lr: 3.7891e-04 gnorm: 1.67 [ 1:08:53< 1:47:27] +[titan] 2025-06-13 13:50:26,924 - root - INFO - step: 5865 loss: 21.1382 memory: 6.46GiB(27.34%) tps: 23,288 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 3.7715 global_avg_mtp_loss: 17.3667 +[titan] 2025-06-13 13:50:26,924 - root - INFO - lr: 3.7869e-04 gnorm: 1.26 [ 1:08:57< 1:47:23] +[titan] 2025-06-13 13:50:30,183 - root - INFO - step: 5870 loss: 19.6861 memory: 6.46GiB(27.34%) tps: 25,132 tflops: 25.29 mfu: 8.11% global_avg_ntp_loss: 3.4236 global_avg_mtp_loss: 16.2625 +[titan] 2025-06-13 13:50:30,184 - root - INFO - lr: 3.7847e-04 gnorm: 1.29 [ 1:09:00< 1:47:19] +[titan] 2025-06-13 13:50:33,695 - root - INFO - step: 5875 loss: 21.0527 memory: 6.46GiB(27.34%) tps: 23,330 tflops: 23.48 mfu: 7.53% global_avg_ntp_loss: 3.7548 global_avg_mtp_loss: 17.2979 +[titan] 2025-06-13 13:50:33,696 - root - INFO - lr: 3.7824e-04 gnorm: 1.07 [ 1:09:03< 1:47:16] +[titan] 2025-06-13 13:50:37,055 - root - INFO - step: 5880 loss: 20.0838 memory: 6.46GiB(27.34%) tps: 24,387 tflops: 24.54 mfu: 7.87% global_avg_ntp_loss: 3.5649 global_avg_mtp_loss: 16.5189 +[titan] 2025-06-13 13:50:37,055 - root - INFO - lr: 3.7802e-04 gnorm: 1.10 [ 1:09:07< 1:47:12] +[titan] 2025-06-13 13:50:40,294 - root - INFO - step: 5885 loss: 19.7791 memory: 6.46GiB(27.34%) tps: 25,298 tflops: 25.46 mfu: 8.16% global_avg_ntp_loss: 3.4859 global_avg_mtp_loss: 16.2932 +[titan] 2025-06-13 13:50:40,294 - root - INFO - lr: 3.7779e-04 gnorm: 1.16 [ 1:09:10< 1:47:08] +[titan] 2025-06-13 13:50:43,772 - root - INFO - step: 5890 loss: 20.0966 memory: 6.46GiB(27.34%) tps: 23,554 tflops: 23.70 mfu: 7.60% global_avg_ntp_loss: 3.5665 global_avg_mtp_loss: 16.5301 +[titan] 2025-06-13 13:50:43,772 - root - INFO - lr: 3.7757e-04 gnorm: 1.08 [ 1:09:14< 1:47:05] +[titan] 2025-06-13 13:50:47,564 - root - INFO - step: 5895 loss: 17.0882 memory: 6.46GiB(27.34%) tps: 21,607 tflops: 21.74 mfu: 6.97% global_avg_ntp_loss: 3.0500 global_avg_mtp_loss: 14.0382 +[titan] 2025-06-13 13:50:47,564 - root - INFO - lr: 3.7734e-04 gnorm: 1.41 [ 1:09:17< 1:47:01] +[titan] 2025-06-13 13:50:50,282 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:50:50,836 - root - INFO - step: 5900 loss: 20.2507 memory: 6.46GiB(27.34%) tps: 25,042 tflops: 25.20 mfu: 8.08% global_avg_ntp_loss: 3.7316 global_avg_mtp_loss: 16.5190 +[titan] 2025-06-13 13:50:50,836 - root - INFO - lr: 3.7712e-04 gnorm: 1.52 [ 1:09:21< 1:46:58] +[titan] 2025-06-13 13:50:54,182 - root - INFO - step: 5905 loss: 18.1148 memory: 6.46GiB(27.34%) tps: 24,482 tflops: 24.64 mfu: 7.90% global_avg_ntp_loss: 3.1544 global_avg_mtp_loss: 14.9604 +[titan] 2025-06-13 13:50:54,183 - root - INFO - lr: 3.7689e-04 gnorm: 1.44 [ 1:09:24< 1:46:54] +[titan] 2025-06-13 13:50:57,627 - root - INFO - step: 5910 loss: 19.8359 memory: 6.46GiB(27.34%) tps: 23,783 tflops: 23.93 mfu: 7.67% global_avg_ntp_loss: 3.4854 global_avg_mtp_loss: 16.3505 +[titan] 2025-06-13 13:50:57,628 - root - INFO - lr: 3.7667e-04 gnorm: 1.22 [ 1:09:27< 1:46:50] +[titan] 2025-06-13 13:51:01,245 - root - INFO - step: 5915 loss: 19.9376 memory: 6.46GiB(27.34%) tps: 22,647 tflops: 22.79 mfu: 7.30% global_avg_ntp_loss: 3.5425 global_avg_mtp_loss: 16.3951 +[titan] 2025-06-13 13:51:01,245 - root - INFO - lr: 3.7644e-04 gnorm: 1.14 [ 1:09:31< 1:46:47] +[titan] 2025-06-13 13:51:04,642 - root - INFO - step: 5920 loss: 19.7339 memory: 6.46GiB(27.34%) tps: 24,118 tflops: 24.27 mfu: 7.78% global_avg_ntp_loss: 3.4353 global_avg_mtp_loss: 16.2986 +[titan] 2025-06-13 13:51:04,642 - root - INFO - lr: 3.7622e-04 gnorm: 1.30 [ 1:09:34< 1:46:43] +[titan] 2025-06-13 13:51:08,145 - root - INFO - step: 5925 loss: 21.2127 memory: 6.46GiB(27.34%) tps: 23,389 tflops: 23.54 mfu: 7.54% global_avg_ntp_loss: 3.7693 global_avg_mtp_loss: 17.4434 +[titan] 2025-06-13 13:51:08,145 - root - INFO - lr: 3.7599e-04 gnorm: 1.09 [ 1:09:38< 1:46:39] +[titan] 2025-06-13 13:51:11,613 - root - INFO - step: 5930 loss: 20.8129 memory: 6.46GiB(27.34%) tps: 23,625 tflops: 23.78 mfu: 7.62% global_avg_ntp_loss: 3.6723 global_avg_mtp_loss: 17.1406 +[titan] 2025-06-13 13:51:11,614 - root - INFO - lr: 3.7577e-04 gnorm: 1.10 [ 1:09:41< 1:46:36] +[titan] 2025-06-13 13:51:15,157 - root - INFO - step: 5935 loss: 20.6867 memory: 6.46GiB(27.34%) tps: 23,118 tflops: 23.27 mfu: 7.46% global_avg_ntp_loss: 3.6619 global_avg_mtp_loss: 17.0248 +[titan] 2025-06-13 13:51:15,158 - root - INFO - lr: 3.7554e-04 gnorm: 1.12 [ 1:09:45< 1:46:32] +[titan] 2025-06-13 13:51:19,312 - root - INFO - step: 5940 loss: 20.0128 memory: 6.46GiB(27.34%) tps: 19,719 tflops: 19.84 mfu: 6.36% global_avg_ntp_loss: 3.5433 global_avg_mtp_loss: 16.4695 +[titan] 2025-06-13 13:51:19,312 - root - INFO - lr: 3.7532e-04 gnorm: 1.44 [ 1:09:49< 1:46:30] +[titan] 2025-06-13 13:51:22,631 - root - INFO - step: 5945 loss: 20.0341 memory: 6.46GiB(27.34%) tps: 24,685 tflops: 24.84 mfu: 7.96% global_avg_ntp_loss: 3.5229 global_avg_mtp_loss: 16.5112 +[titan] 2025-06-13 13:51:22,631 - root - INFO - lr: 3.7509e-04 gnorm: 1.24 [ 1:09:52< 1:46:26] +[titan] 2025-06-13 13:51:25,609 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:51:26,209 - root - INFO - step: 5950 loss: 18.4301 memory: 6.46GiB(27.34%) tps: 22,901 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 3.2059 global_avg_mtp_loss: 15.2242 +[titan] 2025-06-13 13:51:26,209 - root - INFO - lr: 3.7486e-04 gnorm: 1.40 [ 1:09:56< 1:46:22] +[titan] 2025-06-13 13:51:29,616 - root - INFO - step: 5955 loss: 20.8239 memory: 6.46GiB(27.34%) tps: 24,048 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 3.6929 global_avg_mtp_loss: 17.1310 +[titan] 2025-06-13 13:51:29,616 - root - INFO - lr: 3.7464e-04 gnorm: 1.05 [ 1:09:59< 1:46:19] +[titan] 2025-06-13 13:51:32,991 - root - INFO - step: 5960 loss: 20.3127 memory: 6.46GiB(27.34%) tps: 24,273 tflops: 24.43 mfu: 7.83% global_avg_ntp_loss: 3.5908 global_avg_mtp_loss: 16.7220 +[titan] 2025-06-13 13:51:32,991 - root - INFO - lr: 3.7441e-04 gnorm: 1.06 [ 1:10:03< 1:46:15] +[titan] 2025-06-13 13:51:36,435 - root - INFO - step: 5965 loss: 21.0494 memory: 6.46GiB(27.34%) tps: 23,788 tflops: 23.94 mfu: 7.67% global_avg_ntp_loss: 3.7445 global_avg_mtp_loss: 17.3049 +[titan] 2025-06-13 13:51:36,435 - root - INFO - lr: 3.7418e-04 gnorm: 1.29 [ 1:10:06< 1:46:11] +[titan] 2025-06-13 13:51:39,850 - root - INFO - step: 5970 loss: 20.2789 memory: 6.46GiB(27.34%) tps: 23,990 tflops: 24.14 mfu: 7.74% global_avg_ntp_loss: 3.5319 global_avg_mtp_loss: 16.7470 +[titan] 2025-06-13 13:51:39,851 - root - INFO - lr: 3.7396e-04 gnorm: 1.32 [ 1:10:10< 1:46:08] +[titan] 2025-06-13 13:51:43,285 - root - INFO - step: 5975 loss: 21.3667 memory: 6.46GiB(27.34%) tps: 23,856 tflops: 24.01 mfu: 7.69% global_avg_ntp_loss: 3.8234 global_avg_mtp_loss: 17.5432 +[titan] 2025-06-13 13:51:43,286 - root - INFO - lr: 3.7373e-04 gnorm: 1.12 [ 1:10:13< 1:46:04] +[titan] 2025-06-13 13:51:46,514 - root - INFO - step: 5980 loss: 19.8212 memory: 6.46GiB(27.34%) tps: 25,379 tflops: 25.54 mfu: 8.19% global_avg_ntp_loss: 3.5108 global_avg_mtp_loss: 16.3103 +[titan] 2025-06-13 13:51:46,514 - root - INFO - lr: 3.7350e-04 gnorm: 1.42 [ 1:10:16< 1:46:00] +[titan] 2025-06-13 13:51:49,810 - root - INFO - step: 5985 loss: 19.6530 memory: 6.46GiB(27.34%) tps: 24,857 tflops: 25.01 mfu: 8.02% global_avg_ntp_loss: 3.4359 global_avg_mtp_loss: 16.2171 +[titan] 2025-06-13 13:51:49,810 - root - INFO - lr: 3.7328e-04 gnorm: 1.22 [ 1:10:20< 1:45:56] +[titan] 2025-06-13 13:51:53,347 - root - INFO - step: 5990 loss: 18.9423 memory: 6.46GiB(27.34%) tps: 23,159 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.3118 global_avg_mtp_loss: 15.6306 +[titan] 2025-06-13 13:51:53,348 - root - INFO - lr: 3.7305e-04 gnorm: 1.39 [ 1:10:23< 1:45:53] +[titan] 2025-06-13 13:51:57,118 - root - INFO - step: 5995 loss: 20.4429 memory: 6.46GiB(27.34%) tps: 21,726 tflops: 21.86 mfu: 7.01% global_avg_ntp_loss: 3.6516 global_avg_mtp_loss: 16.7913 +[titan] 2025-06-13 13:51:57,119 - root - INFO - lr: 3.7282e-04 gnorm: 1.11 [ 1:10:27< 1:45:49] +[titan] 2025-06-13 13:51:59,893 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:52:00,581 - root - INFO - step: 6000 loss: 19.9522 memory: 6.46GiB(27.34%) tps: 23,664 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 3.4812 global_avg_mtp_loss: 16.4710 +[titan] 2025-06-13 13:52:00,581 - root - INFO - lr: 3.7259e-04 gnorm: 1.35 [ 1:10:30< 1:45:46] +[titan] 2025-06-13 13:52:04,256 - root - INFO - step: 6005 loss: 21.0963 memory: 6.46GiB(27.34%) tps: 22,290 tflops: 22.43 mfu: 7.19% global_avg_ntp_loss: 3.7573 global_avg_mtp_loss: 17.3390 +[titan] 2025-06-13 13:52:04,256 - root - INFO - lr: 3.7237e-04 gnorm: 1.18 [ 1:10:34< 1:45:42] +[titan] 2025-06-13 13:52:07,438 - root - INFO - step: 6010 loss: 18.8675 memory: 6.46GiB(27.34%) tps: 25,752 tflops: 25.92 mfu: 8.31% global_avg_ntp_loss: 3.3394 global_avg_mtp_loss: 15.5281 +[titan] 2025-06-13 13:52:07,438 - root - INFO - lr: 3.7214e-04 gnorm: 1.39 [ 1:10:37< 1:45:38] +[titan] 2025-06-13 13:52:10,880 - root - INFO - step: 6015 loss: 21.2500 memory: 6.46GiB(27.34%) tps: 23,798 tflops: 23.95 mfu: 7.68% global_avg_ntp_loss: 3.8356 global_avg_mtp_loss: 17.4144 +[titan] 2025-06-13 13:52:10,881 - root - INFO - lr: 3.7191e-04 gnorm: 1.50 [ 1:10:41< 1:45:35] +[titan] 2025-06-13 13:52:14,364 - root - INFO - step: 6020 loss: 19.4744 memory: 6.46GiB(27.34%) tps: 23,520 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.4590 global_avg_mtp_loss: 16.0155 +[titan] 2025-06-13 13:52:14,364 - root - INFO - lr: 3.7168e-04 gnorm: 1.13 [ 1:10:44< 1:45:31] +[titan] 2025-06-13 13:52:18,226 - root - INFO - step: 6025 loss: 20.2776 memory: 6.46GiB(27.34%) tps: 21,213 tflops: 21.35 mfu: 6.84% global_avg_ntp_loss: 3.5685 global_avg_mtp_loss: 16.7091 +[titan] 2025-06-13 13:52:18,227 - root - INFO - lr: 3.7146e-04 gnorm: 1.13 [ 1:10:48< 1:45:28] +[titan] 2025-06-13 13:52:21,987 - root - INFO - step: 6030 loss: 19.8720 memory: 6.46GiB(27.34%) tps: 21,784 tflops: 21.92 mfu: 7.03% global_avg_ntp_loss: 3.5408 global_avg_mtp_loss: 16.3313 +[titan] 2025-06-13 13:52:21,987 - root - INFO - lr: 3.7123e-04 gnorm: 1.39 [ 1:10:52< 1:45:25] +[titan] 2025-06-13 13:52:25,523 - root - INFO - step: 6035 loss: 20.0146 memory: 6.46GiB(27.34%) tps: 23,174 tflops: 23.32 mfu: 7.47% global_avg_ntp_loss: 3.5099 global_avg_mtp_loss: 16.5046 +[titan] 2025-06-13 13:52:25,523 - root - INFO - lr: 3.7100e-04 gnorm: 1.18 [ 1:10:55< 1:45:21] +[titan] 2025-06-13 13:52:29,034 - root - INFO - step: 6040 loss: 20.2663 memory: 6.46GiB(27.34%) tps: 23,333 tflops: 23.48 mfu: 7.53% global_avg_ntp_loss: 3.5760 global_avg_mtp_loss: 16.6903 +[titan] 2025-06-13 13:52:29,034 - root - INFO - lr: 3.7077e-04 gnorm: 1.20 [ 1:10:59< 1:45:18] +[titan] 2025-06-13 13:52:32,667 - root - INFO - step: 6045 loss: 19.8225 memory: 6.46GiB(27.34%) tps: 22,549 tflops: 22.69 mfu: 7.27% global_avg_ntp_loss: 3.4659 global_avg_mtp_loss: 16.3566 +[titan] 2025-06-13 13:52:32,668 - root - INFO - lr: 3.7054e-04 gnorm: 1.07 [ 1:11:02< 1:45:15] +[titan] 2025-06-13 13:52:35,644 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:52:36,532 - root - INFO - step: 6050 loss: 20.8509 memory: 6.46GiB(27.34%) tps: 21,202 tflops: 21.34 mfu: 6.84% global_avg_ntp_loss: 3.6780 global_avg_mtp_loss: 17.1729 +[titan] 2025-06-13 13:52:36,532 - root - INFO - lr: 3.7031e-04 gnorm: 1.10 [ 1:11:06< 1:45:12] +[titan] 2025-06-13 13:52:40,302 - root - INFO - step: 6055 loss: 19.0253 memory: 6.46GiB(27.34%) tps: 21,727 tflops: 21.87 mfu: 7.01% global_avg_ntp_loss: 3.3209 global_avg_mtp_loss: 15.7045 +[titan] 2025-06-13 13:52:40,303 - root - INFO - lr: 3.7009e-04 gnorm: 1.14 [ 1:11:10< 1:45:08] +[titan] 2025-06-13 13:52:43,888 - root - INFO - step: 6060 loss: 19.3048 memory: 6.46GiB(27.34%) tps: 22,850 tflops: 23.00 mfu: 7.37% global_avg_ntp_loss: 3.3826 global_avg_mtp_loss: 15.9221 +[titan] 2025-06-13 13:52:43,888 - root - INFO - lr: 3.6986e-04 gnorm: 1.04 [ 1:11:14< 1:45:05] +[titan] 2025-06-13 13:52:47,089 - root - INFO - step: 6065 loss: 21.3395 memory: 6.46GiB(27.34%) tps: 25,600 tflops: 25.76 mfu: 8.26% global_avg_ntp_loss: 3.7536 global_avg_mtp_loss: 17.5859 +[titan] 2025-06-13 13:52:47,089 - root - INFO - lr: 3.6963e-04 gnorm: 1.11 [ 1:11:17< 1:45:01] +[titan] 2025-06-13 13:52:50,477 - root - INFO - step: 6070 loss: 20.2625 memory: 6.46GiB(27.34%) tps: 24,179 tflops: 24.33 mfu: 7.80% global_avg_ntp_loss: 3.5437 global_avg_mtp_loss: 16.7187 +[titan] 2025-06-13 13:52:50,477 - root - INFO - lr: 3.6940e-04 gnorm: 1.29 [ 1:11:20< 1:44:57] +[titan] 2025-06-13 13:52:53,732 - root - INFO - step: 6075 loss: 20.2130 memory: 6.46GiB(27.34%) tps: 25,169 tflops: 25.33 mfu: 8.12% global_avg_ntp_loss: 3.5986 global_avg_mtp_loss: 16.6144 +[titan] 2025-06-13 13:52:53,733 - root - INFO - lr: 3.6917e-04 gnorm: 1.06 [ 1:11:24< 1:44:53] +[titan] 2025-06-13 13:52:57,374 - root - INFO - step: 6080 loss: 21.0209 memory: 6.46GiB(27.34%) tps: 22,500 tflops: 22.64 mfu: 7.26% global_avg_ntp_loss: 3.7272 global_avg_mtp_loss: 17.2938 +[titan] 2025-06-13 13:52:57,374 - root - INFO - lr: 3.6894e-04 gnorm: 1.31 [ 1:11:27< 1:44:50] +[titan] 2025-06-13 13:53:01,057 - root - INFO - step: 6085 loss: 18.8225 memory: 6.46GiB(27.34%) tps: 22,241 tflops: 22.38 mfu: 7.17% global_avg_ntp_loss: 3.3263 global_avg_mtp_loss: 15.4962 +[titan] 2025-06-13 13:53:01,058 - root - INFO - lr: 3.6871e-04 gnorm: 1.92 [ 1:11:31< 1:44:47] +[titan] 2025-06-13 13:53:04,650 - root - INFO - step: 6090 loss: 19.9396 memory: 6.46GiB(27.34%) tps: 22,806 tflops: 22.95 mfu: 7.36% global_avg_ntp_loss: 3.4606 global_avg_mtp_loss: 16.4791 +[titan] 2025-06-13 13:53:04,650 - root - INFO - lr: 3.6848e-04 gnorm: 1.22 [ 1:11:34< 1:44:43] +[titan] 2025-06-13 13:53:08,598 - root - INFO - step: 6095 loss: 20.3460 memory: 6.46GiB(27.34%) tps: 20,751 tflops: 20.88 mfu: 6.69% global_avg_ntp_loss: 3.7159 global_avg_mtp_loss: 16.6301 +[titan] 2025-06-13 13:53:08,598 - root - INFO - lr: 3.6825e-04 gnorm: 1.34 [ 1:11:38< 1:44:40] +[titan] 2025-06-13 13:53:11,916 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:53:12,397 - root - INFO - step: 6100 loss: 20.2650 memory: 6.46GiB(27.34%) tps: 21,564 tflops: 21.70 mfu: 6.96% global_avg_ntp_loss: 3.5672 global_avg_mtp_loss: 16.6978 +[titan] 2025-06-13 13:53:12,398 - root - INFO - lr: 3.6802e-04 gnorm: 1.05 [ 1:11:42< 1:44:37] +[titan] 2025-06-13 13:53:16,180 - root - INFO - step: 6105 loss: 19.8535 memory: 6.46GiB(27.34%) tps: 21,658 tflops: 21.80 mfu: 6.99% global_avg_ntp_loss: 3.4790 global_avg_mtp_loss: 16.3745 +[titan] 2025-06-13 13:53:16,181 - root - INFO - lr: 3.6779e-04 gnorm: 1.13 [ 1:11:46< 1:44:34] +[titan] 2025-06-13 13:53:19,938 - root - INFO - step: 6110 loss: 20.8695 memory: 6.46GiB(27.34%) tps: 21,805 tflops: 21.94 mfu: 7.03% global_avg_ntp_loss: 3.7221 global_avg_mtp_loss: 17.1474 +[titan] 2025-06-13 13:53:19,938 - root - INFO - lr: 3.6756e-04 gnorm: 1.02 [ 1:11:50< 1:44:31] +[titan] 2025-06-13 13:53:23,219 - root - INFO - step: 6115 loss: 20.0740 memory: 6.46GiB(27.34%) tps: 24,971 tflops: 25.13 mfu: 8.05% global_avg_ntp_loss: 3.5206 global_avg_mtp_loss: 16.5534 +[titan] 2025-06-13 13:53:23,219 - root - INFO - lr: 3.6733e-04 gnorm: 1.12 [ 1:11:53< 1:44:27] +[titan] 2025-06-13 13:53:27,530 - root - INFO - step: 6120 loss: 20.0891 memory: 6.46GiB(27.34%) tps: 19,004 tflops: 19.12 mfu: 6.13% global_avg_ntp_loss: 3.5441 global_avg_mtp_loss: 16.5450 +[titan] 2025-06-13 13:53:27,530 - root - INFO - lr: 3.6710e-04 gnorm: 1.18 [ 1:11:57< 1:44:25] +[titan] 2025-06-13 13:53:31,373 - root - INFO - step: 6125 loss: 20.5361 memory: 6.46GiB(27.34%) tps: 21,317 tflops: 21.45 mfu: 6.88% global_avg_ntp_loss: 3.6318 global_avg_mtp_loss: 16.9043 +[titan] 2025-06-13 13:53:31,374 - root - INFO - lr: 3.6687e-04 gnorm: 1.15 [ 1:12:01< 1:44:21] +[titan] 2025-06-13 13:53:34,544 - root - INFO - step: 6130 loss: 20.0684 memory: 6.46GiB(27.34%) tps: 25,837 tflops: 26.00 mfu: 8.33% global_avg_ntp_loss: 3.5347 global_avg_mtp_loss: 16.5337 +[titan] 2025-06-13 13:53:34,545 - root - INFO - lr: 3.6664e-04 gnorm: 1.12 [ 1:12:04< 1:44:17] +[titan] 2025-06-13 13:53:38,203 - root - INFO - step: 6135 loss: 20.5625 memory: 6.46GiB(27.34%) tps: 22,394 tflops: 22.54 mfu: 7.22% global_avg_ntp_loss: 3.6385 global_avg_mtp_loss: 16.9240 +[titan] 2025-06-13 13:53:38,204 - root - INFO - lr: 3.6641e-04 gnorm: 1.09 [ 1:12:08< 1:44:14] +[titan] 2025-06-13 13:53:42,196 - root - INFO - step: 6140 loss: 20.5485 memory: 6.46GiB(27.34%) tps: 20,519 tflops: 20.65 mfu: 6.62% global_avg_ntp_loss: 3.6373 global_avg_mtp_loss: 16.9112 +[titan] 2025-06-13 13:53:42,196 - root - INFO - lr: 3.6618e-04 gnorm: 1.18 [ 1:12:12< 1:44:11] +[titan] 2025-06-13 13:53:45,304 - root - INFO - Dumping profiler traces at step 6144 +[titan] 2025-06-13 13:53:45,403 - root - INFO - Finished dumping profiler traces in 0.10 seconds +[titan] 2025-06-13 13:53:46,165 - root - INFO - step: 6145 loss: 20.4297 memory: 6.46GiB(27.34%) tps: 20,645 tflops: 20.78 mfu: 6.66% global_avg_ntp_loss: 3.6180 global_avg_mtp_loss: 16.8116 +[titan] 2025-06-13 13:53:46,165 - root - INFO - lr: 3.6595e-04 gnorm: 1.10 [ 1:12:16< 1:44:08] +[titan] 2025-06-13 13:53:49,054 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:53:50,032 - root - INFO - step: 6150 loss: 19.6238 memory: 6.46GiB(27.34%) tps: 21,184 tflops: 21.32 mfu: 6.83% global_avg_ntp_loss: 3.4736 global_avg_mtp_loss: 16.1501 +[titan] 2025-06-13 13:53:50,033 - root - INFO - lr: 3.6572e-04 gnorm: 1.33 [ 1:12:20< 1:44:05] +[titan] 2025-06-13 13:53:53,307 - root - INFO - step: 6155 loss: 20.0850 memory: 6.46GiB(27.34%) tps: 25,018 tflops: 25.18 mfu: 8.07% global_avg_ntp_loss: 3.5110 global_avg_mtp_loss: 16.5740 +[titan] 2025-06-13 13:53:53,308 - root - INFO - lr: 3.6549e-04 gnorm: 1.26 [ 1:12:23< 1:44:01] +[titan] 2025-06-13 13:53:56,773 - root - INFO - step: 6160 loss: 21.5110 memory: 6.46GiB(27.34%) tps: 23,643 tflops: 23.79 mfu: 7.63% global_avg_ntp_loss: 3.8718 global_avg_mtp_loss: 17.6393 +[titan] 2025-06-13 13:53:56,773 - root - INFO - lr: 3.6525e-04 gnorm: 1.12 [ 1:12:27< 1:43:58] +[titan] 2025-06-13 13:54:00,258 - root - INFO - step: 6165 loss: 20.8296 memory: 6.46GiB(27.34%) tps: 23,510 tflops: 23.66 mfu: 7.58% global_avg_ntp_loss: 3.6930 global_avg_mtp_loss: 17.1366 +[titan] 2025-06-13 13:54:00,258 - root - INFO - lr: 3.6502e-04 gnorm: 1.07 [ 1:12:30< 1:43:54] +[titan] 2025-06-13 13:54:03,823 - root - INFO - step: 6170 loss: 18.2596 memory: 6.46GiB(27.34%) tps: 22,979 tflops: 23.13 mfu: 7.41% global_avg_ntp_loss: 3.1919 global_avg_mtp_loss: 15.0677 +[titan] 2025-06-13 13:54:03,823 - root - INFO - lr: 3.6479e-04 gnorm: 1.24 [ 1:12:34< 1:43:51] +[titan] 2025-06-13 13:54:07,456 - root - INFO - step: 6175 loss: 19.3431 memory: 6.46GiB(27.34%) tps: 22,555 tflops: 22.70 mfu: 7.28% global_avg_ntp_loss: 3.3613 global_avg_mtp_loss: 15.9818 +[titan] 2025-06-13 13:54:07,456 - root - INFO - lr: 3.6456e-04 gnorm: 1.20 [ 1:12:37< 1:43:47] +[titan] 2025-06-13 13:54:11,140 - root - INFO - step: 6180 loss: 20.2276 memory: 6.46GiB(27.34%) tps: 22,238 tflops: 22.38 mfu: 7.17% global_avg_ntp_loss: 3.5668 global_avg_mtp_loss: 16.6608 +[titan] 2025-06-13 13:54:11,140 - root - INFO - lr: 3.6433e-04 gnorm: 1.21 [ 1:12:41< 1:43:44] +[titan] 2025-06-13 13:54:14,626 - root - INFO - step: 6185 loss: 19.6398 memory: 6.46GiB(27.34%) tps: 23,501 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.4303 global_avg_mtp_loss: 16.2095 +[titan] 2025-06-13 13:54:14,627 - root - INFO - lr: 3.6410e-04 gnorm: 1.14 [ 1:12:44< 1:43:40] +[titan] 2025-06-13 13:54:18,210 - root - INFO - step: 6190 loss: 20.7318 memory: 6.46GiB(27.34%) tps: 22,862 tflops: 23.01 mfu: 7.37% global_avg_ntp_loss: 3.6753 global_avg_mtp_loss: 17.0565 +[titan] 2025-06-13 13:54:18,210 - root - INFO - lr: 3.6387e-04 gnorm: 1.05 [ 1:12:48< 1:43:37] +[titan] 2025-06-13 13:54:21,818 - root - INFO - step: 6195 loss: 18.9481 memory: 6.46GiB(27.34%) tps: 22,710 tflops: 22.85 mfu: 7.33% global_avg_ntp_loss: 3.3611 global_avg_mtp_loss: 15.5870 +[titan] 2025-06-13 13:54:21,818 - root - INFO - lr: 3.6363e-04 gnorm: 1.59 [ 1:12:52< 1:43:34] +[titan] 2025-06-13 13:54:24,582 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:54:25,169 - root - INFO - step: 6200 loss: 20.3802 memory: 6.46GiB(27.34%) tps: 24,447 tflops: 24.60 mfu: 7.89% global_avg_ntp_loss: 3.5953 global_avg_mtp_loss: 16.7849 +[titan] 2025-06-13 13:54:25,169 - root - INFO - lr: 3.6340e-04 gnorm: 1.08 [ 1:12:55< 1:43:30] +[titan] 2025-06-13 13:54:28,473 - root - INFO - step: 6205 loss: 20.5849 memory: 6.46GiB(27.34%) tps: 24,794 tflops: 24.95 mfu: 8.00% global_avg_ntp_loss: 3.6307 global_avg_mtp_loss: 16.9541 +[titan] 2025-06-13 13:54:28,474 - root - INFO - lr: 3.6317e-04 gnorm: 1.12 [ 1:12:58< 1:43:26] +[titan] 2025-06-13 13:54:32,098 - root - INFO - step: 6210 loss: 20.4805 memory: 6.46GiB(27.34%) tps: 22,601 tflops: 22.75 mfu: 7.29% global_avg_ntp_loss: 3.5890 global_avg_mtp_loss: 16.8914 +[titan] 2025-06-13 13:54:32,099 - root - INFO - lr: 3.6294e-04 gnorm: 1.14 [ 1:13:02< 1:43:23] +[titan] 2025-06-13 13:54:35,569 - root - INFO - step: 6215 loss: 20.7868 memory: 6.46GiB(27.34%) tps: 23,607 tflops: 23.76 mfu: 7.61% global_avg_ntp_loss: 3.7006 global_avg_mtp_loss: 17.0862 +[titan] 2025-06-13 13:54:35,569 - root - INFO - lr: 3.6270e-04 gnorm: 1.16 [ 1:13:05< 1:43:19] +[titan] 2025-06-13 13:54:39,019 - root - INFO - step: 6220 loss: 20.3738 memory: 6.46GiB(27.34%) tps: 23,750 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 3.5689 global_avg_mtp_loss: 16.8050 +[titan] 2025-06-13 13:54:39,019 - root - INFO - lr: 3.6247e-04 gnorm: 1.06 [ 1:13:09< 1:43:15] +[titan] 2025-06-13 13:54:42,543 - root - INFO - step: 6225 loss: 19.4119 memory: 6.46GiB(27.34%) tps: 23,248 tflops: 23.40 mfu: 7.50% global_avg_ntp_loss: 3.4003 global_avg_mtp_loss: 16.0116 +[titan] 2025-06-13 13:54:42,543 - root - INFO - lr: 3.6224e-04 gnorm: 1.05 [ 1:13:12< 1:43:12] +[titan] 2025-06-13 13:54:45,832 - root - INFO - step: 6230 loss: 20.7273 memory: 6.46GiB(27.34%) tps: 24,910 tflops: 25.07 mfu: 8.03% global_avg_ntp_loss: 3.7102 global_avg_mtp_loss: 17.0171 +[titan] 2025-06-13 13:54:45,832 - root - INFO - lr: 3.6201e-04 gnorm: 1.11 [ 1:13:16< 1:43:08] +[titan] 2025-06-13 13:54:49,692 - root - INFO - step: 6235 loss: 19.5657 memory: 6.46GiB(27.34%) tps: 21,224 tflops: 21.36 mfu: 6.85% global_avg_ntp_loss: 3.4515 global_avg_mtp_loss: 16.1142 +[titan] 2025-06-13 13:54:49,692 - root - INFO - lr: 3.6177e-04 gnorm: 1.10 [ 1:13:19< 1:43:05] +[titan] 2025-06-13 13:54:53,047 - root - INFO - step: 6240 loss: 20.0790 memory: 6.46GiB(27.34%) tps: 24,420 tflops: 24.58 mfu: 7.88% global_avg_ntp_loss: 3.5432 global_avg_mtp_loss: 16.5358 +[titan] 2025-06-13 13:54:53,048 - root - INFO - lr: 3.6154e-04 gnorm: 1.13 [ 1:13:23< 1:43:01] +[titan] 2025-06-13 13:54:56,422 - root - INFO - step: 6245 loss: 18.7264 memory: 6.46GiB(27.34%) tps: 24,277 tflops: 24.43 mfu: 7.83% global_avg_ntp_loss: 3.2490 global_avg_mtp_loss: 15.4774 +[titan] 2025-06-13 13:54:56,423 - root - INFO - lr: 3.6131e-04 gnorm: 1.19 [ 1:13:26< 1:42:57] +[titan] 2025-06-13 13:54:59,397 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:55:00,130 - root - INFO - step: 6250 loss: 20.0121 memory: 6.46GiB(27.34%) tps: 22,100 tflops: 22.24 mfu: 7.13% global_avg_ntp_loss: 3.5126 global_avg_mtp_loss: 16.4995 +[titan] 2025-06-13 13:55:00,130 - root - INFO - lr: 3.6107e-04 gnorm: 1.06 [ 1:13:30< 1:42:54] +[titan] 2025-06-13 13:55:03,819 - root - INFO - step: 6255 loss: 20.3611 memory: 6.46GiB(27.34%) tps: 22,205 tflops: 22.35 mfu: 7.16% global_avg_ntp_loss: 3.6373 global_avg_mtp_loss: 16.7238 +[titan] 2025-06-13 13:55:03,819 - root - INFO - lr: 3.6084e-04 gnorm: 1.07 [ 1:13:34< 1:42:51] +[titan] 2025-06-13 13:55:07,874 - root - INFO - step: 6260 loss: 20.7310 memory: 6.46GiB(27.34%) tps: 20,203 tflops: 20.33 mfu: 6.52% global_avg_ntp_loss: 3.6595 global_avg_mtp_loss: 17.0714 +[titan] 2025-06-13 13:55:07,875 - root - INFO - lr: 3.6061e-04 gnorm: 1.11 [ 1:13:38< 1:42:48] +[titan] 2025-06-13 13:55:10,837 - root - INFO - step: 6265 loss: 20.5111 memory: 6.46GiB(27.34%) tps: 27,651 tflops: 27.83 mfu: 8.92% global_avg_ntp_loss: 3.6524 global_avg_mtp_loss: 16.8587 +[titan] 2025-06-13 13:55:10,838 - root - INFO - lr: 3.6037e-04 gnorm: 1.16 [ 1:13:41< 1:42:44] +[titan] 2025-06-13 13:55:14,658 - root - INFO - step: 6270 loss: 18.9961 memory: 6.46GiB(27.34%) tps: 21,444 tflops: 21.58 mfu: 6.92% global_avg_ntp_loss: 3.3503 global_avg_mtp_loss: 15.6458 +[titan] 2025-06-13 13:55:14,658 - root - INFO - lr: 3.6014e-04 gnorm: 1.35 [ 1:13:44< 1:42:41] +[titan] 2025-06-13 13:55:18,331 - root - INFO - step: 6275 loss: 20.3341 memory: 6.46GiB(27.34%) tps: 22,306 tflops: 22.45 mfu: 7.19% global_avg_ntp_loss: 3.6469 global_avg_mtp_loss: 16.6872 +[titan] 2025-06-13 13:55:18,331 - root - INFO - lr: 3.5991e-04 gnorm: 1.83 [ 1:13:48< 1:42:37] +[titan] 2025-06-13 13:55:21,532 - root - INFO - step: 6280 loss: 20.1895 memory: 6.46GiB(27.34%) tps: 25,596 tflops: 25.76 mfu: 8.26% global_avg_ntp_loss: 3.5773 global_avg_mtp_loss: 16.6123 +[titan] 2025-06-13 13:55:21,532 - root - INFO - lr: 3.5967e-04 gnorm: 1.37 [ 1:13:51< 1:42:33] +[titan] 2025-06-13 13:55:25,142 - root - INFO - step: 6285 loss: 20.3389 memory: 6.46GiB(27.34%) tps: 22,696 tflops: 22.84 mfu: 7.32% global_avg_ntp_loss: 3.5935 global_avg_mtp_loss: 16.7454 +[titan] 2025-06-13 13:55:25,142 - root - INFO - lr: 3.5944e-04 gnorm: 1.10 [ 1:13:55< 1:42:30] +[titan] 2025-06-13 13:55:28,777 - root - INFO - step: 6290 loss: 18.3673 memory: 6.46GiB(27.34%) tps: 22,538 tflops: 22.68 mfu: 7.27% global_avg_ntp_loss: 3.1764 global_avg_mtp_loss: 15.1909 +[titan] 2025-06-13 13:55:28,778 - root - INFO - lr: 3.5921e-04 gnorm: 1.42 [ 1:13:59< 1:42:26] +[titan] 2025-06-13 13:55:32,515 - root - INFO - step: 6295 loss: 20.2602 memory: 6.46GiB(27.34%) tps: 21,920 tflops: 22.06 mfu: 7.07% global_avg_ntp_loss: 3.5798 global_avg_mtp_loss: 16.6803 +[titan] 2025-06-13 13:55:32,515 - root - INFO - lr: 3.5897e-04 gnorm: 1.11 [ 1:14:02< 1:42:23] +[titan] 2025-06-13 13:55:35,807 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:55:36,397 - root - INFO - step: 6300 loss: 20.2072 memory: 6.46GiB(27.34%) tps: 21,108 tflops: 21.24 mfu: 6.81% global_avg_ntp_loss: 3.5697 global_avg_mtp_loss: 16.6375 +[titan] 2025-06-13 13:55:36,397 - root - INFO - lr: 3.5874e-04 gnorm: 1.30 [ 1:14:06< 1:42:20] +[titan] 2025-06-13 13:55:40,249 - root - INFO - step: 6305 loss: 21.3044 memory: 6.46GiB(27.34%) tps: 21,268 tflops: 21.40 mfu: 6.86% global_avg_ntp_loss: 3.7898 global_avg_mtp_loss: 17.5145 +[titan] 2025-06-13 13:55:40,249 - root - INFO - lr: 3.5850e-04 gnorm: 1.12 [ 1:14:10< 1:42:17] +[titan] 2025-06-13 13:55:43,677 - root - INFO - step: 6310 loss: 19.4397 memory: 6.46GiB(27.34%) tps: 23,896 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 3.4294 global_avg_mtp_loss: 16.0103 +[titan] 2025-06-13 13:55:43,678 - root - INFO - lr: 3.5827e-04 gnorm: 1.26 [ 1:14:13< 1:42:13] +[titan] 2025-06-13 13:55:47,331 - root - INFO - step: 6315 loss: 20.2494 memory: 6.46GiB(27.34%) tps: 22,427 tflops: 22.57 mfu: 7.23% global_avg_ntp_loss: 3.5800 global_avg_mtp_loss: 16.6693 +[titan] 2025-06-13 13:55:47,331 - root - INFO - lr: 3.5803e-04 gnorm: 1.11 [ 1:14:17< 1:42:10] +[titan] 2025-06-13 13:55:50,975 - root - INFO - step: 6320 loss: 20.9121 memory: 6.46GiB(27.34%) tps: 22,482 tflops: 22.63 mfu: 7.25% global_avg_ntp_loss: 3.7012 global_avg_mtp_loss: 17.2109 +[titan] 2025-06-13 13:55:50,975 - root - INFO - lr: 3.5780e-04 gnorm: 1.20 [ 1:14:21< 1:42:07] +[titan] 2025-06-13 13:55:54,080 - root - INFO - step: 6325 loss: 19.3369 memory: 6.46GiB(27.34%) tps: 26,387 tflops: 26.56 mfu: 8.51% global_avg_ntp_loss: 3.3810 global_avg_mtp_loss: 15.9558 +[titan] 2025-06-13 13:55:54,080 - root - INFO - lr: 3.5756e-04 gnorm: 1.25 [ 1:14:24< 1:42:03] +[titan] 2025-06-13 13:55:57,542 - root - INFO - step: 6330 loss: 19.1964 memory: 6.46GiB(27.34%) tps: 23,666 tflops: 23.82 mfu: 7.63% global_avg_ntp_loss: 3.3681 global_avg_mtp_loss: 15.8283 +[titan] 2025-06-13 13:55:57,542 - root - INFO - lr: 3.5733e-04 gnorm: 1.55 [ 1:14:27< 1:41:59] +[titan] 2025-06-13 13:56:01,738 - root - INFO - step: 6335 loss: 18.8682 memory: 6.46GiB(27.34%) tps: 19,525 tflops: 19.65 mfu: 6.30% global_avg_ntp_loss: 3.3051 global_avg_mtp_loss: 15.5631 +[titan] 2025-06-13 13:56:01,738 - root - INFO - lr: 3.5709e-04 gnorm: 1.22 [ 1:14:31< 1:41:56] +[titan] 2025-06-13 13:56:05,127 - root - INFO - step: 6340 loss: 21.0558 memory: 6.46GiB(27.34%) tps: 24,177 tflops: 24.33 mfu: 7.80% global_avg_ntp_loss: 3.7832 global_avg_mtp_loss: 17.2726 +[titan] 2025-06-13 13:56:05,127 - root - INFO - lr: 3.5686e-04 gnorm: 1.13 [ 1:14:35< 1:41:53] +[titan] 2025-06-13 13:56:08,596 - root - INFO - step: 6345 loss: 21.2027 memory: 6.46GiB(27.34%) tps: 23,612 tflops: 23.76 mfu: 7.62% global_avg_ntp_loss: 3.7854 global_avg_mtp_loss: 17.4173 +[titan] 2025-06-13 13:56:08,597 - root - INFO - lr: 3.5662e-04 gnorm: 1.30 [ 1:14:38< 1:41:49] +[titan] 2025-06-13 13:56:11,456 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:56:12,219 - root - INFO - step: 6350 loss: 21.1157 memory: 6.46GiB(27.34%) tps: 22,614 tflops: 22.76 mfu: 7.29% global_avg_ntp_loss: 3.7563 global_avg_mtp_loss: 17.3594 +[titan] 2025-06-13 13:56:12,220 - root - INFO - lr: 3.5639e-04 gnorm: 1.11 [ 1:14:42< 1:41:46] +[titan] 2025-06-13 13:56:15,092 - root - INFO - step: 6355 loss: 18.9653 memory: 6.46GiB(27.34%) tps: 28,518 tflops: 28.70 mfu: 9.20% global_avg_ntp_loss: 3.3346 global_avg_mtp_loss: 15.6307 +[titan] 2025-06-13 13:56:15,093 - root - INFO - lr: 3.5615e-04 gnorm: 1.29 [ 1:14:45< 1:41:41] +[titan] 2025-06-13 13:56:18,246 - root - INFO - step: 6360 loss: 21.2534 memory: 6.46GiB(27.34%) tps: 25,979 tflops: 26.14 mfu: 8.38% global_avg_ntp_loss: 3.7143 global_avg_mtp_loss: 17.5391 +[titan] 2025-06-13 13:56:18,246 - root - INFO - lr: 3.5592e-04 gnorm: 1.11 [ 1:14:48< 1:41:37] +[titan] 2025-06-13 13:56:21,732 - root - INFO - step: 6365 loss: 20.1555 memory: 6.46GiB(27.34%) tps: 23,502 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.5507 global_avg_mtp_loss: 16.6048 +[titan] 2025-06-13 13:56:21,732 - root - INFO - lr: 3.5568e-04 gnorm: 1.14 [ 1:14:51< 1:41:33] +[titan] 2025-06-13 13:56:24,928 - root - INFO - step: 6370 loss: 19.9845 memory: 6.46GiB(27.34%) tps: 25,634 tflops: 25.80 mfu: 8.27% global_avg_ntp_loss: 3.5714 global_avg_mtp_loss: 16.4131 +[titan] 2025-06-13 13:56:24,929 - root - INFO - lr: 3.5545e-04 gnorm: 1.18 [ 1:14:55< 1:41:30] +[titan] 2025-06-13 13:56:28,818 - root - INFO - step: 6375 loss: 20.8786 memory: 6.46GiB(27.34%) tps: 21,064 tflops: 21.20 mfu: 6.79% global_avg_ntp_loss: 3.6387 global_avg_mtp_loss: 17.2400 +[titan] 2025-06-13 13:56:28,818 - root - INFO - lr: 3.5521e-04 gnorm: 1.20 [ 1:14:59< 1:41:26] +[titan] 2025-06-13 13:56:32,045 - root - INFO - step: 6380 loss: 16.1207 memory: 6.46GiB(27.34%) tps: 25,390 tflops: 25.55 mfu: 8.19% global_avg_ntp_loss: 2.7989 global_avg_mtp_loss: 13.3218 +[titan] 2025-06-13 13:56:32,046 - root - INFO - lr: 3.5497e-04 gnorm: 1.63 [ 1:15:02< 1:41:23] +[titan] 2025-06-13 13:56:35,382 - root - INFO - step: 6385 loss: 18.8099 memory: 6.46GiB(27.34%) tps: 24,553 tflops: 24.71 mfu: 7.92% global_avg_ntp_loss: 3.3122 global_avg_mtp_loss: 15.4977 +[titan] 2025-06-13 13:56:35,383 - root - INFO - lr: 3.5474e-04 gnorm: 1.16 [ 1:15:05< 1:41:19] +[titan] 2025-06-13 13:56:39,037 - root - INFO - step: 6390 loss: 18.4699 memory: 6.46GiB(27.34%) tps: 22,418 tflops: 22.56 mfu: 7.23% global_avg_ntp_loss: 3.2688 global_avg_mtp_loss: 15.2011 +[titan] 2025-06-13 13:56:39,037 - root - INFO - lr: 3.5450e-04 gnorm: 1.40 [ 1:15:09< 1:41:15] +[titan] 2025-06-13 13:56:42,338 - root - INFO - step: 6395 loss: 19.8354 memory: 6.46GiB(27.34%) tps: 24,823 tflops: 24.98 mfu: 8.01% global_avg_ntp_loss: 3.4947 global_avg_mtp_loss: 16.3407 +[titan] 2025-06-13 13:56:42,338 - root - INFO - lr: 3.5427e-04 gnorm: 1.09 [ 1:15:12< 1:41:12] +[titan] 2025-06-13 13:56:45,732 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:56:46,490 - root - INFO - step: 6400 loss: 19.4523 memory: 6.46GiB(27.34%) tps: 19,733 tflops: 19.86 mfu: 6.36% global_avg_ntp_loss: 3.4267 global_avg_mtp_loss: 16.0255 +[titan] 2025-06-13 13:56:46,490 - root - INFO - lr: 3.5403e-04 gnorm: 1.19 [ 1:15:16< 1:41:09] +[titan] 2025-06-13 13:56:50,469 - root - INFO - step: 6405 loss: 20.2499 memory: 6.46GiB(27.34%) tps: 20,590 tflops: 20.72 mfu: 6.64% global_avg_ntp_loss: 3.5966 global_avg_mtp_loss: 16.6533 +[titan] 2025-06-13 13:56:50,469 - root - INFO - lr: 3.5379e-04 gnorm: 1.09 [ 1:15:20< 1:41:06] +[titan] 2025-06-13 13:56:53,898 - root - INFO - step: 6410 loss: 19.8857 memory: 6.46GiB(27.34%) tps: 23,894 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 3.4932 global_avg_mtp_loss: 16.3925 +[titan] 2025-06-13 13:56:53,898 - root - INFO - lr: 3.5356e-04 gnorm: 1.12 [ 1:15:24< 1:41:02] +[titan] 2025-06-13 13:56:57,786 - root - INFO - step: 6415 loss: 20.8998 memory: 6.46GiB(27.34%) tps: 21,069 tflops: 21.20 mfu: 6.80% global_avg_ntp_loss: 3.7132 global_avg_mtp_loss: 17.1866 +[titan] 2025-06-13 13:56:57,787 - root - INFO - lr: 3.5332e-04 gnorm: 1.03 [ 1:15:28< 1:40:59] +[titan] 2025-06-13 13:57:01,185 - root - INFO - step: 6420 loss: 21.0062 memory: 6.46GiB(27.34%) tps: 24,108 tflops: 24.26 mfu: 7.78% global_avg_ntp_loss: 3.7488 global_avg_mtp_loss: 17.2574 +[titan] 2025-06-13 13:57:01,185 - root - INFO - lr: 3.5308e-04 gnorm: 1.44 [ 1:15:31< 1:40:56] +[titan] 2025-06-13 13:57:04,650 - root - INFO - step: 6425 loss: 20.5053 memory: 6.46GiB(27.34%) tps: 23,645 tflops: 23.80 mfu: 7.63% global_avg_ntp_loss: 3.6050 global_avg_mtp_loss: 16.9004 +[titan] 2025-06-13 13:57:04,650 - root - INFO - lr: 3.5285e-04 gnorm: 1.03 [ 1:15:34< 1:40:52] +[titan] 2025-06-13 13:57:08,822 - root - INFO - step: 6430 loss: 19.8715 memory: 6.46GiB(27.34%) tps: 19,638 tflops: 19.76 mfu: 6.33% global_avg_ntp_loss: 3.5026 global_avg_mtp_loss: 16.3690 +[titan] 2025-06-13 13:57:08,822 - root - INFO - lr: 3.5261e-04 gnorm: 1.16 [ 1:15:39< 1:40:49] +[titan] 2025-06-13 13:57:12,044 - root - INFO - step: 6435 loss: 19.9561 memory: 6.46GiB(27.34%) tps: 25,430 tflops: 25.59 mfu: 8.20% global_avg_ntp_loss: 3.5372 global_avg_mtp_loss: 16.4190 +[titan] 2025-06-13 13:57:12,044 - root - INFO - lr: 3.5237e-04 gnorm: 1.10 [ 1:15:42< 1:40:45] +[titan] 2025-06-13 13:57:15,308 - root - INFO - step: 6440 loss: 20.2491 memory: 6.46GiB(27.34%) tps: 25,102 tflops: 25.26 mfu: 8.10% global_avg_ntp_loss: 3.6098 global_avg_mtp_loss: 16.6393 +[titan] 2025-06-13 13:57:15,308 - root - INFO - lr: 3.5214e-04 gnorm: 1.18 [ 1:15:45< 1:40:41] +[titan] 2025-06-13 13:57:19,068 - root - INFO - step: 6445 loss: 19.4672 memory: 6.46GiB(27.34%) tps: 21,785 tflops: 21.92 mfu: 7.03% global_avg_ntp_loss: 3.4441 global_avg_mtp_loss: 16.0231 +[titan] 2025-06-13 13:57:19,069 - root - INFO - lr: 3.5190e-04 gnorm: 1.29 [ 1:15:49< 1:40:38] +[titan] 2025-06-13 13:57:21,645 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:57:22,430 - root - INFO - step: 6450 loss: 18.3483 memory: 6.46GiB(27.34%) tps: 24,370 tflops: 24.53 mfu: 7.86% global_avg_ntp_loss: 3.2411 global_avg_mtp_loss: 15.1072 +[titan] 2025-06-13 13:57:22,431 - root - INFO - lr: 3.5166e-04 gnorm: 1.42 [ 1:15:52< 1:40:34] +[titan] 2025-06-13 13:57:26,088 - root - INFO - step: 6455 loss: 19.6660 memory: 6.46GiB(27.34%) tps: 22,401 tflops: 22.54 mfu: 7.23% global_avg_ntp_loss: 3.4576 global_avg_mtp_loss: 16.2083 +[titan] 2025-06-13 13:57:26,088 - root - INFO - lr: 3.5142e-04 gnorm: 1.30 [ 1:15:56< 1:40:31] +[titan] 2025-06-13 13:57:30,039 - root - INFO - step: 6460 loss: 21.1727 memory: 6.46GiB(27.34%) tps: 20,736 tflops: 20.87 mfu: 6.69% global_avg_ntp_loss: 3.7563 global_avg_mtp_loss: 17.4164 +[titan] 2025-06-13 13:57:30,039 - root - INFO - lr: 3.5119e-04 gnorm: 1.41 [ 1:16:00< 1:40:28] +[titan] 2025-06-13 13:57:33,595 - root - INFO - step: 6465 loss: 19.7023 memory: 6.46GiB(27.34%) tps: 23,038 tflops: 23.19 mfu: 7.43% global_avg_ntp_loss: 3.4822 global_avg_mtp_loss: 16.2201 +[titan] 2025-06-13 13:57:33,596 - root - INFO - lr: 3.5095e-04 gnorm: 1.17 [ 1:16:03< 1:40:25] +[titan] 2025-06-13 13:57:36,924 - root - INFO - step: 6470 loss: 21.1290 memory: 6.46GiB(27.34%) tps: 24,616 tflops: 24.77 mfu: 7.94% global_avg_ntp_loss: 3.7224 global_avg_mtp_loss: 17.4066 +[titan] 2025-06-13 13:57:36,924 - root - INFO - lr: 3.5071e-04 gnorm: 1.27 [ 1:16:07< 1:40:21] +[titan] 2025-06-13 13:57:40,162 - root - INFO - step: 6475 loss: 21.6893 memory: 6.46GiB(27.34%) tps: 25,303 tflops: 25.46 mfu: 8.16% global_avg_ntp_loss: 3.9303 global_avg_mtp_loss: 17.7591 +[titan] 2025-06-13 13:57:40,162 - root - INFO - lr: 3.5047e-04 gnorm: 1.28 [ 1:16:10< 1:40:17] +[titan] 2025-06-13 13:57:43,737 - root - INFO - step: 6480 loss: 19.8782 memory: 6.46GiB(27.34%) tps: 22,918 tflops: 23.06 mfu: 7.39% global_avg_ntp_loss: 3.5551 global_avg_mtp_loss: 16.3231 +[titan] 2025-06-13 13:57:43,737 - root - INFO - lr: 3.5024e-04 gnorm: 1.24 [ 1:16:13< 1:40:13] +[titan] 2025-06-13 13:57:47,153 - root - INFO - step: 6485 loss: 20.0735 memory: 6.46GiB(27.34%) tps: 23,980 tflops: 24.13 mfu: 7.73% global_avg_ntp_loss: 3.5446 global_avg_mtp_loss: 16.5289 +[titan] 2025-06-13 13:57:47,154 - root - INFO - lr: 3.5000e-04 gnorm: 1.26 [ 1:16:17< 1:40:10] +[titan] 2025-06-13 13:57:50,554 - root - INFO - step: 6490 loss: 21.3871 memory: 6.46GiB(27.34%) tps: 24,095 tflops: 24.25 mfu: 7.77% global_avg_ntp_loss: 3.8114 global_avg_mtp_loss: 17.5757 +[titan] 2025-06-13 13:57:50,554 - root - INFO - lr: 3.4976e-04 gnorm: 1.18 [ 1:16:20< 1:40:06] +[titan] 2025-06-13 13:57:54,096 - root - INFO - step: 6495 loss: 20.2205 memory: 6.46GiB(27.34%) tps: 23,132 tflops: 23.28 mfu: 7.46% global_avg_ntp_loss: 3.5231 global_avg_mtp_loss: 16.6974 +[titan] 2025-06-13 13:57:54,096 - root - INFO - lr: 3.4952e-04 gnorm: 1.23 [ 1:16:24< 1:40:03] +[titan] 2025-06-13 13:57:56,459 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:57:57,283 - root - INFO - step: 6500 loss: 21.1208 memory: 6.46GiB(27.34%) tps: 25,703 tflops: 25.87 mfu: 8.29% global_avg_ntp_loss: 3.7557 global_avg_mtp_loss: 17.3651 +[titan] 2025-06-13 13:57:57,284 - root - INFO - lr: 3.4928e-04 gnorm: 1.23 [ 1:16:27< 1:39:59] +[titan] 2025-06-13 13:58:01,076 - root - INFO - step: 6505 loss: 20.8990 memory: 6.46GiB(27.34%) tps: 21,602 tflops: 21.74 mfu: 6.97% global_avg_ntp_loss: 3.7249 global_avg_mtp_loss: 17.1740 +[titan] 2025-06-13 13:58:01,076 - root - INFO - lr: 3.4905e-04 gnorm: 1.07 [ 1:16:31< 1:39:55] +[titan] 2025-06-13 13:58:04,345 - root - INFO - step: 6510 loss: 19.8546 memory: 6.46GiB(27.34%) tps: 25,061 tflops: 25.22 mfu: 8.08% global_avg_ntp_loss: 3.4849 global_avg_mtp_loss: 16.3697 +[titan] 2025-06-13 13:58:04,346 - root - INFO - lr: 3.4881e-04 gnorm: 1.07 [ 1:16:34< 1:39:52] +[titan] 2025-06-13 13:58:07,832 - root - INFO - step: 6515 loss: 19.7349 memory: 6.46GiB(27.34%) tps: 23,497 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.4631 global_avg_mtp_loss: 16.2718 +[titan] 2025-06-13 13:58:07,832 - root - INFO - lr: 3.4857e-04 gnorm: 1.26 [ 1:16:38< 1:39:48] +[titan] 2025-06-13 13:58:10,986 - root - INFO - step: 6520 loss: 19.8113 memory: 6.46GiB(27.34%) tps: 25,976 tflops: 26.14 mfu: 8.38% global_avg_ntp_loss: 3.4869 global_avg_mtp_loss: 16.3244 +[titan] 2025-06-13 13:58:10,986 - root - INFO - lr: 3.4833e-04 gnorm: 1.11 [ 1:16:41< 1:39:44] +[titan] 2025-06-13 13:58:14,473 - root - INFO - step: 6525 loss: 20.8616 memory: 6.46GiB(27.34%) tps: 23,500 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.7000 global_avg_mtp_loss: 17.1616 +[titan] 2025-06-13 13:58:14,473 - root - INFO - lr: 3.4809e-04 gnorm: 1.06 [ 1:16:44< 1:39:40] +[titan] 2025-06-13 13:58:18,089 - root - INFO - step: 6530 loss: 20.2745 memory: 6.46GiB(27.34%) tps: 22,657 tflops: 22.80 mfu: 7.31% global_avg_ntp_loss: 3.6372 global_avg_mtp_loss: 16.6374 +[titan] 2025-06-13 13:58:18,089 - root - INFO - lr: 3.4785e-04 gnorm: 1.17 [ 1:16:48< 1:39:37] +[titan] 2025-06-13 13:58:21,436 - root - INFO - step: 6535 loss: 20.0190 memory: 6.46GiB(27.34%) tps: 24,474 tflops: 24.63 mfu: 7.89% global_avg_ntp_loss: 3.5053 global_avg_mtp_loss: 16.5137 +[titan] 2025-06-13 13:58:21,437 - root - INFO - lr: 3.4761e-04 gnorm: 1.15 [ 1:16:51< 1:39:33] +[titan] 2025-06-13 13:58:25,029 - root - INFO - step: 6540 loss: 19.5249 memory: 6.46GiB(27.34%) tps: 22,805 tflops: 22.95 mfu: 7.36% global_avg_ntp_loss: 3.4798 global_avg_mtp_loss: 16.0451 +[titan] 2025-06-13 13:58:25,029 - root - INFO - lr: 3.4737e-04 gnorm: 1.17 [ 1:16:55< 1:39:30] +[titan] 2025-06-13 13:58:28,934 - root - INFO - step: 6545 loss: 17.9869 memory: 6.46GiB(27.34%) tps: 20,981 tflops: 21.11 mfu: 6.77% global_avg_ntp_loss: 3.1691 global_avg_mtp_loss: 14.8178 +[titan] 2025-06-13 13:58:28,934 - root - INFO - lr: 3.4714e-04 gnorm: 1.16 [ 1:16:59< 1:39:27] +[titan] 2025-06-13 13:58:31,624 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:58:32,197 - root - INFO - step: 6550 loss: 17.2982 memory: 6.46GiB(27.34%) tps: 25,104 tflops: 25.26 mfu: 8.10% global_avg_ntp_loss: 2.9628 global_avg_mtp_loss: 14.3354 +[titan] 2025-06-13 13:58:32,198 - root - INFO - lr: 3.4690e-04 gnorm: 1.53 [ 1:17:02< 1:39:23] +[titan] 2025-06-13 13:58:35,938 - root - INFO - step: 6555 loss: 18.5377 memory: 6.46GiB(27.34%) tps: 21,902 tflops: 22.04 mfu: 7.06% global_avg_ntp_loss: 3.2612 global_avg_mtp_loss: 15.2766 +[titan] 2025-06-13 13:58:35,939 - root - INFO - lr: 3.4666e-04 gnorm: 1.24 [ 1:17:06< 1:39:20] +[titan] 2025-06-13 13:58:39,299 - root - INFO - step: 6560 loss: 20.5695 memory: 6.46GiB(27.34%) tps: 24,377 tflops: 24.53 mfu: 7.86% global_avg_ntp_loss: 3.6276 global_avg_mtp_loss: 16.9419 +[titan] 2025-06-13 13:58:39,299 - root - INFO - lr: 3.4642e-04 gnorm: 1.08 [ 1:17:09< 1:39:16] +[titan] 2025-06-13 13:58:42,660 - root - INFO - step: 6565 loss: 17.3649 memory: 6.46GiB(27.34%) tps: 24,379 tflops: 24.53 mfu: 7.86% global_avg_ntp_loss: 3.0211 global_avg_mtp_loss: 14.3438 +[titan] 2025-06-13 13:58:42,660 - root - INFO - lr: 3.4618e-04 gnorm: 1.41 [ 1:17:12< 1:39:12] +[titan] 2025-06-13 13:58:46,134 - root - INFO - step: 6570 loss: 20.3086 memory: 6.46GiB(27.34%) tps: 23,584 tflops: 23.73 mfu: 7.61% global_avg_ntp_loss: 3.6144 global_avg_mtp_loss: 16.6942 +[titan] 2025-06-13 13:58:46,134 - root - INFO - lr: 3.4594e-04 gnorm: 1.07 [ 1:17:16< 1:39:08] +[titan] 2025-06-13 13:58:49,470 - root - INFO - step: 6575 loss: 20.1046 memory: 6.46GiB(27.34%) tps: 24,559 tflops: 24.72 mfu: 7.92% global_avg_ntp_loss: 3.5563 global_avg_mtp_loss: 16.5483 +[titan] 2025-06-13 13:58:49,470 - root - INFO - lr: 3.4570e-04 gnorm: 1.28 [ 1:17:19< 1:39:05] +[titan] 2025-06-13 13:58:52,740 - root - INFO - step: 6580 loss: 18.9530 memory: 6.46GiB(27.34%) tps: 25,060 tflops: 25.22 mfu: 8.08% global_avg_ntp_loss: 3.2845 global_avg_mtp_loss: 15.6685 +[titan] 2025-06-13 13:58:52,740 - root - INFO - lr: 3.4546e-04 gnorm: 1.28 [ 1:17:22< 1:39:01] +[titan] 2025-06-13 13:58:55,982 - root - INFO - step: 6585 loss: 20.1022 memory: 6.46GiB(27.34%) tps: 25,265 tflops: 25.43 mfu: 8.15% global_avg_ntp_loss: 3.5407 global_avg_mtp_loss: 16.5615 +[titan] 2025-06-13 13:58:55,983 - root - INFO - lr: 3.4522e-04 gnorm: 1.08 [ 1:17:26< 1:38:57] +[titan] 2025-06-13 13:58:59,893 - root - INFO - step: 6590 loss: 19.5202 memory: 6.46GiB(27.34%) tps: 20,952 tflops: 21.09 mfu: 6.76% global_avg_ntp_loss: 3.3948 global_avg_mtp_loss: 16.1255 +[titan] 2025-06-13 13:58:59,893 - root - INFO - lr: 3.4498e-04 gnorm: 1.19 [ 1:17:30< 1:38:54] +[titan] 2025-06-13 13:59:03,683 - root - INFO - step: 6595 loss: 20.5559 memory: 6.46GiB(27.34%) tps: 21,615 tflops: 21.75 mfu: 6.97% global_avg_ntp_loss: 3.6599 global_avg_mtp_loss: 16.8959 +[titan] 2025-06-13 13:59:03,684 - root - INFO - lr: 3.4474e-04 gnorm: 1.13 [ 1:17:33< 1:38:51] +[titan] 2025-06-13 13:59:06,982 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:59:07,483 - root - INFO - step: 6600 loss: 20.4297 memory: 6.46GiB(27.34%) tps: 21,563 tflops: 21.70 mfu: 6.96% global_avg_ntp_loss: 3.5824 global_avg_mtp_loss: 16.8473 +[titan] 2025-06-13 13:59:07,483 - root - INFO - lr: 3.4450e-04 gnorm: 1.15 [ 1:17:37< 1:38:47] +[titan] 2025-06-13 13:59:10,920 - root - INFO - step: 6605 loss: 19.1426 memory: 6.46GiB(27.34%) tps: 23,833 tflops: 23.99 mfu: 7.69% global_avg_ntp_loss: 3.2920 global_avg_mtp_loss: 15.8506 +[titan] 2025-06-13 13:59:10,921 - root - INFO - lr: 3.4426e-04 gnorm: 1.28 [ 1:17:41< 1:38:44] +[titan] 2025-06-13 13:59:14,663 - root - INFO - step: 6610 loss: 20.0896 memory: 6.46GiB(27.34%) tps: 21,893 tflops: 22.03 mfu: 7.06% global_avg_ntp_loss: 3.4884 global_avg_mtp_loss: 16.6012 +[titan] 2025-06-13 13:59:14,663 - root - INFO - lr: 3.4402e-04 gnorm: 1.21 [ 1:17:44< 1:38:41] +[titan] 2025-06-13 13:59:17,971 - root - INFO - step: 6615 loss: 19.0448 memory: 6.46GiB(27.34%) tps: 24,765 tflops: 24.92 mfu: 7.99% global_avg_ntp_loss: 3.3372 global_avg_mtp_loss: 15.7076 +[titan] 2025-06-13 13:59:17,971 - root - INFO - lr: 3.4378e-04 gnorm: 1.17 [ 1:17:48< 1:38:37] +[titan] 2025-06-13 13:59:21,263 - root - INFO - step: 6620 loss: 20.9221 memory: 6.46GiB(27.34%) tps: 24,891 tflops: 25.05 mfu: 8.03% global_avg_ntp_loss: 3.7268 global_avg_mtp_loss: 17.1953 +[titan] 2025-06-13 13:59:21,263 - root - INFO - lr: 3.4354e-04 gnorm: 1.08 [ 1:17:51< 1:38:33] +[titan] 2025-06-13 13:59:25,477 - root - INFO - step: 6625 loss: 19.2425 memory: 6.46GiB(27.34%) tps: 19,438 tflops: 19.56 mfu: 6.27% global_avg_ntp_loss: 3.3848 global_avg_mtp_loss: 15.8577 +[titan] 2025-06-13 13:59:25,478 - root - INFO - lr: 3.4330e-04 gnorm: 1.13 [ 1:17:55< 1:38:30] +[titan] 2025-06-13 13:59:28,787 - root - INFO - step: 6630 loss: 19.1377 memory: 6.46GiB(27.34%) tps: 24,757 tflops: 24.91 mfu: 7.99% global_avg_ntp_loss: 3.3371 global_avg_mtp_loss: 15.8006 +[titan] 2025-06-13 13:59:28,787 - root - INFO - lr: 3.4306e-04 gnorm: 2.08 [ 1:17:59< 1:38:26] +[titan] 2025-06-13 13:59:32,059 - root - INFO - step: 6635 loss: 19.1491 memory: 6.46GiB(27.34%) tps: 25,041 tflops: 25.20 mfu: 8.08% global_avg_ntp_loss: 3.3516 global_avg_mtp_loss: 15.7975 +[titan] 2025-06-13 13:59:32,059 - root - INFO - lr: 3.4282e-04 gnorm: 1.18 [ 1:18:02< 1:38:23] +[titan] 2025-06-13 13:59:35,726 - root - INFO - step: 6640 loss: 19.4661 memory: 6.46GiB(27.34%) tps: 22,339 tflops: 22.48 mfu: 7.21% global_avg_ntp_loss: 3.4111 global_avg_mtp_loss: 16.0549 +[titan] 2025-06-13 13:59:35,727 - root - INFO - lr: 3.4258e-04 gnorm: 1.16 [ 1:18:05< 1:38:19] +[titan] 2025-06-13 13:59:39,527 - root - INFO - step: 6645 loss: 20.9629 memory: 6.46GiB(27.34%) tps: 21,558 tflops: 21.70 mfu: 6.95% global_avg_ntp_loss: 3.6757 global_avg_mtp_loss: 17.2873 +[titan] 2025-06-13 13:59:39,527 - root - INFO - lr: 3.4234e-04 gnorm: 1.18 [ 1:18:09< 1:38:16] +[titan] 2025-06-13 13:59:42,855 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 13:59:43,301 - root - INFO - step: 6650 loss: 19.2920 memory: 6.46GiB(27.34%) tps: 21,707 tflops: 21.85 mfu: 7.00% global_avg_ntp_loss: 3.3455 global_avg_mtp_loss: 15.9465 +[titan] 2025-06-13 13:59:43,301 - root - INFO - lr: 3.4209e-04 gnorm: 1.50 [ 1:18:13< 1:38:13] +[titan] 2025-06-13 13:59:46,927 - root - INFO - step: 6655 loss: 18.8758 memory: 6.46GiB(27.34%) tps: 22,592 tflops: 22.74 mfu: 7.29% global_avg_ntp_loss: 3.3264 global_avg_mtp_loss: 15.5494 +[titan] 2025-06-13 13:59:46,928 - root - INFO - lr: 3.4185e-04 gnorm: 1.36 [ 1:18:17< 1:38:09] +[titan] 2025-06-13 13:59:47,612 - root - INFO - Dumping profiler traces at step 6656 +[titan] 2025-06-13 13:59:47,705 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 13:59:50,525 - root - INFO - step: 6660 loss: 20.3534 memory: 6.46GiB(27.34%) tps: 22,775 tflops: 22.92 mfu: 7.35% global_avg_ntp_loss: 3.7147 global_avg_mtp_loss: 16.6388 +[titan] 2025-06-13 13:59:50,525 - root - INFO - lr: 3.4161e-04 gnorm: 1.75 [ 1:18:20< 1:38:06] +[titan] 2025-06-13 13:59:54,200 - root - INFO - step: 6665 loss: 20.5012 memory: 6.46GiB(27.34%) tps: 22,296 tflops: 22.44 mfu: 7.19% global_avg_ntp_loss: 3.6183 global_avg_mtp_loss: 16.8829 +[titan] 2025-06-13 13:59:54,200 - root - INFO - lr: 3.4137e-04 gnorm: 1.15 [ 1:18:24< 1:38:03] +[titan] 2025-06-13 13:59:57,884 - root - INFO - step: 6670 loss: 15.9840 memory: 6.46GiB(27.34%) tps: 22,234 tflops: 22.38 mfu: 7.17% global_avg_ntp_loss: 2.7088 global_avg_mtp_loss: 13.2752 +[titan] 2025-06-13 13:59:57,885 - root - INFO - lr: 3.4113e-04 gnorm: 1.73 [ 1:18:28< 1:37:59] +[titan] 2025-06-13 14:00:01,549 - root - INFO - step: 6675 loss: 20.2170 memory: 6.46GiB(27.34%) tps: 22,358 tflops: 22.50 mfu: 7.21% global_avg_ntp_loss: 3.5993 global_avg_mtp_loss: 16.6177 +[titan] 2025-06-13 14:00:01,549 - root - INFO - lr: 3.4089e-04 gnorm: 1.18 [ 1:18:31< 1:37:56] +[titan] 2025-06-13 14:00:04,634 - root - INFO - step: 6680 loss: 19.7904 memory: 6.46GiB(27.34%) tps: 26,553 tflops: 26.72 mfu: 8.56% global_avg_ntp_loss: 3.4809 global_avg_mtp_loss: 16.3095 +[titan] 2025-06-13 14:00:04,635 - root - INFO - lr: 3.4065e-04 gnorm: 1.06 [ 1:18:34< 1:37:52] +[titan] 2025-06-13 14:00:08,347 - root - INFO - step: 6685 loss: 14.5829 memory: 6.46GiB(27.34%) tps: 22,070 tflops: 22.21 mfu: 7.12% global_avg_ntp_loss: 2.5849 global_avg_mtp_loss: 11.9979 +[titan] 2025-06-13 14:00:08,347 - root - INFO - lr: 3.4041e-04 gnorm: 2.07 [ 1:18:38< 1:37:49] +[titan] 2025-06-13 14:00:11,714 - root - INFO - step: 6690 loss: 19.7575 memory: 6.46GiB(27.34%) tps: 24,328 tflops: 24.48 mfu: 7.85% global_avg_ntp_loss: 3.4768 global_avg_mtp_loss: 16.2807 +[titan] 2025-06-13 14:00:11,715 - root - INFO - lr: 3.4016e-04 gnorm: 1.26 [ 1:18:41< 1:37:45] +[titan] 2025-06-13 14:00:15,479 - root - INFO - step: 6695 loss: 19.6360 memory: 6.46GiB(27.34%) tps: 21,766 tflops: 21.90 mfu: 7.02% global_avg_ntp_loss: 3.4314 global_avg_mtp_loss: 16.2047 +[titan] 2025-06-13 14:00:15,479 - root - INFO - lr: 3.3992e-04 gnorm: 1.23 [ 1:18:45< 1:37:42] +[titan] 2025-06-13 14:00:18,238 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:00:18,892 - root - INFO - step: 6700 loss: 17.9150 memory: 6.46GiB(27.34%) tps: 24,006 tflops: 24.16 mfu: 7.74% global_avg_ntp_loss: 3.1308 global_avg_mtp_loss: 14.7842 +[titan] 2025-06-13 14:00:18,892 - root - INFO - lr: 3.3968e-04 gnorm: 1.51 [ 1:18:49< 1:37:38] +[titan] 2025-06-13 14:00:22,336 - root - INFO - step: 6705 loss: 20.0062 memory: 6.46GiB(27.34%) tps: 23,785 tflops: 23.94 mfu: 7.67% global_avg_ntp_loss: 3.5253 global_avg_mtp_loss: 16.4809 +[titan] 2025-06-13 14:00:22,336 - root - INFO - lr: 3.3944e-04 gnorm: 1.18 [ 1:18:52< 1:37:34] +[titan] 2025-06-13 14:00:25,656 - root - INFO - step: 6710 loss: 19.3844 memory: 6.46GiB(27.34%) tps: 24,677 tflops: 24.83 mfu: 7.96% global_avg_ntp_loss: 3.3685 global_avg_mtp_loss: 16.0159 +[titan] 2025-06-13 14:00:25,656 - root - INFO - lr: 3.3920e-04 gnorm: 1.13 [ 1:18:55< 1:37:31] +[titan] 2025-06-13 14:00:29,713 - root - INFO - step: 6715 loss: 20.0351 memory: 6.46GiB(27.34%) tps: 20,197 tflops: 20.33 mfu: 6.51% global_avg_ntp_loss: 3.5004 global_avg_mtp_loss: 16.5347 +[titan] 2025-06-13 14:00:29,713 - root - INFO - lr: 3.3896e-04 gnorm: 1.13 [ 1:18:59< 1:37:28] +[titan] 2025-06-13 14:00:33,023 - root - INFO - step: 6720 loss: 20.3470 memory: 6.46GiB(27.34%) tps: 24,750 tflops: 24.91 mfu: 7.98% global_avg_ntp_loss: 3.5393 global_avg_mtp_loss: 16.8078 +[titan] 2025-06-13 14:00:33,023 - root - INFO - lr: 3.3871e-04 gnorm: 1.25 [ 1:19:03< 1:37:24] +[titan] 2025-06-13 14:00:36,374 - root - INFO - step: 6725 loss: 19.8514 memory: 6.46GiB(27.34%) tps: 24,445 tflops: 24.60 mfu: 7.88% global_avg_ntp_loss: 3.4422 global_avg_mtp_loss: 16.4092 +[titan] 2025-06-13 14:00:36,375 - root - INFO - lr: 3.3847e-04 gnorm: 1.49 [ 1:19:06< 1:37:20] +[titan] 2025-06-13 14:00:39,880 - root - INFO - step: 6730 loss: 19.6388 memory: 6.46GiB(27.34%) tps: 23,375 tflops: 23.52 mfu: 7.54% global_avg_ntp_loss: 3.4280 global_avg_mtp_loss: 16.2108 +[titan] 2025-06-13 14:00:39,880 - root - INFO - lr: 3.3823e-04 gnorm: 1.19 [ 1:19:10< 1:37:17] +[titan] 2025-06-13 14:00:43,209 - root - INFO - step: 6735 loss: 19.5427 memory: 6.46GiB(27.34%) tps: 24,610 tflops: 24.77 mfu: 7.94% global_avg_ntp_loss: 3.4109 global_avg_mtp_loss: 16.1318 +[titan] 2025-06-13 14:00:43,209 - root - INFO - lr: 3.3799e-04 gnorm: 1.14 [ 1:19:13< 1:37:13] +[titan] 2025-06-13 14:00:46,903 - root - INFO - step: 6740 loss: 20.5880 memory: 6.46GiB(27.34%) tps: 22,179 tflops: 22.32 mfu: 7.15% global_avg_ntp_loss: 3.5804 global_avg_mtp_loss: 17.0076 +[titan] 2025-06-13 14:00:46,903 - root - INFO - lr: 3.3774e-04 gnorm: 1.20 [ 1:19:17< 1:37:09] +[titan] 2025-06-13 14:00:50,393 - root - INFO - step: 6745 loss: 21.0103 memory: 6.46GiB(27.34%) tps: 23,478 tflops: 23.63 mfu: 7.57% global_avg_ntp_loss: 3.7563 global_avg_mtp_loss: 17.2539 +[titan] 2025-06-13 14:00:50,393 - root - INFO - lr: 3.3750e-04 gnorm: 1.20 [ 1:19:20< 1:37:06] +[titan] 2025-06-13 14:00:53,496 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:00:54,177 - root - INFO - step: 6750 loss: 20.2056 memory: 6.46GiB(27.34%) tps: 21,647 tflops: 21.79 mfu: 6.98% global_avg_ntp_loss: 3.5516 global_avg_mtp_loss: 16.6540 +[titan] 2025-06-13 14:00:54,178 - root - INFO - lr: 3.3726e-04 gnorm: 1.18 [ 1:19:24< 1:37:03] +[titan] 2025-06-13 14:00:57,818 - root - INFO - step: 6755 loss: 21.1726 memory: 6.46GiB(27.34%) tps: 22,504 tflops: 22.65 mfu: 7.26% global_avg_ntp_loss: 3.7336 global_avg_mtp_loss: 17.4390 +[titan] 2025-06-13 14:00:57,818 - root - INFO - lr: 3.3702e-04 gnorm: 1.11 [ 1:19:28< 1:36:59] +[titan] 2025-06-13 14:01:01,402 - root - INFO - step: 6760 loss: 20.0735 memory: 6.46GiB(27.34%) tps: 22,861 tflops: 23.01 mfu: 7.37% global_avg_ntp_loss: 3.5481 global_avg_mtp_loss: 16.5254 +[titan] 2025-06-13 14:01:01,402 - root - INFO - lr: 3.3677e-04 gnorm: 1.24 [ 1:19:31< 1:36:56] +[titan] 2025-06-13 14:01:04,869 - root - INFO - step: 6765 loss: 20.6597 memory: 6.46GiB(27.34%) tps: 23,635 tflops: 23.79 mfu: 7.62% global_avg_ntp_loss: 3.6088 global_avg_mtp_loss: 17.0510 +[titan] 2025-06-13 14:01:04,869 - root - INFO - lr: 3.3653e-04 gnorm: 1.22 [ 1:19:35< 1:36:52] +[titan] 2025-06-13 14:01:08,201 - root - INFO - step: 6770 loss: 20.1762 memory: 6.46GiB(27.34%) tps: 24,583 tflops: 24.74 mfu: 7.93% global_avg_ntp_loss: 3.5769 global_avg_mtp_loss: 16.5992 +[titan] 2025-06-13 14:01:08,202 - root - INFO - lr: 3.3629e-04 gnorm: 1.37 [ 1:19:38< 1:36:48] +[titan] 2025-06-13 14:01:11,618 - root - INFO - step: 6775 loss: 19.7472 memory: 6.46GiB(27.34%) tps: 23,982 tflops: 24.13 mfu: 7.74% global_avg_ntp_loss: 3.4313 global_avg_mtp_loss: 16.3159 +[titan] 2025-06-13 14:01:11,618 - root - INFO - lr: 3.3605e-04 gnorm: 1.33 [ 1:19:41< 1:36:45] +[titan] 2025-06-13 14:01:15,131 - root - INFO - step: 6780 loss: 20.3305 memory: 6.46GiB(27.34%) tps: 23,320 tflops: 23.47 mfu: 7.52% global_avg_ntp_loss: 3.5901 global_avg_mtp_loss: 16.7404 +[titan] 2025-06-13 14:01:15,132 - root - INFO - lr: 3.3580e-04 gnorm: 1.26 [ 1:19:45< 1:36:41] +[titan] 2025-06-13 14:01:18,646 - root - INFO - step: 6785 loss: 19.6451 memory: 6.46GiB(27.34%) tps: 23,315 tflops: 23.46 mfu: 7.52% global_avg_ntp_loss: 3.4420 global_avg_mtp_loss: 16.2031 +[titan] 2025-06-13 14:01:18,646 - root - INFO - lr: 3.3556e-04 gnorm: 1.23 [ 1:19:48< 1:36:38] +[titan] 2025-06-13 14:01:22,239 - root - INFO - step: 6790 loss: 21.0102 memory: 6.46GiB(27.34%) tps: 22,801 tflops: 22.95 mfu: 7.35% global_avg_ntp_loss: 3.7084 global_avg_mtp_loss: 17.3018 +[titan] 2025-06-13 14:01:22,239 - root - INFO - lr: 3.3532e-04 gnorm: 1.17 [ 1:19:52< 1:36:34] +[titan] 2025-06-13 14:01:26,220 - root - INFO - step: 6795 loss: 19.3292 memory: 6.46GiB(27.34%) tps: 20,580 tflops: 20.71 mfu: 6.64% global_avg_ntp_loss: 3.3771 global_avg_mtp_loss: 15.9521 +[titan] 2025-06-13 14:01:26,221 - root - INFO - lr: 3.3507e-04 gnorm: 1.23 [ 1:19:56< 1:36:31] +[titan] 2025-06-13 14:01:28,850 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:01:29,778 - root - INFO - step: 6800 loss: 19.8080 memory: 6.46GiB(27.34%) tps: 23,029 tflops: 23.18 mfu: 7.43% global_avg_ntp_loss: 3.5062 global_avg_mtp_loss: 16.3018 +[titan] 2025-06-13 14:01:29,778 - root - INFO - lr: 3.3483e-04 gnorm: 1.08 [ 1:19:59< 1:36:28] +[titan] 2025-06-13 14:01:33,451 - root - INFO - step: 6805 loss: 18.7958 memory: 6.46GiB(27.34%) tps: 22,305 tflops: 22.45 mfu: 7.19% global_avg_ntp_loss: 3.3805 global_avg_mtp_loss: 15.4153 +[titan] 2025-06-13 14:01:33,451 - root - INFO - lr: 3.3459e-04 gnorm: 1.19 [ 1:20:03< 1:36:24] +[titan] 2025-06-13 14:01:36,876 - root - INFO - step: 6810 loss: 19.7046 memory: 6.46GiB(27.34%) tps: 23,923 tflops: 24.08 mfu: 7.72% global_avg_ntp_loss: 3.4617 global_avg_mtp_loss: 16.2429 +[titan] 2025-06-13 14:01:36,876 - root - INFO - lr: 3.3434e-04 gnorm: 1.07 [ 1:20:07< 1:36:21] +[titan] 2025-06-13 14:01:41,035 - root - INFO - step: 6815 loss: 19.3715 memory: 6.46GiB(27.34%) tps: 19,700 tflops: 19.83 mfu: 6.35% global_avg_ntp_loss: 3.4080 global_avg_mtp_loss: 15.9634 +[titan] 2025-06-13 14:01:41,035 - root - INFO - lr: 3.3410e-04 gnorm: 1.16 [ 1:20:11< 1:36:18] +[titan] 2025-06-13 14:01:44,395 - root - INFO - step: 6820 loss: 18.8657 memory: 6.46GiB(27.34%) tps: 24,384 tflops: 24.54 mfu: 7.87% global_avg_ntp_loss: 3.2886 global_avg_mtp_loss: 15.5771 +[titan] 2025-06-13 14:01:44,395 - root - INFO - lr: 3.3386e-04 gnorm: 1.19 [ 1:20:14< 1:36:14] +[titan] 2025-06-13 14:01:48,188 - root - INFO - step: 6825 loss: 18.8386 memory: 6.46GiB(27.34%) tps: 21,600 tflops: 21.74 mfu: 6.97% global_avg_ntp_loss: 3.3185 global_avg_mtp_loss: 15.5202 +[titan] 2025-06-13 14:01:48,188 - root - INFO - lr: 3.3361e-04 gnorm: 1.39 [ 1:20:18< 1:36:11] +[titan] 2025-06-13 14:01:51,557 - root - INFO - step: 6830 loss: 18.8760 memory: 6.46GiB(27.34%) tps: 24,317 tflops: 24.47 mfu: 7.84% global_avg_ntp_loss: 3.3130 global_avg_mtp_loss: 15.5630 +[titan] 2025-06-13 14:01:51,557 - root - INFO - lr: 3.3337e-04 gnorm: 1.40 [ 1:20:21< 1:36:07] +[titan] 2025-06-13 14:01:54,776 - root - INFO - step: 6835 loss: 19.7444 memory: 6.46GiB(27.34%) tps: 25,457 tflops: 25.62 mfu: 8.21% global_avg_ntp_loss: 3.4910 global_avg_mtp_loss: 16.2534 +[titan] 2025-06-13 14:01:54,776 - root - INFO - lr: 3.3312e-04 gnorm: 1.29 [ 1:20:24< 1:36:03] +[titan] 2025-06-13 14:01:58,570 - root - INFO - step: 6840 loss: 20.6059 memory: 6.46GiB(27.34%) tps: 21,594 tflops: 21.73 mfu: 6.97% global_avg_ntp_loss: 3.6254 global_avg_mtp_loss: 16.9805 +[titan] 2025-06-13 14:01:58,570 - root - INFO - lr: 3.3288e-04 gnorm: 1.01 [ 1:20:28< 1:36:00] +[titan] 2025-06-13 14:02:02,250 - root - INFO - step: 6845 loss: 20.0747 memory: 6.46GiB(27.34%) tps: 22,260 tflops: 22.40 mfu: 7.18% global_avg_ntp_loss: 3.5028 global_avg_mtp_loss: 16.5719 +[titan] 2025-06-13 14:02:02,251 - root - INFO - lr: 3.3264e-04 gnorm: 1.12 [ 1:20:32< 1:35:57] +[titan] 2025-06-13 14:02:04,545 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:02:05,332 - root - INFO - step: 6850 loss: 16.3534 memory: 6.46GiB(27.34%) tps: 26,588 tflops: 26.76 mfu: 8.58% global_avg_ntp_loss: 2.8719 global_avg_mtp_loss: 13.4815 +[titan] 2025-06-13 14:02:05,332 - root - INFO - lr: 3.3239e-04 gnorm: 1.43 [ 1:20:35< 1:35:53] +[titan] 2025-06-13 14:02:08,401 - root - INFO - step: 6855 loss: 19.0169 memory: 6.46GiB(27.34%) tps: 26,699 tflops: 26.87 mfu: 8.61% global_avg_ntp_loss: 3.3197 global_avg_mtp_loss: 15.6972 +[titan] 2025-06-13 14:02:08,401 - root - INFO - lr: 3.3215e-04 gnorm: 1.46 [ 1:20:38< 1:35:49] +[titan] 2025-06-13 14:02:11,985 - root - INFO - step: 6860 loss: 17.1013 memory: 6.46GiB(27.34%) tps: 22,859 tflops: 23.00 mfu: 7.37% global_avg_ntp_loss: 3.0280 global_avg_mtp_loss: 14.0734 +[titan] 2025-06-13 14:02:11,985 - root - INFO - lr: 3.3190e-04 gnorm: 1.48 [ 1:20:42< 1:35:45] +[titan] 2025-06-13 14:02:15,314 - root - INFO - step: 6865 loss: 20.1454 memory: 6.46GiB(27.34%) tps: 24,613 tflops: 24.77 mfu: 7.94% global_avg_ntp_loss: 3.5327 global_avg_mtp_loss: 16.6127 +[titan] 2025-06-13 14:02:15,314 - root - INFO - lr: 3.3166e-04 gnorm: 1.06 [ 1:20:45< 1:35:41] +[titan] 2025-06-13 14:02:18,892 - root - INFO - step: 6870 loss: 20.1548 memory: 6.46GiB(27.34%) tps: 22,898 tflops: 23.04 mfu: 7.39% global_avg_ntp_loss: 3.5063 global_avg_mtp_loss: 16.6485 +[titan] 2025-06-13 14:02:18,892 - root - INFO - lr: 3.3142e-04 gnorm: 1.15 [ 1:20:49< 1:35:38] +[titan] 2025-06-13 14:02:22,499 - root - INFO - step: 6875 loss: 20.6972 memory: 6.46GiB(27.34%) tps: 22,714 tflops: 22.86 mfu: 7.33% global_avg_ntp_loss: 3.6266 global_avg_mtp_loss: 17.0706 +[titan] 2025-06-13 14:02:22,499 - root - INFO - lr: 3.3117e-04 gnorm: 1.08 [ 1:20:52< 1:35:34] +[titan] 2025-06-13 14:02:25,832 - root - INFO - step: 6880 loss: 20.5089 memory: 6.46GiB(27.34%) tps: 24,580 tflops: 24.74 mfu: 7.93% global_avg_ntp_loss: 3.6207 global_avg_mtp_loss: 16.8881 +[titan] 2025-06-13 14:02:25,832 - root - INFO - lr: 3.3093e-04 gnorm: 1.12 [ 1:20:56< 1:35:31] +[titan] 2025-06-13 14:02:29,512 - root - INFO - step: 6885 loss: 19.4999 memory: 6.46GiB(27.34%) tps: 22,261 tflops: 22.40 mfu: 7.18% global_avg_ntp_loss: 3.4547 global_avg_mtp_loss: 16.0452 +[titan] 2025-06-13 14:02:29,513 - root - INFO - lr: 3.3068e-04 gnorm: 1.19 [ 1:20:59< 1:35:27] +[titan] 2025-06-13 14:02:33,148 - root - INFO - step: 6890 loss: 20.9924 memory: 6.46GiB(27.34%) tps: 22,538 tflops: 22.68 mfu: 7.27% global_avg_ntp_loss: 3.7281 global_avg_mtp_loss: 17.2643 +[titan] 2025-06-13 14:02:33,148 - root - INFO - lr: 3.3044e-04 gnorm: 1.40 [ 1:21:03< 1:35:24] +[titan] 2025-06-13 14:02:36,132 - root - INFO - step: 6895 loss: 20.0299 memory: 6.46GiB(27.34%) tps: 27,456 tflops: 27.63 mfu: 8.86% global_avg_ntp_loss: 3.5125 global_avg_mtp_loss: 16.5174 +[titan] 2025-06-13 14:02:36,132 - root - INFO - lr: 3.3019e-04 gnorm: 1.27 [ 1:21:06< 1:35:20] +[titan] 2025-06-13 14:02:38,968 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:02:39,527 - root - INFO - step: 6900 loss: 20.2183 memory: 6.46GiB(27.34%) tps: 24,129 tflops: 24.28 mfu: 7.78% global_avg_ntp_loss: 3.5380 global_avg_mtp_loss: 16.6803 +[titan] 2025-06-13 14:02:39,528 - root - INFO - lr: 3.2995e-04 gnorm: 1.28 [ 1:21:09< 1:35:16] +[titan] 2025-06-13 14:02:43,110 - root - INFO - step: 6905 loss: 19.9927 memory: 6.46GiB(27.34%) tps: 22,867 tflops: 23.01 mfu: 7.38% global_avg_ntp_loss: 3.4926 global_avg_mtp_loss: 16.5001 +[titan] 2025-06-13 14:02:43,110 - root - INFO - lr: 3.2970e-04 gnorm: 1.15 [ 1:21:13< 1:35:13] +[titan] 2025-06-13 14:02:46,970 - root - INFO - step: 6910 loss: 20.7146 memory: 6.46GiB(27.34%) tps: 21,226 tflops: 21.36 mfu: 6.85% global_avg_ntp_loss: 3.6362 global_avg_mtp_loss: 17.0785 +[titan] 2025-06-13 14:02:46,970 - root - INFO - lr: 3.2946e-04 gnorm: 1.14 [ 1:21:17< 1:35:10] +[titan] 2025-06-13 14:02:50,209 - root - INFO - step: 6915 loss: 20.6513 memory: 6.46GiB(27.34%) tps: 25,298 tflops: 25.46 mfu: 8.16% global_avg_ntp_loss: 3.6531 global_avg_mtp_loss: 16.9982 +[titan] 2025-06-13 14:02:50,209 - root - INFO - lr: 3.2921e-04 gnorm: 1.29 [ 1:21:20< 1:35:06] +[titan] 2025-06-13 14:02:53,258 - root - INFO - step: 6920 loss: 18.1743 memory: 6.46GiB(27.34%) tps: 26,873 tflops: 27.04 mfu: 8.67% global_avg_ntp_loss: 3.1471 global_avg_mtp_loss: 15.0272 +[titan] 2025-06-13 14:02:53,258 - root - INFO - lr: 3.2897e-04 gnorm: 1.34 [ 1:21:23< 1:35:02] +[titan] 2025-06-13 14:02:56,664 - root - INFO - step: 6925 loss: 18.6509 memory: 6.46GiB(27.34%) tps: 24,049 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 3.2238 global_avg_mtp_loss: 15.4271 +[titan] 2025-06-13 14:02:56,665 - root - INFO - lr: 3.2872e-04 gnorm: 1.31 [ 1:21:26< 1:34:58] +[titan] 2025-06-13 14:03:00,100 - root - INFO - step: 6930 loss: 20.2749 memory: 6.46GiB(27.34%) tps: 23,847 tflops: 24.00 mfu: 7.69% global_avg_ntp_loss: 3.5514 global_avg_mtp_loss: 16.7235 +[titan] 2025-06-13 14:03:00,100 - root - INFO - lr: 3.2848e-04 gnorm: 1.06 [ 1:21:30< 1:34:54] +[titan] 2025-06-13 14:03:03,711 - root - INFO - step: 6935 loss: 19.9734 memory: 6.46GiB(27.34%) tps: 22,687 tflops: 22.83 mfu: 7.32% global_avg_ntp_loss: 3.4856 global_avg_mtp_loss: 16.4878 +[titan] 2025-06-13 14:03:03,712 - root - INFO - lr: 3.2823e-04 gnorm: 1.24 [ 1:21:33< 1:34:51] +[titan] 2025-06-13 14:03:07,221 - root - INFO - step: 6940 loss: 19.4896 memory: 6.46GiB(27.34%) tps: 23,341 tflops: 23.49 mfu: 7.53% global_avg_ntp_loss: 3.4014 global_avg_mtp_loss: 16.0881 +[titan] 2025-06-13 14:03:07,222 - root - INFO - lr: 3.2799e-04 gnorm: 1.11 [ 1:21:37< 1:34:47] +[titan] 2025-06-13 14:03:10,460 - root - INFO - step: 6945 loss: 18.0434 memory: 6.46GiB(27.34%) tps: 25,299 tflops: 25.46 mfu: 8.16% global_avg_ntp_loss: 3.1501 global_avg_mtp_loss: 14.8934 +[titan] 2025-06-13 14:03:10,460 - root - INFO - lr: 3.2774e-04 gnorm: 1.14 [ 1:21:40< 1:34:43] +[titan] 2025-06-13 14:03:13,408 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:03:14,071 - root - INFO - step: 6950 loss: 19.6825 memory: 6.46GiB(27.34%) tps: 22,690 tflops: 22.83 mfu: 7.32% global_avg_ntp_loss: 3.4527 global_avg_mtp_loss: 16.2298 +[titan] 2025-06-13 14:03:14,071 - root - INFO - lr: 3.2750e-04 gnorm: 1.58 [ 1:21:44< 1:34:40] +[titan] 2025-06-13 14:03:17,565 - root - INFO - step: 6955 loss: 19.9870 memory: 6.46GiB(27.34%) tps: 23,446 tflops: 23.60 mfu: 7.56% global_avg_ntp_loss: 3.4707 global_avg_mtp_loss: 16.5163 +[titan] 2025-06-13 14:03:17,566 - root - INFO - lr: 3.2725e-04 gnorm: 1.04 [ 1:21:47< 1:34:36] +[titan] 2025-06-13 14:03:20,949 - root - INFO - step: 6960 loss: 20.8399 memory: 6.46GiB(27.34%) tps: 24,212 tflops: 24.37 mfu: 7.81% global_avg_ntp_loss: 3.6897 global_avg_mtp_loss: 17.1502 +[titan] 2025-06-13 14:03:20,950 - root - INFO - lr: 3.2701e-04 gnorm: 1.07 [ 1:21:51< 1:34:33] +[titan] 2025-06-13 14:03:24,558 - root - INFO - step: 6965 loss: 19.4968 memory: 6.46GiB(27.34%) tps: 22,705 tflops: 22.85 mfu: 7.32% global_avg_ntp_loss: 3.4757 global_avg_mtp_loss: 16.0211 +[titan] 2025-06-13 14:03:24,558 - root - INFO - lr: 3.2676e-04 gnorm: 1.36 [ 1:21:54< 1:34:29] +[titan] 2025-06-13 14:03:28,234 - root - INFO - step: 6970 loss: 20.1904 memory: 6.46GiB(27.34%) tps: 22,289 tflops: 22.43 mfu: 7.19% global_avg_ntp_loss: 3.5513 global_avg_mtp_loss: 16.6391 +[titan] 2025-06-13 14:03:28,234 - root - INFO - lr: 3.2651e-04 gnorm: 1.24 [ 1:21:58< 1:34:26] +[titan] 2025-06-13 14:03:31,699 - root - INFO - step: 6975 loss: 19.9584 memory: 6.46GiB(27.34%) tps: 23,643 tflops: 23.79 mfu: 7.63% global_avg_ntp_loss: 3.5110 global_avg_mtp_loss: 16.4474 +[titan] 2025-06-13 14:03:31,699 - root - INFO - lr: 3.2627e-04 gnorm: 1.17 [ 1:22:01< 1:34:22] +[titan] 2025-06-13 14:03:35,270 - root - INFO - step: 6980 loss: 19.6179 memory: 6.46GiB(27.34%) tps: 22,947 tflops: 23.09 mfu: 7.40% global_avg_ntp_loss: 3.3849 global_avg_mtp_loss: 16.2330 +[titan] 2025-06-13 14:03:35,270 - root - INFO - lr: 3.2602e-04 gnorm: 1.29 [ 1:22:05< 1:34:19] +[titan] 2025-06-13 14:03:38,388 - root - INFO - step: 6985 loss: 20.2688 memory: 6.46GiB(27.34%) tps: 26,278 tflops: 26.45 mfu: 8.48% global_avg_ntp_loss: 3.5989 global_avg_mtp_loss: 16.6699 +[titan] 2025-06-13 14:03:38,388 - root - INFO - lr: 3.2578e-04 gnorm: 1.10 [ 1:22:08< 1:34:15] +[titan] 2025-06-13 14:03:41,993 - root - INFO - step: 6990 loss: 20.5462 memory: 6.46GiB(27.34%) tps: 22,722 tflops: 22.87 mfu: 7.33% global_avg_ntp_loss: 3.6464 global_avg_mtp_loss: 16.8998 +[titan] 2025-06-13 14:03:41,994 - root - INFO - lr: 3.2553e-04 gnorm: 1.05 [ 1:22:12< 1:34:11] +[titan] 2025-06-13 14:03:45,575 - root - INFO - step: 6995 loss: 18.9632 memory: 6.46GiB(27.34%) tps: 22,874 tflops: 23.02 mfu: 7.38% global_avg_ntp_loss: 3.3203 global_avg_mtp_loss: 15.6429 +[titan] 2025-06-13 14:03:45,575 - root - INFO - lr: 3.2529e-04 gnorm: 1.31 [ 1:22:15< 1:34:08] +[titan] 2025-06-13 14:03:48,564 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:03:49,128 - root - INFO - step: 7000 loss: 20.2917 memory: 6.46GiB(27.34%) tps: 23,063 tflops: 23.21 mfu: 7.44% global_avg_ntp_loss: 3.5411 global_avg_mtp_loss: 16.7507 +[titan] 2025-06-13 14:03:49,128 - root - INFO - lr: 3.2504e-04 gnorm: 1.17 [ 1:22:19< 1:34:04] +[titan] 2025-06-13 14:03:52,746 - root - INFO - step: 7005 loss: 19.9985 memory: 6.46GiB(27.34%) tps: 22,641 tflops: 22.78 mfu: 7.30% global_avg_ntp_loss: 3.5050 global_avg_mtp_loss: 16.4935 +[titan] 2025-06-13 14:03:52,747 - root - INFO - lr: 3.2479e-04 gnorm: 1.13 [ 1:22:22< 1:34:01] +[titan] 2025-06-13 14:03:56,394 - root - INFO - step: 7010 loss: 19.9758 memory: 6.46GiB(27.34%) tps: 22,463 tflops: 22.61 mfu: 7.25% global_avg_ntp_loss: 3.5070 global_avg_mtp_loss: 16.4688 +[titan] 2025-06-13 14:03:56,394 - root - INFO - lr: 3.2455e-04 gnorm: 1.24 [ 1:22:26< 1:33:58] +[titan] 2025-06-13 14:04:00,233 - root - INFO - step: 7015 loss: 20.4464 memory: 6.46GiB(27.34%) tps: 21,340 tflops: 21.48 mfu: 6.88% global_avg_ntp_loss: 3.5457 global_avg_mtp_loss: 16.9007 +[titan] 2025-06-13 14:04:00,233 - root - INFO - lr: 3.2430e-04 gnorm: 1.26 [ 1:22:30< 1:33:54] +[titan] 2025-06-13 14:04:03,394 - root - INFO - step: 7020 loss: 19.2693 memory: 6.46GiB(27.34%) tps: 25,921 tflops: 26.09 mfu: 8.36% global_avg_ntp_loss: 3.3594 global_avg_mtp_loss: 15.9099 +[titan] 2025-06-13 14:04:03,394 - root - INFO - lr: 3.2405e-04 gnorm: 1.08 [ 1:22:33< 1:33:50] +[titan] 2025-06-13 14:04:07,040 - root - INFO - step: 7025 loss: 19.1866 memory: 6.46GiB(27.34%) tps: 22,475 tflops: 22.62 mfu: 7.25% global_avg_ntp_loss: 3.3754 global_avg_mtp_loss: 15.8112 +[titan] 2025-06-13 14:04:07,040 - root - INFO - lr: 3.2381e-04 gnorm: 1.19 [ 1:22:37< 1:33:47] +[titan] 2025-06-13 14:04:10,880 - root - INFO - step: 7030 loss: 20.5128 memory: 6.46GiB(27.34%) tps: 21,331 tflops: 21.47 mfu: 6.88% global_avg_ntp_loss: 3.6229 global_avg_mtp_loss: 16.8900 +[titan] 2025-06-13 14:04:10,881 - root - INFO - lr: 3.2356e-04 gnorm: 1.23 [ 1:22:41< 1:33:44] +[titan] 2025-06-13 14:04:14,436 - root - INFO - step: 7035 loss: 20.4669 memory: 6.46GiB(27.34%) tps: 23,041 tflops: 23.19 mfu: 7.43% global_avg_ntp_loss: 3.6518 global_avg_mtp_loss: 16.8150 +[titan] 2025-06-13 14:04:14,436 - root - INFO - lr: 3.2331e-04 gnorm: 1.34 [ 1:22:44< 1:33:40] +[titan] 2025-06-13 14:04:17,979 - root - INFO - step: 7040 loss: 20.6538 memory: 6.46GiB(27.34%) tps: 23,127 tflops: 23.27 mfu: 7.46% global_avg_ntp_loss: 3.6477 global_avg_mtp_loss: 17.0061 +[titan] 2025-06-13 14:04:17,979 - root - INFO - lr: 3.2307e-04 gnorm: 1.08 [ 1:22:48< 1:33:37] +[titan] 2025-06-13 14:04:21,625 - root - INFO - step: 7045 loss: 21.2593 memory: 6.46GiB(27.34%) tps: 22,471 tflops: 22.61 mfu: 7.25% global_avg_ntp_loss: 3.7288 global_avg_mtp_loss: 17.5305 +[titan] 2025-06-13 14:04:21,625 - root - INFO - lr: 3.2282e-04 gnorm: 1.13 [ 1:22:51< 1:33:34] +[titan] 2025-06-13 14:04:24,111 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:04:24,960 - root - INFO - step: 7050 loss: 19.1890 memory: 6.46GiB(27.34%) tps: 24,568 tflops: 24.72 mfu: 7.92% global_avg_ntp_loss: 3.3499 global_avg_mtp_loss: 15.8391 +[titan] 2025-06-13 14:04:24,960 - root - INFO - lr: 3.2257e-04 gnorm: 1.25 [ 1:22:55< 1:33:30] +[titan] 2025-06-13 14:04:28,817 - root - INFO - step: 7055 loss: 19.8023 memory: 6.46GiB(27.34%) tps: 21,238 tflops: 21.37 mfu: 6.85% global_avg_ntp_loss: 3.4924 global_avg_mtp_loss: 16.3099 +[titan] 2025-06-13 14:04:28,817 - root - INFO - lr: 3.2233e-04 gnorm: 1.17 [ 1:22:58< 1:33:27] +[titan] 2025-06-13 14:04:32,438 - root - INFO - step: 7060 loss: 19.7970 memory: 6.46GiB(27.34%) tps: 22,626 tflops: 22.77 mfu: 7.30% global_avg_ntp_loss: 3.4694 global_avg_mtp_loss: 16.3276 +[titan] 2025-06-13 14:04:32,439 - root - INFO - lr: 3.2208e-04 gnorm: 1.09 [ 1:23:02< 1:33:23] +[titan] 2025-06-13 14:04:35,614 - root - INFO - step: 7065 loss: 19.4125 memory: 6.46GiB(27.34%) tps: 25,796 tflops: 25.96 mfu: 8.32% global_avg_ntp_loss: 3.3902 global_avg_mtp_loss: 16.0223 +[titan] 2025-06-13 14:04:35,615 - root - INFO - lr: 3.2183e-04 gnorm: 1.22 [ 1:23:05< 1:33:19] +[titan] 2025-06-13 14:04:40,954 - root - INFO - step: 7070 loss: 20.6826 memory: 6.46GiB(27.34%) tps: 15,342 tflops: 15.44 mfu: 4.95% global_avg_ntp_loss: 3.6128 global_avg_mtp_loss: 17.0699 +[titan] 2025-06-13 14:04:40,955 - root - INFO - lr: 3.2159e-04 gnorm: 1.10 [ 1:23:11< 1:33:18] +[titan] 2025-06-13 14:04:44,798 - root - INFO - step: 7075 loss: 20.6029 memory: 6.46GiB(27.34%) tps: 21,317 tflops: 21.45 mfu: 6.88% global_avg_ntp_loss: 3.5854 global_avg_mtp_loss: 17.0175 +[titan] 2025-06-13 14:04:44,798 - root - INFO - lr: 3.2134e-04 gnorm: 1.14 [ 1:23:14< 1:33:15] +[titan] 2025-06-13 14:04:48,362 - root - INFO - step: 7080 loss: 19.8783 memory: 6.46GiB(27.34%) tps: 22,986 tflops: 23.13 mfu: 7.41% global_avg_ntp_loss: 3.4843 global_avg_mtp_loss: 16.3940 +[titan] 2025-06-13 14:04:48,363 - root - INFO - lr: 3.2109e-04 gnorm: 1.17 [ 1:23:18< 1:33:11] +[titan] 2025-06-13 14:04:51,631 - root - INFO - step: 7085 loss: 19.5074 memory: 6.46GiB(27.34%) tps: 25,067 tflops: 25.23 mfu: 8.09% global_avg_ntp_loss: 3.4783 global_avg_mtp_loss: 16.0291 +[titan] 2025-06-13 14:04:51,631 - root - INFO - lr: 3.2085e-04 gnorm: 1.19 [ 1:23:21< 1:33:07] +[titan] 2025-06-13 14:04:55,738 - root - INFO - step: 7090 loss: 19.3473 memory: 6.46GiB(27.34%) tps: 19,947 tflops: 20.07 mfu: 6.43% global_avg_ntp_loss: 3.3955 global_avg_mtp_loss: 15.9518 +[titan] 2025-06-13 14:04:55,739 - root - INFO - lr: 3.2060e-04 gnorm: 1.11 [ 1:23:25< 1:33:04] +[titan] 2025-06-13 14:04:58,969 - root - INFO - step: 7095 loss: 18.7456 memory: 6.46GiB(27.34%) tps: 25,359 tflops: 25.52 mfu: 8.18% global_avg_ntp_loss: 3.2556 global_avg_mtp_loss: 15.4900 +[titan] 2025-06-13 14:04:58,969 - root - INFO - lr: 3.2035e-04 gnorm: 1.32 [ 1:23:29< 1:33:01] +[titan] 2025-06-13 14:05:01,583 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:05:02,289 - root - INFO - step: 7100 loss: 20.3421 memory: 6.46GiB(27.34%) tps: 24,679 tflops: 24.84 mfu: 7.96% global_avg_ntp_loss: 3.5849 global_avg_mtp_loss: 16.7572 +[titan] 2025-06-13 14:05:02,289 - root - INFO - lr: 3.2010e-04 gnorm: 1.13 [ 1:23:32< 1:32:57] +[titan] 2025-06-13 14:05:05,829 - root - INFO - step: 7105 loss: 20.7685 memory: 6.46GiB(27.34%) tps: 23,147 tflops: 23.29 mfu: 7.47% global_avg_ntp_loss: 3.6263 global_avg_mtp_loss: 17.1422 +[titan] 2025-06-13 14:05:05,829 - root - INFO - lr: 3.1986e-04 gnorm: 1.08 [ 1:23:35< 1:32:53] +[titan] 2025-06-13 14:05:09,677 - root - INFO - step: 7110 loss: 18.2486 memory: 6.46GiB(27.34%) tps: 21,290 tflops: 21.43 mfu: 6.87% global_avg_ntp_loss: 3.2034 global_avg_mtp_loss: 15.0452 +[titan] 2025-06-13 14:05:09,677 - root - INFO - lr: 3.1961e-04 gnorm: 1.11 [ 1:23:39< 1:32:50] +[titan] 2025-06-13 14:05:12,822 - root - INFO - step: 7115 loss: 18.9287 memory: 6.46GiB(27.34%) tps: 26,052 tflops: 26.22 mfu: 8.40% global_avg_ntp_loss: 3.2458 global_avg_mtp_loss: 15.6829 +[titan] 2025-06-13 14:05:12,822 - root - INFO - lr: 3.1936e-04 gnorm: 1.49 [ 1:23:42< 1:32:46] +[titan] 2025-06-13 14:05:16,189 - root - INFO - step: 7120 loss: 20.2525 memory: 6.46GiB(27.34%) tps: 24,333 tflops: 24.49 mfu: 7.85% global_avg_ntp_loss: 3.5683 global_avg_mtp_loss: 16.6843 +[titan] 2025-06-13 14:05:16,189 - root - INFO - lr: 3.1912e-04 gnorm: 1.09 [ 1:23:46< 1:32:42] +[titan] 2025-06-13 14:05:19,887 - root - INFO - step: 7125 loss: 19.0526 memory: 6.46GiB(27.34%) tps: 22,153 tflops: 22.29 mfu: 7.15% global_avg_ntp_loss: 3.3047 global_avg_mtp_loss: 15.7479 +[titan] 2025-06-13 14:05:19,888 - root - INFO - lr: 3.1887e-04 gnorm: 1.17 [ 1:23:50< 1:32:39] +[titan] 2025-06-13 14:05:23,579 - root - INFO - step: 7130 loss: 19.6202 memory: 6.46GiB(27.34%) tps: 22,192 tflops: 22.33 mfu: 7.16% global_avg_ntp_loss: 3.4556 global_avg_mtp_loss: 16.1646 +[titan] 2025-06-13 14:05:23,579 - root - INFO - lr: 3.1862e-04 gnorm: 1.23 [ 1:23:53< 1:32:36] +[titan] 2025-06-13 14:05:27,116 - root - INFO - step: 7135 loss: 19.3047 memory: 6.46GiB(27.34%) tps: 23,164 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.3623 global_avg_mtp_loss: 15.9424 +[titan] 2025-06-13 14:05:27,116 - root - INFO - lr: 3.1837e-04 gnorm: 1.22 [ 1:23:57< 1:32:32] +[titan] 2025-06-13 14:05:30,625 - root - INFO - step: 7140 loss: 19.4903 memory: 6.46GiB(27.34%) tps: 23,350 tflops: 23.50 mfu: 7.53% global_avg_ntp_loss: 3.3625 global_avg_mtp_loss: 16.1278 +[titan] 2025-06-13 14:05:30,625 - root - INFO - lr: 3.1812e-04 gnorm: 1.75 [ 1:24:00< 1:32:29] +[titan] 2025-06-13 14:05:33,835 - root - INFO - step: 7145 loss: 20.1464 memory: 6.46GiB(27.34%) tps: 25,526 tflops: 25.69 mfu: 8.23% global_avg_ntp_loss: 3.5072 global_avg_mtp_loss: 16.6392 +[titan] 2025-06-13 14:05:33,835 - root - INFO - lr: 3.1788e-04 gnorm: 1.12 [ 1:24:03< 1:32:25] +[titan] 2025-06-13 14:05:36,650 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:05:39,011 - root - INFO - step: 7150 loss: 19.9993 memory: 6.46GiB(27.34%) tps: 15,827 tflops: 15.93 mfu: 5.10% global_avg_ntp_loss: 3.5096 global_avg_mtp_loss: 16.4897 +[titan] 2025-06-13 14:05:39,012 - root - INFO - lr: 3.1763e-04 gnorm: 1.13 [ 1:24:09< 1:32:23] +[titan] 2025-06-13 14:05:42,067 - root - INFO - step: 7155 loss: 19.0363 memory: 6.46GiB(27.34%) tps: 26,812 tflops: 26.98 mfu: 8.65% global_avg_ntp_loss: 3.3221 global_avg_mtp_loss: 15.7142 +[titan] 2025-06-13 14:05:42,068 - root - INFO - lr: 3.1738e-04 gnorm: 1.73 [ 1:24:12< 1:32:19] +[titan] 2025-06-13 14:05:45,554 - root - INFO - step: 7160 loss: 20.7739 memory: 6.46GiB(27.34%) tps: 23,498 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.6531 global_avg_mtp_loss: 17.1208 +[titan] 2025-06-13 14:05:45,554 - root - INFO - lr: 3.1713e-04 gnorm: 1.31 [ 1:24:15< 1:32:15] +[titan] 2025-06-13 14:05:49,233 - root - INFO - step: 7165 loss: 19.8452 memory: 6.46GiB(27.34%) tps: 22,269 tflops: 22.41 mfu: 7.18% global_avg_ntp_loss: 3.4390 global_avg_mtp_loss: 16.4062 +[titan] 2025-06-13 14:05:49,233 - root - INFO - lr: 3.1689e-04 gnorm: 1.34 [ 1:24:19< 1:32:12] +[titan] 2025-06-13 14:05:51,642 - root - INFO - Dumping profiler traces at step 7168 +[titan] 2025-06-13 14:05:51,729 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 14:05:52,887 - root - INFO - step: 7170 loss: 19.5117 memory: 6.46GiB(27.34%) tps: 22,425 tflops: 22.57 mfu: 7.23% global_avg_ntp_loss: 3.4132 global_avg_mtp_loss: 16.0985 +[titan] 2025-06-13 14:05:52,887 - root - INFO - lr: 3.1664e-04 gnorm: 1.64 [ 1:24:23< 1:32:09] +[titan] 2025-06-13 14:05:56,824 - root - INFO - step: 7175 loss: 19.2634 memory: 6.46GiB(27.34%) tps: 20,810 tflops: 20.94 mfu: 6.71% global_avg_ntp_loss: 3.3407 global_avg_mtp_loss: 15.9227 +[titan] 2025-06-13 14:05:56,824 - root - INFO - lr: 3.1639e-04 gnorm: 3.22 [ 1:24:26< 1:32:06] +[titan] 2025-06-13 14:06:00,426 - root - INFO - step: 7180 loss: 20.1675 memory: 6.46GiB(27.34%) tps: 22,743 tflops: 22.89 mfu: 7.34% global_avg_ntp_loss: 3.5137 global_avg_mtp_loss: 16.6537 +[titan] 2025-06-13 14:06:00,427 - root - INFO - lr: 3.1614e-04 gnorm: 1.12 [ 1:24:30< 1:32:02] +[titan] 2025-06-13 14:06:03,623 - root - INFO - step: 7185 loss: 18.7608 memory: 6.46GiB(27.34%) tps: 25,631 tflops: 25.79 mfu: 8.27% global_avg_ntp_loss: 3.2562 global_avg_mtp_loss: 15.5046 +[titan] 2025-06-13 14:06:03,623 - root - INFO - lr: 3.1589e-04 gnorm: 1.50 [ 1:24:33< 1:31:58] +[titan] 2025-06-13 14:06:07,389 - root - INFO - step: 7190 loss: 20.9886 memory: 6.46GiB(27.34%) tps: 21,756 tflops: 21.89 mfu: 7.02% global_avg_ntp_loss: 3.7940 global_avg_mtp_loss: 17.1945 +[titan] 2025-06-13 14:06:07,389 - root - INFO - lr: 3.1564e-04 gnorm: 1.37 [ 1:24:37< 1:31:55] +[titan] 2025-06-13 14:06:10,883 - root - INFO - step: 7195 loss: 20.2846 memory: 6.46GiB(27.34%) tps: 23,448 tflops: 23.60 mfu: 7.56% global_avg_ntp_loss: 3.5631 global_avg_mtp_loss: 16.7215 +[titan] 2025-06-13 14:06:10,883 - root - INFO - lr: 3.1540e-04 gnorm: 1.18 [ 1:24:41< 1:31:51] +[titan] 2025-06-13 14:06:13,897 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:06:14,502 - root - INFO - step: 7200 loss: 20.2197 memory: 6.46GiB(27.34%) tps: 22,636 tflops: 22.78 mfu: 7.30% global_avg_ntp_loss: 3.5193 global_avg_mtp_loss: 16.7005 +[titan] 2025-06-13 14:06:14,503 - root - INFO - lr: 3.1515e-04 gnorm: 1.16 [ 1:24:44< 1:31:48] +[titan] 2025-06-13 14:06:17,734 - root - INFO - step: 7205 loss: 20.5592 memory: 6.46GiB(27.34%) tps: 25,356 tflops: 25.52 mfu: 8.18% global_avg_ntp_loss: 3.6119 global_avg_mtp_loss: 16.9473 +[titan] 2025-06-13 14:06:17,734 - root - INFO - lr: 3.1490e-04 gnorm: 1.18 [ 1:24:47< 1:31:44] +[titan] 2025-06-13 14:06:21,406 - root - INFO - step: 7210 loss: 20.2993 memory: 6.46GiB(27.34%) tps: 22,308 tflops: 22.45 mfu: 7.20% global_avg_ntp_loss: 3.5786 global_avg_mtp_loss: 16.7207 +[titan] 2025-06-13 14:06:21,407 - root - INFO - lr: 3.1465e-04 gnorm: 1.21 [ 1:24:51< 1:31:41] +[titan] 2025-06-13 14:06:25,080 - root - INFO - step: 7215 loss: 19.9137 memory: 6.46GiB(27.34%) tps: 22,304 tflops: 22.45 mfu: 7.19% global_avg_ntp_loss: 3.5775 global_avg_mtp_loss: 16.3362 +[titan] 2025-06-13 14:06:25,080 - root - INFO - lr: 3.1440e-04 gnorm: 1.31 [ 1:24:55< 1:31:37] +[titan] 2025-06-13 14:06:28,546 - root - INFO - step: 7220 loss: 19.2520 memory: 6.46GiB(27.34%) tps: 23,636 tflops: 23.79 mfu: 7.62% global_avg_ntp_loss: 3.3308 global_avg_mtp_loss: 15.9212 +[titan] 2025-06-13 14:06:28,546 - root - INFO - lr: 3.1415e-04 gnorm: 1.17 [ 1:24:58< 1:31:34] +[titan] 2025-06-13 14:06:31,928 - root - INFO - step: 7225 loss: 19.0451 memory: 6.46GiB(27.34%) tps: 24,225 tflops: 24.38 mfu: 7.81% global_avg_ntp_loss: 3.3028 global_avg_mtp_loss: 15.7423 +[titan] 2025-06-13 14:06:31,929 - root - INFO - lr: 3.1391e-04 gnorm: 1.16 [ 1:25:02< 1:31:30] +[titan] 2025-06-13 14:06:35,566 - root - INFO - step: 7230 loss: 17.8488 memory: 6.46GiB(27.34%) tps: 22,525 tflops: 22.67 mfu: 7.27% global_avg_ntp_loss: 3.0831 global_avg_mtp_loss: 14.7657 +[titan] 2025-06-13 14:06:35,566 - root - INFO - lr: 3.1366e-04 gnorm: 1.61 [ 1:25:05< 1:31:27] +[titan] 2025-06-13 14:06:39,407 - root - INFO - step: 7235 loss: 20.5701 memory: 6.46GiB(27.34%) tps: 21,326 tflops: 21.46 mfu: 6.88% global_avg_ntp_loss: 3.6267 global_avg_mtp_loss: 16.9434 +[titan] 2025-06-13 14:06:39,408 - root - INFO - lr: 3.1341e-04 gnorm: 1.13 [ 1:25:09< 1:31:23] +[titan] 2025-06-13 14:06:42,772 - root - INFO - step: 7240 loss: 20.9629 memory: 6.46GiB(27.34%) tps: 24,354 tflops: 24.51 mfu: 7.86% global_avg_ntp_loss: 3.6843 global_avg_mtp_loss: 17.2787 +[titan] 2025-06-13 14:06:42,772 - root - INFO - lr: 3.1316e-04 gnorm: 1.08 [ 1:25:12< 1:31:20] +[titan] 2025-06-13 14:06:46,439 - root - INFO - step: 7245 loss: 19.0886 memory: 6.46GiB(27.34%) tps: 22,337 tflops: 22.48 mfu: 7.20% global_avg_ntp_loss: 3.3232 global_avg_mtp_loss: 15.7654 +[titan] 2025-06-13 14:06:46,440 - root - INFO - lr: 3.1291e-04 gnorm: 1.05 [ 1:25:16< 1:31:16] +[titan] 2025-06-13 14:06:48,946 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:06:49,701 - root - INFO - step: 7250 loss: 14.7513 memory: 6.46GiB(27.34%) tps: 25,117 tflops: 25.28 mfu: 8.10% global_avg_ntp_loss: 2.6137 global_avg_mtp_loss: 12.1376 +[titan] 2025-06-13 14:06:49,702 - root - INFO - lr: 3.1266e-04 gnorm: 1.33 [ 1:25:19< 1:31:12] +[titan] 2025-06-13 14:06:53,098 - root - INFO - step: 7255 loss: 21.1043 memory: 6.46GiB(27.34%) tps: 24,118 tflops: 24.27 mfu: 7.78% global_avg_ntp_loss: 3.7257 global_avg_mtp_loss: 17.3786 +[titan] 2025-06-13 14:06:53,099 - root - INFO - lr: 3.1241e-04 gnorm: 1.33 [ 1:25:23< 1:31:09] +[titan] 2025-06-13 14:06:57,335 - root - INFO - step: 7260 loss: 19.6960 memory: 6.46GiB(27.34%) tps: 19,339 tflops: 19.46 mfu: 6.24% global_avg_ntp_loss: 3.3899 global_avg_mtp_loss: 16.3061 +[titan] 2025-06-13 14:06:57,335 - root - INFO - lr: 3.1216e-04 gnorm: 1.17 [ 1:25:27< 1:31:06] +[titan] 2025-06-13 14:07:00,771 - root - INFO - step: 7265 loss: 18.6820 memory: 6.46GiB(27.34%) tps: 23,847 tflops: 24.00 mfu: 7.69% global_avg_ntp_loss: 3.2471 global_avg_mtp_loss: 15.4349 +[titan] 2025-06-13 14:07:00,771 - root - INFO - lr: 3.1191e-04 gnorm: 1.52 [ 1:25:30< 1:31:02] +[titan] 2025-06-13 14:07:04,142 - root - INFO - step: 7270 loss: 19.8420 memory: 6.46GiB(27.34%) tps: 24,304 tflops: 24.46 mfu: 7.84% global_avg_ntp_loss: 3.5061 global_avg_mtp_loss: 16.3359 +[titan] 2025-06-13 14:07:04,142 - root - INFO - lr: 3.1167e-04 gnorm: 1.23 [ 1:25:34< 1:30:59] +[titan] 2025-06-13 14:07:08,200 - root - INFO - step: 7275 loss: 19.0018 memory: 6.46GiB(27.34%) tps: 20,190 tflops: 20.32 mfu: 6.51% global_avg_ntp_loss: 3.2850 global_avg_mtp_loss: 15.7168 +[titan] 2025-06-13 14:07:08,200 - root - INFO - lr: 3.1142e-04 gnorm: 1.20 [ 1:25:38< 1:30:56] +[titan] 2025-06-13 14:07:11,407 - root - INFO - step: 7280 loss: 19.6256 memory: 6.46GiB(27.34%) tps: 25,548 tflops: 25.71 mfu: 8.24% global_avg_ntp_loss: 3.4415 global_avg_mtp_loss: 16.1840 +[titan] 2025-06-13 14:07:11,407 - root - INFO - lr: 3.1117e-04 gnorm: 1.16 [ 1:25:41< 1:30:52] +[titan] 2025-06-13 14:07:14,810 - root - INFO - step: 7285 loss: 20.1189 memory: 6.46GiB(27.34%) tps: 24,078 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 3.4966 global_avg_mtp_loss: 16.6223 +[titan] 2025-06-13 14:07:14,810 - root - INFO - lr: 3.1092e-04 gnorm: 1.24 [ 1:25:44< 1:30:48] +[titan] 2025-06-13 14:07:18,184 - root - INFO - step: 7290 loss: 20.4512 memory: 6.46GiB(27.34%) tps: 24,281 tflops: 24.44 mfu: 7.83% global_avg_ntp_loss: 3.5735 global_avg_mtp_loss: 16.8777 +[titan] 2025-06-13 14:07:18,184 - root - INFO - lr: 3.1067e-04 gnorm: 1.20 [ 1:25:48< 1:30:44] +[titan] 2025-06-13 14:07:21,431 - root - INFO - step: 7295 loss: 20.0783 memory: 6.46GiB(27.34%) tps: 25,236 tflops: 25.40 mfu: 8.14% global_avg_ntp_loss: 3.5142 global_avg_mtp_loss: 16.5641 +[titan] 2025-06-13 14:07:21,431 - root - INFO - lr: 3.1042e-04 gnorm: 1.48 [ 1:25:51< 1:30:41] +[titan] 2025-06-13 14:07:24,001 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:07:24,661 - root - INFO - step: 7300 loss: 20.9677 memory: 6.46GiB(27.34%) tps: 25,365 tflops: 25.53 mfu: 8.18% global_avg_ntp_loss: 3.6830 global_avg_mtp_loss: 17.2847 +[titan] 2025-06-13 14:07:24,661 - root - INFO - lr: 3.1017e-04 gnorm: 1.14 [ 1:25:54< 1:30:37] +[titan] 2025-06-13 14:07:28,230 - root - INFO - step: 7305 loss: 18.8924 memory: 6.46GiB(27.34%) tps: 22,950 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 3.2210 global_avg_mtp_loss: 15.6714 +[titan] 2025-06-13 14:07:28,231 - root - INFO - lr: 3.0992e-04 gnorm: 1.30 [ 1:25:58< 1:30:33] +[titan] 2025-06-13 14:07:31,922 - root - INFO - step: 7310 loss: 21.0123 memory: 6.46GiB(27.34%) tps: 22,192 tflops: 22.33 mfu: 7.16% global_avg_ntp_loss: 3.7175 global_avg_mtp_loss: 17.2949 +[titan] 2025-06-13 14:07:31,923 - root - INFO - lr: 3.0967e-04 gnorm: 1.11 [ 1:26:02< 1:30:30] +[titan] 2025-06-13 14:07:35,410 - root - INFO - step: 7315 loss: 19.2239 memory: 6.46GiB(27.34%) tps: 23,494 tflops: 23.64 mfu: 7.58% global_avg_ntp_loss: 3.3243 global_avg_mtp_loss: 15.8996 +[titan] 2025-06-13 14:07:35,410 - root - INFO - lr: 3.0942e-04 gnorm: 1.21 [ 1:26:05< 1:30:26] +[titan] 2025-06-13 14:07:39,088 - root - INFO - step: 7320 loss: 21.8134 memory: 6.46GiB(27.34%) tps: 22,275 tflops: 22.42 mfu: 7.18% global_avg_ntp_loss: 3.9905 global_avg_mtp_loss: 17.8229 +[titan] 2025-06-13 14:07:39,088 - root - INFO - lr: 3.0917e-04 gnorm: 1.16 [ 1:26:09< 1:30:23] +[titan] 2025-06-13 14:07:42,614 - root - INFO - step: 7325 loss: 19.3233 memory: 6.46GiB(27.34%) tps: 23,237 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.3723 global_avg_mtp_loss: 15.9510 +[titan] 2025-06-13 14:07:42,614 - root - INFO - lr: 3.0892e-04 gnorm: 1.44 [ 1:26:12< 1:30:19] +[titan] 2025-06-13 14:07:46,345 - root - INFO - step: 7330 loss: 20.0780 memory: 6.46GiB(27.34%) tps: 21,959 tflops: 22.10 mfu: 7.08% global_avg_ntp_loss: 3.5056 global_avg_mtp_loss: 16.5724 +[titan] 2025-06-13 14:07:46,345 - root - INFO - lr: 3.0867e-04 gnorm: 1.12 [ 1:26:16< 1:30:16] +[titan] 2025-06-13 14:07:50,144 - root - INFO - step: 7335 loss: 19.9057 memory: 6.46GiB(27.34%) tps: 21,564 tflops: 21.70 mfu: 6.96% global_avg_ntp_loss: 3.4664 global_avg_mtp_loss: 16.4393 +[titan] 2025-06-13 14:07:50,144 - root - INFO - lr: 3.0842e-04 gnorm: 1.29 [ 1:26:20< 1:30:13] +[titan] 2025-06-13 14:07:53,426 - root - INFO - step: 7340 loss: 18.0745 memory: 6.46GiB(27.34%) tps: 24,965 tflops: 25.12 mfu: 8.05% global_avg_ntp_loss: 3.1267 global_avg_mtp_loss: 14.9478 +[titan] 2025-06-13 14:07:53,426 - root - INFO - lr: 3.0817e-04 gnorm: 1.38 [ 1:26:23< 1:30:09] +[titan] 2025-06-13 14:07:57,066 - root - INFO - step: 7345 loss: 19.5653 memory: 6.46GiB(27.34%) tps: 22,509 tflops: 22.65 mfu: 7.26% global_avg_ntp_loss: 3.4515 global_avg_mtp_loss: 16.1138 +[titan] 2025-06-13 14:07:57,066 - root - INFO - lr: 3.0792e-04 gnorm: 1.21 [ 1:26:27< 1:30:06] +[titan] 2025-06-13 14:07:59,671 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:08:00,622 - root - INFO - step: 7350 loss: 19.3686 memory: 6.46GiB(27.34%) tps: 23,042 tflops: 23.19 mfu: 7.43% global_avg_ntp_loss: 3.4103 global_avg_mtp_loss: 15.9583 +[titan] 2025-06-13 14:08:00,622 - root - INFO - lr: 3.0767e-04 gnorm: 1.08 [ 1:26:30< 1:30:02] +[titan] 2025-06-13 14:08:03,660 - root - INFO - step: 7355 loss: 20.5111 memory: 6.46GiB(27.34%) tps: 26,966 tflops: 27.14 mfu: 8.70% global_avg_ntp_loss: 3.6603 global_avg_mtp_loss: 16.8508 +[titan] 2025-06-13 14:08:03,660 - root - INFO - lr: 3.0743e-04 gnorm: 1.26 [ 1:26:33< 1:29:58] +[titan] 2025-06-13 14:08:07,203 - root - INFO - step: 7360 loss: 18.6802 memory: 6.46GiB(27.34%) tps: 23,122 tflops: 23.27 mfu: 7.46% global_avg_ntp_loss: 3.2709 global_avg_mtp_loss: 15.4093 +[titan] 2025-06-13 14:08:07,204 - root - INFO - lr: 3.0718e-04 gnorm: 1.55 [ 1:26:37< 1:29:55] +[titan] 2025-06-13 14:08:10,439 - root - INFO - step: 7365 loss: 18.5064 memory: 6.46GiB(27.34%) tps: 25,322 tflops: 25.48 mfu: 8.17% global_avg_ntp_loss: 3.1958 global_avg_mtp_loss: 15.3106 +[titan] 2025-06-13 14:08:10,440 - root - INFO - lr: 3.0693e-04 gnorm: 1.11 [ 1:26:40< 1:29:51] +[titan] 2025-06-13 14:08:14,486 - root - INFO - step: 7370 loss: 20.1499 memory: 6.46GiB(27.34%) tps: 20,247 tflops: 20.38 mfu: 6.53% global_avg_ntp_loss: 3.4793 global_avg_mtp_loss: 16.6706 +[titan] 2025-06-13 14:08:14,486 - root - INFO - lr: 3.0668e-04 gnorm: 1.11 [ 1:26:44< 1:29:48] +[titan] 2025-06-13 14:08:18,161 - root - INFO - step: 7375 loss: 19.9879 memory: 6.46GiB(27.34%) tps: 22,291 tflops: 22.43 mfu: 7.19% global_avg_ntp_loss: 3.5049 global_avg_mtp_loss: 16.4830 +[titan] 2025-06-13 14:08:18,162 - root - INFO - lr: 3.0643e-04 gnorm: 1.13 [ 1:26:48< 1:29:44] +[titan] 2025-06-13 14:08:21,291 - root - INFO - step: 7380 loss: 19.3722 memory: 6.46GiB(27.34%) tps: 26,181 tflops: 26.35 mfu: 8.44% global_avg_ntp_loss: 3.3828 global_avg_mtp_loss: 15.9894 +[titan] 2025-06-13 14:08:21,291 - root - INFO - lr: 3.0618e-04 gnorm: 1.19 [ 1:26:51< 1:29:40] +[titan] 2025-06-13 14:08:25,191 - root - INFO - step: 7385 loss: 20.1202 memory: 6.46GiB(27.34%) tps: 21,006 tflops: 21.14 mfu: 6.78% global_avg_ntp_loss: 3.4941 global_avg_mtp_loss: 16.6261 +[titan] 2025-06-13 14:08:25,192 - root - INFO - lr: 3.0593e-04 gnorm: 1.28 [ 1:26:55< 1:29:37] +[titan] 2025-06-13 14:08:29,246 - root - INFO - step: 7390 loss: 19.0033 memory: 6.46GiB(27.34%) tps: 20,207 tflops: 20.34 mfu: 6.52% global_avg_ntp_loss: 3.2873 global_avg_mtp_loss: 15.7160 +[titan] 2025-06-13 14:08:29,246 - root - INFO - lr: 3.0568e-04 gnorm: 1.23 [ 1:26:59< 1:29:34] +[titan] 2025-06-13 14:08:33,085 - root - INFO - step: 7395 loss: 18.4935 memory: 6.46GiB(27.34%) tps: 21,339 tflops: 21.47 mfu: 6.88% global_avg_ntp_loss: 3.2153 global_avg_mtp_loss: 15.2781 +[titan] 2025-06-13 14:08:33,085 - root - INFO - lr: 3.0543e-04 gnorm: 1.20 [ 1:27:03< 1:29:31] +[titan] 2025-06-13 14:08:35,704 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:08:36,246 - root - INFO - step: 7400 loss: 19.6345 memory: 6.46GiB(27.34%) tps: 25,922 tflops: 26.09 mfu: 8.36% global_avg_ntp_loss: 3.4886 global_avg_mtp_loss: 16.1459 +[titan] 2025-06-13 14:08:36,246 - root - INFO - lr: 3.0518e-04 gnorm: 1.26 [ 1:27:06< 1:29:27] +[titan] 2025-06-13 14:08:40,235 - root - INFO - step: 7405 loss: 19.4773 memory: 6.46GiB(27.34%) tps: 20,536 tflops: 20.67 mfu: 6.62% global_avg_ntp_loss: 3.3949 global_avg_mtp_loss: 16.0824 +[titan] 2025-06-13 14:08:40,236 - root - INFO - lr: 3.0493e-04 gnorm: 1.23 [ 1:27:10< 1:29:24] +[titan] 2025-06-13 14:08:43,970 - root - INFO - step: 7410 loss: 18.7134 memory: 6.46GiB(27.34%) tps: 21,939 tflops: 22.08 mfu: 7.08% global_avg_ntp_loss: 3.2437 global_avg_mtp_loss: 15.4696 +[titan] 2025-06-13 14:08:43,970 - root - INFO - lr: 3.0467e-04 gnorm: 1.33 [ 1:27:14< 1:29:21] +[titan] 2025-06-13 14:08:47,642 - root - INFO - step: 7415 loss: 20.3781 memory: 6.46GiB(27.34%) tps: 22,308 tflops: 22.45 mfu: 7.20% global_avg_ntp_loss: 3.5731 global_avg_mtp_loss: 16.8050 +[titan] 2025-06-13 14:08:47,643 - root - INFO - lr: 3.0442e-04 gnorm: 1.15 [ 1:27:17< 1:29:17] +[titan] 2025-06-13 14:08:51,225 - root - INFO - step: 7420 loss: 19.6687 memory: 6.46GiB(27.34%) tps: 22,869 tflops: 23.01 mfu: 7.38% global_avg_ntp_loss: 3.4091 global_avg_mtp_loss: 16.2596 +[titan] 2025-06-13 14:08:51,225 - root - INFO - lr: 3.0417e-04 gnorm: 1.17 [ 1:27:21< 1:29:14] +[titan] 2025-06-13 14:08:54,926 - root - INFO - step: 7425 loss: 19.8529 memory: 6.46GiB(27.34%) tps: 22,134 tflops: 22.28 mfu: 7.14% global_avg_ntp_loss: 3.4864 global_avg_mtp_loss: 16.3665 +[titan] 2025-06-13 14:08:54,927 - root - INFO - lr: 3.0392e-04 gnorm: 1.13 [ 1:27:25< 1:29:11] +[titan] 2025-06-13 14:08:58,461 - root - INFO - step: 7430 loss: 18.6432 memory: 6.46GiB(27.34%) tps: 23,181 tflops: 23.33 mfu: 7.48% global_avg_ntp_loss: 3.2561 global_avg_mtp_loss: 15.3870 +[titan] 2025-06-13 14:08:58,461 - root - INFO - lr: 3.0367e-04 gnorm: 1.24 [ 1:27:28< 1:29:07] +[titan] 2025-06-13 14:09:02,330 - root - INFO - step: 7435 loss: 20.9818 memory: 6.46GiB(27.34%) tps: 21,173 tflops: 21.31 mfu: 6.83% global_avg_ntp_loss: 3.6850 global_avg_mtp_loss: 17.2968 +[titan] 2025-06-13 14:09:02,331 - root - INFO - lr: 3.0342e-04 gnorm: 1.04 [ 1:27:32< 1:29:04] +[titan] 2025-06-13 14:09:05,803 - root - INFO - step: 7440 loss: 20.8279 memory: 6.46GiB(27.34%) tps: 23,590 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 3.6026 global_avg_mtp_loss: 17.2253 +[titan] 2025-06-13 14:09:05,804 - root - INFO - lr: 3.0317e-04 gnorm: 1.12 [ 1:27:35< 1:29:00] +[titan] 2025-06-13 14:09:09,253 - root - INFO - step: 7445 loss: 19.8462 memory: 6.46GiB(27.34%) tps: 23,748 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 3.5098 global_avg_mtp_loss: 16.3365 +[titan] 2025-06-13 14:09:09,254 - root - INFO - lr: 3.0292e-04 gnorm: 1.30 [ 1:27:39< 1:28:57] +[titan] 2025-06-13 14:09:12,081 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:09:12,624 - root - INFO - step: 7450 loss: 20.4414 memory: 6.46GiB(27.34%) tps: 24,304 tflops: 24.46 mfu: 7.84% global_avg_ntp_loss: 3.6635 global_avg_mtp_loss: 16.7779 +[titan] 2025-06-13 14:09:12,625 - root - INFO - lr: 3.0267e-04 gnorm: 1.36 [ 1:27:42< 1:28:53] +[titan] 2025-06-13 14:09:16,080 - root - INFO - step: 7455 loss: 19.8198 memory: 6.46GiB(27.34%) tps: 23,710 tflops: 23.86 mfu: 7.65% global_avg_ntp_loss: 3.4642 global_avg_mtp_loss: 16.3555 +[titan] 2025-06-13 14:09:16,080 - root - INFO - lr: 3.0242e-04 gnorm: 1.20 [ 1:27:46< 1:28:49] +[titan] 2025-06-13 14:09:19,993 - root - INFO - step: 7460 loss: 19.5942 memory: 6.46GiB(27.34%) tps: 20,937 tflops: 21.07 mfu: 6.75% global_avg_ntp_loss: 3.4455 global_avg_mtp_loss: 16.1487 +[titan] 2025-06-13 14:09:19,993 - root - INFO - lr: 3.0217e-04 gnorm: 1.29 [ 1:27:50< 1:28:46] +[titan] 2025-06-13 14:09:23,722 - root - INFO - step: 7465 loss: 20.4681 memory: 6.46GiB(27.34%) tps: 21,970 tflops: 22.11 mfu: 7.09% global_avg_ntp_loss: 3.5699 global_avg_mtp_loss: 16.8982 +[titan] 2025-06-13 14:09:23,723 - root - INFO - lr: 3.0192e-04 gnorm: 1.13 [ 1:27:53< 1:28:43] +[titan] 2025-06-13 14:09:27,354 - root - INFO - step: 7470 loss: 20.0336 memory: 6.46GiB(27.34%) tps: 22,559 tflops: 22.70 mfu: 7.28% global_avg_ntp_loss: 3.4894 global_avg_mtp_loss: 16.5442 +[titan] 2025-06-13 14:09:27,354 - root - INFO - lr: 3.0167e-04 gnorm: 1.17 [ 1:27:57< 1:28:39] +[titan] 2025-06-13 14:09:30,950 - root - INFO - step: 7475 loss: 19.4512 memory: 6.46GiB(27.34%) tps: 22,782 tflops: 22.93 mfu: 7.35% global_avg_ntp_loss: 3.4053 global_avg_mtp_loss: 16.0459 +[titan] 2025-06-13 14:09:30,951 - root - INFO - lr: 3.0142e-04 gnorm: 1.12 [ 1:28:01< 1:28:36] +[titan] 2025-06-13 14:09:34,145 - root - INFO - step: 7480 loss: 18.4436 memory: 6.46GiB(27.34%) tps: 25,649 tflops: 25.81 mfu: 8.27% global_avg_ntp_loss: 3.2212 global_avg_mtp_loss: 15.2224 +[titan] 2025-06-13 14:09:34,145 - root - INFO - lr: 3.0117e-04 gnorm: 1.18 [ 1:28:04< 1:28:32] +[titan] 2025-06-13 14:09:37,658 - root - INFO - step: 7485 loss: 19.5220 memory: 6.46GiB(27.34%) tps: 23,322 tflops: 23.47 mfu: 7.52% global_avg_ntp_loss: 3.3760 global_avg_mtp_loss: 16.1460 +[titan] 2025-06-13 14:09:37,658 - root - INFO - lr: 3.0092e-04 gnorm: 1.22 [ 1:28:07< 1:28:28] +[titan] 2025-06-13 14:09:41,588 - root - INFO - step: 7490 loss: 20.9529 memory: 6.46GiB(27.34%) tps: 20,846 tflops: 20.98 mfu: 6.72% global_avg_ntp_loss: 3.6748 global_avg_mtp_loss: 17.2781 +[titan] 2025-06-13 14:09:41,588 - root - INFO - lr: 3.0067e-04 gnorm: 1.36 [ 1:28:11< 1:28:25] +[titan] 2025-06-13 14:09:45,092 - root - INFO - step: 7495 loss: 20.1059 memory: 6.46GiB(27.34%) tps: 23,381 tflops: 23.53 mfu: 7.54% global_avg_ntp_loss: 3.5082 global_avg_mtp_loss: 16.5977 +[titan] 2025-06-13 14:09:45,092 - root - INFO - lr: 3.0042e-04 gnorm: 1.29 [ 1:28:15< 1:28:22] +[titan] 2025-06-13 14:09:47,508 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:09:48,062 - root - INFO - step: 7500 loss: 17.6524 memory: 6.46GiB(27.34%) tps: 27,588 tflops: 27.76 mfu: 8.90% global_avg_ntp_loss: 3.0517 global_avg_mtp_loss: 14.6008 +[titan] 2025-06-13 14:09:48,062 - root - INFO - lr: 3.0017e-04 gnorm: 1.34 [ 1:28:18< 1:28:18] +[titan] 2025-06-13 14:09:51,574 - root - INFO - step: 7505 loss: 20.4757 memory: 6.46GiB(27.34%) tps: 23,328 tflops: 23.48 mfu: 7.52% global_avg_ntp_loss: 3.5581 global_avg_mtp_loss: 16.9176 +[titan] 2025-06-13 14:09:51,574 - root - INFO - lr: 2.9991e-04 gnorm: 1.20 [ 1:28:21< 1:28:14] +[titan] 2025-06-13 14:09:55,186 - root - INFO - step: 7510 loss: 20.6178 memory: 6.46GiB(27.34%) tps: 22,687 tflops: 22.83 mfu: 7.32% global_avg_ntp_loss: 3.6147 global_avg_mtp_loss: 17.0031 +[titan] 2025-06-13 14:09:55,186 - root - INFO - lr: 2.9966e-04 gnorm: 1.06 [ 1:28:25< 1:28:11] +[titan] 2025-06-13 14:09:58,960 - root - INFO - step: 7515 loss: 20.0153 memory: 6.46GiB(27.34%) tps: 21,704 tflops: 21.84 mfu: 7.00% global_avg_ntp_loss: 3.5084 global_avg_mtp_loss: 16.5069 +[titan] 2025-06-13 14:09:58,961 - root - INFO - lr: 2.9941e-04 gnorm: 1.09 [ 1:28:29< 1:28:07] +[titan] 2025-06-13 14:10:02,511 - root - INFO - step: 7520 loss: 20.8632 memory: 6.46GiB(27.34%) tps: 23,077 tflops: 23.22 mfu: 7.44% global_avg_ntp_loss: 3.6752 global_avg_mtp_loss: 17.1879 +[titan] 2025-06-13 14:10:02,511 - root - INFO - lr: 2.9916e-04 gnorm: 1.12 [ 1:28:32< 1:28:04] +[titan] 2025-06-13 14:10:06,037 - root - INFO - step: 7525 loss: 20.4230 memory: 6.46GiB(27.34%) tps: 23,232 tflops: 23.38 mfu: 7.49% global_avg_ntp_loss: 3.5633 global_avg_mtp_loss: 16.8597 +[titan] 2025-06-13 14:10:06,038 - root - INFO - lr: 2.9891e-04 gnorm: 1.10 [ 1:28:36< 1:28:00] +[titan] 2025-06-13 14:10:09,390 - root - INFO - step: 7530 loss: 19.5020 memory: 6.46GiB(27.34%) tps: 24,441 tflops: 24.60 mfu: 7.88% global_avg_ntp_loss: 3.4030 global_avg_mtp_loss: 16.0990 +[titan] 2025-06-13 14:10:09,390 - root - INFO - lr: 2.9866e-04 gnorm: 1.10 [ 1:28:39< 1:27:57] +[titan] 2025-06-13 14:10:13,126 - root - INFO - step: 7535 loss: 18.6964 memory: 6.46GiB(27.34%) tps: 21,927 tflops: 22.07 mfu: 7.07% global_avg_ntp_loss: 3.2562 global_avg_mtp_loss: 15.4402 +[titan] 2025-06-13 14:10:13,127 - root - INFO - lr: 2.9841e-04 gnorm: 1.18 [ 1:28:43< 1:27:53] +[titan] 2025-06-13 14:10:16,848 - root - INFO - step: 7540 loss: 19.8312 memory: 6.46GiB(27.34%) tps: 22,015 tflops: 22.16 mfu: 7.10% global_avg_ntp_loss: 3.4329 global_avg_mtp_loss: 16.3983 +[titan] 2025-06-13 14:10:16,848 - root - INFO - lr: 2.9816e-04 gnorm: 1.08 [ 1:28:46< 1:27:50] +[titan] 2025-06-13 14:10:20,427 - root - INFO - step: 7545 loss: 18.4303 memory: 6.46GiB(27.34%) tps: 22,892 tflops: 23.04 mfu: 7.38% global_avg_ntp_loss: 3.1972 global_avg_mtp_loss: 15.2331 +[titan] 2025-06-13 14:10:20,427 - root - INFO - lr: 2.9791e-04 gnorm: 1.25 [ 1:28:50< 1:27:46] +[titan] 2025-06-13 14:10:23,374 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:10:24,109 - root - INFO - step: 7550 loss: 19.9769 memory: 6.46GiB(27.34%) tps: 22,249 tflops: 22.39 mfu: 7.18% global_avg_ntp_loss: 3.4946 global_avg_mtp_loss: 16.4823 +[titan] 2025-06-13 14:10:24,110 - root - INFO - lr: 2.9766e-04 gnorm: 1.21 [ 1:28:54< 1:27:43] +[titan] 2025-06-13 14:10:27,243 - root - INFO - step: 7555 loss: 20.5164 memory: 6.46GiB(27.34%) tps: 26,147 tflops: 26.31 mfu: 8.43% global_avg_ntp_loss: 3.5685 global_avg_mtp_loss: 16.9478 +[titan] 2025-06-13 14:10:27,243 - root - INFO - lr: 2.9740e-04 gnorm: 1.18 [ 1:28:57< 1:27:39] +[titan] 2025-06-13 14:10:30,730 - root - INFO - step: 7560 loss: 19.0468 memory: 6.46GiB(27.34%) tps: 23,496 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.3594 global_avg_mtp_loss: 15.6874 +[titan] 2025-06-13 14:10:30,730 - root - INFO - lr: 2.9715e-04 gnorm: 1.33 [ 1:29:00< 1:27:36] +[titan] 2025-06-13 14:10:34,211 - root - INFO - step: 7565 loss: 19.7378 memory: 6.46GiB(27.34%) tps: 23,536 tflops: 23.69 mfu: 7.59% global_avg_ntp_loss: 3.4250 global_avg_mtp_loss: 16.3127 +[titan] 2025-06-13 14:10:34,211 - root - INFO - lr: 2.9690e-04 gnorm: 1.29 [ 1:29:04< 1:27:32] +[titan] 2025-06-13 14:10:37,744 - root - INFO - step: 7570 loss: 19.0819 memory: 6.46GiB(27.34%) tps: 23,189 tflops: 23.34 mfu: 7.48% global_avg_ntp_loss: 3.3557 global_avg_mtp_loss: 15.7262 +[titan] 2025-06-13 14:10:37,745 - root - INFO - lr: 2.9665e-04 gnorm: 1.28 [ 1:29:07< 1:27:28] +[titan] 2025-06-13 14:10:42,033 - root - INFO - step: 7575 loss: 17.6472 memory: 6.46GiB(27.34%) tps: 19,102 tflops: 19.22 mfu: 6.16% global_avg_ntp_loss: 3.0566 global_avg_mtp_loss: 14.5906 +[titan] 2025-06-13 14:10:42,034 - root - INFO - lr: 2.9640e-04 gnorm: 1.37 [ 1:29:12< 1:27:26] +[titan] 2025-06-13 14:10:45,348 - root - INFO - step: 7580 loss: 19.5326 memory: 6.46GiB(27.34%) tps: 24,717 tflops: 24.87 mfu: 7.97% global_avg_ntp_loss: 3.3941 global_avg_mtp_loss: 16.1385 +[titan] 2025-06-13 14:10:45,348 - root - INFO - lr: 2.9615e-04 gnorm: 1.25 [ 1:29:15< 1:27:22] +[titan] 2025-06-13 14:10:48,990 - root - INFO - step: 7585 loss: 20.1923 memory: 6.46GiB(27.34%) tps: 22,494 tflops: 22.64 mfu: 7.26% global_avg_ntp_loss: 3.5386 global_avg_mtp_loss: 16.6537 +[titan] 2025-06-13 14:10:48,991 - root - INFO - lr: 2.9590e-04 gnorm: 1.16 [ 1:29:19< 1:27:18] +[titan] 2025-06-13 14:10:52,465 - root - INFO - step: 7590 loss: 20.4887 memory: 6.46GiB(27.34%) tps: 23,580 tflops: 23.73 mfu: 7.61% global_avg_ntp_loss: 3.5966 global_avg_mtp_loss: 16.8921 +[titan] 2025-06-13 14:10:52,466 - root - INFO - lr: 2.9565e-04 gnorm: 1.20 [ 1:29:22< 1:27:15] +[titan] 2025-06-13 14:10:56,545 - root - INFO - step: 7595 loss: 21.9370 memory: 6.46GiB(27.34%) tps: 20,082 tflops: 20.21 mfu: 6.48% global_avg_ntp_loss: 4.0542 global_avg_mtp_loss: 17.8828 +[titan] 2025-06-13 14:10:56,545 - root - INFO - lr: 2.9539e-04 gnorm: 1.45 [ 1:29:26< 1:27:12] +[titan] 2025-06-13 14:10:59,533 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:11:00,091 - root - INFO - step: 7600 loss: 19.1256 memory: 6.46GiB(27.34%) tps: 23,107 tflops: 23.25 mfu: 7.45% global_avg_ntp_loss: 3.3193 global_avg_mtp_loss: 15.8063 +[titan] 2025-06-13 14:11:00,091 - root - INFO - lr: 2.9514e-04 gnorm: 1.16 [ 1:29:30< 1:27:08] +[titan] 2025-06-13 14:11:03,680 - root - INFO - step: 7605 loss: 18.3812 memory: 6.46GiB(27.34%) tps: 22,827 tflops: 22.97 mfu: 7.36% global_avg_ntp_loss: 3.2300 global_avg_mtp_loss: 15.1513 +[titan] 2025-06-13 14:11:03,680 - root - INFO - lr: 2.9489e-04 gnorm: 1.20 [ 1:29:33< 1:27:05] +[titan] 2025-06-13 14:11:07,001 - root - INFO - step: 7610 loss: 20.0453 memory: 6.46GiB(27.34%) tps: 24,667 tflops: 24.82 mfu: 7.96% global_avg_ntp_loss: 3.5154 global_avg_mtp_loss: 16.5299 +[titan] 2025-06-13 14:11:07,002 - root - INFO - lr: 2.9464e-04 gnorm: 1.26 [ 1:29:37< 1:27:01] +[titan] 2025-06-13 14:11:10,519 - root - INFO - step: 7615 loss: 18.0674 memory: 6.46GiB(27.34%) tps: 23,289 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 3.1578 global_avg_mtp_loss: 14.9096 +[titan] 2025-06-13 14:11:10,520 - root - INFO - lr: 2.9439e-04 gnorm: 1.32 [ 1:29:40< 1:26:58] +[titan] 2025-06-13 14:11:14,155 - root - INFO - step: 7620 loss: 19.2552 memory: 6.46GiB(27.34%) tps: 22,532 tflops: 22.68 mfu: 7.27% global_avg_ntp_loss: 3.3570 global_avg_mtp_loss: 15.8982 +[titan] 2025-06-13 14:11:14,156 - root - INFO - lr: 2.9414e-04 gnorm: 1.13 [ 1:29:44< 1:26:54] +[titan] 2025-06-13 14:11:17,678 - root - INFO - step: 7625 loss: 19.0972 memory: 6.46GiB(27.34%) tps: 23,260 tflops: 23.41 mfu: 7.50% global_avg_ntp_loss: 3.2937 global_avg_mtp_loss: 15.8035 +[titan] 2025-06-13 14:11:17,678 - root - INFO - lr: 2.9388e-04 gnorm: 1.21 [ 1:29:47< 1:26:51] +[titan] 2025-06-13 14:11:21,131 - root - INFO - step: 7630 loss: 19.6176 memory: 6.46GiB(27.34%) tps: 23,728 tflops: 23.88 mfu: 7.65% global_avg_ntp_loss: 3.3757 global_avg_mtp_loss: 16.2420 +[titan] 2025-06-13 14:11:21,131 - root - INFO - lr: 2.9363e-04 gnorm: 1.19 [ 1:29:51< 1:26:47] +[titan] 2025-06-13 14:11:24,402 - root - INFO - step: 7635 loss: 18.8397 memory: 6.46GiB(27.34%) tps: 25,048 tflops: 25.21 mfu: 8.08% global_avg_ntp_loss: 3.2932 global_avg_mtp_loss: 15.5465 +[titan] 2025-06-13 14:11:24,402 - root - INFO - lr: 2.9338e-04 gnorm: 1.17 [ 1:29:54< 1:26:43] +[titan] 2025-06-13 14:11:27,820 - root - INFO - step: 7640 loss: 19.9331 memory: 6.46GiB(27.34%) tps: 23,966 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 3.4650 global_avg_mtp_loss: 16.4681 +[titan] 2025-06-13 14:11:27,821 - root - INFO - lr: 2.9313e-04 gnorm: 1.11 [ 1:29:57< 1:26:40] +[titan] 2025-06-13 14:11:30,996 - root - INFO - step: 7645 loss: 19.3775 memory: 6.46GiB(27.34%) tps: 25,801 tflops: 25.97 mfu: 8.32% global_avg_ntp_loss: 3.3707 global_avg_mtp_loss: 16.0068 +[titan] 2025-06-13 14:11:30,996 - root - INFO - lr: 2.9288e-04 gnorm: 1.59 [ 1:30:01< 1:26:36] +[titan] 2025-06-13 14:11:33,928 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:11:34,598 - root - INFO - step: 7650 loss: 18.9271 memory: 6.46GiB(27.34%) tps: 22,743 tflops: 22.89 mfu: 7.34% global_avg_ntp_loss: 3.3100 global_avg_mtp_loss: 15.6171 +[titan] 2025-06-13 14:11:34,599 - root - INFO - lr: 2.9263e-04 gnorm: 1.22 [ 1:30:04< 1:26:32] +[titan] 2025-06-13 14:11:37,988 - root - INFO - step: 7655 loss: 19.1214 memory: 6.46GiB(27.34%) tps: 24,172 tflops: 24.33 mfu: 7.80% global_avg_ntp_loss: 3.3246 global_avg_mtp_loss: 15.7968 +[titan] 2025-06-13 14:11:37,988 - root - INFO - lr: 2.9238e-04 gnorm: 1.39 [ 1:30:08< 1:26:29] +[titan] 2025-06-13 14:11:41,454 - root - INFO - step: 7660 loss: 18.5803 memory: 6.46GiB(27.34%) tps: 23,636 tflops: 23.79 mfu: 7.62% global_avg_ntp_loss: 3.1879 global_avg_mtp_loss: 15.3924 +[titan] 2025-06-13 14:11:41,454 - root - INFO - lr: 2.9212e-04 gnorm: 1.32 [ 1:30:11< 1:26:25] +[titan] 2025-06-13 14:11:44,573 - root - INFO - step: 7665 loss: 17.9392 memory: 6.46GiB(27.34%) tps: 26,268 tflops: 26.44 mfu: 8.47% global_avg_ntp_loss: 3.0983 global_avg_mtp_loss: 14.8409 +[titan] 2025-06-13 14:11:44,573 - root - INFO - lr: 2.9187e-04 gnorm: 1.54 [ 1:30:14< 1:26:21] +[titan] 2025-06-13 14:11:48,158 - root - INFO - step: 7670 loss: 19.4233 memory: 6.46GiB(27.34%) tps: 22,853 tflops: 23.00 mfu: 7.37% global_avg_ntp_loss: 3.4090 global_avg_mtp_loss: 16.0143 +[titan] 2025-06-13 14:11:48,158 - root - INFO - lr: 2.9162e-04 gnorm: 1.17 [ 1:30:18< 1:26:18] +[titan] 2025-06-13 14:11:51,624 - root - INFO - step: 7675 loss: 17.5434 memory: 6.46GiB(27.34%) tps: 23,641 tflops: 23.79 mfu: 7.63% global_avg_ntp_loss: 3.0547 global_avg_mtp_loss: 14.4887 +[titan] 2025-06-13 14:11:51,624 - root - INFO - lr: 2.9137e-04 gnorm: 1.37 [ 1:30:21< 1:26:14] +[titan] 2025-06-13 14:11:55,121 - root - INFO - step: 7680 loss: 19.6693 memory: 6.46GiB(27.34%) tps: 23,432 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 3.3995 global_avg_mtp_loss: 16.2698 +[titan] 2025-06-13 14:11:55,121 - root - INFO - lr: 2.9112e-04 gnorm: 3.87 [ 1:30:25< 1:26:10] +[titan] 2025-06-13 14:11:55,280 - root - INFO - Dumping profiler traces at step 7680 +[titan] 2025-06-13 14:11:55,379 - root - INFO - Finished dumping profiler traces in 0.10 seconds +[titan] 2025-06-13 14:11:58,926 - root - INFO - step: 7685 loss: 19.6861 memory: 6.46GiB(27.34%) tps: 21,527 tflops: 21.66 mfu: 6.94% global_avg_ntp_loss: 3.4673 global_avg_mtp_loss: 16.2188 +[titan] 2025-06-13 14:11:58,927 - root - INFO - lr: 2.9086e-04 gnorm: 1.13 [ 1:30:29< 1:26:07] +[titan] 2025-06-13 14:12:02,370 - root - INFO - step: 7690 loss: 19.8880 memory: 6.46GiB(27.34%) tps: 23,792 tflops: 23.94 mfu: 7.67% global_avg_ntp_loss: 3.4358 global_avg_mtp_loss: 16.4522 +[titan] 2025-06-13 14:12:02,370 - root - INFO - lr: 2.9061e-04 gnorm: 1.09 [ 1:30:32< 1:26:04] +[titan] 2025-06-13 14:12:06,042 - root - INFO - step: 7695 loss: 18.6956 memory: 6.46GiB(27.34%) tps: 22,310 tflops: 22.45 mfu: 7.20% global_avg_ntp_loss: 3.2647 global_avg_mtp_loss: 15.4309 +[titan] 2025-06-13 14:12:06,043 - root - INFO - lr: 2.9036e-04 gnorm: 1.18 [ 1:30:36< 1:26:00] +[titan] 2025-06-13 14:12:08,650 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:12:09,208 - root - INFO - step: 7700 loss: 19.9061 memory: 6.46GiB(27.34%) tps: 25,878 tflops: 26.04 mfu: 8.35% global_avg_ntp_loss: 3.4781 global_avg_mtp_loss: 16.4281 +[titan] 2025-06-13 14:12:09,209 - root - INFO - lr: 2.9011e-04 gnorm: 1.17 [ 1:30:39< 1:25:56] +[titan] 2025-06-13 14:12:13,122 - root - INFO - step: 7705 loss: 15.8351 memory: 6.46GiB(27.34%) tps: 20,934 tflops: 21.07 mfu: 6.75% global_avg_ntp_loss: 2.7634 global_avg_mtp_loss: 13.0717 +[titan] 2025-06-13 14:12:13,122 - root - INFO - lr: 2.8986e-04 gnorm: 1.57 [ 1:30:43< 1:25:53] +[titan] 2025-06-13 14:12:16,313 - root - INFO - step: 7710 loss: 20.7804 memory: 6.46GiB(27.34%) tps: 25,678 tflops: 25.84 mfu: 8.28% global_avg_ntp_loss: 3.6728 global_avg_mtp_loss: 17.1076 +[titan] 2025-06-13 14:12:16,313 - root - INFO - lr: 2.8961e-04 gnorm: 1.26 [ 1:30:46< 1:25:49] +[titan] 2025-06-13 14:12:19,790 - root - INFO - step: 7715 loss: 20.5037 memory: 6.46GiB(27.34%) tps: 23,562 tflops: 23.71 mfu: 7.60% global_avg_ntp_loss: 3.5954 global_avg_mtp_loss: 16.9083 +[titan] 2025-06-13 14:12:19,790 - root - INFO - lr: 2.8935e-04 gnorm: 1.23 [ 1:30:49< 1:25:46] +[titan] 2025-06-13 14:12:23,241 - root - INFO - step: 7720 loss: 19.9184 memory: 6.46GiB(27.34%) tps: 23,740 tflops: 23.89 mfu: 7.66% global_avg_ntp_loss: 3.4697 global_avg_mtp_loss: 16.4487 +[titan] 2025-06-13 14:12:23,242 - root - INFO - lr: 2.8910e-04 gnorm: 1.25 [ 1:30:53< 1:25:42] +[titan] 2025-06-13 14:12:26,816 - root - INFO - step: 7725 loss: 20.2111 memory: 6.46GiB(27.34%) tps: 22,918 tflops: 23.06 mfu: 7.39% global_avg_ntp_loss: 3.5273 global_avg_mtp_loss: 16.6838 +[titan] 2025-06-13 14:12:26,817 - root - INFO - lr: 2.8885e-04 gnorm: 1.33 [ 1:30:56< 1:25:39] +[titan] 2025-06-13 14:12:30,527 - root - INFO - step: 7730 loss: 18.6529 memory: 6.46GiB(27.34%) tps: 22,079 tflops: 22.22 mfu: 7.12% global_avg_ntp_loss: 3.2070 global_avg_mtp_loss: 15.4459 +[titan] 2025-06-13 14:12:30,528 - root - INFO - lr: 2.8860e-04 gnorm: 1.30 [ 1:31:00< 1:25:35] +[titan] 2025-06-13 14:12:33,712 - root - INFO - step: 7735 loss: 19.9418 memory: 6.46GiB(27.34%) tps: 25,721 tflops: 25.89 mfu: 8.30% global_avg_ntp_loss: 3.4876 global_avg_mtp_loss: 16.4542 +[titan] 2025-06-13 14:12:33,713 - root - INFO - lr: 2.8835e-04 gnorm: 1.22 [ 1:31:03< 1:25:31] +[titan] 2025-06-13 14:12:37,458 - root - INFO - step: 7740 loss: 19.7126 memory: 6.46GiB(27.34%) tps: 21,871 tflops: 22.01 mfu: 7.05% global_avg_ntp_loss: 3.4606 global_avg_mtp_loss: 16.2519 +[titan] 2025-06-13 14:12:37,459 - root - INFO - lr: 2.8809e-04 gnorm: 1.16 [ 1:31:07< 1:25:28] +[titan] 2025-06-13 14:12:40,912 - root - INFO - step: 7745 loss: 17.7655 memory: 6.46GiB(27.34%) tps: 23,724 tflops: 23.88 mfu: 7.65% global_avg_ntp_loss: 3.0642 global_avg_mtp_loss: 14.7013 +[titan] 2025-06-13 14:12:40,912 - root - INFO - lr: 2.8784e-04 gnorm: 1.54 [ 1:31:11< 1:25:24] +[titan] 2025-06-13 14:12:43,905 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:12:44,615 - root - INFO - step: 7750 loss: 20.1900 memory: 6.46GiB(27.34%) tps: 22,125 tflops: 22.27 mfu: 7.14% global_avg_ntp_loss: 3.5481 global_avg_mtp_loss: 16.6419 +[titan] 2025-06-13 14:12:44,615 - root - INFO - lr: 2.8759e-04 gnorm: 1.15 [ 1:31:14< 1:25:21] +[titan] 2025-06-13 14:12:48,031 - root - INFO - step: 7755 loss: 20.4627 memory: 6.46GiB(27.34%) tps: 23,983 tflops: 24.14 mfu: 7.74% global_avg_ntp_loss: 3.6262 global_avg_mtp_loss: 16.8365 +[titan] 2025-06-13 14:12:48,031 - root - INFO - lr: 2.8734e-04 gnorm: 1.15 [ 1:31:18< 1:25:17] +[titan] 2025-06-13 14:12:51,959 - root - INFO - step: 7760 loss: 20.0952 memory: 6.46GiB(27.34%) tps: 20,856 tflops: 20.99 mfu: 6.73% global_avg_ntp_loss: 3.5504 global_avg_mtp_loss: 16.5448 +[titan] 2025-06-13 14:12:51,959 - root - INFO - lr: 2.8709e-04 gnorm: 1.48 [ 1:31:22< 1:25:14] +[titan] 2025-06-13 14:12:55,554 - root - INFO - step: 7765 loss: 19.7439 memory: 6.46GiB(27.34%) tps: 22,792 tflops: 22.94 mfu: 7.35% global_avg_ntp_loss: 3.4244 global_avg_mtp_loss: 16.3195 +[titan] 2025-06-13 14:12:55,555 - root - INFO - lr: 2.8683e-04 gnorm: 1.29 [ 1:31:25< 1:25:11] +[titan] 2025-06-13 14:12:59,018 - root - INFO - step: 7770 loss: 18.8463 memory: 6.46GiB(27.34%) tps: 23,656 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 3.3534 global_avg_mtp_loss: 15.4929 +[titan] 2025-06-13 14:12:59,018 - root - INFO - lr: 2.8658e-04 gnorm: 1.47 [ 1:31:29< 1:25:07] +[titan] 2025-06-13 14:13:02,829 - root - INFO - step: 7775 loss: 20.1836 memory: 6.46GiB(27.34%) tps: 21,495 tflops: 21.63 mfu: 6.93% global_avg_ntp_loss: 3.5369 global_avg_mtp_loss: 16.6467 +[titan] 2025-06-13 14:13:02,830 - root - INFO - lr: 2.8633e-04 gnorm: 1.14 [ 1:31:32< 1:25:04] +[titan] 2025-06-13 14:13:06,161 - root - INFO - step: 7780 loss: 20.7523 memory: 6.46GiB(27.34%) tps: 24,596 tflops: 24.75 mfu: 7.93% global_avg_ntp_loss: 3.6997 global_avg_mtp_loss: 17.0527 +[titan] 2025-06-13 14:13:06,161 - root - INFO - lr: 2.8608e-04 gnorm: 1.22 [ 1:31:36< 1:25:00] +[titan] 2025-06-13 14:13:09,570 - root - INFO - step: 7785 loss: 18.9691 memory: 6.46GiB(27.34%) tps: 24,029 tflops: 24.18 mfu: 7.75% global_avg_ntp_loss: 3.2763 global_avg_mtp_loss: 15.6928 +[titan] 2025-06-13 14:13:09,571 - root - INFO - lr: 2.8583e-04 gnorm: 1.20 [ 1:31:39< 1:24:56] +[titan] 2025-06-13 14:13:12,945 - root - INFO - step: 7790 loss: 18.1841 memory: 6.46GiB(27.34%) tps: 24,278 tflops: 24.43 mfu: 7.83% global_avg_ntp_loss: 3.1627 global_avg_mtp_loss: 15.0214 +[titan] 2025-06-13 14:13:12,945 - root - INFO - lr: 2.8557e-04 gnorm: 1.25 [ 1:31:43< 1:24:53] +[titan] 2025-06-13 14:13:16,590 - root - INFO - step: 7795 loss: 21.0000 memory: 6.46GiB(27.34%) tps: 22,480 tflops: 22.62 mfu: 7.25% global_avg_ntp_loss: 3.7462 global_avg_mtp_loss: 17.2538 +[titan] 2025-06-13 14:13:16,590 - root - INFO - lr: 2.8532e-04 gnorm: 1.15 [ 1:31:46< 1:24:49] +[titan] 2025-06-13 14:13:19,600 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:13:20,212 - root - INFO - step: 7800 loss: 19.3755 memory: 6.46GiB(27.34%) tps: 22,616 tflops: 22.76 mfu: 7.29% global_avg_ntp_loss: 3.3845 global_avg_mtp_loss: 15.9910 +[titan] 2025-06-13 14:13:20,213 - root - INFO - lr: 2.8507e-04 gnorm: 1.35 [ 1:31:50< 1:24:46] +[titan] 2025-06-13 14:13:23,801 - root - INFO - step: 7805 loss: 20.3454 memory: 6.46GiB(27.34%) tps: 22,832 tflops: 22.98 mfu: 7.36% global_avg_ntp_loss: 3.5912 global_avg_mtp_loss: 16.7541 +[titan] 2025-06-13 14:13:23,801 - root - INFO - lr: 2.8482e-04 gnorm: 1.16 [ 1:31:53< 1:24:42] +[titan] 2025-06-13 14:13:27,354 - root - INFO - step: 7810 loss: 18.6239 memory: 6.46GiB(27.34%) tps: 23,060 tflops: 23.21 mfu: 7.44% global_avg_ntp_loss: 3.2606 global_avg_mtp_loss: 15.3633 +[titan] 2025-06-13 14:13:27,354 - root - INFO - lr: 2.8456e-04 gnorm: 1.45 [ 1:31:57< 1:24:39] +[titan] 2025-06-13 14:13:31,080 - root - INFO - step: 7815 loss: 20.0820 memory: 6.46GiB(27.34%) tps: 21,985 tflops: 22.13 mfu: 7.09% global_avg_ntp_loss: 3.4709 global_avg_mtp_loss: 16.6112 +[titan] 2025-06-13 14:13:31,081 - root - INFO - lr: 2.8431e-04 gnorm: 6.39 [ 1:32:01< 1:24:36] +[titan] 2025-06-13 14:13:34,176 - root - INFO - step: 7820 loss: 20.2220 memory: 6.46GiB(27.34%) tps: 26,467 tflops: 26.64 mfu: 8.54% global_avg_ntp_loss: 3.5281 global_avg_mtp_loss: 16.6938 +[titan] 2025-06-13 14:13:34,176 - root - INFO - lr: 2.8406e-04 gnorm: 1.18 [ 1:32:04< 1:24:32] +[titan] 2025-06-13 14:13:37,579 - root - INFO - step: 7825 loss: 19.9503 memory: 6.46GiB(27.34%) tps: 24,079 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 3.4355 global_avg_mtp_loss: 16.5149 +[titan] 2025-06-13 14:13:37,579 - root - INFO - lr: 2.8381e-04 gnorm: 1.22 [ 1:32:07< 1:24:28] +[titan] 2025-06-13 14:13:40,808 - root - INFO - step: 7830 loss: 19.3764 memory: 6.46GiB(27.34%) tps: 25,373 tflops: 25.53 mfu: 8.18% global_avg_ntp_loss: 3.3569 global_avg_mtp_loss: 16.0195 +[titan] 2025-06-13 14:13:40,808 - root - INFO - lr: 2.8356e-04 gnorm: 1.23 [ 1:32:10< 1:24:24] +[titan] 2025-06-13 14:13:44,597 - root - INFO - step: 7835 loss: 19.2520 memory: 6.46GiB(27.34%) tps: 21,622 tflops: 21.76 mfu: 6.97% global_avg_ntp_loss: 3.3581 global_avg_mtp_loss: 15.8938 +[titan] 2025-06-13 14:13:44,597 - root - INFO - lr: 2.8330e-04 gnorm: 1.26 [ 1:32:14< 1:24:21] +[titan] 2025-06-13 14:13:48,171 - root - INFO - step: 7840 loss: 20.2990 memory: 6.46GiB(27.34%) tps: 22,924 tflops: 23.07 mfu: 7.39% global_avg_ntp_loss: 3.6094 global_avg_mtp_loss: 16.6896 +[titan] 2025-06-13 14:13:48,171 - root - INFO - lr: 2.8305e-04 gnorm: 1.27 [ 1:32:18< 1:24:17] +[titan] 2025-06-13 14:13:51,768 - root - INFO - step: 7845 loss: 18.4489 memory: 6.46GiB(27.34%) tps: 22,780 tflops: 22.93 mfu: 7.35% global_avg_ntp_loss: 3.2041 global_avg_mtp_loss: 15.2448 +[titan] 2025-06-13 14:13:51,768 - root - INFO - lr: 2.8280e-04 gnorm: 1.18 [ 1:32:21< 1:24:14] +[titan] 2025-06-13 14:13:54,318 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:13:55,092 - root - INFO - step: 7850 loss: 19.4553 memory: 6.46GiB(27.34%) tps: 24,647 tflops: 24.80 mfu: 7.95% global_avg_ntp_loss: 3.3365 global_avg_mtp_loss: 16.1189 +[titan] 2025-06-13 14:13:55,092 - root - INFO - lr: 2.8255e-04 gnorm: 1.11 [ 1:32:25< 1:24:10] +[titan] 2025-06-13 14:13:58,801 - root - INFO - step: 7855 loss: 20.0448 memory: 6.46GiB(27.34%) tps: 22,087 tflops: 22.23 mfu: 7.12% global_avg_ntp_loss: 3.4839 global_avg_mtp_loss: 16.5609 +[titan] 2025-06-13 14:13:58,802 - root - INFO - lr: 2.8229e-04 gnorm: 1.10 [ 1:32:28< 1:24:07] +[titan] 2025-06-13 14:14:02,086 - root - INFO - step: 7860 loss: 17.6533 memory: 6.46GiB(27.34%) tps: 24,952 tflops: 25.11 mfu: 8.05% global_avg_ntp_loss: 3.0830 global_avg_mtp_loss: 14.5703 +[titan] 2025-06-13 14:14:02,086 - root - INFO - lr: 2.8204e-04 gnorm: 1.22 [ 1:32:32< 1:24:03] +[titan] 2025-06-13 14:14:05,812 - root - INFO - step: 7865 loss: 18.9479 memory: 6.46GiB(27.34%) tps: 21,988 tflops: 22.13 mfu: 7.09% global_avg_ntp_loss: 3.3010 global_avg_mtp_loss: 15.6469 +[titan] 2025-06-13 14:14:05,812 - root - INFO - lr: 2.8179e-04 gnorm: 1.48 [ 1:32:35< 1:24:00] +[titan] 2025-06-13 14:14:09,428 - root - INFO - step: 7870 loss: 19.9725 memory: 6.46GiB(27.34%) tps: 22,657 tflops: 22.80 mfu: 7.31% global_avg_ntp_loss: 3.5097 global_avg_mtp_loss: 16.4628 +[titan] 2025-06-13 14:14:09,428 - root - INFO - lr: 2.8154e-04 gnorm: 1.24 [ 1:32:39< 1:23:56] +[titan] 2025-06-13 14:14:12,983 - root - INFO - step: 7875 loss: 17.4465 memory: 6.46GiB(27.34%) tps: 23,044 tflops: 23.19 mfu: 7.43% global_avg_ntp_loss: 3.0766 global_avg_mtp_loss: 14.3699 +[titan] 2025-06-13 14:14:12,984 - root - INFO - lr: 2.8128e-04 gnorm: 1.38 [ 1:32:43< 1:23:53] +[titan] 2025-06-13 14:14:16,150 - root - INFO - step: 7880 loss: 18.6852 memory: 6.46GiB(27.34%) tps: 25,877 tflops: 26.04 mfu: 8.35% global_avg_ntp_loss: 3.2431 global_avg_mtp_loss: 15.4421 +[titan] 2025-06-13 14:14:16,150 - root - INFO - lr: 2.8103e-04 gnorm: 1.47 [ 1:32:46< 1:23:49] +[titan] 2025-06-13 14:14:19,332 - root - INFO - step: 7885 loss: 19.9001 memory: 6.46GiB(27.34%) tps: 25,743 tflops: 25.91 mfu: 8.30% global_avg_ntp_loss: 3.4526 global_avg_mtp_loss: 16.4475 +[titan] 2025-06-13 14:14:19,332 - root - INFO - lr: 2.8078e-04 gnorm: 1.25 [ 1:32:49< 1:23:45] +[titan] 2025-06-13 14:14:23,220 - root - INFO - step: 7890 loss: 18.9004 memory: 6.46GiB(27.34%) tps: 21,072 tflops: 21.21 mfu: 6.80% global_avg_ntp_loss: 3.2929 global_avg_mtp_loss: 15.6075 +[titan] 2025-06-13 14:14:23,220 - root - INFO - lr: 2.8053e-04 gnorm: 1.21 [ 1:32:53< 1:23:42] +[titan] 2025-06-13 14:14:27,137 - root - INFO - step: 7895 loss: 20.0779 memory: 6.46GiB(27.34%) tps: 20,918 tflops: 21.05 mfu: 6.75% global_avg_ntp_loss: 3.5234 global_avg_mtp_loss: 16.5545 +[titan] 2025-06-13 14:14:27,138 - root - INFO - lr: 2.8028e-04 gnorm: 1.21 [ 1:32:57< 1:23:39] +[titan] 2025-06-13 14:14:29,580 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:14:30,178 - root - INFO - step: 7900 loss: 15.3341 memory: 6.46GiB(27.34%) tps: 26,948 tflops: 27.12 mfu: 8.69% global_avg_ntp_loss: 2.6501 global_avg_mtp_loss: 12.6840 +[titan] 2025-06-13 14:14:30,178 - root - INFO - lr: 2.8002e-04 gnorm: 2.98 [ 1:33:00< 1:23:35] +[titan] 2025-06-13 14:14:33,990 - root - INFO - step: 7905 loss: 20.4756 memory: 6.46GiB(27.34%) tps: 21,490 tflops: 21.63 mfu: 6.93% global_avg_ntp_loss: 3.6094 global_avg_mtp_loss: 16.8662 +[titan] 2025-06-13 14:14:33,990 - root - INFO - lr: 2.7977e-04 gnorm: 1.13 [ 1:33:04< 1:23:31] +[titan] 2025-06-13 14:14:37,317 - root - INFO - step: 7910 loss: 20.6579 memory: 6.46GiB(27.34%) tps: 24,626 tflops: 24.78 mfu: 7.94% global_avg_ntp_loss: 3.5960 global_avg_mtp_loss: 17.0618 +[titan] 2025-06-13 14:14:37,317 - root - INFO - lr: 2.7952e-04 gnorm: 1.39 [ 1:33:07< 1:23:28] +[titan] 2025-06-13 14:14:41,141 - root - INFO - step: 7915 loss: 20.5230 memory: 6.46GiB(27.34%) tps: 21,423 tflops: 21.56 mfu: 6.91% global_avg_ntp_loss: 3.5898 global_avg_mtp_loss: 16.9332 +[titan] 2025-06-13 14:14:41,142 - root - INFO - lr: 2.7927e-04 gnorm: 1.18 [ 1:33:11< 1:23:24] +[titan] 2025-06-13 14:14:44,697 - root - INFO - step: 7920 loss: 20.7964 memory: 6.46GiB(27.34%) tps: 23,040 tflops: 23.19 mfu: 7.43% global_avg_ntp_loss: 3.6471 global_avg_mtp_loss: 17.1493 +[titan] 2025-06-13 14:14:44,698 - root - INFO - lr: 2.7901e-04 gnorm: 1.14 [ 1:33:14< 1:23:21] +[titan] 2025-06-13 14:14:48,056 - root - INFO - step: 7925 loss: 18.4884 memory: 6.46GiB(27.34%) tps: 24,394 tflops: 24.55 mfu: 7.87% global_avg_ntp_loss: 3.2191 global_avg_mtp_loss: 15.2693 +[titan] 2025-06-13 14:14:48,056 - root - INFO - lr: 2.7876e-04 gnorm: 1.17 [ 1:33:18< 1:23:17] +[titan] 2025-06-13 14:14:51,259 - root - INFO - step: 7930 loss: 18.6130 memory: 6.46GiB(27.34%) tps: 25,579 tflops: 25.74 mfu: 8.25% global_avg_ntp_loss: 3.2184 global_avg_mtp_loss: 15.3946 +[titan] 2025-06-13 14:14:51,259 - root - INFO - lr: 2.7851e-04 gnorm: 1.31 [ 1:33:21< 1:23:13] +[titan] 2025-06-13 14:14:55,046 - root - INFO - step: 7935 loss: 19.6302 memory: 6.46GiB(27.34%) tps: 21,633 tflops: 21.77 mfu: 6.98% global_avg_ntp_loss: 3.4670 global_avg_mtp_loss: 16.1632 +[titan] 2025-06-13 14:14:55,047 - root - INFO - lr: 2.7826e-04 gnorm: 1.34 [ 1:33:25< 1:23:10] +[titan] 2025-06-13 14:14:58,819 - root - INFO - step: 7940 loss: 19.9904 memory: 6.46GiB(27.34%) tps: 21,718 tflops: 21.86 mfu: 7.01% global_avg_ntp_loss: 3.5495 global_avg_mtp_loss: 16.4409 +[titan] 2025-06-13 14:14:58,819 - root - INFO - lr: 2.7800e-04 gnorm: 1.32 [ 1:33:28< 1:23:07] +[titan] 2025-06-13 14:15:02,391 - root - INFO - step: 7945 loss: 19.6230 memory: 6.46GiB(27.34%) tps: 22,934 tflops: 23.08 mfu: 7.40% global_avg_ntp_loss: 3.4644 global_avg_mtp_loss: 16.1585 +[titan] 2025-06-13 14:15:02,392 - root - INFO - lr: 2.7775e-04 gnorm: 1.34 [ 1:33:32< 1:23:03] +[titan] 2025-06-13 14:15:05,453 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:15:06,063 - root - INFO - step: 7950 loss: 19.5213 memory: 6.46GiB(27.34%) tps: 22,315 tflops: 22.46 mfu: 7.20% global_avg_ntp_loss: 3.4002 global_avg_mtp_loss: 16.1211 +[titan] 2025-06-13 14:15:06,063 - root - INFO - lr: 2.7750e-04 gnorm: 1.15 [ 1:33:36< 1:23:00] +[titan] 2025-06-13 14:15:09,584 - root - INFO - step: 7955 loss: 19.5203 memory: 6.46GiB(27.34%) tps: 23,267 tflops: 23.41 mfu: 7.50% global_avg_ntp_loss: 3.4294 global_avg_mtp_loss: 16.0909 +[titan] 2025-06-13 14:15:09,585 - root - INFO - lr: 2.7725e-04 gnorm: 1.34 [ 1:33:39< 1:22:56] +[titan] 2025-06-13 14:15:13,446 - root - INFO - step: 7960 loss: 18.2737 memory: 6.46GiB(27.34%) tps: 21,215 tflops: 21.35 mfu: 6.84% global_avg_ntp_loss: 3.1236 global_avg_mtp_loss: 15.1501 +[titan] 2025-06-13 14:15:13,446 - root - INFO - lr: 2.7699e-04 gnorm: 1.29 [ 1:33:43< 1:22:53] +[titan] 2025-06-13 14:15:17,053 - root - INFO - step: 7965 loss: 20.2118 memory: 6.46GiB(27.34%) tps: 22,713 tflops: 22.86 mfu: 7.33% global_avg_ntp_loss: 3.5170 global_avg_mtp_loss: 16.6948 +[titan] 2025-06-13 14:15:17,054 - root - INFO - lr: 2.7674e-04 gnorm: 1.13 [ 1:33:47< 1:22:50] +[titan] 2025-06-13 14:15:21,000 - root - INFO - step: 7970 loss: 19.8313 memory: 6.46GiB(27.34%) tps: 20,761 tflops: 20.89 mfu: 6.70% global_avg_ntp_loss: 3.4513 global_avg_mtp_loss: 16.3800 +[titan] 2025-06-13 14:15:21,000 - root - INFO - lr: 2.7649e-04 gnorm: 1.15 [ 1:33:51< 1:22:46] +[titan] 2025-06-13 14:15:24,870 - root - INFO - step: 7975 loss: 17.7478 memory: 6.46GiB(27.34%) tps: 21,169 tflops: 21.30 mfu: 6.83% global_avg_ntp_loss: 3.0502 global_avg_mtp_loss: 14.6976 +[titan] 2025-06-13 14:15:24,870 - root - INFO - lr: 2.7624e-04 gnorm: 1.56 [ 1:33:54< 1:22:43] +[titan] 2025-06-13 14:15:28,820 - root - INFO - step: 7980 loss: 20.2337 memory: 6.46GiB(27.34%) tps: 20,742 tflops: 20.87 mfu: 6.69% global_avg_ntp_loss: 3.5589 global_avg_mtp_loss: 16.6747 +[titan] 2025-06-13 14:15:28,820 - root - INFO - lr: 2.7598e-04 gnorm: 1.13 [ 1:33:58< 1:22:40] +[titan] 2025-06-13 14:15:32,177 - root - INFO - step: 7985 loss: 19.8631 memory: 6.46GiB(27.34%) tps: 24,407 tflops: 24.56 mfu: 7.87% global_avg_ntp_loss: 3.4300 global_avg_mtp_loss: 16.4331 +[titan] 2025-06-13 14:15:32,177 - root - INFO - lr: 2.7573e-04 gnorm: 1.67 [ 1:34:02< 1:22:36] +[titan] 2025-06-13 14:15:36,068 - root - INFO - step: 7990 loss: 19.9236 memory: 6.46GiB(27.34%) tps: 21,052 tflops: 21.19 mfu: 6.79% global_avg_ntp_loss: 3.4850 global_avg_mtp_loss: 16.4386 +[titan] 2025-06-13 14:15:36,069 - root - INFO - lr: 2.7548e-04 gnorm: 1.15 [ 1:34:06< 1:22:33] +[titan] 2025-06-13 14:15:39,352 - root - INFO - step: 7995 loss: 19.7122 memory: 6.46GiB(27.34%) tps: 24,954 tflops: 25.11 mfu: 8.05% global_avg_ntp_loss: 3.4646 global_avg_mtp_loss: 16.2476 +[titan] 2025-06-13 14:15:39,352 - root - INFO - lr: 2.7523e-04 gnorm: 1.10 [ 1:34:09< 1:22:29] +[titan] 2025-06-13 14:15:42,305 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:15:43,353 - root - INFO - step: 8000 loss: 20.9564 memory: 6.46GiB(27.34%) tps: 20,474 tflops: 20.60 mfu: 6.60% global_avg_ntp_loss: 3.6520 global_avg_mtp_loss: 17.3044 +[titan] 2025-06-13 14:15:43,354 - root - INFO - lr: 2.7497e-04 gnorm: 1.15 [ 1:34:13< 1:22:26] +[titan] 2025-06-13 14:15:47,097 - root - INFO - step: 8005 loss: 20.2992 memory: 6.46GiB(27.34%) tps: 21,885 tflops: 22.02 mfu: 7.06% global_avg_ntp_loss: 3.5623 global_avg_mtp_loss: 16.7369 +[titan] 2025-06-13 14:15:47,097 - root - INFO - lr: 2.7472e-04 gnorm: 1.14 [ 1:34:17< 1:22:23] +[titan] 2025-06-13 14:15:50,526 - root - INFO - step: 8010 loss: 16.7758 memory: 6.46GiB(27.34%) tps: 23,894 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 2.9159 global_avg_mtp_loss: 13.8599 +[titan] 2025-06-13 14:15:50,526 - root - INFO - lr: 2.7447e-04 gnorm: 1.22 [ 1:34:20< 1:22:19] +[titan] 2025-06-13 14:15:53,924 - root - INFO - step: 8015 loss: 19.2763 memory: 6.46GiB(27.34%) tps: 24,113 tflops: 24.27 mfu: 7.78% global_avg_ntp_loss: 3.3385 global_avg_mtp_loss: 15.9379 +[titan] 2025-06-13 14:15:53,924 - root - INFO - lr: 2.7422e-04 gnorm: 1.27 [ 1:34:23< 1:22:16] +[titan] 2025-06-13 14:15:57,532 - root - INFO - step: 8020 loss: 19.9670 memory: 6.46GiB(27.34%) tps: 22,708 tflops: 22.85 mfu: 7.32% global_avg_ntp_loss: 3.4724 global_avg_mtp_loss: 16.4947 +[titan] 2025-06-13 14:15:57,532 - root - INFO - lr: 2.7397e-04 gnorm: 1.33 [ 1:34:27< 1:22:12] +[titan] 2025-06-13 14:16:01,174 - root - INFO - step: 8025 loss: 19.9371 memory: 6.46GiB(27.34%) tps: 22,491 tflops: 22.63 mfu: 7.25% global_avg_ntp_loss: 3.5008 global_avg_mtp_loss: 16.4363 +[titan] 2025-06-13 14:16:01,175 - root - INFO - lr: 2.7371e-04 gnorm: 1.19 [ 1:34:31< 1:22:09] +[titan] 2025-06-13 14:16:04,563 - root - INFO - step: 8030 loss: 20.4688 memory: 6.46GiB(27.34%) tps: 24,178 tflops: 24.33 mfu: 7.80% global_avg_ntp_loss: 3.6442 global_avg_mtp_loss: 16.8245 +[titan] 2025-06-13 14:16:04,563 - root - INFO - lr: 2.7346e-04 gnorm: 1.27 [ 1:34:34< 1:22:05] +[titan] 2025-06-13 14:16:08,350 - root - INFO - step: 8035 loss: 20.2178 memory: 6.46GiB(27.34%) tps: 21,637 tflops: 21.77 mfu: 6.98% global_avg_ntp_loss: 3.5434 global_avg_mtp_loss: 16.6744 +[titan] 2025-06-13 14:16:08,350 - root - INFO - lr: 2.7321e-04 gnorm: 1.32 [ 1:34:38< 1:22:02] +[titan] 2025-06-13 14:16:11,786 - root - INFO - step: 8040 loss: 19.2334 memory: 6.46GiB(27.34%) tps: 23,843 tflops: 23.99 mfu: 7.69% global_avg_ntp_loss: 3.4037 global_avg_mtp_loss: 15.8298 +[titan] 2025-06-13 14:16:11,787 - root - INFO - lr: 2.7296e-04 gnorm: 1.19 [ 1:34:41< 1:21:58] +[titan] 2025-06-13 14:16:15,423 - root - INFO - step: 8045 loss: 20.0833 memory: 6.46GiB(27.34%) tps: 22,529 tflops: 22.67 mfu: 7.27% global_avg_ntp_loss: 3.4658 global_avg_mtp_loss: 16.6175 +[titan] 2025-06-13 14:16:15,423 - root - INFO - lr: 2.7270e-04 gnorm: 1.38 [ 1:34:45< 1:21:55] +[titan] 2025-06-13 14:16:18,365 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:16:19,258 - root - INFO - step: 8050 loss: 20.0057 memory: 6.46GiB(27.34%) tps: 21,361 tflops: 21.50 mfu: 6.89% global_avg_ntp_loss: 3.4668 global_avg_mtp_loss: 16.5389 +[titan] 2025-06-13 14:16:19,259 - root - INFO - lr: 2.7245e-04 gnorm: 1.11 [ 1:34:49< 1:21:51] +[titan] 2025-06-13 14:16:22,663 - root - INFO - step: 8055 loss: 19.7055 memory: 6.46GiB(27.34%) tps: 24,065 tflops: 24.22 mfu: 7.76% global_avg_ntp_loss: 3.4386 global_avg_mtp_loss: 16.2669 +[titan] 2025-06-13 14:16:22,663 - root - INFO - lr: 2.7220e-04 gnorm: 1.34 [ 1:34:52< 1:21:48] +[titan] 2025-06-13 14:16:26,074 - root - INFO - step: 8060 loss: 19.3929 memory: 6.46GiB(27.34%) tps: 24,020 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 3.3770 global_avg_mtp_loss: 16.0159 +[titan] 2025-06-13 14:16:26,074 - root - INFO - lr: 2.7195e-04 gnorm: 1.18 [ 1:34:56< 1:21:44] +[titan] 2025-06-13 14:16:29,570 - root - INFO - step: 8065 loss: 16.5074 memory: 6.46GiB(27.34%) tps: 23,433 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 2.8498 global_avg_mtp_loss: 13.6576 +[titan] 2025-06-13 14:16:29,571 - root - INFO - lr: 2.7169e-04 gnorm: 1.52 [ 1:34:59< 1:21:41] +[titan] 2025-06-13 14:16:33,176 - root - INFO - step: 8070 loss: 18.5414 memory: 6.46GiB(27.34%) tps: 22,723 tflops: 22.87 mfu: 7.33% global_avg_ntp_loss: 3.2601 global_avg_mtp_loss: 15.2812 +[titan] 2025-06-13 14:16:33,176 - root - INFO - lr: 2.7144e-04 gnorm: 1.67 [ 1:35:03< 1:21:37] +[titan] 2025-06-13 14:16:36,981 - root - INFO - step: 8075 loss: 20.3402 memory: 6.46GiB(27.34%) tps: 21,529 tflops: 21.67 mfu: 6.94% global_avg_ntp_loss: 3.5634 global_avg_mtp_loss: 16.7768 +[titan] 2025-06-13 14:16:36,982 - root - INFO - lr: 2.7119e-04 gnorm: 1.17 [ 1:35:07< 1:21:34] +[titan] 2025-06-13 14:16:40,438 - root - INFO - step: 8080 loss: 20.5659 memory: 6.46GiB(27.34%) tps: 23,703 tflops: 23.85 mfu: 7.65% global_avg_ntp_loss: 3.6244 global_avg_mtp_loss: 16.9415 +[titan] 2025-06-13 14:16:40,438 - root - INFO - lr: 2.7094e-04 gnorm: 1.06 [ 1:35:10< 1:21:30] +[titan] 2025-06-13 14:16:43,791 - root - INFO - step: 8085 loss: 19.8157 memory: 6.46GiB(27.34%) tps: 24,435 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 3.4512 global_avg_mtp_loss: 16.3645 +[titan] 2025-06-13 14:16:43,792 - root - INFO - lr: 2.7068e-04 gnorm: 1.21 [ 1:35:13< 1:21:26] +[titan] 2025-06-13 14:16:47,645 - root - INFO - step: 8090 loss: 19.7183 memory: 6.46GiB(27.34%) tps: 21,261 tflops: 21.40 mfu: 6.86% global_avg_ntp_loss: 3.4551 global_avg_mtp_loss: 16.2632 +[titan] 2025-06-13 14:16:47,645 - root - INFO - lr: 2.7043e-04 gnorm: 1.21 [ 1:35:17< 1:21:23] +[titan] 2025-06-13 14:16:51,069 - root - INFO - step: 8095 loss: 18.7747 memory: 6.46GiB(27.34%) tps: 23,931 tflops: 24.08 mfu: 7.72% global_avg_ntp_loss: 3.2524 global_avg_mtp_loss: 15.5223 +[titan] 2025-06-13 14:16:51,069 - root - INFO - lr: 2.7018e-04 gnorm: 1.07 [ 1:35:21< 1:21:20] +[titan] 2025-06-13 14:16:53,673 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:16:54,407 - root - INFO - step: 8100 loss: 19.4233 memory: 6.46GiB(27.34%) tps: 24,543 tflops: 24.70 mfu: 7.92% global_avg_ntp_loss: 3.3584 global_avg_mtp_loss: 16.0649 +[titan] 2025-06-13 14:16:54,407 - root - INFO - lr: 2.6993e-04 gnorm: 1.29 [ 1:35:24< 1:21:16] +[titan] 2025-06-13 14:16:59,575 - root - INFO - step: 8105 loss: 18.3497 memory: 6.46GiB(27.34%) tps: 15,852 tflops: 15.95 mfu: 5.11% global_avg_ntp_loss: 3.2438 global_avg_mtp_loss: 15.1059 +[titan] 2025-06-13 14:16:59,576 - root - INFO - lr: 2.6967e-04 gnorm: 1.67 [ 1:35:29< 1:21:14] +[titan] 2025-06-13 14:17:03,419 - root - INFO - step: 8110 loss: 20.0666 memory: 6.46GiB(27.34%) tps: 21,313 tflops: 21.45 mfu: 6.87% global_avg_ntp_loss: 3.6112 global_avg_mtp_loss: 16.4554 +[titan] 2025-06-13 14:17:03,420 - root - INFO - lr: 2.6942e-04 gnorm: 1.29 [ 1:35:33< 1:21:10] +[titan] 2025-06-13 14:17:07,211 - root - INFO - step: 8115 loss: 20.9319 memory: 6.46GiB(27.34%) tps: 21,609 tflops: 21.75 mfu: 6.97% global_avg_ntp_loss: 3.6939 global_avg_mtp_loss: 17.2381 +[titan] 2025-06-13 14:17:07,211 - root - INFO - lr: 2.6917e-04 gnorm: 1.28 [ 1:35:37< 1:21:07] +[titan] 2025-06-13 14:17:10,648 - root - INFO - step: 8120 loss: 20.1148 memory: 6.46GiB(27.34%) tps: 23,835 tflops: 23.99 mfu: 7.69% global_avg_ntp_loss: 3.5594 global_avg_mtp_loss: 16.5554 +[titan] 2025-06-13 14:17:10,649 - root - INFO - lr: 2.6892e-04 gnorm: 1.93 [ 1:35:40< 1:21:04] +[titan] 2025-06-13 14:17:14,165 - root - INFO - step: 8125 loss: 20.2551 memory: 6.46GiB(27.34%) tps: 23,298 tflops: 23.45 mfu: 7.51% global_avg_ntp_loss: 3.5255 global_avg_mtp_loss: 16.7296 +[titan] 2025-06-13 14:17:14,165 - root - INFO - lr: 2.6866e-04 gnorm: 1.20 [ 1:35:44< 1:21:00] +[titan] 2025-06-13 14:17:17,635 - root - INFO - step: 8130 loss: 15.7105 memory: 6.46GiB(27.34%) tps: 23,615 tflops: 23.77 mfu: 7.62% global_avg_ntp_loss: 2.6998 global_avg_mtp_loss: 13.0107 +[titan] 2025-06-13 14:17:17,635 - root - INFO - lr: 2.6841e-04 gnorm: 1.35 [ 1:35:47< 1:20:56] +[titan] 2025-06-13 14:17:21,173 - root - INFO - step: 8135 loss: 19.9111 memory: 6.46GiB(27.34%) tps: 23,157 tflops: 23.30 mfu: 7.47% global_avg_ntp_loss: 3.4796 global_avg_mtp_loss: 16.4314 +[titan] 2025-06-13 14:17:21,173 - root - INFO - lr: 2.6816e-04 gnorm: 1.14 [ 1:35:51< 1:20:53] +[titan] 2025-06-13 14:17:24,857 - root - INFO - step: 8140 loss: 20.5072 memory: 6.46GiB(27.34%) tps: 22,237 tflops: 22.38 mfu: 7.17% global_avg_ntp_loss: 3.6552 global_avg_mtp_loss: 16.8519 +[titan] 2025-06-13 14:17:24,857 - root - INFO - lr: 2.6791e-04 gnorm: 1.93 [ 1:35:54< 1:20:49] +[titan] 2025-06-13 14:17:30,395 - root - INFO - step: 8145 loss: 20.5143 memory: 6.46GiB(27.34%) tps: 14,794 tflops: 14.89 mfu: 4.77% global_avg_ntp_loss: 3.6323 global_avg_mtp_loss: 16.8820 +[titan] 2025-06-13 14:17:30,395 - root - INFO - lr: 2.6766e-04 gnorm: 1.12 [ 1:36:00< 1:20:48] +[titan] 2025-06-13 14:17:33,587 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:17:34,264 - root - INFO - step: 8150 loss: 20.7279 memory: 6.46GiB(27.34%) tps: 21,172 tflops: 21.31 mfu: 6.83% global_avg_ntp_loss: 3.6535 global_avg_mtp_loss: 17.0744 +[titan] 2025-06-13 14:17:34,265 - root - INFO - lr: 2.6740e-04 gnorm: 1.24 [ 1:36:04< 1:20:44] +[titan] 2025-06-13 14:17:37,526 - root - INFO - step: 8155 loss: 20.2214 memory: 6.46GiB(27.34%) tps: 25,119 tflops: 25.28 mfu: 8.10% global_avg_ntp_loss: 3.5564 global_avg_mtp_loss: 16.6650 +[titan] 2025-06-13 14:17:37,526 - root - INFO - lr: 2.6715e-04 gnorm: 1.12 [ 1:36:07< 1:20:41] +[titan] 2025-06-13 14:17:41,198 - root - INFO - step: 8160 loss: 20.5212 memory: 6.46GiB(27.34%) tps: 22,316 tflops: 22.46 mfu: 7.20% global_avg_ntp_loss: 3.6066 global_avg_mtp_loss: 16.9146 +[titan] 2025-06-13 14:17:41,198 - root - INFO - lr: 2.6690e-04 gnorm: 1.16 [ 1:36:11< 1:20:37] +[titan] 2025-06-13 14:17:44,780 - root - INFO - step: 8165 loss: 21.2671 memory: 6.46GiB(27.34%) tps: 22,874 tflops: 23.02 mfu: 7.38% global_avg_ntp_loss: 3.7821 global_avg_mtp_loss: 17.4850 +[titan] 2025-06-13 14:17:44,780 - root - INFO - lr: 2.6665e-04 gnorm: 1.24 [ 1:36:14< 1:20:34] +[titan] 2025-06-13 14:17:48,176 - root - INFO - step: 8170 loss: 20.0943 memory: 6.46GiB(27.34%) tps: 24,125 tflops: 24.28 mfu: 7.78% global_avg_ntp_loss: 3.5087 global_avg_mtp_loss: 16.5855 +[titan] 2025-06-13 14:17:48,176 - root - INFO - lr: 2.6639e-04 gnorm: 1.21 [ 1:36:18< 1:20:30] +[titan] 2025-06-13 14:17:51,535 - root - INFO - step: 8175 loss: 20.2236 memory: 6.46GiB(27.34%) tps: 24,393 tflops: 24.55 mfu: 7.87% global_avg_ntp_loss: 3.5169 global_avg_mtp_loss: 16.7066 +[titan] 2025-06-13 14:17:51,535 - root - INFO - lr: 2.6614e-04 gnorm: 1.13 [ 1:36:21< 1:20:26] +[titan] 2025-06-13 14:17:55,525 - root - INFO - step: 8180 loss: 19.8800 memory: 6.46GiB(27.34%) tps: 20,532 tflops: 20.66 mfu: 6.62% global_avg_ntp_loss: 3.4412 global_avg_mtp_loss: 16.4388 +[titan] 2025-06-13 14:17:55,525 - root - INFO - lr: 2.6589e-04 gnorm: 1.35 [ 1:36:25< 1:20:23] +[titan] 2025-06-13 14:17:59,087 - root - INFO - step: 8185 loss: 17.8997 memory: 6.46GiB(27.34%) tps: 23,002 tflops: 23.15 mfu: 7.42% global_avg_ntp_loss: 3.1213 global_avg_mtp_loss: 14.7784 +[titan] 2025-06-13 14:17:59,087 - root - INFO - lr: 2.6564e-04 gnorm: 1.48 [ 1:36:29< 1:20:20] +[titan] 2025-06-13 14:18:02,262 - root - INFO - step: 8190 loss: 18.6917 memory: 6.46GiB(27.34%) tps: 25,802 tflops: 25.97 mfu: 8.32% global_avg_ntp_loss: 3.2551 global_avg_mtp_loss: 15.4366 +[titan] 2025-06-13 14:18:02,262 - root - INFO - lr: 2.6539e-04 gnorm: 1.39 [ 1:36:32< 1:20:16] +[titan] 2025-06-13 14:18:03,745 - root - INFO - Dumping profiler traces at step 8192 +[titan] 2025-06-13 14:18:03,838 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 14:18:05,689 - root - INFO - step: 8195 loss: 19.9739 memory: 6.46GiB(27.34%) tps: 23,910 tflops: 24.06 mfu: 7.71% global_avg_ntp_loss: 3.4607 global_avg_mtp_loss: 16.5133 +[titan] 2025-06-13 14:18:05,689 - root - INFO - lr: 2.6513e-04 gnorm: 1.15 [ 1:36:35< 1:20:12] +[titan] 2025-06-13 14:18:08,458 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:18:09,127 - root - INFO - step: 8200 loss: 19.5004 memory: 6.46GiB(27.34%) tps: 23,831 tflops: 23.98 mfu: 7.69% global_avg_ntp_loss: 3.3853 global_avg_mtp_loss: 16.1152 +[titan] 2025-06-13 14:18:09,127 - root - INFO - lr: 2.6488e-04 gnorm: 1.33 [ 1:36:39< 1:20:09] +[titan] 2025-06-13 14:18:12,377 - root - INFO - step: 8205 loss: 19.9452 memory: 6.46GiB(27.34%) tps: 25,203 tflops: 25.36 mfu: 8.13% global_avg_ntp_loss: 3.4570 global_avg_mtp_loss: 16.4883 +[titan] 2025-06-13 14:18:12,378 - root - INFO - lr: 2.6463e-04 gnorm: 1.17 [ 1:36:42< 1:20:05] +[titan] 2025-06-13 14:18:15,983 - root - INFO - step: 8210 loss: 19.9275 memory: 6.46GiB(27.34%) tps: 22,723 tflops: 22.87 mfu: 7.33% global_avg_ntp_loss: 3.4781 global_avg_mtp_loss: 16.4494 +[titan] 2025-06-13 14:18:15,983 - root - INFO - lr: 2.6438e-04 gnorm: 1.33 [ 1:36:46< 1:20:01] +[titan] 2025-06-13 14:18:19,897 - root - INFO - step: 8215 loss: 20.9916 memory: 6.46GiB(27.34%) tps: 20,935 tflops: 21.07 mfu: 6.75% global_avg_ntp_loss: 3.6909 global_avg_mtp_loss: 17.3007 +[titan] 2025-06-13 14:18:19,897 - root - INFO - lr: 2.6412e-04 gnorm: 1.22 [ 1:36:49< 1:19:58] +[titan] 2025-06-13 14:18:23,044 - root - INFO - step: 8220 loss: 19.9055 memory: 6.46GiB(27.34%) tps: 26,036 tflops: 26.20 mfu: 8.40% global_avg_ntp_loss: 3.4198 global_avg_mtp_loss: 16.4857 +[titan] 2025-06-13 14:18:23,044 - root - INFO - lr: 2.6387e-04 gnorm: 1.54 [ 1:36:53< 1:19:54] +[titan] 2025-06-13 14:18:26,820 - root - INFO - step: 8225 loss: 20.3854 memory: 6.46GiB(27.34%) tps: 21,692 tflops: 21.83 mfu: 7.00% global_avg_ntp_loss: 3.5691 global_avg_mtp_loss: 16.8162 +[titan] 2025-06-13 14:18:26,821 - root - INFO - lr: 2.6362e-04 gnorm: 1.08 [ 1:36:56< 1:19:51] +[titan] 2025-06-13 14:18:31,073 - root - INFO - step: 8230 loss: 19.3541 memory: 6.46GiB(27.34%) tps: 19,267 tflops: 19.39 mfu: 6.21% global_avg_ntp_loss: 3.4222 global_avg_mtp_loss: 15.9318 +[titan] 2025-06-13 14:18:31,073 - root - INFO - lr: 2.6337e-04 gnorm: 1.28 [ 1:37:01< 1:19:48] +[titan] 2025-06-13 14:18:34,637 - root - INFO - step: 8235 loss: 19.6107 memory: 6.46GiB(27.34%) tps: 22,988 tflops: 23.13 mfu: 7.41% global_avg_ntp_loss: 3.4014 global_avg_mtp_loss: 16.2094 +[titan] 2025-06-13 14:18:34,637 - root - INFO - lr: 2.6312e-04 gnorm: 1.48 [ 1:37:04< 1:19:44] +[titan] 2025-06-13 14:18:38,127 - root - INFO - step: 8240 loss: 20.5779 memory: 6.46GiB(27.34%) tps: 23,472 tflops: 23.62 mfu: 7.57% global_avg_ntp_loss: 3.5781 global_avg_mtp_loss: 16.9998 +[titan] 2025-06-13 14:18:38,128 - root - INFO - lr: 2.6286e-04 gnorm: 1.10 [ 1:37:08< 1:19:41] +[titan] 2025-06-13 14:18:41,549 - root - INFO - step: 8245 loss: 19.9473 memory: 6.46GiB(27.34%) tps: 23,946 tflops: 24.10 mfu: 7.72% global_avg_ntp_loss: 3.4632 global_avg_mtp_loss: 16.4841 +[titan] 2025-06-13 14:18:41,549 - root - INFO - lr: 2.6261e-04 gnorm: 1.23 [ 1:37:11< 1:19:37] +[titan] 2025-06-13 14:18:44,681 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:18:45,383 - root - INFO - step: 8250 loss: 19.0187 memory: 6.46GiB(27.34%) tps: 21,368 tflops: 21.50 mfu: 6.89% global_avg_ntp_loss: 3.3205 global_avg_mtp_loss: 15.6982 +[titan] 2025-06-13 14:18:45,383 - root - INFO - lr: 2.6236e-04 gnorm: 1.27 [ 1:37:15< 1:19:34] +[titan] 2025-06-13 14:18:48,680 - root - INFO - step: 8255 loss: 20.2184 memory: 6.46GiB(27.34%) tps: 24,848 tflops: 25.01 mfu: 8.01% global_avg_ntp_loss: 3.5218 global_avg_mtp_loss: 16.6966 +[titan] 2025-06-13 14:18:48,681 - root - INFO - lr: 2.6211e-04 gnorm: 1.27 [ 1:37:18< 1:19:30] +[titan] 2025-06-13 14:18:52,317 - root - INFO - step: 8260 loss: 20.2393 memory: 6.46GiB(27.34%) tps: 22,529 tflops: 22.67 mfu: 7.27% global_avg_ntp_loss: 3.5222 global_avg_mtp_loss: 16.7171 +[titan] 2025-06-13 14:18:52,317 - root - INFO - lr: 2.6186e-04 gnorm: 1.13 [ 1:37:22< 1:19:27] +[titan] 2025-06-13 14:18:56,017 - root - INFO - step: 8265 loss: 19.0739 memory: 6.46GiB(27.34%) tps: 22,144 tflops: 22.28 mfu: 7.14% global_avg_ntp_loss: 3.3421 global_avg_mtp_loss: 15.7318 +[titan] 2025-06-13 14:18:56,017 - root - INFO - lr: 2.6160e-04 gnorm: 1.30 [ 1:37:26< 1:19:23] +[titan] 2025-06-13 14:18:59,310 - root - INFO - step: 8270 loss: 18.7320 memory: 6.46GiB(27.34%) tps: 24,884 tflops: 25.04 mfu: 8.03% global_avg_ntp_loss: 3.2470 global_avg_mtp_loss: 15.4850 +[titan] 2025-06-13 14:18:59,310 - root - INFO - lr: 2.6135e-04 gnorm: 1.42 [ 1:37:29< 1:19:20] +[titan] 2025-06-13 14:19:03,289 - root - INFO - step: 8275 loss: 19.8159 memory: 6.46GiB(27.34%) tps: 20,590 tflops: 20.72 mfu: 6.64% global_avg_ntp_loss: 3.4745 global_avg_mtp_loss: 16.3414 +[titan] 2025-06-13 14:19:03,289 - root - INFO - lr: 2.6110e-04 gnorm: 1.14 [ 1:37:33< 1:19:16] +[titan] 2025-06-13 14:19:06,880 - root - INFO - step: 8280 loss: 20.6537 memory: 6.46GiB(27.34%) tps: 22,815 tflops: 22.96 mfu: 7.36% global_avg_ntp_loss: 3.7153 global_avg_mtp_loss: 16.9384 +[titan] 2025-06-13 14:19:06,880 - root - INFO - lr: 2.6085e-04 gnorm: 1.39 [ 1:37:36< 1:19:13] +[titan] 2025-06-13 14:19:10,518 - root - INFO - step: 8285 loss: 19.1923 memory: 6.46GiB(27.34%) tps: 22,517 tflops: 22.66 mfu: 7.26% global_avg_ntp_loss: 3.2819 global_avg_mtp_loss: 15.9104 +[titan] 2025-06-13 14:19:10,519 - root - INFO - lr: 2.6060e-04 gnorm: 1.26 [ 1:37:40< 1:19:09] +[titan] 2025-06-13 14:19:14,013 - root - INFO - step: 8290 loss: 20.5147 memory: 6.46GiB(27.34%) tps: 23,447 tflops: 23.60 mfu: 7.56% global_avg_ntp_loss: 3.5863 global_avg_mtp_loss: 16.9283 +[titan] 2025-06-13 14:19:14,013 - root - INFO - lr: 2.6034e-04 gnorm: 1.10 [ 1:37:44< 1:19:06] +[titan] 2025-06-13 14:19:17,507 - root - INFO - step: 8295 loss: 19.9766 memory: 6.46GiB(27.34%) tps: 23,444 tflops: 23.59 mfu: 7.56% global_avg_ntp_loss: 3.4596 global_avg_mtp_loss: 16.5170 +[titan] 2025-06-13 14:19:17,508 - root - INFO - lr: 2.6009e-04 gnorm: 1.20 [ 1:37:47< 1:19:02] +[titan] 2025-06-13 14:19:20,725 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:19:21,283 - root - INFO - step: 8300 loss: 19.6079 memory: 6.46GiB(27.34%) tps: 21,700 tflops: 21.84 mfu: 7.00% global_avg_ntp_loss: 3.4424 global_avg_mtp_loss: 16.1655 +[titan] 2025-06-13 14:19:21,283 - root - INFO - lr: 2.5984e-04 gnorm: 1.16 [ 1:37:51< 1:18:59] +[titan] 2025-06-13 14:19:24,591 - root - INFO - step: 8305 loss: 17.4125 memory: 6.46GiB(27.34%) tps: 24,770 tflops: 24.93 mfu: 7.99% global_avg_ntp_loss: 3.0031 global_avg_mtp_loss: 14.4094 +[titan] 2025-06-13 14:19:24,591 - root - INFO - lr: 2.5959e-04 gnorm: 1.42 [ 1:37:54< 1:18:55] +[titan] 2025-06-13 14:19:27,991 - root - INFO - step: 8310 loss: 20.4034 memory: 6.46GiB(27.34%) tps: 24,092 tflops: 24.25 mfu: 7.77% global_avg_ntp_loss: 3.5257 global_avg_mtp_loss: 16.8776 +[titan] 2025-06-13 14:19:27,992 - root - INFO - lr: 2.5934e-04 gnorm: 1.29 [ 1:37:58< 1:18:52] +[titan] 2025-06-13 14:19:31,576 - root - INFO - step: 8315 loss: 19.7038 memory: 6.46GiB(27.34%) tps: 22,854 tflops: 23.00 mfu: 7.37% global_avg_ntp_loss: 3.4519 global_avg_mtp_loss: 16.2519 +[titan] 2025-06-13 14:19:31,577 - root - INFO - lr: 2.5908e-04 gnorm: 1.14 [ 1:38:01< 1:18:48] +[titan] 2025-06-13 14:19:35,353 - root - INFO - step: 8320 loss: 19.5842 memory: 6.46GiB(27.34%) tps: 21,693 tflops: 21.83 mfu: 7.00% global_avg_ntp_loss: 3.3695 global_avg_mtp_loss: 16.2147 +[titan] 2025-06-13 14:19:35,353 - root - INFO - lr: 2.5883e-04 gnorm: 1.26 [ 1:38:05< 1:18:45] +[titan] 2025-06-13 14:19:38,598 - root - INFO - step: 8325 loss: 19.8172 memory: 6.46GiB(27.34%) tps: 25,249 tflops: 25.41 mfu: 8.14% global_avg_ntp_loss: 3.4688 global_avg_mtp_loss: 16.3484 +[titan] 2025-06-13 14:19:38,598 - root - INFO - lr: 2.5858e-04 gnorm: 1.12 [ 1:38:08< 1:18:41] +[titan] 2025-06-13 14:19:42,084 - root - INFO - step: 8330 loss: 18.2216 memory: 6.46GiB(27.34%) tps: 23,504 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.1638 global_avg_mtp_loss: 15.0578 +[titan] 2025-06-13 14:19:42,084 - root - INFO - lr: 2.5833e-04 gnorm: 1.16 [ 1:38:12< 1:18:37] +[titan] 2025-06-13 14:19:45,884 - root - INFO - step: 8335 loss: 20.4764 memory: 6.46GiB(27.34%) tps: 21,559 tflops: 21.70 mfu: 6.95% global_avg_ntp_loss: 3.5848 global_avg_mtp_loss: 16.8916 +[titan] 2025-06-13 14:19:45,884 - root - INFO - lr: 2.5808e-04 gnorm: 1.14 [ 1:38:15< 1:18:34] +[titan] 2025-06-13 14:19:49,238 - root - INFO - step: 8340 loss: 20.6237 memory: 6.46GiB(27.34%) tps: 24,430 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 3.5945 global_avg_mtp_loss: 17.0292 +[titan] 2025-06-13 14:19:49,238 - root - INFO - lr: 2.5783e-04 gnorm: 1.28 [ 1:38:19< 1:18:30] +[titan] 2025-06-13 14:19:52,659 - root - INFO - step: 8345 loss: 19.6085 memory: 6.46GiB(27.34%) tps: 23,948 tflops: 24.10 mfu: 7.72% global_avg_ntp_loss: 3.4908 global_avg_mtp_loss: 16.1177 +[titan] 2025-06-13 14:19:52,659 - root - INFO - lr: 2.5757e-04 gnorm: 1.36 [ 1:38:22< 1:18:27] +[titan] 2025-06-13 14:19:55,606 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:19:56,311 - root - INFO - step: 8350 loss: 18.6325 memory: 6.46GiB(27.34%) tps: 22,436 tflops: 22.58 mfu: 7.24% global_avg_ntp_loss: 3.2169 global_avg_mtp_loss: 15.4156 +[titan] 2025-06-13 14:19:56,311 - root - INFO - lr: 2.5732e-04 gnorm: 1.30 [ 1:38:26< 1:18:23] +[titan] 2025-06-13 14:20:00,093 - root - INFO - step: 8355 loss: 20.8243 memory: 6.46GiB(27.34%) tps: 21,660 tflops: 21.80 mfu: 6.99% global_avg_ntp_loss: 3.6351 global_avg_mtp_loss: 17.1892 +[titan] 2025-06-13 14:20:00,094 - root - INFO - lr: 2.5707e-04 gnorm: 1.08 [ 1:38:30< 1:18:20] +[titan] 2025-06-13 14:20:03,732 - root - INFO - step: 8360 loss: 20.5343 memory: 6.46GiB(27.34%) tps: 22,519 tflops: 22.66 mfu: 7.26% global_avg_ntp_loss: 3.6250 global_avg_mtp_loss: 16.9093 +[titan] 2025-06-13 14:20:03,732 - root - INFO - lr: 2.5682e-04 gnorm: 1.12 [ 1:38:33< 1:18:17] +[titan] 2025-06-13 14:20:07,301 - root - INFO - step: 8365 loss: 18.9687 memory: 6.46GiB(27.34%) tps: 22,952 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 3.4027 global_avg_mtp_loss: 15.5660 +[titan] 2025-06-13 14:20:07,302 - root - INFO - lr: 2.5657e-04 gnorm: 1.30 [ 1:38:37< 1:18:13] +[titan] 2025-06-13 14:20:10,843 - root - INFO - step: 8370 loss: 20.0536 memory: 6.46GiB(27.34%) tps: 23,130 tflops: 23.28 mfu: 7.46% global_avg_ntp_loss: 3.4486 global_avg_mtp_loss: 16.6049 +[titan] 2025-06-13 14:20:10,844 - root - INFO - lr: 2.5632e-04 gnorm: 1.36 [ 1:38:40< 1:18:10] +[titan] 2025-06-13 14:20:14,281 - root - INFO - step: 8375 loss: 18.5764 memory: 6.46GiB(27.34%) tps: 23,832 tflops: 23.98 mfu: 7.69% global_avg_ntp_loss: 3.1635 global_avg_mtp_loss: 15.4130 +[titan] 2025-06-13 14:20:14,282 - root - INFO - lr: 2.5606e-04 gnorm: 1.49 [ 1:38:44< 1:18:06] +[titan] 2025-06-13 14:20:18,098 - root - INFO - step: 8380 loss: 20.1092 memory: 6.46GiB(27.34%) tps: 21,464 tflops: 21.60 mfu: 6.92% global_avg_ntp_loss: 3.4713 global_avg_mtp_loss: 16.6380 +[titan] 2025-06-13 14:20:18,099 - root - INFO - lr: 2.5581e-04 gnorm: 1.23 [ 1:38:48< 1:18:03] +[titan] 2025-06-13 14:20:21,273 - root - INFO - step: 8385 loss: 18.9632 memory: 6.46GiB(27.34%) tps: 25,807 tflops: 25.97 mfu: 8.32% global_avg_ntp_loss: 3.2670 global_avg_mtp_loss: 15.6962 +[titan] 2025-06-13 14:20:21,274 - root - INFO - lr: 2.5556e-04 gnorm: 1.46 [ 1:38:51< 1:17:59] +[titan] 2025-06-13 14:20:24,850 - root - INFO - step: 8390 loss: 18.9510 memory: 6.46GiB(27.34%) tps: 22,904 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 3.2855 global_avg_mtp_loss: 15.6655 +[titan] 2025-06-13 14:20:24,851 - root - INFO - lr: 2.5531e-04 gnorm: 1.25 [ 1:38:54< 1:17:55] +[titan] 2025-06-13 14:20:28,202 - root - INFO - step: 8395 loss: 19.5736 memory: 6.46GiB(27.34%) tps: 24,444 tflops: 24.60 mfu: 7.88% global_avg_ntp_loss: 3.3814 global_avg_mtp_loss: 16.1922 +[titan] 2025-06-13 14:20:28,202 - root - INFO - lr: 2.5506e-04 gnorm: 1.17 [ 1:38:58< 1:17:52] +[titan] 2025-06-13 14:20:30,649 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:20:31,347 - root - INFO - step: 8400 loss: 20.3499 memory: 6.46GiB(27.34%) tps: 26,055 tflops: 26.22 mfu: 8.40% global_avg_ntp_loss: 3.5410 global_avg_mtp_loss: 16.8089 +[titan] 2025-06-13 14:20:31,347 - root - INFO - lr: 2.5481e-04 gnorm: 1.07 [ 1:39:01< 1:17:48] +[titan] 2025-06-13 14:20:35,118 - root - INFO - step: 8405 loss: 17.3607 memory: 6.46GiB(27.34%) tps: 21,723 tflops: 21.86 mfu: 7.01% global_avg_ntp_loss: 2.9899 global_avg_mtp_loss: 14.3708 +[titan] 2025-06-13 14:20:35,119 - root - INFO - lr: 2.5456e-04 gnorm: 1.46 [ 1:39:05< 1:17:44] +[titan] 2025-06-13 14:20:40,043 - root - INFO - step: 8410 loss: 20.6127 memory: 6.46GiB(27.34%) tps: 16,635 tflops: 16.74 mfu: 5.37% global_avg_ntp_loss: 3.6329 global_avg_mtp_loss: 16.9798 +[titan] 2025-06-13 14:20:40,043 - root - INFO - lr: 2.5430e-04 gnorm: 1.15 [ 1:39:10< 1:17:42] +[titan] 2025-06-13 14:20:43,191 - root - INFO - step: 8415 loss: 18.9480 memory: 6.46GiB(27.34%) tps: 26,025 tflops: 26.19 mfu: 8.39% global_avg_ntp_loss: 3.2863 global_avg_mtp_loss: 15.6617 +[titan] 2025-06-13 14:20:43,192 - root - INFO - lr: 2.5405e-04 gnorm: 1.28 [ 1:39:13< 1:17:38] +[titan] 2025-06-13 14:20:46,769 - root - INFO - step: 8420 loss: 20.5671 memory: 6.46GiB(27.34%) tps: 22,898 tflops: 23.04 mfu: 7.39% global_avg_ntp_loss: 3.5885 global_avg_mtp_loss: 16.9786 +[titan] 2025-06-13 14:20:46,770 - root - INFO - lr: 2.5380e-04 gnorm: 1.17 [ 1:39:16< 1:17:35] +[titan] 2025-06-13 14:20:50,351 - root - INFO - step: 8425 loss: 19.2700 memory: 6.46GiB(27.34%) tps: 22,873 tflops: 23.02 mfu: 7.38% global_avg_ntp_loss: 3.3670 global_avg_mtp_loss: 15.9030 +[titan] 2025-06-13 14:20:50,351 - root - INFO - lr: 2.5355e-04 gnorm: 1.23 [ 1:39:20< 1:17:31] +[titan] 2025-06-13 14:20:53,556 - root - INFO - step: 8430 loss: 18.9776 memory: 6.46GiB(27.34%) tps: 25,565 tflops: 25.73 mfu: 8.25% global_avg_ntp_loss: 3.2902 global_avg_mtp_loss: 15.6873 +[titan] 2025-06-13 14:20:53,557 - root - INFO - lr: 2.5330e-04 gnorm: 1.38 [ 1:39:23< 1:17:27] +[titan] 2025-06-13 14:20:57,280 - root - INFO - step: 8435 loss: 19.3895 memory: 6.46GiB(27.34%) tps: 21,999 tflops: 22.14 mfu: 7.10% global_avg_ntp_loss: 3.3723 global_avg_mtp_loss: 16.0171 +[titan] 2025-06-13 14:20:57,281 - root - INFO - lr: 2.5305e-04 gnorm: 1.19 [ 1:39:27< 1:17:24] +[titan] 2025-06-13 14:21:00,938 - root - INFO - step: 8440 loss: 19.0611 memory: 6.46GiB(27.34%) tps: 22,404 tflops: 22.55 mfu: 7.23% global_avg_ntp_loss: 3.3184 global_avg_mtp_loss: 15.7427 +[titan] 2025-06-13 14:21:00,938 - root - INFO - lr: 2.5280e-04 gnorm: 1.19 [ 1:39:30< 1:17:20] +[titan] 2025-06-13 14:21:04,233 - root - INFO - step: 8445 loss: 20.1645 memory: 6.46GiB(27.34%) tps: 24,860 tflops: 25.02 mfu: 8.02% global_avg_ntp_loss: 3.4915 global_avg_mtp_loss: 16.6730 +[titan] 2025-06-13 14:21:04,234 - root - INFO - lr: 2.5255e-04 gnorm: 1.28 [ 1:39:34< 1:17:17] +[titan] 2025-06-13 14:21:06,908 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:21:07,774 - root - INFO - step: 8450 loss: 20.2277 memory: 6.46GiB(27.34%) tps: 23,141 tflops: 23.29 mfu: 7.46% global_avg_ntp_loss: 3.5173 global_avg_mtp_loss: 16.7104 +[titan] 2025-06-13 14:21:07,774 - root - INFO - lr: 2.5229e-04 gnorm: 1.26 [ 1:39:37< 1:17:13] +[titan] 2025-06-13 14:21:11,051 - root - INFO - step: 8455 loss: 18.7872 memory: 6.46GiB(27.34%) tps: 25,002 tflops: 25.16 mfu: 8.06% global_avg_ntp_loss: 3.2541 global_avg_mtp_loss: 15.5331 +[titan] 2025-06-13 14:21:11,051 - root - INFO - lr: 2.5204e-04 gnorm: 1.40 [ 1:39:41< 1:17:09] +[titan] 2025-06-13 14:21:14,724 - root - INFO - step: 8460 loss: 20.2340 memory: 6.46GiB(27.34%) tps: 22,304 tflops: 22.45 mfu: 7.19% global_avg_ntp_loss: 3.5374 global_avg_mtp_loss: 16.6966 +[titan] 2025-06-13 14:21:14,724 - root - INFO - lr: 2.5179e-04 gnorm: 1.22 [ 1:39:44< 1:17:06] +[titan] 2025-06-13 14:21:18,334 - root - INFO - step: 8465 loss: 19.3315 memory: 6.46GiB(27.34%) tps: 22,695 tflops: 22.84 mfu: 7.32% global_avg_ntp_loss: 3.3276 global_avg_mtp_loss: 16.0039 +[titan] 2025-06-13 14:21:18,334 - root - INFO - lr: 2.5154e-04 gnorm: 1.42 [ 1:39:48< 1:17:03] +[titan] 2025-06-13 14:21:21,676 - root - INFO - step: 8470 loss: 20.0317 memory: 6.46GiB(27.34%) tps: 24,515 tflops: 24.67 mfu: 7.91% global_avg_ntp_loss: 3.4505 global_avg_mtp_loss: 16.5812 +[titan] 2025-06-13 14:21:21,677 - root - INFO - lr: 2.5129e-04 gnorm: 1.35 [ 1:39:51< 1:16:59] +[titan] 2025-06-13 14:21:25,018 - root - INFO - step: 8475 loss: 20.5726 memory: 6.46GiB(27.34%) tps: 24,516 tflops: 24.67 mfu: 7.91% global_avg_ntp_loss: 3.5413 global_avg_mtp_loss: 17.0313 +[titan] 2025-06-13 14:21:25,019 - root - INFO - lr: 2.5104e-04 gnorm: 1.29 [ 1:39:55< 1:16:55] +[titan] 2025-06-13 14:21:28,499 - root - INFO - step: 8480 loss: 19.3808 memory: 6.46GiB(27.34%) tps: 23,542 tflops: 23.69 mfu: 7.59% global_avg_ntp_loss: 3.4534 global_avg_mtp_loss: 15.9273 +[titan] 2025-06-13 14:21:28,499 - root - INFO - lr: 2.5079e-04 gnorm: 1.47 [ 1:39:58< 1:16:52] +[titan] 2025-06-13 14:21:32,026 - root - INFO - step: 8485 loss: 18.2388 memory: 6.46GiB(27.34%) tps: 23,226 tflops: 23.37 mfu: 7.49% global_avg_ntp_loss: 3.1336 global_avg_mtp_loss: 15.1052 +[titan] 2025-06-13 14:21:32,026 - root - INFO - lr: 2.5054e-04 gnorm: 1.52 [ 1:40:02< 1:16:48] +[titan] 2025-06-13 14:21:35,254 - root - INFO - step: 8490 loss: 19.6109 memory: 6.46GiB(27.34%) tps: 25,386 tflops: 25.55 mfu: 8.19% global_avg_ntp_loss: 3.4163 global_avg_mtp_loss: 16.1946 +[titan] 2025-06-13 14:21:35,254 - root - INFO - lr: 2.5029e-04 gnorm: 1.28 [ 1:40:05< 1:16:44] +[titan] 2025-06-13 14:21:39,182 - root - INFO - step: 8495 loss: 18.7783 memory: 6.46GiB(27.34%) tps: 20,854 tflops: 20.99 mfu: 6.73% global_avg_ntp_loss: 3.2696 global_avg_mtp_loss: 15.5087 +[titan] 2025-06-13 14:21:39,182 - root - INFO - lr: 2.5004e-04 gnorm: 1.45 [ 1:40:09< 1:16:41] +[titan] 2025-06-13 14:21:41,636 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:21:42,584 - root - INFO - step: 8500 loss: 20.1298 memory: 6.46GiB(27.34%) tps: 24,083 tflops: 24.24 mfu: 7.77% global_avg_ntp_loss: 3.5137 global_avg_mtp_loss: 16.6162 +[titan] 2025-06-13 14:21:42,584 - root - INFO - lr: 2.4978e-04 gnorm: 1.20 [ 1:40:12< 1:16:37] +[titan] 2025-06-13 14:21:46,151 - root - INFO - step: 8505 loss: 19.7284 memory: 6.46GiB(27.34%) tps: 22,970 tflops: 23.12 mfu: 7.41% global_avg_ntp_loss: 3.4381 global_avg_mtp_loss: 16.2903 +[titan] 2025-06-13 14:21:46,151 - root - INFO - lr: 2.4953e-04 gnorm: 1.39 [ 1:40:16< 1:16:34] +[titan] 2025-06-13 14:21:49,742 - root - INFO - step: 8510 loss: 20.5916 memory: 6.46GiB(27.34%) tps: 22,814 tflops: 22.96 mfu: 7.36% global_avg_ntp_loss: 3.5658 global_avg_mtp_loss: 17.0258 +[titan] 2025-06-13 14:21:49,743 - root - INFO - lr: 2.4928e-04 gnorm: 1.20 [ 1:40:19< 1:16:30] +[titan] 2025-06-13 14:21:53,037 - root - INFO - step: 8515 loss: 18.4054 memory: 6.46GiB(27.34%) tps: 24,871 tflops: 25.03 mfu: 8.02% global_avg_ntp_loss: 3.2050 global_avg_mtp_loss: 15.2005 +[titan] 2025-06-13 14:21:53,037 - root - INFO - lr: 2.4903e-04 gnorm: 1.44 [ 1:40:23< 1:16:27] +[titan] 2025-06-13 14:21:56,503 - root - INFO - step: 8520 loss: 20.0078 memory: 6.46GiB(27.34%) tps: 23,633 tflops: 23.78 mfu: 7.62% global_avg_ntp_loss: 3.4726 global_avg_mtp_loss: 16.5352 +[titan] 2025-06-13 14:21:56,504 - root - INFO - lr: 2.4878e-04 gnorm: 1.22 [ 1:40:26< 1:16:23] +[titan] 2025-06-13 14:22:00,260 - root - INFO - step: 8525 loss: 20.5602 memory: 6.46GiB(27.34%) tps: 21,809 tflops: 21.95 mfu: 7.03% global_avg_ntp_loss: 3.6062 global_avg_mtp_loss: 16.9539 +[titan] 2025-06-13 14:22:00,261 - root - INFO - lr: 2.4853e-04 gnorm: 1.12 [ 1:40:30< 1:16:20] +[titan] 2025-06-13 14:22:03,737 - root - INFO - step: 8530 loss: 19.6904 memory: 6.46GiB(27.34%) tps: 23,567 tflops: 23.72 mfu: 7.60% global_avg_ntp_loss: 3.3972 global_avg_mtp_loss: 16.2932 +[titan] 2025-06-13 14:22:03,737 - root - INFO - lr: 2.4828e-04 gnorm: 1.25 [ 1:40:33< 1:16:16] +[titan] 2025-06-13 14:22:07,167 - root - INFO - step: 8535 loss: 20.7156 memory: 6.46GiB(27.34%) tps: 23,882 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 3.6296 global_avg_mtp_loss: 17.0860 +[titan] 2025-06-13 14:22:07,168 - root - INFO - lr: 2.4803e-04 gnorm: 1.07 [ 1:40:37< 1:16:12] +[titan] 2025-06-13 14:22:10,293 - root - INFO - step: 8540 loss: 19.2872 memory: 6.46GiB(27.34%) tps: 26,215 tflops: 26.38 mfu: 8.46% global_avg_ntp_loss: 3.3498 global_avg_mtp_loss: 15.9374 +[titan] 2025-06-13 14:22:10,293 - root - INFO - lr: 2.4778e-04 gnorm: 1.19 [ 1:40:40< 1:16:09] +[titan] 2025-06-13 14:22:14,201 - root - INFO - step: 8545 loss: 19.8705 memory: 6.46GiB(27.34%) tps: 20,964 tflops: 21.10 mfu: 6.76% global_avg_ntp_loss: 3.4511 global_avg_mtp_loss: 16.4194 +[titan] 2025-06-13 14:22:14,201 - root - INFO - lr: 2.4753e-04 gnorm: 1.11 [ 1:40:44< 1:16:05] +[titan] 2025-06-13 14:22:17,706 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:22:18,311 - root - INFO - step: 8550 loss: 20.0431 memory: 6.46GiB(27.34%) tps: 19,935 tflops: 20.06 mfu: 6.43% global_avg_ntp_loss: 3.4355 global_avg_mtp_loss: 16.6076 +[titan] 2025-06-13 14:22:18,311 - root - INFO - lr: 2.4728e-04 gnorm: 1.26 [ 1:40:48< 1:16:02] +[titan] 2025-06-13 14:22:21,370 - root - INFO - step: 8555 loss: 19.9975 memory: 6.46GiB(27.34%) tps: 26,783 tflops: 26.95 mfu: 8.64% global_avg_ntp_loss: 3.5208 global_avg_mtp_loss: 16.4768 +[titan] 2025-06-13 14:22:21,370 - root - INFO - lr: 2.4703e-04 gnorm: 1.18 [ 1:40:51< 1:15:58] +[titan] 2025-06-13 14:22:24,890 - root - INFO - step: 8560 loss: 20.5717 memory: 6.46GiB(27.34%) tps: 23,275 tflops: 23.42 mfu: 7.51% global_avg_ntp_loss: 3.6166 global_avg_mtp_loss: 16.9551 +[titan] 2025-06-13 14:22:24,890 - root - INFO - lr: 2.4678e-04 gnorm: 1.15 [ 1:40:54< 1:15:55] +[titan] 2025-06-13 14:22:28,305 - root - INFO - step: 8565 loss: 19.6751 memory: 6.46GiB(27.34%) tps: 23,989 tflops: 24.14 mfu: 7.74% global_avg_ntp_loss: 3.3917 global_avg_mtp_loss: 16.2834 +[titan] 2025-06-13 14:22:28,306 - root - INFO - lr: 2.4653e-04 gnorm: 1.14 [ 1:40:58< 1:15:51] +[titan] 2025-06-13 14:22:32,074 - root - INFO - step: 8570 loss: 19.8460 memory: 6.46GiB(27.34%) tps: 21,739 tflops: 21.88 mfu: 7.01% global_avg_ntp_loss: 3.4422 global_avg_mtp_loss: 16.4039 +[titan] 2025-06-13 14:22:32,074 - root - INFO - lr: 2.4628e-04 gnorm: 1.29 [ 1:41:02< 1:15:48] +[titan] 2025-06-13 14:22:35,392 - root - INFO - step: 8575 loss: 17.7124 memory: 6.46GiB(27.34%) tps: 24,695 tflops: 24.85 mfu: 7.97% global_avg_ntp_loss: 3.0499 global_avg_mtp_loss: 14.6625 +[titan] 2025-06-13 14:22:35,392 - root - INFO - lr: 2.4603e-04 gnorm: 1.28 [ 1:41:05< 1:15:44] +[titan] 2025-06-13 14:22:38,980 - root - INFO - step: 8580 loss: 18.5639 memory: 6.46GiB(27.34%) tps: 22,832 tflops: 22.98 mfu: 7.36% global_avg_ntp_loss: 3.2422 global_avg_mtp_loss: 15.3216 +[titan] 2025-06-13 14:22:38,980 - root - INFO - lr: 2.4578e-04 gnorm: 1.52 [ 1:41:08< 1:15:41] +[titan] 2025-06-13 14:22:42,409 - root - INFO - step: 8585 loss: 19.3130 memory: 6.46GiB(27.34%) tps: 23,897 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 3.3498 global_avg_mtp_loss: 15.9632 +[titan] 2025-06-13 14:22:42,409 - root - INFO - lr: 2.4553e-04 gnorm: 1.20 [ 1:41:12< 1:15:37] +[titan] 2025-06-13 14:22:46,081 - root - INFO - step: 8590 loss: 17.9145 memory: 6.46GiB(27.34%) tps: 22,307 tflops: 22.45 mfu: 7.20% global_avg_ntp_loss: 3.0921 global_avg_mtp_loss: 14.8224 +[titan] 2025-06-13 14:22:46,082 - root - INFO - lr: 2.4528e-04 gnorm: 1.14 [ 1:41:16< 1:15:34] +[titan] 2025-06-13 14:22:49,649 - root - INFO - step: 8595 loss: 19.5990 memory: 6.46GiB(27.34%) tps: 22,964 tflops: 23.11 mfu: 7.41% global_avg_ntp_loss: 3.4101 global_avg_mtp_loss: 16.1889 +[titan] 2025-06-13 14:22:49,650 - root - INFO - lr: 2.4502e-04 gnorm: 1.32 [ 1:41:19< 1:15:30] +[titan] 2025-06-13 14:22:52,623 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:22:53,188 - root - INFO - step: 8600 loss: 19.0187 memory: 6.46GiB(27.34%) tps: 23,156 tflops: 23.30 mfu: 7.47% global_avg_ntp_loss: 3.3025 global_avg_mtp_loss: 15.7162 +[titan] 2025-06-13 14:22:53,188 - root - INFO - lr: 2.4477e-04 gnorm: 1.30 [ 1:41:23< 1:15:27] +[titan] 2025-06-13 14:22:57,136 - root - INFO - step: 8605 loss: 20.2949 memory: 6.46GiB(27.34%) tps: 20,748 tflops: 20.88 mfu: 6.69% global_avg_ntp_loss: 3.5367 global_avg_mtp_loss: 16.7582 +[titan] 2025-06-13 14:22:57,137 - root - INFO - lr: 2.4452e-04 gnorm: 1.25 [ 1:41:27< 1:15:23] +[titan] 2025-06-13 14:23:00,493 - root - INFO - step: 8610 loss: 19.6954 memory: 6.46GiB(27.34%) tps: 24,407 tflops: 24.56 mfu: 7.87% global_avg_ntp_loss: 3.3797 global_avg_mtp_loss: 16.3157 +[titan] 2025-06-13 14:23:00,494 - root - INFO - lr: 2.4427e-04 gnorm: 1.18 [ 1:41:30< 1:15:20] +[titan] 2025-06-13 14:23:03,932 - root - INFO - step: 8615 loss: 19.5411 memory: 6.46GiB(27.34%) tps: 23,830 tflops: 23.98 mfu: 7.69% global_avg_ntp_loss: 3.4124 global_avg_mtp_loss: 16.1287 +[titan] 2025-06-13 14:23:03,932 - root - INFO - lr: 2.4402e-04 gnorm: 1.18 [ 1:41:33< 1:15:16] +[titan] 2025-06-13 14:23:07,475 - root - INFO - step: 8620 loss: 18.6666 memory: 6.46GiB(27.34%) tps: 23,120 tflops: 23.27 mfu: 7.46% global_avg_ntp_loss: 3.2518 global_avg_mtp_loss: 15.4148 +[titan] 2025-06-13 14:23:07,476 - root - INFO - lr: 2.4377e-04 gnorm: 1.19 [ 1:41:37< 1:15:12] +[titan] 2025-06-13 14:23:10,847 - root - INFO - step: 8625 loss: 17.5898 memory: 6.46GiB(27.34%) tps: 24,298 tflops: 24.45 mfu: 7.84% global_avg_ntp_loss: 3.0551 global_avg_mtp_loss: 14.5347 +[titan] 2025-06-13 14:23:10,847 - root - INFO - lr: 2.4352e-04 gnorm: 1.35 [ 1:41:40< 1:15:09] +[titan] 2025-06-13 14:23:14,336 - root - INFO - step: 8630 loss: 20.3639 memory: 6.46GiB(27.34%) tps: 23,482 tflops: 23.63 mfu: 7.57% global_avg_ntp_loss: 3.5469 global_avg_mtp_loss: 16.8170 +[titan] 2025-06-13 14:23:14,336 - root - INFO - lr: 2.4327e-04 gnorm: 1.16 [ 1:41:44< 1:15:05] +[titan] 2025-06-13 14:23:18,015 - root - INFO - step: 8635 loss: 19.9040 memory: 6.46GiB(27.34%) tps: 22,268 tflops: 22.41 mfu: 7.18% global_avg_ntp_loss: 3.4802 global_avg_mtp_loss: 16.4238 +[titan] 2025-06-13 14:23:18,016 - root - INFO - lr: 2.4302e-04 gnorm: 1.35 [ 1:41:48< 1:15:02] +[titan] 2025-06-13 14:23:21,691 - root - INFO - step: 8640 loss: 15.7737 memory: 6.46GiB(27.34%) tps: 22,291 tflops: 22.43 mfu: 7.19% global_avg_ntp_loss: 2.7360 global_avg_mtp_loss: 13.0378 +[titan] 2025-06-13 14:23:21,691 - root - INFO - lr: 2.4277e-04 gnorm: 1.21 [ 1:41:51< 1:14:58] +[titan] 2025-06-13 14:23:25,387 - root - INFO - step: 8645 loss: 19.5568 memory: 6.46GiB(27.34%) tps: 22,169 tflops: 22.31 mfu: 7.15% global_avg_ntp_loss: 3.4237 global_avg_mtp_loss: 16.1331 +[titan] 2025-06-13 14:23:25,387 - root - INFO - lr: 2.4252e-04 gnorm: 1.31 [ 1:41:55< 1:14:55] +[titan] 2025-06-13 14:23:28,181 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:23:28,934 - root - INFO - step: 8650 loss: 20.5155 memory: 6.46GiB(27.34%) tps: 23,094 tflops: 23.24 mfu: 7.45% global_avg_ntp_loss: 3.5788 global_avg_mtp_loss: 16.9367 +[titan] 2025-06-13 14:23:28,935 - root - INFO - lr: 2.4228e-04 gnorm: 1.21 [ 1:41:58< 1:14:51] +[titan] 2025-06-13 14:23:32,475 - root - INFO - step: 8655 loss: 19.7811 memory: 6.46GiB(27.34%) tps: 23,141 tflops: 23.29 mfu: 7.46% global_avg_ntp_loss: 3.4468 global_avg_mtp_loss: 16.3343 +[titan] 2025-06-13 14:23:32,475 - root - INFO - lr: 2.4203e-04 gnorm: 1.16 [ 1:42:02< 1:14:48] +[titan] 2025-06-13 14:23:35,641 - root - INFO - step: 8660 loss: 17.5097 memory: 6.46GiB(27.34%) tps: 25,876 tflops: 26.04 mfu: 8.35% global_avg_ntp_loss: 3.0552 global_avg_mtp_loss: 14.4544 +[titan] 2025-06-13 14:23:35,642 - root - INFO - lr: 2.4178e-04 gnorm: 1.35 [ 1:42:05< 1:14:44] +[titan] 2025-06-13 14:23:39,145 - root - INFO - step: 8665 loss: 19.5405 memory: 6.46GiB(27.34%) tps: 23,384 tflops: 23.53 mfu: 7.54% global_avg_ntp_loss: 3.3840 global_avg_mtp_loss: 16.1566 +[titan] 2025-06-13 14:23:39,145 - root - INFO - lr: 2.4153e-04 gnorm: 1.34 [ 1:42:09< 1:14:41] +[titan] 2025-06-13 14:23:42,702 - root - INFO - step: 8670 loss: 16.9739 memory: 6.46GiB(27.34%) tps: 23,035 tflops: 23.18 mfu: 7.43% global_avg_ntp_loss: 2.9344 global_avg_mtp_loss: 14.0395 +[titan] 2025-06-13 14:23:42,702 - root - INFO - lr: 2.4128e-04 gnorm: 1.75 [ 1:42:12< 1:14:37] +[titan] 2025-06-13 14:23:46,146 - root - INFO - step: 8675 loss: 18.7848 memory: 6.46GiB(27.34%) tps: 23,789 tflops: 23.94 mfu: 7.67% global_avg_ntp_loss: 3.2751 global_avg_mtp_loss: 15.5096 +[titan] 2025-06-13 14:23:46,146 - root - INFO - lr: 2.4103e-04 gnorm: 1.38 [ 1:42:16< 1:14:33] +[titan] 2025-06-13 14:23:49,271 - root - INFO - step: 8680 loss: 19.4275 memory: 6.46GiB(27.34%) tps: 26,215 tflops: 26.38 mfu: 8.46% global_avg_ntp_loss: 3.3593 global_avg_mtp_loss: 16.0682 +[titan] 2025-06-13 14:23:49,272 - root - INFO - lr: 2.4078e-04 gnorm: 1.17 [ 1:42:19< 1:14:30] +[titan] 2025-06-13 14:23:52,801 - root - INFO - step: 8685 loss: 20.8090 memory: 6.46GiB(27.34%) tps: 23,214 tflops: 23.36 mfu: 7.49% global_avg_ntp_loss: 3.6841 global_avg_mtp_loss: 17.1249 +[titan] 2025-06-13 14:23:52,801 - root - INFO - lr: 2.4053e-04 gnorm: 1.13 [ 1:42:22< 1:14:26] +[titan] 2025-06-13 14:23:59,185 - root - INFO - step: 8690 loss: 18.3224 memory: 6.46GiB(27.34%) tps: 12,833 tflops: 12.91 mfu: 4.14% global_avg_ntp_loss: 3.1781 global_avg_mtp_loss: 15.1444 +[titan] 2025-06-13 14:23:59,185 - root - INFO - lr: 2.4028e-04 gnorm: 1.50 [ 1:42:29< 1:14:25] +[titan] 2025-06-13 14:24:02,617 - root - INFO - step: 8695 loss: 18.7807 memory: 6.46GiB(27.34%) tps: 23,867 tflops: 24.02 mfu: 7.70% global_avg_ntp_loss: 3.2343 global_avg_mtp_loss: 15.5464 +[titan] 2025-06-13 14:24:02,618 - root - INFO - lr: 2.4003e-04 gnorm: 1.38 [ 1:42:32< 1:14:21] +[titan] 2025-06-13 14:24:05,095 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:24:05,663 - root - INFO - step: 8700 loss: 18.6490 memory: 6.46GiB(27.34%) tps: 26,903 tflops: 27.07 mfu: 8.68% global_avg_ntp_loss: 3.1999 global_avg_mtp_loss: 15.4491 +[titan] 2025-06-13 14:24:05,664 - root - INFO - lr: 2.3978e-04 gnorm: 1.54 [ 1:42:35< 1:14:17] +[titan] 2025-06-13 14:24:08,353 - root - INFO - Dumping profiler traces at step 8704 +[titan] 2025-06-13 14:24:08,449 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 14:24:09,030 - root - INFO - step: 8705 loss: 18.5834 memory: 6.46GiB(27.34%) tps: 24,336 tflops: 24.49 mfu: 7.85% global_avg_ntp_loss: 3.2141 global_avg_mtp_loss: 15.3693 +[titan] 2025-06-13 14:24:09,030 - root - INFO - lr: 2.3953e-04 gnorm: 1.54 [ 1:42:39< 1:14:13] +[titan] 2025-06-13 14:24:12,714 - root - INFO - step: 8710 loss: 20.0935 memory: 6.46GiB(27.34%) tps: 22,239 tflops: 22.38 mfu: 7.17% global_avg_ntp_loss: 3.5002 global_avg_mtp_loss: 16.5933 +[titan] 2025-06-13 14:24:12,714 - root - INFO - lr: 2.3928e-04 gnorm: 1.30 [ 1:42:42< 1:14:10] +[titan] 2025-06-13 14:24:15,919 - root - INFO - step: 8715 loss: 17.6714 memory: 6.46GiB(27.34%) tps: 25,567 tflops: 25.73 mfu: 8.25% global_avg_ntp_loss: 3.0358 global_avg_mtp_loss: 14.6356 +[titan] 2025-06-13 14:24:15,919 - root - INFO - lr: 2.3903e-04 gnorm: 1.57 [ 1:42:45< 1:14:06] +[titan] 2025-06-13 14:24:19,193 - root - INFO - step: 8720 loss: 17.7156 memory: 6.46GiB(27.34%) tps: 25,020 tflops: 25.18 mfu: 8.07% global_avg_ntp_loss: 3.1352 global_avg_mtp_loss: 14.5804 +[titan] 2025-06-13 14:24:19,194 - root - INFO - lr: 2.3878e-04 gnorm: 1.66 [ 1:42:49< 1:14:02] +[titan] 2025-06-13 14:24:23,162 - root - INFO - step: 8725 loss: 20.4688 memory: 6.46GiB(27.34%) tps: 20,644 tflops: 20.78 mfu: 6.66% global_avg_ntp_loss: 3.5306 global_avg_mtp_loss: 16.9382 +[titan] 2025-06-13 14:24:23,162 - root - INFO - lr: 2.3853e-04 gnorm: 1.45 [ 1:42:53< 1:13:59] +[titan] 2025-06-13 14:24:26,240 - root - INFO - step: 8730 loss: 18.1702 memory: 6.46GiB(27.34%) tps: 26,617 tflops: 26.79 mfu: 8.59% global_avg_ntp_loss: 3.1737 global_avg_mtp_loss: 14.9965 +[titan] 2025-06-13 14:24:26,240 - root - INFO - lr: 2.3828e-04 gnorm: 1.31 [ 1:42:56< 1:13:55] +[titan] 2025-06-13 14:24:29,840 - root - INFO - step: 8735 loss: 19.9031 memory: 6.46GiB(27.34%) tps: 22,760 tflops: 22.90 mfu: 7.34% global_avg_ntp_loss: 3.4347 global_avg_mtp_loss: 16.4684 +[titan] 2025-06-13 14:24:29,840 - root - INFO - lr: 2.3804e-04 gnorm: 1.28 [ 1:42:59< 1:13:52] +[titan] 2025-06-13 14:24:33,245 - root - INFO - step: 8740 loss: 20.1038 memory: 6.46GiB(27.34%) tps: 24,061 tflops: 24.21 mfu: 7.76% global_avg_ntp_loss: 3.4566 global_avg_mtp_loss: 16.6472 +[titan] 2025-06-13 14:24:33,246 - root - INFO - lr: 2.3779e-04 gnorm: 1.15 [ 1:43:03< 1:13:48] +[titan] 2025-06-13 14:24:36,930 - root - INFO - step: 8745 loss: 19.5176 memory: 6.46GiB(27.34%) tps: 22,233 tflops: 22.37 mfu: 7.17% global_avg_ntp_loss: 3.4780 global_avg_mtp_loss: 16.0395 +[titan] 2025-06-13 14:24:36,931 - root - INFO - lr: 2.3754e-04 gnorm: 1.23 [ 1:43:06< 1:13:45] +[titan] 2025-06-13 14:24:39,520 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:24:40,065 - root - INFO - step: 8750 loss: 19.4458 memory: 6.46GiB(27.34%) tps: 26,138 tflops: 26.30 mfu: 8.43% global_avg_ntp_loss: 3.3741 global_avg_mtp_loss: 16.0717 +[titan] 2025-06-13 14:24:40,065 - root - INFO - lr: 2.3729e-04 gnorm: 1.15 [ 1:43:10< 1:13:41] +[titan] 2025-06-13 14:24:43,670 - root - INFO - step: 8755 loss: 20.9230 memory: 6.46GiB(27.34%) tps: 22,724 tflops: 22.87 mfu: 7.33% global_avg_ntp_loss: 3.6699 global_avg_mtp_loss: 17.2531 +[titan] 2025-06-13 14:24:43,671 - root - INFO - lr: 2.3704e-04 gnorm: 1.16 [ 1:43:13< 1:13:37] +[titan] 2025-06-13 14:24:47,409 - root - INFO - step: 8760 loss: 20.4610 memory: 6.46GiB(27.34%) tps: 21,916 tflops: 22.06 mfu: 7.07% global_avg_ntp_loss: 3.5823 global_avg_mtp_loss: 16.8787 +[titan] 2025-06-13 14:24:47,409 - root - INFO - lr: 2.3679e-04 gnorm: 1.17 [ 1:43:17< 1:13:34] +[titan] 2025-06-13 14:24:51,099 - root - INFO - step: 8765 loss: 19.9821 memory: 6.46GiB(27.34%) tps: 22,199 tflops: 22.34 mfu: 7.16% global_avg_ntp_loss: 3.4807 global_avg_mtp_loss: 16.5015 +[titan] 2025-06-13 14:24:51,100 - root - INFO - lr: 2.3654e-04 gnorm: 1.31 [ 1:43:21< 1:13:31] +[titan] 2025-06-13 14:24:54,884 - root - INFO - step: 8770 loss: 19.7887 memory: 6.46GiB(27.34%) tps: 21,646 tflops: 21.78 mfu: 6.98% global_avg_ntp_loss: 3.3857 global_avg_mtp_loss: 16.4030 +[titan] 2025-06-13 14:24:54,885 - root - INFO - lr: 2.3629e-04 gnorm: 1.35 [ 1:43:24< 1:13:27] +[titan] 2025-06-13 14:24:58,319 - root - INFO - step: 8775 loss: 19.1551 memory: 6.46GiB(27.34%) tps: 23,852 tflops: 24.00 mfu: 7.69% global_avg_ntp_loss: 3.3346 global_avg_mtp_loss: 15.8205 +[titan] 2025-06-13 14:24:58,319 - root - INFO - lr: 2.3605e-04 gnorm: 1.20 [ 1:43:28< 1:13:24] +[titan] 2025-06-13 14:25:01,752 - root - INFO - step: 8780 loss: 18.7366 memory: 6.46GiB(27.34%) tps: 23,870 tflops: 24.02 mfu: 7.70% global_avg_ntp_loss: 3.2010 global_avg_mtp_loss: 15.5357 +[titan] 2025-06-13 14:25:01,752 - root - INFO - lr: 2.3580e-04 gnorm: 1.51 [ 1:43:31< 1:13:20] +[titan] 2025-06-13 14:25:05,115 - root - INFO - step: 8785 loss: 20.0697 memory: 6.46GiB(27.34%) tps: 24,358 tflops: 24.51 mfu: 7.86% global_avg_ntp_loss: 3.4551 global_avg_mtp_loss: 16.6146 +[titan] 2025-06-13 14:25:05,116 - root - INFO - lr: 2.3555e-04 gnorm: 1.14 [ 1:43:35< 1:13:16] +[titan] 2025-06-13 14:25:08,546 - root - INFO - step: 8790 loss: 19.2806 memory: 6.46GiB(27.34%) tps: 23,886 tflops: 24.04 mfu: 7.70% global_avg_ntp_loss: 3.3040 global_avg_mtp_loss: 15.9765 +[titan] 2025-06-13 14:25:08,546 - root - INFO - lr: 2.3530e-04 gnorm: 1.47 [ 1:43:38< 1:13:13] +[titan] 2025-06-13 14:25:12,253 - root - INFO - step: 8795 loss: 17.6598 memory: 6.46GiB(27.34%) tps: 22,098 tflops: 22.24 mfu: 7.13% global_avg_ntp_loss: 3.0618 global_avg_mtp_loss: 14.5981 +[titan] 2025-06-13 14:25:12,253 - root - INFO - lr: 2.3505e-04 gnorm: 1.52 [ 1:43:42< 1:13:09] +[titan] 2025-06-13 14:25:15,177 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:25:16,051 - root - INFO - step: 8800 loss: 19.3793 memory: 6.46GiB(27.34%) tps: 21,574 tflops: 21.71 mfu: 6.96% global_avg_ntp_loss: 3.3462 global_avg_mtp_loss: 16.0331 +[titan] 2025-06-13 14:25:16,051 - root - INFO - lr: 2.3480e-04 gnorm: 1.23 [ 1:43:46< 1:13:06] +[titan] 2025-06-13 14:25:19,673 - root - INFO - step: 8805 loss: 19.1273 memory: 6.46GiB(27.34%) tps: 22,619 tflops: 22.76 mfu: 7.30% global_avg_ntp_loss: 3.3380 global_avg_mtp_loss: 15.7893 +[titan] 2025-06-13 14:25:19,673 - root - INFO - lr: 2.3455e-04 gnorm: 1.25 [ 1:43:49< 1:13:03] +[titan] 2025-06-13 14:25:22,995 - root - INFO - step: 8810 loss: 19.7131 memory: 6.46GiB(27.34%) tps: 24,662 tflops: 24.82 mfu: 7.96% global_avg_ntp_loss: 3.4112 global_avg_mtp_loss: 16.3020 +[titan] 2025-06-13 14:25:22,995 - root - INFO - lr: 2.3431e-04 gnorm: 1.31 [ 1:43:52< 1:12:59] +[titan] 2025-06-13 14:25:26,039 - root - INFO - step: 8815 loss: 18.3275 memory: 6.46GiB(27.34%) tps: 26,913 tflops: 27.08 mfu: 8.68% global_avg_ntp_loss: 3.1849 global_avg_mtp_loss: 15.1426 +[titan] 2025-06-13 14:25:26,040 - root - INFO - lr: 2.3406e-04 gnorm: 1.28 [ 1:43:56< 1:12:55] +[titan] 2025-06-13 14:25:30,048 - root - INFO - step: 8820 loss: 20.8967 memory: 6.46GiB(27.34%) tps: 20,438 tflops: 20.57 mfu: 6.59% global_avg_ntp_loss: 3.6338 global_avg_mtp_loss: 17.2629 +[titan] 2025-06-13 14:25:30,048 - root - INFO - lr: 2.3381e-04 gnorm: 1.24 [ 1:44:00< 1:12:52] +[titan] 2025-06-13 14:25:33,762 - root - INFO - step: 8825 loss: 19.2743 memory: 6.46GiB(27.34%) tps: 22,061 tflops: 22.20 mfu: 7.12% global_avg_ntp_loss: 3.3446 global_avg_mtp_loss: 15.9297 +[titan] 2025-06-13 14:25:33,762 - root - INFO - lr: 2.3356e-04 gnorm: 1.24 [ 1:44:03< 1:12:48] +[titan] 2025-06-13 14:25:37,389 - root - INFO - step: 8830 loss: 19.6118 memory: 6.46GiB(27.34%) tps: 22,589 tflops: 22.73 mfu: 7.29% global_avg_ntp_loss: 3.4031 global_avg_mtp_loss: 16.2088 +[titan] 2025-06-13 14:25:37,389 - root - INFO - lr: 2.3331e-04 gnorm: 1.27 [ 1:44:07< 1:12:45] +[titan] 2025-06-13 14:25:41,561 - root - INFO - step: 8835 loss: 20.8377 memory: 6.46GiB(27.34%) tps: 19,635 tflops: 19.76 mfu: 6.33% global_avg_ntp_loss: 3.6302 global_avg_mtp_loss: 17.2075 +[titan] 2025-06-13 14:25:41,562 - root - INFO - lr: 2.3307e-04 gnorm: 1.30 [ 1:44:11< 1:12:42] +[titan] 2025-06-13 14:25:45,157 - root - INFO - step: 8840 loss: 20.2830 memory: 6.46GiB(27.34%) tps: 22,786 tflops: 22.93 mfu: 7.35% global_avg_ntp_loss: 3.5581 global_avg_mtp_loss: 16.7249 +[titan] 2025-06-13 14:25:45,157 - root - INFO - lr: 2.3282e-04 gnorm: 1.14 [ 1:44:15< 1:12:38] +[titan] 2025-06-13 14:25:48,649 - root - INFO - step: 8845 loss: 18.8583 memory: 6.46GiB(27.34%) tps: 23,460 tflops: 23.61 mfu: 7.57% global_avg_ntp_loss: 3.2663 global_avg_mtp_loss: 15.5920 +[titan] 2025-06-13 14:25:48,650 - root - INFO - lr: 2.3257e-04 gnorm: 1.29 [ 1:44:18< 1:12:35] +[titan] 2025-06-13 14:25:51,427 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:25:52,067 - root - INFO - step: 8850 loss: 20.0310 memory: 6.46GiB(27.34%) tps: 23,973 tflops: 24.13 mfu: 7.73% global_avg_ntp_loss: 3.4700 global_avg_mtp_loss: 16.5610 +[titan] 2025-06-13 14:25:52,067 - root - INFO - lr: 2.3232e-04 gnorm: 1.42 [ 1:44:22< 1:12:31] +[titan] 2025-06-13 14:25:55,604 - root - INFO - step: 8855 loss: 19.7440 memory: 6.46GiB(27.34%) tps: 23,166 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.4335 global_avg_mtp_loss: 16.3104 +[titan] 2025-06-13 14:25:55,604 - root - INFO - lr: 2.3207e-04 gnorm: 1.37 [ 1:44:25< 1:12:28] +[titan] 2025-06-13 14:25:59,238 - root - INFO - step: 8860 loss: 18.2242 memory: 6.46GiB(27.34%) tps: 22,545 tflops: 22.69 mfu: 7.27% global_avg_ntp_loss: 3.1302 global_avg_mtp_loss: 15.0941 +[titan] 2025-06-13 14:25:59,238 - root - INFO - lr: 2.3183e-04 gnorm: 1.31 [ 1:44:29< 1:12:24] +[titan] 2025-06-13 14:26:02,465 - root - INFO - step: 8865 loss: 17.9440 memory: 6.46GiB(27.34%) tps: 25,385 tflops: 25.55 mfu: 8.19% global_avg_ntp_loss: 3.0863 global_avg_mtp_loss: 14.8577 +[titan] 2025-06-13 14:26:02,466 - root - INFO - lr: 2.3158e-04 gnorm: 1.52 [ 1:44:32< 1:12:20] +[titan] 2025-06-13 14:26:06,093 - root - INFO - step: 8870 loss: 20.1972 memory: 6.46GiB(27.34%) tps: 22,587 tflops: 22.73 mfu: 7.29% global_avg_ntp_loss: 3.4878 global_avg_mtp_loss: 16.7095 +[titan] 2025-06-13 14:26:06,093 - root - INFO - lr: 2.3133e-04 gnorm: 1.25 [ 1:44:36< 1:12:17] +[titan] 2025-06-13 14:26:09,261 - root - INFO - step: 8875 loss: 19.8116 memory: 6.46GiB(27.34%) tps: 25,857 tflops: 26.02 mfu: 8.34% global_avg_ntp_loss: 3.4410 global_avg_mtp_loss: 16.3706 +[titan] 2025-06-13 14:26:09,262 - root - INFO - lr: 2.3108e-04 gnorm: 1.88 [ 1:44:39< 1:12:13] +[titan] 2025-06-13 14:26:12,401 - root - INFO - step: 8880 loss: 18.5649 memory: 6.46GiB(27.34%) tps: 26,099 tflops: 26.26 mfu: 8.42% global_avg_ntp_loss: 3.2014 global_avg_mtp_loss: 15.3635 +[titan] 2025-06-13 14:26:12,401 - root - INFO - lr: 2.3084e-04 gnorm: 1.46 [ 1:44:42< 1:12:09] +[titan] 2025-06-13 14:26:15,605 - root - INFO - step: 8885 loss: 19.3950 memory: 6.46GiB(27.34%) tps: 25,572 tflops: 25.74 mfu: 8.25% global_avg_ntp_loss: 3.3768 global_avg_mtp_loss: 16.0182 +[titan] 2025-06-13 14:26:15,605 - root - INFO - lr: 2.3059e-04 gnorm: 1.61 [ 1:44:45< 1:12:05] +[titan] 2025-06-13 14:26:19,470 - root - INFO - step: 8890 loss: 19.6518 memory: 6.46GiB(27.34%) tps: 21,199 tflops: 21.33 mfu: 6.84% global_avg_ntp_loss: 3.4095 global_avg_mtp_loss: 16.2423 +[titan] 2025-06-13 14:26:19,470 - root - INFO - lr: 2.3034e-04 gnorm: 1.29 [ 1:44:49< 1:12:02] +[titan] 2025-06-13 14:26:22,917 - root - INFO - step: 8895 loss: 19.9409 memory: 6.46GiB(27.34%) tps: 23,763 tflops: 23.91 mfu: 7.66% global_avg_ntp_loss: 3.4772 global_avg_mtp_loss: 16.4637 +[titan] 2025-06-13 14:26:22,918 - root - INFO - lr: 2.3009e-04 gnorm: 1.14 [ 1:44:52< 1:11:59] +[titan] 2025-06-13 14:26:25,589 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:26:26,274 - root - INFO - step: 8900 loss: 20.7662 memory: 6.46GiB(27.34%) tps: 24,410 tflops: 24.57 mfu: 7.87% global_avg_ntp_loss: 3.6709 global_avg_mtp_loss: 17.0953 +[titan] 2025-06-13 14:26:26,274 - root - INFO - lr: 2.2985e-04 gnorm: 1.37 [ 1:44:56< 1:11:55] +[titan] 2025-06-13 14:26:30,137 - root - INFO - step: 8905 loss: 20.5805 memory: 6.46GiB(27.34%) tps: 21,210 tflops: 21.35 mfu: 6.84% global_avg_ntp_loss: 3.5414 global_avg_mtp_loss: 17.0391 +[titan] 2025-06-13 14:26:30,137 - root - INFO - lr: 2.2960e-04 gnorm: 1.48 [ 1:45:00< 1:11:52] +[titan] 2025-06-13 14:26:33,290 - root - INFO - step: 8910 loss: 19.8300 memory: 6.46GiB(27.34%) tps: 25,979 tflops: 26.14 mfu: 8.38% global_avg_ntp_loss: 3.4499 global_avg_mtp_loss: 16.3801 +[titan] 2025-06-13 14:26:33,291 - root - INFO - lr: 2.2935e-04 gnorm: 1.23 [ 1:45:03< 1:11:48] +[titan] 2025-06-13 14:26:36,695 - root - INFO - step: 8915 loss: 19.3354 memory: 6.46GiB(27.34%) tps: 24,063 tflops: 24.22 mfu: 7.76% global_avg_ntp_loss: 3.3692 global_avg_mtp_loss: 15.9662 +[titan] 2025-06-13 14:26:36,695 - root - INFO - lr: 2.2910e-04 gnorm: 1.13 [ 1:45:06< 1:11:44] +[titan] 2025-06-13 14:26:40,233 - root - INFO - step: 8920 loss: 19.7314 memory: 6.46GiB(27.34%) tps: 23,159 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.4224 global_avg_mtp_loss: 16.3090 +[titan] 2025-06-13 14:26:40,233 - root - INFO - lr: 2.2886e-04 gnorm: 1.29 [ 1:45:10< 1:11:41] +[titan] 2025-06-13 14:26:43,206 - root - INFO - step: 8925 loss: 20.2199 memory: 6.46GiB(27.34%) tps: 27,564 tflops: 27.74 mfu: 8.89% global_avg_ntp_loss: 3.5307 global_avg_mtp_loss: 16.6892 +[titan] 2025-06-13 14:26:43,206 - root - INFO - lr: 2.2861e-04 gnorm: 1.13 [ 1:45:13< 1:11:37] +[titan] 2025-06-13 14:26:47,063 - root - INFO - step: 8930 loss: 19.6290 memory: 6.46GiB(27.34%) tps: 21,239 tflops: 21.37 mfu: 6.85% global_avg_ntp_loss: 3.4091 global_avg_mtp_loss: 16.2198 +[titan] 2025-06-13 14:26:47,063 - root - INFO - lr: 2.2836e-04 gnorm: 1.19 [ 1:45:17< 1:11:33] +[titan] 2025-06-13 14:26:50,886 - root - INFO - step: 8935 loss: 18.9739 memory: 6.46GiB(27.34%) tps: 21,434 tflops: 21.57 mfu: 6.91% global_avg_ntp_loss: 3.2725 global_avg_mtp_loss: 15.7014 +[titan] 2025-06-13 14:26:50,886 - root - INFO - lr: 2.2812e-04 gnorm: 1.12 [ 1:45:20< 1:11:30] +[titan] 2025-06-13 14:26:54,402 - root - INFO - step: 8940 loss: 17.1003 memory: 6.46GiB(27.34%) tps: 23,299 tflops: 23.45 mfu: 7.52% global_avg_ntp_loss: 2.9531 global_avg_mtp_loss: 14.1472 +[titan] 2025-06-13 14:26:54,402 - root - INFO - lr: 2.2787e-04 gnorm: 1.35 [ 1:45:24< 1:11:26] +[titan] 2025-06-13 14:26:57,645 - root - INFO - step: 8945 loss: 19.6379 memory: 6.46GiB(27.34%) tps: 25,264 tflops: 25.42 mfu: 8.15% global_avg_ntp_loss: 3.3990 global_avg_mtp_loss: 16.2389 +[titan] 2025-06-13 14:26:57,645 - root - INFO - lr: 2.2762e-04 gnorm: 1.20 [ 1:45:27< 1:11:23] +[titan] 2025-06-13 14:27:00,252 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:27:01,394 - root - INFO - step: 8950 loss: 19.2448 memory: 6.46GiB(27.34%) tps: 21,856 tflops: 22.00 mfu: 7.05% global_avg_ntp_loss: 3.3172 global_avg_mtp_loss: 15.9275 +[titan] 2025-06-13 14:27:01,394 - root - INFO - lr: 2.2738e-04 gnorm: 1.24 [ 1:45:31< 1:11:19] +[titan] 2025-06-13 14:27:04,573 - root - INFO - step: 8955 loss: 19.5828 memory: 6.46GiB(27.34%) tps: 25,773 tflops: 25.94 mfu: 8.31% global_avg_ntp_loss: 3.5046 global_avg_mtp_loss: 16.0783 +[titan] 2025-06-13 14:27:04,573 - root - INFO - lr: 2.2713e-04 gnorm: 2.22 [ 1:45:34< 1:11:16] +[titan] 2025-06-13 14:27:07,994 - root - INFO - step: 8960 loss: 19.1278 memory: 6.46GiB(27.34%) tps: 23,946 tflops: 24.10 mfu: 7.72% global_avg_ntp_loss: 3.3105 global_avg_mtp_loss: 15.8172 +[titan] 2025-06-13 14:27:07,994 - root - INFO - lr: 2.2688e-04 gnorm: 1.24 [ 1:45:37< 1:11:12] +[titan] 2025-06-13 14:27:11,774 - root - INFO - step: 8965 loss: 18.9796 memory: 6.46GiB(27.34%) tps: 21,675 tflops: 21.81 mfu: 6.99% global_avg_ntp_loss: 3.3154 global_avg_mtp_loss: 15.6642 +[titan] 2025-06-13 14:27:11,775 - root - INFO - lr: 2.2664e-04 gnorm: 1.18 [ 1:45:41< 1:11:09] +[titan] 2025-06-13 14:27:16,963 - root - INFO - step: 8970 loss: 18.5944 memory: 6.46GiB(27.34%) tps: 15,789 tflops: 15.89 mfu: 5.09% global_avg_ntp_loss: 3.2424 global_avg_mtp_loss: 15.3521 +[titan] 2025-06-13 14:27:16,964 - root - INFO - lr: 2.2639e-04 gnorm: 1.31 [ 1:45:46< 1:11:06] +[titan] 2025-06-13 14:27:20,292 - root - INFO - step: 8975 loss: 18.4105 memory: 6.46GiB(27.34%) tps: 24,617 tflops: 24.77 mfu: 7.94% global_avg_ntp_loss: 3.2087 global_avg_mtp_loss: 15.2019 +[titan] 2025-06-13 14:27:20,292 - root - INFO - lr: 2.2614e-04 gnorm: 1.44 [ 1:45:50< 1:11:02] +[titan] 2025-06-13 14:27:23,686 - root - INFO - step: 8980 loss: 14.8196 memory: 6.46GiB(27.34%) tps: 24,140 tflops: 24.29 mfu: 7.79% global_avg_ntp_loss: 2.5700 global_avg_mtp_loss: 12.2496 +[titan] 2025-06-13 14:27:23,686 - root - INFO - lr: 2.2590e-04 gnorm: 1.29 [ 1:45:53< 1:10:59] +[titan] 2025-06-13 14:27:27,305 - root - INFO - step: 8985 loss: 19.8260 memory: 6.46GiB(27.34%) tps: 22,638 tflops: 22.78 mfu: 7.30% global_avg_ntp_loss: 3.4490 global_avg_mtp_loss: 16.3769 +[titan] 2025-06-13 14:27:27,305 - root - INFO - lr: 2.2565e-04 gnorm: 1.13 [ 1:45:57< 1:10:55] +[titan] 2025-06-13 14:27:30,873 - root - INFO - step: 8990 loss: 19.1011 memory: 6.46GiB(27.34%) tps: 22,964 tflops: 23.11 mfu: 7.41% global_avg_ntp_loss: 3.3185 global_avg_mtp_loss: 15.7826 +[titan] 2025-06-13 14:27:30,873 - root - INFO - lr: 2.2540e-04 gnorm: 1.53 [ 1:46:00< 1:10:52] +[titan] 2025-06-13 14:27:34,498 - root - INFO - step: 8995 loss: 19.1843 memory: 6.46GiB(27.34%) tps: 22,600 tflops: 22.74 mfu: 7.29% global_avg_ntp_loss: 3.2999 global_avg_mtp_loss: 15.8843 +[titan] 2025-06-13 14:27:34,498 - root - INFO - lr: 2.2516e-04 gnorm: 1.27 [ 1:46:04< 1:10:48] +[titan] 2025-06-13 14:27:37,228 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:27:38,069 - root - INFO - step: 9000 loss: 17.8551 memory: 6.46GiB(27.34%) tps: 22,941 tflops: 23.09 mfu: 7.40% global_avg_ntp_loss: 3.1264 global_avg_mtp_loss: 14.7287 +[titan] 2025-06-13 14:27:38,069 - root - INFO - lr: 2.2491e-04 gnorm: 1.63 [ 1:46:08< 1:10:45] +[titan] 2025-06-13 14:27:41,411 - root - INFO - step: 9005 loss: 19.9445 memory: 6.46GiB(27.34%) tps: 24,516 tflops: 24.67 mfu: 7.91% global_avg_ntp_loss: 3.4709 global_avg_mtp_loss: 16.4736 +[titan] 2025-06-13 14:27:41,411 - root - INFO - lr: 2.2467e-04 gnorm: 1.20 [ 1:46:11< 1:10:41] +[titan] 2025-06-13 14:27:44,686 - root - INFO - step: 9010 loss: 19.4573 memory: 6.46GiB(27.34%) tps: 25,014 tflops: 25.17 mfu: 8.07% global_avg_ntp_loss: 3.3777 global_avg_mtp_loss: 16.0797 +[titan] 2025-06-13 14:27:44,687 - root - INFO - lr: 2.2442e-04 gnorm: 1.43 [ 1:46:14< 1:10:37] +[titan] 2025-06-13 14:27:48,394 - root - INFO - step: 9015 loss: 19.7769 memory: 6.46GiB(27.34%) tps: 22,100 tflops: 22.24 mfu: 7.13% global_avg_ntp_loss: 3.4801 global_avg_mtp_loss: 16.2967 +[titan] 2025-06-13 14:27:48,394 - root - INFO - lr: 2.2417e-04 gnorm: 1.14 [ 1:46:18< 1:10:34] +[titan] 2025-06-13 14:27:52,034 - root - INFO - step: 9020 loss: 19.5472 memory: 6.46GiB(27.34%) tps: 22,506 tflops: 22.65 mfu: 7.26% global_avg_ntp_loss: 3.4061 global_avg_mtp_loss: 16.1411 +[titan] 2025-06-13 14:27:52,035 - root - INFO - lr: 2.2393e-04 gnorm: 1.52 [ 1:46:22< 1:10:31] +[titan] 2025-06-13 14:27:55,219 - root - INFO - step: 9025 loss: 12.4408 memory: 6.46GiB(27.34%) tps: 25,727 tflops: 25.89 mfu: 8.30% global_avg_ntp_loss: 2.1230 global_avg_mtp_loss: 10.3178 +[titan] 2025-06-13 14:27:55,219 - root - INFO - lr: 2.2368e-04 gnorm: 1.58 [ 1:46:25< 1:10:27] +[titan] 2025-06-13 14:27:58,882 - root - INFO - step: 9030 loss: 19.1224 memory: 6.46GiB(27.34%) tps: 22,366 tflops: 22.51 mfu: 7.21% global_avg_ntp_loss: 3.3443 global_avg_mtp_loss: 15.7781 +[titan] 2025-06-13 14:27:58,882 - root - INFO - lr: 2.2344e-04 gnorm: 1.29 [ 1:46:28< 1:10:23] +[titan] 2025-06-13 14:28:02,438 - root - INFO - step: 9035 loss: 19.5576 memory: 6.46GiB(27.34%) tps: 23,039 tflops: 23.19 mfu: 7.43% global_avg_ntp_loss: 3.3698 global_avg_mtp_loss: 16.1877 +[titan] 2025-06-13 14:28:02,439 - root - INFO - lr: 2.2319e-04 gnorm: 1.43 [ 1:46:32< 1:10:20] +[titan] 2025-06-13 14:28:06,097 - root - INFO - step: 9040 loss: 19.9545 memory: 6.46GiB(27.34%) tps: 22,394 tflops: 22.54 mfu: 7.22% global_avg_ntp_loss: 3.4908 global_avg_mtp_loss: 16.4638 +[titan] 2025-06-13 14:28:06,097 - root - INFO - lr: 2.2294e-04 gnorm: 1.29 [ 1:46:36< 1:10:16] +[titan] 2025-06-13 14:28:09,430 - root - INFO - step: 9045 loss: 20.0908 memory: 6.46GiB(27.34%) tps: 24,579 tflops: 24.74 mfu: 7.93% global_avg_ntp_loss: 3.4977 global_avg_mtp_loss: 16.5931 +[titan] 2025-06-13 14:28:09,431 - root - INFO - lr: 2.2270e-04 gnorm: 1.27 [ 1:46:39< 1:10:13] +[titan] 2025-06-13 14:28:12,213 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:28:12,750 - root - INFO - step: 9050 loss: 17.8362 memory: 6.46GiB(27.34%) tps: 24,684 tflops: 24.84 mfu: 7.96% global_avg_ntp_loss: 3.0470 global_avg_mtp_loss: 14.7892 +[titan] 2025-06-13 14:28:12,750 - root - INFO - lr: 2.2245e-04 gnorm: 1.25 [ 1:46:42< 1:10:09] +[titan] 2025-06-13 14:28:16,303 - root - INFO - step: 9055 loss: 20.0030 memory: 6.46GiB(27.34%) tps: 23,060 tflops: 23.21 mfu: 7.44% global_avg_ntp_loss: 3.4665 global_avg_mtp_loss: 16.5365 +[titan] 2025-06-13 14:28:16,303 - root - INFO - lr: 2.2221e-04 gnorm: 1.19 [ 1:46:46< 1:10:05] +[titan] 2025-06-13 14:28:21,021 - root - INFO - step: 9060 loss: 19.9864 memory: 6.46GiB(27.34%) tps: 17,365 tflops: 17.48 mfu: 5.60% global_avg_ntp_loss: 3.4775 global_avg_mtp_loss: 16.5089 +[titan] 2025-06-13 14:28:21,021 - root - INFO - lr: 2.2196e-04 gnorm: 1.28 [ 1:46:50< 1:10:03] +[titan] 2025-06-13 14:28:24,303 - root - INFO - step: 9065 loss: 19.8896 memory: 6.46GiB(27.34%) tps: 24,965 tflops: 25.12 mfu: 8.05% global_avg_ntp_loss: 3.4346 global_avg_mtp_loss: 16.4549 +[titan] 2025-06-13 14:28:24,303 - root - INFO - lr: 2.2172e-04 gnorm: 1.27 [ 1:46:54< 1:09:59] +[titan] 2025-06-13 14:28:27,924 - root - INFO - step: 9070 loss: 19.2761 memory: 6.46GiB(27.34%) tps: 22,625 tflops: 22.77 mfu: 7.30% global_avg_ntp_loss: 3.2757 global_avg_mtp_loss: 16.0004 +[titan] 2025-06-13 14:28:27,924 - root - INFO - lr: 2.2147e-04 gnorm: 1.34 [ 1:46:57< 1:09:56] +[titan] 2025-06-13 14:28:30,717 - root - INFO - step: 9075 loss: 18.2701 memory: 6.46GiB(27.34%) tps: 29,334 tflops: 29.52 mfu: 9.46% global_avg_ntp_loss: 3.1379 global_avg_mtp_loss: 15.1323 +[titan] 2025-06-13 14:28:30,717 - root - INFO - lr: 2.2123e-04 gnorm: 3.15 [ 1:47:00< 1:09:52] +[titan] 2025-06-13 14:28:34,311 - root - INFO - step: 9080 loss: 18.7880 memory: 6.46GiB(27.34%) tps: 22,794 tflops: 22.94 mfu: 7.35% global_avg_ntp_loss: 3.2670 global_avg_mtp_loss: 15.5210 +[titan] 2025-06-13 14:28:34,312 - root - INFO - lr: 2.2098e-04 gnorm: 1.42 [ 1:47:04< 1:09:48] +[titan] 2025-06-13 14:28:37,651 - root - INFO - step: 9085 loss: 18.1445 memory: 6.46GiB(27.34%) tps: 24,534 tflops: 24.69 mfu: 7.91% global_avg_ntp_loss: 3.1243 global_avg_mtp_loss: 15.0201 +[titan] 2025-06-13 14:28:37,651 - root - INFO - lr: 2.2074e-04 gnorm: 1.35 [ 1:47:07< 1:09:44] +[titan] 2025-06-13 14:28:41,303 - root - INFO - step: 9090 loss: 19.1950 memory: 6.46GiB(27.34%) tps: 22,432 tflops: 22.58 mfu: 7.24% global_avg_ntp_loss: 3.3337 global_avg_mtp_loss: 15.8613 +[titan] 2025-06-13 14:28:41,303 - root - INFO - lr: 2.2049e-04 gnorm: 1.26 [ 1:47:11< 1:09:41] +[titan] 2025-06-13 14:28:44,914 - root - INFO - step: 9095 loss: 18.3376 memory: 6.46GiB(27.34%) tps: 22,690 tflops: 22.83 mfu: 7.32% global_avg_ntp_loss: 3.2004 global_avg_mtp_loss: 15.1373 +[titan] 2025-06-13 14:28:44,914 - root - INFO - lr: 2.2025e-04 gnorm: 2.21 [ 1:47:14< 1:09:37] +[titan] 2025-06-13 14:28:47,951 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:28:48,512 - root - INFO - step: 9100 loss: 19.5284 memory: 6.46GiB(27.34%) tps: 22,768 tflops: 22.91 mfu: 7.34% global_avg_ntp_loss: 3.3760 global_avg_mtp_loss: 16.1525 +[titan] 2025-06-13 14:28:48,513 - root - INFO - lr: 2.2000e-04 gnorm: 1.23 [ 1:47:18< 1:09:34] +[titan] 2025-06-13 14:28:52,088 - root - INFO - step: 9105 loss: 15.9889 memory: 6.46GiB(27.34%) tps: 22,915 tflops: 23.06 mfu: 7.39% global_avg_ntp_loss: 2.7804 global_avg_mtp_loss: 13.2085 +[titan] 2025-06-13 14:28:52,088 - root - INFO - lr: 2.1976e-04 gnorm: 1.17 [ 1:47:22< 1:09:30] +[titan] 2025-06-13 14:28:55,892 - root - INFO - step: 9110 loss: 18.0448 memory: 6.46GiB(27.34%) tps: 21,534 tflops: 21.67 mfu: 6.95% global_avg_ntp_loss: 3.1236 global_avg_mtp_loss: 14.9212 +[titan] 2025-06-13 14:28:55,892 - root - INFO - lr: 2.1951e-04 gnorm: 1.10 [ 1:47:25< 1:09:27] +[titan] 2025-06-13 14:28:59,209 - root - INFO - step: 9115 loss: 19.1151 memory: 6.46GiB(27.34%) tps: 24,703 tflops: 24.86 mfu: 7.97% global_avg_ntp_loss: 3.2836 global_avg_mtp_loss: 15.8315 +[titan] 2025-06-13 14:28:59,209 - root - INFO - lr: 2.1927e-04 gnorm: 1.34 [ 1:47:29< 1:09:23] +[titan] 2025-06-13 14:29:03,235 - root - INFO - step: 9120 loss: 17.5154 memory: 6.46GiB(27.34%) tps: 20,351 tflops: 20.48 mfu: 6.56% global_avg_ntp_loss: 3.0466 global_avg_mtp_loss: 14.4688 +[titan] 2025-06-13 14:29:03,235 - root - INFO - lr: 2.1902e-04 gnorm: 1.23 [ 1:47:33< 1:09:20] +[titan] 2025-06-13 14:29:06,612 - root - INFO - step: 9125 loss: 19.4011 memory: 6.46GiB(27.34%) tps: 24,256 tflops: 24.41 mfu: 7.82% global_avg_ntp_loss: 3.3818 global_avg_mtp_loss: 16.0193 +[titan] 2025-06-13 14:29:06,613 - root - INFO - lr: 2.1878e-04 gnorm: 1.52 [ 1:47:36< 1:09:16] +[titan] 2025-06-13 14:29:10,139 - root - INFO - step: 9130 loss: 19.6755 memory: 6.46GiB(27.34%) tps: 23,233 tflops: 23.38 mfu: 7.49% global_avg_ntp_loss: 3.3915 global_avg_mtp_loss: 16.2840 +[titan] 2025-06-13 14:29:10,139 - root - INFO - lr: 2.1854e-04 gnorm: 1.21 [ 1:47:40< 1:09:13] +[titan] 2025-06-13 14:29:13,498 - root - INFO - step: 9135 loss: 18.9697 memory: 6.46GiB(27.34%) tps: 24,393 tflops: 24.55 mfu: 7.87% global_avg_ntp_loss: 3.2630 global_avg_mtp_loss: 15.7067 +[titan] 2025-06-13 14:29:13,498 - root - INFO - lr: 2.1829e-04 gnorm: 1.40 [ 1:47:43< 1:09:09] +[titan] 2025-06-13 14:29:17,425 - root - INFO - step: 9140 loss: 19.8018 memory: 6.46GiB(27.34%) tps: 20,862 tflops: 20.99 mfu: 6.73% global_avg_ntp_loss: 3.4152 global_avg_mtp_loss: 16.3867 +[titan] 2025-06-13 14:29:17,425 - root - INFO - lr: 2.1805e-04 gnorm: 1.20 [ 1:47:47< 1:09:06] +[titan] 2025-06-13 14:29:20,931 - root - INFO - step: 9145 loss: 18.8595 memory: 6.46GiB(27.34%) tps: 23,366 tflops: 23.51 mfu: 7.54% global_avg_ntp_loss: 3.2630 global_avg_mtp_loss: 15.5965 +[titan] 2025-06-13 14:29:20,932 - root - INFO - lr: 2.1780e-04 gnorm: 1.31 [ 1:47:50< 1:09:02] +[titan] 2025-06-13 14:29:23,464 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:29:24,026 - root - INFO - step: 9150 loss: 20.0619 memory: 6.46GiB(27.34%) tps: 26,477 tflops: 26.65 mfu: 8.54% global_avg_ntp_loss: 3.5237 global_avg_mtp_loss: 16.5382 +[titan] 2025-06-13 14:29:24,026 - root - INFO - lr: 2.1756e-04 gnorm: 1.45 [ 1:47:53< 1:08:59] +[titan] 2025-06-13 14:29:27,757 - root - INFO - step: 9155 loss: 20.5796 memory: 6.46GiB(27.34%) tps: 21,956 tflops: 22.10 mfu: 7.08% global_avg_ntp_loss: 3.6294 global_avg_mtp_loss: 16.9502 +[titan] 2025-06-13 14:29:27,758 - root - INFO - lr: 2.1731e-04 gnorm: 1.18 [ 1:47:57< 1:08:55] +[titan] 2025-06-13 14:29:31,286 - root - INFO - step: 9160 loss: 19.4966 memory: 6.46GiB(27.34%) tps: 23,222 tflops: 23.37 mfu: 7.49% global_avg_ntp_loss: 3.3841 global_avg_mtp_loss: 16.1125 +[titan] 2025-06-13 14:29:31,286 - root - INFO - lr: 2.1707e-04 gnorm: 1.21 [ 1:48:01< 1:08:52] +[titan] 2025-06-13 14:29:34,813 - root - INFO - step: 9165 loss: 19.8921 memory: 6.46GiB(27.34%) tps: 23,227 tflops: 23.38 mfu: 7.49% global_avg_ntp_loss: 3.4806 global_avg_mtp_loss: 16.4115 +[titan] 2025-06-13 14:29:34,813 - root - INFO - lr: 2.1683e-04 gnorm: 1.22 [ 1:48:04< 1:08:48] +[titan] 2025-06-13 14:29:38,690 - root - INFO - step: 9170 loss: 19.7885 memory: 6.46GiB(27.34%) tps: 21,130 tflops: 21.26 mfu: 6.82% global_avg_ntp_loss: 3.4310 global_avg_mtp_loss: 16.3575 +[titan] 2025-06-13 14:29:38,691 - root - INFO - lr: 2.1658e-04 gnorm: 1.29 [ 1:48:08< 1:08:45] +[titan] 2025-06-13 14:29:41,879 - root - INFO - step: 9175 loss: 18.6195 memory: 6.46GiB(27.34%) tps: 25,698 tflops: 25.86 mfu: 8.29% global_avg_ntp_loss: 3.1962 global_avg_mtp_loss: 15.4233 +[titan] 2025-06-13 14:29:41,879 - root - INFO - lr: 2.1634e-04 gnorm: 1.43 [ 1:48:11< 1:08:41] +[titan] 2025-06-13 14:29:45,324 - root - INFO - step: 9180 loss: 20.7748 memory: 6.46GiB(27.34%) tps: 23,780 tflops: 23.93 mfu: 7.67% global_avg_ntp_loss: 3.6146 global_avg_mtp_loss: 17.1602 +[titan] 2025-06-13 14:29:45,324 - root - INFO - lr: 2.1610e-04 gnorm: 1.28 [ 1:48:15< 1:08:37] +[titan] 2025-06-13 14:29:48,646 - root - INFO - step: 9185 loss: 21.1979 memory: 6.46GiB(27.34%) tps: 24,661 tflops: 24.82 mfu: 7.95% global_avg_ntp_loss: 3.7128 global_avg_mtp_loss: 17.4852 +[titan] 2025-06-13 14:29:48,646 - root - INFO - lr: 2.1585e-04 gnorm: 1.21 [ 1:48:18< 1:08:34] +[titan] 2025-06-13 14:29:51,850 - root - INFO - step: 9190 loss: 18.1121 memory: 6.46GiB(27.34%) tps: 25,575 tflops: 25.74 mfu: 8.25% global_avg_ntp_loss: 3.1152 global_avg_mtp_loss: 14.9969 +[titan] 2025-06-13 14:29:51,850 - root - INFO - lr: 2.1561e-04 gnorm: 1.32 [ 1:48:21< 1:08:30] +[titan] 2025-06-13 14:29:55,762 - root - INFO - step: 9195 loss: 20.8084 memory: 6.46GiB(27.34%) tps: 20,943 tflops: 21.08 mfu: 6.76% global_avg_ntp_loss: 3.6455 global_avg_mtp_loss: 17.1629 +[titan] 2025-06-13 14:29:55,762 - root - INFO - lr: 2.1536e-04 gnorm: 1.13 [ 1:48:25< 1:08:27] +[titan] 2025-06-13 14:29:58,527 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:29:59,109 - root - INFO - step: 9200 loss: 20.2174 memory: 6.46GiB(27.34%) tps: 24,482 tflops: 24.64 mfu: 7.90% global_avg_ntp_loss: 3.5235 global_avg_mtp_loss: 16.6939 +[titan] 2025-06-13 14:29:59,109 - root - INFO - lr: 2.1512e-04 gnorm: 1.44 [ 1:48:29< 1:08:23] +[titan] 2025-06-13 14:30:02,460 - root - INFO - step: 9205 loss: 18.9372 memory: 6.46GiB(27.34%) tps: 24,446 tflops: 24.60 mfu: 7.89% global_avg_ntp_loss: 3.2391 global_avg_mtp_loss: 15.6981 +[titan] 2025-06-13 14:30:02,460 - root - INFO - lr: 2.1488e-04 gnorm: 1.49 [ 1:48:32< 1:08:19] +[titan] 2025-06-13 14:30:05,651 - root - INFO - step: 9210 loss: 20.1298 memory: 6.46GiB(27.34%) tps: 25,675 tflops: 25.84 mfu: 8.28% global_avg_ntp_loss: 3.4564 global_avg_mtp_loss: 16.6735 +[titan] 2025-06-13 14:30:05,651 - root - INFO - lr: 2.1464e-04 gnorm: 1.32 [ 1:48:35< 1:08:16] +[titan] 2025-06-13 14:30:09,129 - root - INFO - step: 9215 loss: 20.0537 memory: 6.46GiB(27.34%) tps: 23,556 tflops: 23.71 mfu: 7.60% global_avg_ntp_loss: 3.4615 global_avg_mtp_loss: 16.5921 +[titan] 2025-06-13 14:30:09,129 - root - INFO - lr: 2.1439e-04 gnorm: 1.27 [ 1:48:39< 1:08:12] +[titan] 2025-06-13 14:30:10,072 - root - INFO - Dumping profiler traces at step 9216 +[titan] 2025-06-13 14:30:10,157 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 14:30:12,715 - root - INFO - step: 9220 loss: 19.5708 memory: 6.46GiB(27.34%) tps: 22,848 tflops: 22.99 mfu: 7.37% global_avg_ntp_loss: 3.3401 global_avg_mtp_loss: 16.2307 +[titan] 2025-06-13 14:30:12,715 - root - INFO - lr: 2.1415e-04 gnorm: 1.32 [ 1:48:42< 1:08:09] +[titan] 2025-06-13 14:30:15,943 - root - INFO - step: 9225 loss: 17.8337 memory: 6.46GiB(27.34%) tps: 25,385 tflops: 25.55 mfu: 8.19% global_avg_ntp_loss: 3.0712 global_avg_mtp_loss: 14.7625 +[titan] 2025-06-13 14:30:15,943 - root - INFO - lr: 2.1391e-04 gnorm: 1.34 [ 1:48:45< 1:08:05] +[titan] 2025-06-13 14:30:19,717 - root - INFO - step: 9230 loss: 20.3336 memory: 6.46GiB(27.34%) tps: 21,707 tflops: 21.85 mfu: 7.00% global_avg_ntp_loss: 3.4921 global_avg_mtp_loss: 16.8415 +[titan] 2025-06-13 14:30:19,718 - root - INFO - lr: 2.1366e-04 gnorm: 1.19 [ 1:48:49< 1:08:01] +[titan] 2025-06-13 14:30:23,472 - root - INFO - step: 9235 loss: 19.0068 memory: 6.46GiB(27.34%) tps: 21,819 tflops: 21.96 mfu: 7.04% global_avg_ntp_loss: 3.2872 global_avg_mtp_loss: 15.7196 +[titan] 2025-06-13 14:30:23,473 - root - INFO - lr: 2.1342e-04 gnorm: 1.19 [ 1:48:53< 1:07:58] +[titan] 2025-06-13 14:30:26,623 - root - INFO - step: 9240 loss: 19.7595 memory: 6.46GiB(27.34%) tps: 26,004 tflops: 26.17 mfu: 8.39% global_avg_ntp_loss: 3.4040 global_avg_mtp_loss: 16.3555 +[titan] 2025-06-13 14:30:26,623 - root - INFO - lr: 2.1318e-04 gnorm: 1.24 [ 1:48:56< 1:07:54] +[titan] 2025-06-13 14:30:30,107 - root - INFO - step: 9245 loss: 17.9477 memory: 6.46GiB(27.34%) tps: 23,518 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.1291 global_avg_mtp_loss: 14.8186 +[titan] 2025-06-13 14:30:30,107 - root - INFO - lr: 2.1293e-04 gnorm: 1.20 [ 1:49:00< 1:07:51] +[titan] 2025-06-13 14:30:32,735 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:30:33,276 - root - INFO - step: 9250 loss: 19.0759 memory: 6.46GiB(27.34%) tps: 25,853 tflops: 26.02 mfu: 8.34% global_avg_ntp_loss: 3.2700 global_avg_mtp_loss: 15.8059 +[titan] 2025-06-13 14:30:33,276 - root - INFO - lr: 2.1269e-04 gnorm: 1.52 [ 1:49:03< 1:07:47] +[titan] 2025-06-13 14:30:36,462 - root - INFO - step: 9255 loss: 20.1941 memory: 6.46GiB(27.34%) tps: 25,720 tflops: 25.88 mfu: 8.30% global_avg_ntp_loss: 3.5468 global_avg_mtp_loss: 16.6473 +[titan] 2025-06-13 14:30:36,462 - root - INFO - lr: 2.1245e-04 gnorm: 1.13 [ 1:49:06< 1:07:43] +[titan] 2025-06-13 14:30:40,042 - root - INFO - step: 9260 loss: 19.4227 memory: 6.46GiB(27.34%) tps: 22,886 tflops: 23.03 mfu: 7.38% global_avg_ntp_loss: 3.3323 global_avg_mtp_loss: 16.0904 +[titan] 2025-06-13 14:30:40,042 - root - INFO - lr: 2.1221e-04 gnorm: 1.19 [ 1:49:09< 1:07:40] +[titan] 2025-06-13 14:30:43,529 - root - INFO - step: 9265 loss: 20.0861 memory: 6.46GiB(27.34%) tps: 23,493 tflops: 23.64 mfu: 7.58% global_avg_ntp_loss: 3.4596 global_avg_mtp_loss: 16.6265 +[titan] 2025-06-13 14:30:43,529 - root - INFO - lr: 2.1196e-04 gnorm: 1.17 [ 1:49:13< 1:07:36] +[titan] 2025-06-13 14:30:46,793 - root - INFO - step: 9270 loss: 18.7648 memory: 6.46GiB(27.34%) tps: 25,102 tflops: 25.26 mfu: 8.10% global_avg_ntp_loss: 3.2077 global_avg_mtp_loss: 15.5571 +[titan] 2025-06-13 14:30:46,793 - root - INFO - lr: 2.1172e-04 gnorm: 1.47 [ 1:49:16< 1:07:32] +[titan] 2025-06-13 14:30:50,268 - root - INFO - step: 9275 loss: 18.5111 memory: 6.46GiB(27.34%) tps: 23,574 tflops: 23.72 mfu: 7.60% global_avg_ntp_loss: 3.2005 global_avg_mtp_loss: 15.3106 +[titan] 2025-06-13 14:30:50,269 - root - INFO - lr: 2.1148e-04 gnorm: 1.31 [ 1:49:20< 1:07:29] +[titan] 2025-06-13 14:30:53,780 - root - INFO - step: 9280 loss: 19.5348 memory: 6.46GiB(27.34%) tps: 23,334 tflops: 23.48 mfu: 7.53% global_avg_ntp_loss: 3.3487 global_avg_mtp_loss: 16.1861 +[titan] 2025-06-13 14:30:53,780 - root - INFO - lr: 2.1124e-04 gnorm: 1.24 [ 1:49:23< 1:07:25] +[titan] 2025-06-13 14:30:56,938 - root - INFO - step: 9285 loss: 20.3398 memory: 6.46GiB(27.34%) tps: 25,947 tflops: 26.11 mfu: 8.37% global_avg_ntp_loss: 3.5391 global_avg_mtp_loss: 16.8007 +[titan] 2025-06-13 14:30:56,938 - root - INFO - lr: 2.1100e-04 gnorm: 1.19 [ 1:49:26< 1:07:21] +[titan] 2025-06-13 14:31:00,515 - root - INFO - step: 9290 loss: 19.3897 memory: 6.46GiB(27.34%) tps: 22,900 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 3.3727 global_avg_mtp_loss: 16.0170 +[titan] 2025-06-13 14:31:00,516 - root - INFO - lr: 2.1075e-04 gnorm: 1.16 [ 1:49:30< 1:07:18] +[titan] 2025-06-13 14:31:03,717 - root - INFO - step: 9295 loss: 19.2874 memory: 6.46GiB(27.34%) tps: 25,590 tflops: 25.75 mfu: 8.25% global_avg_ntp_loss: 3.3433 global_avg_mtp_loss: 15.9441 +[titan] 2025-06-13 14:31:03,717 - root - INFO - lr: 2.1051e-04 gnorm: 1.35 [ 1:49:33< 1:07:14] +[titan] 2025-06-13 14:31:06,773 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:31:07,589 - root - INFO - step: 9300 loss: 19.9561 memory: 6.46GiB(27.34%) tps: 21,159 tflops: 21.29 mfu: 6.82% global_avg_ntp_loss: 3.4121 global_avg_mtp_loss: 16.5440 +[titan] 2025-06-13 14:31:07,590 - root - INFO - lr: 2.1027e-04 gnorm: 1.24 [ 1:49:37< 1:07:11] +[titan] 2025-06-13 14:31:10,990 - root - INFO - step: 9305 loss: 18.4848 memory: 6.46GiB(27.34%) tps: 24,094 tflops: 24.25 mfu: 7.77% global_avg_ntp_loss: 3.1881 global_avg_mtp_loss: 15.2967 +[titan] 2025-06-13 14:31:10,990 - root - INFO - lr: 2.1003e-04 gnorm: 1.33 [ 1:49:40< 1:07:07] +[titan] 2025-06-13 14:31:14,044 - root - INFO - step: 9310 loss: 16.6295 memory: 6.46GiB(27.34%) tps: 26,827 tflops: 27.00 mfu: 8.65% global_avg_ntp_loss: 2.8945 global_avg_mtp_loss: 13.7351 +[titan] 2025-06-13 14:31:14,044 - root - INFO - lr: 2.0979e-04 gnorm: 1.24 [ 1:49:43< 1:07:03] +[titan] 2025-06-13 14:31:17,626 - root - INFO - step: 9315 loss: 19.3960 memory: 6.46GiB(27.34%) tps: 22,868 tflops: 23.01 mfu: 7.38% global_avg_ntp_loss: 3.3962 global_avg_mtp_loss: 15.9998 +[titan] 2025-06-13 14:31:17,627 - root - INFO - lr: 2.0955e-04 gnorm: 1.33 [ 1:49:47< 1:07:00] +[titan] 2025-06-13 14:31:21,122 - root - INFO - step: 9320 loss: 19.1780 memory: 6.46GiB(27.34%) tps: 23,438 tflops: 23.59 mfu: 7.56% global_avg_ntp_loss: 3.3506 global_avg_mtp_loss: 15.8274 +[titan] 2025-06-13 14:31:21,122 - root - INFO - lr: 2.0930e-04 gnorm: 1.28 [ 1:49:51< 1:06:56] +[titan] 2025-06-13 14:31:24,454 - root - INFO - step: 9325 loss: 19.3594 memory: 6.46GiB(27.34%) tps: 24,586 tflops: 24.74 mfu: 7.93% global_avg_ntp_loss: 3.3300 global_avg_mtp_loss: 16.0294 +[titan] 2025-06-13 14:31:24,455 - root - INFO - lr: 2.0906e-04 gnorm: 1.34 [ 1:49:54< 1:06:53] +[titan] 2025-06-13 14:31:28,067 - root - INFO - step: 9330 loss: 19.0789 memory: 6.46GiB(27.34%) tps: 22,679 tflops: 22.82 mfu: 7.32% global_avg_ntp_loss: 3.2782 global_avg_mtp_loss: 15.8007 +[titan] 2025-06-13 14:31:28,068 - root - INFO - lr: 2.0882e-04 gnorm: 1.33 [ 1:49:58< 1:06:49] +[titan] 2025-06-13 14:31:31,865 - root - INFO - step: 9335 loss: 19.3739 memory: 6.46GiB(27.34%) tps: 21,572 tflops: 21.71 mfu: 6.96% global_avg_ntp_loss: 3.3739 global_avg_mtp_loss: 16.0000 +[titan] 2025-06-13 14:31:31,866 - root - INFO - lr: 2.0858e-04 gnorm: 1.29 [ 1:50:01< 1:06:46] +[titan] 2025-06-13 14:31:35,352 - root - INFO - step: 9340 loss: 20.1750 memory: 6.46GiB(27.34%) tps: 23,495 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.5221 global_avg_mtp_loss: 16.6529 +[titan] 2025-06-13 14:31:35,353 - root - INFO - lr: 2.0834e-04 gnorm: 1.21 [ 1:50:05< 1:06:42] +[titan] 2025-06-13 14:31:38,841 - root - INFO - step: 9345 loss: 19.7496 memory: 6.46GiB(27.34%) tps: 23,489 tflops: 23.64 mfu: 7.58% global_avg_ntp_loss: 3.3873 global_avg_mtp_loss: 16.3623 +[titan] 2025-06-13 14:31:38,841 - root - INFO - lr: 2.0810e-04 gnorm: 1.33 [ 1:50:08< 1:06:39] +[titan] 2025-06-13 14:31:41,753 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:31:42,341 - root - INFO - step: 9350 loss: 19.3243 memory: 6.46GiB(27.34%) tps: 23,406 tflops: 23.56 mfu: 7.55% global_avg_ntp_loss: 3.3627 global_avg_mtp_loss: 15.9616 +[titan] 2025-06-13 14:31:42,341 - root - INFO - lr: 2.0786e-04 gnorm: 1.42 [ 1:50:12< 1:06:35] +[titan] 2025-06-13 14:31:45,678 - root - INFO - step: 9355 loss: 19.4388 memory: 6.46GiB(27.34%) tps: 24,553 tflops: 24.71 mfu: 7.92% global_avg_ntp_loss: 3.3322 global_avg_mtp_loss: 16.1067 +[titan] 2025-06-13 14:31:45,678 - root - INFO - lr: 2.0762e-04 gnorm: 1.17 [ 1:50:15< 1:06:31] +[titan] 2025-06-13 14:31:49,138 - root - INFO - step: 9360 loss: 17.8874 memory: 6.46GiB(27.34%) tps: 23,679 tflops: 23.83 mfu: 7.64% global_avg_ntp_loss: 3.0581 global_avg_mtp_loss: 14.8293 +[titan] 2025-06-13 14:31:49,138 - root - INFO - lr: 2.0738e-04 gnorm: 1.50 [ 1:50:19< 1:06:28] +[titan] 2025-06-13 14:31:52,761 - root - INFO - step: 9365 loss: 19.9511 memory: 6.46GiB(27.34%) tps: 22,610 tflops: 22.75 mfu: 7.29% global_avg_ntp_loss: 3.4623 global_avg_mtp_loss: 16.4888 +[titan] 2025-06-13 14:31:52,762 - root - INFO - lr: 2.0713e-04 gnorm: 1.17 [ 1:50:22< 1:06:24] +[titan] 2025-06-13 14:31:56,128 - root - INFO - step: 9370 loss: 19.1169 memory: 6.46GiB(27.34%) tps: 24,334 tflops: 24.49 mfu: 7.85% global_avg_ntp_loss: 3.3144 global_avg_mtp_loss: 15.8025 +[titan] 2025-06-13 14:31:56,129 - root - INFO - lr: 2.0689e-04 gnorm: 1.25 [ 1:50:26< 1:06:21] +[titan] 2025-06-13 14:31:59,737 - root - INFO - step: 9375 loss: 20.7900 memory: 6.46GiB(27.34%) tps: 22,704 tflops: 22.85 mfu: 7.32% global_avg_ntp_loss: 3.6758 global_avg_mtp_loss: 17.1142 +[titan] 2025-06-13 14:31:59,738 - root - INFO - lr: 2.0665e-04 gnorm: 1.18 [ 1:50:29< 1:06:17] +[titan] 2025-06-13 14:32:02,906 - root - INFO - step: 9380 loss: 15.4306 memory: 6.46GiB(27.34%) tps: 25,854 tflops: 26.02 mfu: 8.34% global_avg_ntp_loss: 2.6542 global_avg_mtp_loss: 12.7764 +[titan] 2025-06-13 14:32:02,907 - root - INFO - lr: 2.0641e-04 gnorm: 1.41 [ 1:50:32< 1:06:14] +[titan] 2025-06-13 14:32:06,437 - root - INFO - step: 9385 loss: 20.0163 memory: 6.46GiB(27.34%) tps: 23,203 tflops: 23.35 mfu: 7.48% global_avg_ntp_loss: 3.4962 global_avg_mtp_loss: 16.5201 +[titan] 2025-06-13 14:32:06,438 - root - INFO - lr: 2.0617e-04 gnorm: 1.20 [ 1:50:36< 1:06:10] +[titan] 2025-06-13 14:32:09,555 - root - INFO - step: 9390 loss: 18.4741 memory: 6.46GiB(27.34%) tps: 26,280 tflops: 26.45 mfu: 8.48% global_avg_ntp_loss: 3.1969 global_avg_mtp_loss: 15.2772 +[titan] 2025-06-13 14:32:09,555 - root - INFO - lr: 2.0593e-04 gnorm: 1.71 [ 1:50:39< 1:06:06] +[titan] 2025-06-13 14:32:13,524 - root - INFO - step: 9395 loss: 18.9522 memory: 6.46GiB(27.34%) tps: 20,639 tflops: 20.77 mfu: 6.66% global_avg_ntp_loss: 3.2571 global_avg_mtp_loss: 15.6950 +[titan] 2025-06-13 14:32:13,525 - root - INFO - lr: 2.0569e-04 gnorm: 1.36 [ 1:50:43< 1:06:03] +[titan] 2025-06-13 14:32:16,144 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:32:16,587 - root - INFO - step: 9400 loss: 19.7048 memory: 6.46GiB(27.34%) tps: 26,757 tflops: 26.93 mfu: 8.63% global_avg_ntp_loss: 3.3824 global_avg_mtp_loss: 16.3225 +[titan] 2025-06-13 14:32:16,587 - root - INFO - lr: 2.0545e-04 gnorm: 8.58 [ 1:50:46< 1:05:59] +[titan] 2025-06-13 14:32:20,154 - root - INFO - step: 9405 loss: 18.8357 memory: 6.46GiB(27.34%) tps: 22,964 tflops: 23.11 mfu: 7.41% global_avg_ntp_loss: 3.2378 global_avg_mtp_loss: 15.5979 +[titan] 2025-06-13 14:32:20,155 - root - INFO - lr: 2.0521e-04 gnorm: 1.31 [ 1:50:50< 1:05:56] +[titan] 2025-06-13 14:32:23,243 - root - INFO - step: 9410 loss: 19.0675 memory: 6.46GiB(27.34%) tps: 26,528 tflops: 26.70 mfu: 8.56% global_avg_ntp_loss: 3.2453 global_avg_mtp_loss: 15.8222 +[titan] 2025-06-13 14:32:23,243 - root - INFO - lr: 2.0497e-04 gnorm: 1.21 [ 1:50:53< 1:05:52] +[titan] 2025-06-13 14:32:27,258 - root - INFO - step: 9415 loss: 18.4765 memory: 6.46GiB(27.34%) tps: 20,407 tflops: 20.54 mfu: 6.58% global_avg_ntp_loss: 3.1604 global_avg_mtp_loss: 15.3160 +[titan] 2025-06-13 14:32:27,258 - root - INFO - lr: 2.0473e-04 gnorm: 1.29 [ 1:50:57< 1:05:49] +[titan] 2025-06-13 14:32:30,893 - root - INFO - step: 9420 loss: 21.3494 memory: 6.46GiB(27.34%) tps: 22,540 tflops: 22.68 mfu: 7.27% global_avg_ntp_loss: 3.7451 global_avg_mtp_loss: 17.6043 +[titan] 2025-06-13 14:32:30,893 - root - INFO - lr: 2.0449e-04 gnorm: 1.30 [ 1:51:00< 1:05:45] +[titan] 2025-06-13 14:32:34,473 - root - INFO - step: 9425 loss: 19.7069 memory: 6.46GiB(27.34%) tps: 22,884 tflops: 23.03 mfu: 7.38% global_avg_ntp_loss: 3.4242 global_avg_mtp_loss: 16.2827 +[titan] 2025-06-13 14:32:34,473 - root - INFO - lr: 2.0425e-04 gnorm: 1.33 [ 1:51:04< 1:05:42] +[titan] 2025-06-13 14:32:37,881 - root - INFO - step: 9430 loss: 19.6805 memory: 6.46GiB(27.34%) tps: 24,036 tflops: 24.19 mfu: 7.75% global_avg_ntp_loss: 3.4278 global_avg_mtp_loss: 16.2527 +[titan] 2025-06-13 14:32:37,882 - root - INFO - lr: 2.0401e-04 gnorm: 1.20 [ 1:51:07< 1:05:38] +[titan] 2025-06-13 14:32:41,132 - root - INFO - step: 9435 loss: 19.7880 memory: 6.46GiB(27.34%) tps: 25,207 tflops: 25.37 mfu: 8.13% global_avg_ntp_loss: 3.4350 global_avg_mtp_loss: 16.3530 +[titan] 2025-06-13 14:32:41,132 - root - INFO - lr: 2.0377e-04 gnorm: 1.38 [ 1:51:11< 1:05:34] +[titan] 2025-06-13 14:32:44,298 - root - INFO - step: 9440 loss: 18.3249 memory: 6.46GiB(27.34%) tps: 25,878 tflops: 26.04 mfu: 8.35% global_avg_ntp_loss: 3.1995 global_avg_mtp_loss: 15.1254 +[titan] 2025-06-13 14:32:44,298 - root - INFO - lr: 2.0353e-04 gnorm: 1.87 [ 1:51:14< 1:05:31] +[titan] 2025-06-13 14:32:48,460 - root - INFO - step: 9445 loss: 19.2482 memory: 6.46GiB(27.34%) tps: 19,684 tflops: 19.81 mfu: 6.35% global_avg_ntp_loss: 3.2897 global_avg_mtp_loss: 15.9586 +[titan] 2025-06-13 14:32:48,461 - root - INFO - lr: 2.0329e-04 gnorm: 1.22 [ 1:51:18< 1:05:27] +[titan] 2025-06-13 14:32:51,625 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:32:52,158 - root - INFO - step: 9450 loss: 20.1846 memory: 6.46GiB(27.34%) tps: 22,157 tflops: 22.30 mfu: 7.15% global_avg_ntp_loss: 3.5175 global_avg_mtp_loss: 16.6672 +[titan] 2025-06-13 14:32:52,158 - root - INFO - lr: 2.0306e-04 gnorm: 1.18 [ 1:51:22< 1:05:24] +[titan] 2025-06-13 14:32:55,652 - root - INFO - step: 9455 loss: 18.6997 memory: 6.46GiB(27.34%) tps: 23,447 tflops: 23.60 mfu: 7.56% global_avg_ntp_loss: 3.2870 global_avg_mtp_loss: 15.4127 +[titan] 2025-06-13 14:32:55,652 - root - INFO - lr: 2.0282e-04 gnorm: 1.11 [ 1:51:25< 1:05:20] +[titan] 2025-06-13 14:32:59,360 - root - INFO - step: 9460 loss: 19.9897 memory: 6.46GiB(27.34%) tps: 22,093 tflops: 22.23 mfu: 7.13% global_avg_ntp_loss: 3.4694 global_avg_mtp_loss: 16.5204 +[titan] 2025-06-13 14:32:59,361 - root - INFO - lr: 2.0258e-04 gnorm: 1.12 [ 1:51:29< 1:05:17] +[titan] 2025-06-13 14:33:03,236 - root - INFO - step: 9465 loss: 19.5946 memory: 6.46GiB(27.34%) tps: 21,139 tflops: 21.27 mfu: 6.82% global_avg_ntp_loss: 3.3955 global_avg_mtp_loss: 16.1991 +[titan] 2025-06-13 14:33:03,237 - root - INFO - lr: 2.0234e-04 gnorm: 1.28 [ 1:51:33< 1:05:14] +[titan] 2025-06-13 14:33:06,668 - root - INFO - step: 9470 loss: 18.8235 memory: 6.46GiB(27.34%) tps: 23,875 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 3.2316 global_avg_mtp_loss: 15.5920 +[titan] 2025-06-13 14:33:06,669 - root - INFO - lr: 2.0210e-04 gnorm: 1.25 [ 1:51:36< 1:05:10] +[titan] 2025-06-13 14:33:10,060 - root - INFO - step: 9475 loss: 20.5704 memory: 6.46GiB(27.34%) tps: 24,155 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 3.5612 global_avg_mtp_loss: 17.0092 +[titan] 2025-06-13 14:33:10,060 - root - INFO - lr: 2.0186e-04 gnorm: 1.37 [ 1:51:39< 1:05:06] +[titan] 2025-06-13 14:33:13,077 - root - INFO - step: 9480 loss: 19.7433 memory: 6.46GiB(27.34%) tps: 27,155 tflops: 27.33 mfu: 8.76% global_avg_ntp_loss: 3.4229 global_avg_mtp_loss: 16.3204 +[titan] 2025-06-13 14:33:13,078 - root - INFO - lr: 2.0162e-04 gnorm: 1.13 [ 1:51:43< 1:05:03] +[titan] 2025-06-13 14:33:16,450 - root - INFO - step: 9485 loss: 19.5291 memory: 6.46GiB(27.34%) tps: 24,290 tflops: 24.44 mfu: 7.83% global_avg_ntp_loss: 3.4615 global_avg_mtp_loss: 16.0676 +[titan] 2025-06-13 14:33:16,451 - root - INFO - lr: 2.0138e-04 gnorm: 1.37 [ 1:51:46< 1:04:59] +[titan] 2025-06-13 14:33:20,204 - root - INFO - step: 9490 loss: 19.7715 memory: 6.46GiB(27.34%) tps: 21,824 tflops: 21.96 mfu: 7.04% global_avg_ntp_loss: 3.4617 global_avg_mtp_loss: 16.3098 +[titan] 2025-06-13 14:33:20,205 - root - INFO - lr: 2.0115e-04 gnorm: 1.23 [ 1:51:50< 1:04:55] +[titan] 2025-06-13 14:33:24,265 - root - INFO - step: 9495 loss: 20.0435 memory: 6.46GiB(27.34%) tps: 20,179 tflops: 20.31 mfu: 6.51% global_avg_ntp_loss: 3.4487 global_avg_mtp_loss: 16.5949 +[titan] 2025-06-13 14:33:24,265 - root - INFO - lr: 2.0091e-04 gnorm: 1.20 [ 1:51:54< 1:04:52] +[titan] 2025-06-13 14:33:27,016 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:33:27,940 - root - INFO - step: 9500 loss: 20.0893 memory: 6.46GiB(27.34%) tps: 22,295 tflops: 22.44 mfu: 7.19% global_avg_ntp_loss: 3.4588 global_avg_mtp_loss: 16.6304 +[titan] 2025-06-13 14:33:27,940 - root - INFO - lr: 2.0067e-04 gnorm: 1.29 [ 1:51:57< 1:04:49] +[titan] 2025-06-13 14:33:31,632 - root - INFO - step: 9505 loss: 19.6103 memory: 6.46GiB(27.34%) tps: 22,189 tflops: 22.33 mfu: 7.16% global_avg_ntp_loss: 3.3692 global_avg_mtp_loss: 16.2411 +[titan] 2025-06-13 14:33:31,632 - root - INFO - lr: 2.0043e-04 gnorm: 1.15 [ 1:52:01< 1:04:45] +[titan] 2025-06-13 14:33:35,115 - root - INFO - step: 9510 loss: 20.3180 memory: 6.46GiB(27.34%) tps: 23,524 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.6152 global_avg_mtp_loss: 16.7028 +[titan] 2025-06-13 14:33:35,115 - root - INFO - lr: 2.0019e-04 gnorm: 1.25 [ 1:52:05< 1:04:42] +[titan] 2025-06-13 14:33:38,508 - root - INFO - step: 9515 loss: 19.5165 memory: 6.46GiB(27.34%) tps: 24,149 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 3.3539 global_avg_mtp_loss: 16.1626 +[titan] 2025-06-13 14:33:38,508 - root - INFO - lr: 1.9995e-04 gnorm: 1.10 [ 1:52:08< 1:04:38] +[titan] 2025-06-13 14:33:42,388 - root - INFO - step: 9520 loss: 20.1178 memory: 6.46GiB(27.34%) tps: 21,113 tflops: 21.25 mfu: 6.81% global_avg_ntp_loss: 3.5025 global_avg_mtp_loss: 16.6153 +[titan] 2025-06-13 14:33:42,389 - root - INFO - lr: 1.9972e-04 gnorm: 1.16 [ 1:52:12< 1:04:35] +[titan] 2025-06-13 14:33:45,866 - root - INFO - step: 9525 loss: 18.7402 memory: 6.46GiB(27.34%) tps: 23,560 tflops: 23.71 mfu: 7.60% global_avg_ntp_loss: 3.2150 global_avg_mtp_loss: 15.5252 +[titan] 2025-06-13 14:33:45,866 - root - INFO - lr: 1.9948e-04 gnorm: 1.44 [ 1:52:15< 1:04:31] +[titan] 2025-06-13 14:33:49,768 - root - INFO - step: 9530 loss: 19.5326 memory: 6.46GiB(27.34%) tps: 20,997 tflops: 21.13 mfu: 6.77% global_avg_ntp_loss: 3.3860 global_avg_mtp_loss: 16.1466 +[titan] 2025-06-13 14:33:49,768 - root - INFO - lr: 1.9924e-04 gnorm: 1.24 [ 1:52:19< 1:04:28] +[titan] 2025-06-13 14:33:53,561 - root - INFO - step: 9535 loss: 19.3406 memory: 6.46GiB(27.34%) tps: 21,595 tflops: 21.73 mfu: 6.97% global_avg_ntp_loss: 3.2950 global_avg_mtp_loss: 16.0455 +[titan] 2025-06-13 14:33:53,562 - root - INFO - lr: 1.9900e-04 gnorm: 1.24 [ 1:52:23< 1:04:25] +[titan] 2025-06-13 14:33:56,683 - root - INFO - step: 9540 loss: 17.0356 memory: 6.46GiB(27.34%) tps: 26,249 tflops: 26.42 mfu: 8.47% global_avg_ntp_loss: 2.9012 global_avg_mtp_loss: 14.1344 +[titan] 2025-06-13 14:33:56,683 - root - INFO - lr: 1.9877e-04 gnorm: 1.36 [ 1:52:26< 1:04:21] +[titan] 2025-06-13 14:33:59,966 - root - INFO - step: 9545 loss: 20.9710 memory: 6.46GiB(27.34%) tps: 24,952 tflops: 25.11 mfu: 8.05% global_avg_ntp_loss: 3.7015 global_avg_mtp_loss: 17.2696 +[titan] 2025-06-13 14:33:59,967 - root - INFO - lr: 1.9853e-04 gnorm: 1.30 [ 1:52:29< 1:04:17] +[titan] 2025-06-13 14:34:02,705 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:34:03,530 - root - INFO - step: 9550 loss: 19.0553 memory: 6.46GiB(27.34%) tps: 22,992 tflops: 23.14 mfu: 7.42% global_avg_ntp_loss: 3.2442 global_avg_mtp_loss: 15.8111 +[titan] 2025-06-13 14:34:03,530 - root - INFO - lr: 1.9829e-04 gnorm: 1.16 [ 1:52:33< 1:04:14] +[titan] 2025-06-13 14:34:07,524 - root - INFO - step: 9555 loss: 18.9625 memory: 6.46GiB(27.34%) tps: 20,512 tflops: 20.64 mfu: 6.62% global_avg_ntp_loss: 3.2869 global_avg_mtp_loss: 15.6757 +[titan] 2025-06-13 14:34:07,524 - root - INFO - lr: 1.9805e-04 gnorm: 1.22 [ 1:52:37< 1:04:10] +[titan] 2025-06-13 14:34:11,241 - root - INFO - step: 9560 loss: 20.0281 memory: 6.46GiB(27.34%) tps: 22,040 tflops: 22.18 mfu: 7.11% global_avg_ntp_loss: 3.4564 global_avg_mtp_loss: 16.5717 +[titan] 2025-06-13 14:34:11,242 - root - INFO - lr: 1.9782e-04 gnorm: 1.13 [ 1:52:41< 1:04:07] +[titan] 2025-06-13 14:34:14,138 - root - INFO - step: 9565 loss: 19.6087 memory: 6.46GiB(27.34%) tps: 28,283 tflops: 28.46 mfu: 9.12% global_avg_ntp_loss: 3.4039 global_avg_mtp_loss: 16.2048 +[titan] 2025-06-13 14:34:14,139 - root - INFO - lr: 1.9758e-04 gnorm: 1.18 [ 1:52:44< 1:04:03] +[titan] 2025-06-13 14:34:17,803 - root - INFO - step: 9570 loss: 18.9905 memory: 6.46GiB(27.34%) tps: 22,358 tflops: 22.50 mfu: 7.21% global_avg_ntp_loss: 3.2544 global_avg_mtp_loss: 15.7361 +[titan] 2025-06-13 14:34:17,803 - root - INFO - lr: 1.9734e-04 gnorm: 1.19 [ 1:52:47< 1:03:59] +[titan] 2025-06-13 14:34:21,625 - root - INFO - step: 9575 loss: 20.1482 memory: 6.46GiB(27.34%) tps: 21,435 tflops: 21.57 mfu: 6.91% global_avg_ntp_loss: 3.5457 global_avg_mtp_loss: 16.6026 +[titan] 2025-06-13 14:34:21,625 - root - INFO - lr: 1.9711e-04 gnorm: 1.14 [ 1:52:51< 1:03:56] +[titan] 2025-06-13 14:34:25,097 - root - INFO - step: 9580 loss: 18.2400 memory: 6.46GiB(27.34%) tps: 23,595 tflops: 23.75 mfu: 7.61% global_avg_ntp_loss: 3.1192 global_avg_mtp_loss: 15.1207 +[titan] 2025-06-13 14:34:25,098 - root - INFO - lr: 1.9687e-04 gnorm: 1.25 [ 1:52:55< 1:03:53] +[titan] 2025-06-13 14:34:28,582 - root - INFO - step: 9585 loss: 19.3778 memory: 6.46GiB(27.34%) tps: 23,510 tflops: 23.66 mfu: 7.58% global_avg_ntp_loss: 3.2786 global_avg_mtp_loss: 16.0991 +[titan] 2025-06-13 14:34:28,582 - root - INFO - lr: 1.9663e-04 gnorm: 1.25 [ 1:52:58< 1:03:49] +[titan] 2025-06-13 14:34:32,005 - root - INFO - step: 9590 loss: 19.1506 memory: 6.46GiB(27.34%) tps: 23,940 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.3529 global_avg_mtp_loss: 15.7977 +[titan] 2025-06-13 14:34:32,005 - root - INFO - lr: 1.9640e-04 gnorm: 1.42 [ 1:53:01< 1:03:45] +[titan] 2025-06-13 14:34:35,363 - root - INFO - step: 9595 loss: 18.8975 memory: 6.46GiB(27.34%) tps: 24,392 tflops: 24.55 mfu: 7.87% global_avg_ntp_loss: 3.2615 global_avg_mtp_loss: 15.6360 +[titan] 2025-06-13 14:34:35,364 - root - INFO - lr: 1.9616e-04 gnorm: 1.30 [ 1:53:05< 1:03:42] +[titan] 2025-06-13 14:34:38,122 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:34:38,904 - root - INFO - step: 9600 loss: 19.5049 memory: 6.46GiB(27.34%) tps: 23,144 tflops: 23.29 mfu: 7.47% global_avg_ntp_loss: 3.3703 global_avg_mtp_loss: 16.1346 +[titan] 2025-06-13 14:34:38,904 - root - INFO - lr: 1.9592e-04 gnorm: 1.21 [ 1:53:08< 1:03:38] +[titan] 2025-06-13 14:34:42,430 - root - INFO - step: 9605 loss: 19.5990 memory: 6.46GiB(27.34%) tps: 23,238 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.3997 global_avg_mtp_loss: 16.1993 +[titan] 2025-06-13 14:34:42,430 - root - INFO - lr: 1.9569e-04 gnorm: 1.35 [ 1:53:12< 1:03:35] +[titan] 2025-06-13 14:34:46,049 - root - INFO - step: 9610 loss: 20.0601 memory: 6.46GiB(27.34%) tps: 22,634 tflops: 22.78 mfu: 7.30% global_avg_ntp_loss: 3.5946 global_avg_mtp_loss: 16.4655 +[titan] 2025-06-13 14:34:46,050 - root - INFO - lr: 1.9545e-04 gnorm: 1.29 [ 1:53:15< 1:03:31] +[titan] 2025-06-13 14:34:49,405 - root - INFO - step: 9615 loss: 19.4615 memory: 6.46GiB(27.34%) tps: 24,419 tflops: 24.57 mfu: 7.88% global_avg_ntp_loss: 3.3907 global_avg_mtp_loss: 16.0708 +[titan] 2025-06-13 14:34:49,405 - root - INFO - lr: 1.9521e-04 gnorm: 1.26 [ 1:53:19< 1:03:28] +[titan] 2025-06-13 14:34:53,257 - root - INFO - step: 9620 loss: 20.0429 memory: 6.46GiB(27.34%) tps: 21,266 tflops: 21.40 mfu: 6.86% global_avg_ntp_loss: 3.4764 global_avg_mtp_loss: 16.5665 +[titan] 2025-06-13 14:34:53,257 - root - INFO - lr: 1.9498e-04 gnorm: 1.27 [ 1:53:23< 1:03:24] +[titan] 2025-06-13 14:34:56,735 - root - INFO - step: 9625 loss: 20.3617 memory: 6.46GiB(27.34%) tps: 23,555 tflops: 23.71 mfu: 7.60% global_avg_ntp_loss: 3.5269 global_avg_mtp_loss: 16.8347 +[titan] 2025-06-13 14:34:56,736 - root - INFO - lr: 1.9474e-04 gnorm: 1.25 [ 1:53:26< 1:03:21] +[titan] 2025-06-13 14:35:00,426 - root - INFO - step: 9630 loss: 17.4690 memory: 6.46GiB(27.34%) tps: 22,202 tflops: 22.34 mfu: 7.16% global_avg_ntp_loss: 2.9339 global_avg_mtp_loss: 14.5351 +[titan] 2025-06-13 14:35:00,426 - root - INFO - lr: 1.9451e-04 gnorm: 1.52 [ 1:53:30< 1:03:17] +[titan] 2025-06-13 14:35:03,728 - root - INFO - step: 9635 loss: 20.8185 memory: 6.46GiB(27.34%) tps: 24,808 tflops: 24.97 mfu: 8.00% global_avg_ntp_loss: 3.6269 global_avg_mtp_loss: 17.1916 +[titan] 2025-06-13 14:35:03,728 - root - INFO - lr: 1.9427e-04 gnorm: 1.15 [ 1:53:33< 1:03:13] +[titan] 2025-06-13 14:35:07,201 - root - INFO - step: 9640 loss: 20.0038 memory: 6.46GiB(27.34%) tps: 23,589 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 3.4459 global_avg_mtp_loss: 16.5579 +[titan] 2025-06-13 14:35:07,202 - root - INFO - lr: 1.9404e-04 gnorm: 1.28 [ 1:53:37< 1:03:10] +[titan] 2025-06-13 14:35:11,204 - root - INFO - step: 9645 loss: 19.3208 memory: 6.46GiB(27.34%) tps: 20,472 tflops: 20.60 mfu: 6.60% global_avg_ntp_loss: 3.3341 global_avg_mtp_loss: 15.9867 +[titan] 2025-06-13 14:35:11,204 - root - INFO - lr: 1.9380e-04 gnorm: 1.20 [ 1:53:41< 1:03:07] +[titan] 2025-06-13 14:35:13,819 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:35:14,295 - root - INFO - step: 9650 loss: 19.1099 memory: 6.46GiB(27.34%) tps: 26,506 tflops: 26.67 mfu: 8.55% global_avg_ntp_loss: 3.2987 global_avg_mtp_loss: 15.8112 +[titan] 2025-06-13 14:35:14,295 - root - INFO - lr: 1.9356e-04 gnorm: 1.62 [ 1:53:44< 1:03:03] +[titan] 2025-06-13 14:35:18,199 - root - INFO - step: 9655 loss: 19.9623 memory: 6.46GiB(27.34%) tps: 20,984 tflops: 21.12 mfu: 6.77% global_avg_ntp_loss: 3.4727 global_avg_mtp_loss: 16.4897 +[titan] 2025-06-13 14:35:18,200 - root - INFO - lr: 1.9333e-04 gnorm: 1.15 [ 1:53:48< 1:03:00] +[titan] 2025-06-13 14:35:21,892 - root - INFO - step: 9660 loss: 19.1179 memory: 6.46GiB(27.34%) tps: 22,190 tflops: 22.33 mfu: 7.16% global_avg_ntp_loss: 3.2723 global_avg_mtp_loss: 15.8456 +[titan] 2025-06-13 14:35:21,892 - root - INFO - lr: 1.9309e-04 gnorm: 1.25 [ 1:53:51< 1:02:56] +[titan] 2025-06-13 14:35:25,470 - root - INFO - step: 9665 loss: 20.1016 memory: 6.46GiB(27.34%) tps: 22,898 tflops: 23.04 mfu: 7.39% global_avg_ntp_loss: 3.4751 global_avg_mtp_loss: 16.6264 +[titan] 2025-06-13 14:35:25,470 - root - INFO - lr: 1.9286e-04 gnorm: 1.25 [ 1:53:55< 1:02:53] +[titan] 2025-06-13 14:35:29,606 - root - INFO - step: 9670 loss: 19.4358 memory: 6.46GiB(27.34%) tps: 19,809 tflops: 19.94 mfu: 6.39% global_avg_ntp_loss: 3.3325 global_avg_mtp_loss: 16.1033 +[titan] 2025-06-13 14:35:29,606 - root - INFO - lr: 1.9262e-04 gnorm: 1.11 [ 1:53:59< 1:02:49] +[titan] 2025-06-13 14:35:32,484 - root - INFO - step: 9675 loss: 18.8669 memory: 6.46GiB(27.34%) tps: 28,465 tflops: 28.65 mfu: 9.18% global_avg_ntp_loss: 3.2684 global_avg_mtp_loss: 15.5986 +[titan] 2025-06-13 14:35:32,484 - root - INFO - lr: 1.9239e-04 gnorm: 1.56 [ 1:54:02< 1:02:45] +[titan] 2025-06-13 14:35:36,015 - root - INFO - step: 9680 loss: 19.7306 memory: 6.46GiB(27.34%) tps: 23,207 tflops: 23.35 mfu: 7.49% global_avg_ntp_loss: 3.4236 global_avg_mtp_loss: 16.3070 +[titan] 2025-06-13 14:35:36,015 - root - INFO - lr: 1.9215e-04 gnorm: 1.19 [ 1:54:05< 1:02:42] +[titan] 2025-06-13 14:35:39,741 - root - INFO - step: 9685 loss: 19.7742 memory: 6.46GiB(27.34%) tps: 21,984 tflops: 22.12 mfu: 7.09% global_avg_ntp_loss: 3.4464 global_avg_mtp_loss: 16.3278 +[titan] 2025-06-13 14:35:39,742 - root - INFO - lr: 1.9192e-04 gnorm: 1.20 [ 1:54:09< 1:02:38] +[titan] 2025-06-13 14:35:43,281 - root - INFO - step: 9690 loss: 18.9382 memory: 6.46GiB(27.34%) tps: 23,144 tflops: 23.29 mfu: 7.47% global_avg_ntp_loss: 3.2498 global_avg_mtp_loss: 15.6884 +[titan] 2025-06-13 14:35:43,282 - root - INFO - lr: 1.9168e-04 gnorm: 1.32 [ 1:54:13< 1:02:35] +[titan] 2025-06-13 14:35:46,679 - root - INFO - step: 9695 loss: 19.7202 memory: 6.46GiB(27.34%) tps: 24,118 tflops: 24.27 mfu: 7.78% global_avg_ntp_loss: 3.4158 global_avg_mtp_loss: 16.3044 +[titan] 2025-06-13 14:35:46,679 - root - INFO - lr: 1.9145e-04 gnorm: 1.35 [ 1:54:16< 1:02:31] +[titan] 2025-06-13 14:35:49,570 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:35:50,119 - root - INFO - step: 9700 loss: 19.5359 memory: 6.46GiB(27.34%) tps: 23,810 tflops: 23.96 mfu: 7.68% global_avg_ntp_loss: 3.3977 global_avg_mtp_loss: 16.1382 +[titan] 2025-06-13 14:35:50,120 - root - INFO - lr: 1.9122e-04 gnorm: 1.23 [ 1:54:20< 1:02:28] +[titan] 2025-06-13 14:35:53,847 - root - INFO - step: 9705 loss: 19.0620 memory: 6.46GiB(27.34%) tps: 21,980 tflops: 22.12 mfu: 7.09% global_avg_ntp_loss: 3.2818 global_avg_mtp_loss: 15.7802 +[titan] 2025-06-13 14:35:53,847 - root - INFO - lr: 1.9098e-04 gnorm: 1.15 [ 1:54:23< 1:02:24] +[titan] 2025-06-13 14:35:57,214 - root - INFO - step: 9710 loss: 18.9714 memory: 6.46GiB(27.34%) tps: 24,329 tflops: 24.48 mfu: 7.85% global_avg_ntp_loss: 3.2850 global_avg_mtp_loss: 15.6864 +[titan] 2025-06-13 14:35:57,215 - root - INFO - lr: 1.9075e-04 gnorm: 1.48 [ 1:54:27< 1:02:21] +[titan] 2025-06-13 14:36:00,751 - root - INFO - step: 9715 loss: 18.3309 memory: 6.46GiB(27.34%) tps: 23,167 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.1862 global_avg_mtp_loss: 15.1446 +[titan] 2025-06-13 14:36:00,751 - root - INFO - lr: 1.9051e-04 gnorm: 1.23 [ 1:54:30< 1:02:17] +[titan] 2025-06-13 14:36:04,218 - root - INFO - step: 9720 loss: 19.5089 memory: 6.46GiB(27.34%) tps: 23,632 tflops: 23.78 mfu: 7.62% global_avg_ntp_loss: 3.3494 global_avg_mtp_loss: 16.1595 +[titan] 2025-06-13 14:36:04,218 - root - INFO - lr: 1.9028e-04 gnorm: 1.21 [ 1:54:34< 1:02:14] +[titan] 2025-06-13 14:36:08,030 - root - INFO - step: 9725 loss: 19.0723 memory: 6.46GiB(27.34%) tps: 21,494 tflops: 21.63 mfu: 6.93% global_avg_ntp_loss: 3.2983 global_avg_mtp_loss: 15.7740 +[titan] 2025-06-13 14:36:08,030 - root - INFO - lr: 1.9005e-04 gnorm: 1.26 [ 1:54:37< 1:02:10] +[titan] 2025-06-13 14:36:10,278 - root - INFO - Dumping profiler traces at step 9728 +[titan] 2025-06-13 14:36:10,370 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 14:36:11,567 - root - INFO - step: 9730 loss: 19.7434 memory: 6.46GiB(27.34%) tps: 23,159 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.3948 global_avg_mtp_loss: 16.3486 +[titan] 2025-06-13 14:36:11,568 - root - INFO - lr: 1.8981e-04 gnorm: 1.32 [ 1:54:41< 1:02:07] +[titan] 2025-06-13 14:36:15,365 - root - INFO - step: 9735 loss: 20.0378 memory: 6.46GiB(27.34%) tps: 21,572 tflops: 21.71 mfu: 6.96% global_avg_ntp_loss: 3.5119 global_avg_mtp_loss: 16.5259 +[titan] 2025-06-13 14:36:15,366 - root - INFO - lr: 1.8958e-04 gnorm: 1.21 [ 1:54:45< 1:02:03] +[titan] 2025-06-13 14:36:18,789 - root - INFO - step: 9740 loss: 19.4563 memory: 6.46GiB(27.34%) tps: 23,928 tflops: 24.08 mfu: 7.72% global_avg_ntp_loss: 3.3702 global_avg_mtp_loss: 16.0861 +[titan] 2025-06-13 14:36:18,790 - root - INFO - lr: 1.8935e-04 gnorm: 1.21 [ 1:54:48< 1:02:00] +[titan] 2025-06-13 14:36:22,380 - root - INFO - step: 9745 loss: 20.1798 memory: 6.46GiB(27.34%) tps: 22,816 tflops: 22.96 mfu: 7.36% global_avg_ntp_loss: 3.4708 global_avg_mtp_loss: 16.7090 +[titan] 2025-06-13 14:36:22,381 - root - INFO - lr: 1.8911e-04 gnorm: 1.21 [ 1:54:52< 1:01:56] +[titan] 2025-06-13 14:36:25,181 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:36:25,867 - root - INFO - step: 9750 loss: 18.2932 memory: 6.46GiB(27.34%) tps: 23,499 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.1424 global_avg_mtp_loss: 15.1509 +[titan] 2025-06-13 14:36:25,867 - root - INFO - lr: 1.8888e-04 gnorm: 1.27 [ 1:54:55< 1:01:53] +[titan] 2025-06-13 14:36:29,628 - root - INFO - step: 9755 loss: 18.8706 memory: 6.46GiB(27.34%) tps: 21,786 tflops: 21.92 mfu: 7.03% global_avg_ntp_loss: 3.3030 global_avg_mtp_loss: 15.5677 +[titan] 2025-06-13 14:36:29,628 - root - INFO - lr: 1.8865e-04 gnorm: 1.25 [ 1:54:59< 1:01:49] +[titan] 2025-06-13 14:36:33,660 - root - INFO - step: 9760 loss: 20.4403 memory: 6.46GiB(27.34%) tps: 20,319 tflops: 20.45 mfu: 6.55% global_avg_ntp_loss: 3.5590 global_avg_mtp_loss: 16.8813 +[titan] 2025-06-13 14:36:33,660 - root - INFO - lr: 1.8841e-04 gnorm: 1.18 [ 1:55:03< 1:01:46] +[titan] 2025-06-13 14:36:37,249 - root - INFO - step: 9765 loss: 18.6812 memory: 6.46GiB(27.34%) tps: 22,826 tflops: 22.97 mfu: 7.36% global_avg_ntp_loss: 3.2272 global_avg_mtp_loss: 15.4540 +[titan] 2025-06-13 14:36:37,249 - root - INFO - lr: 1.8818e-04 gnorm: 1.33 [ 1:55:07< 1:01:42] +[titan] 2025-06-13 14:36:41,074 - root - INFO - step: 9770 loss: 20.1477 memory: 6.46GiB(27.34%) tps: 21,422 tflops: 21.56 mfu: 6.91% global_avg_ntp_loss: 3.4856 global_avg_mtp_loss: 16.6621 +[titan] 2025-06-13 14:36:41,074 - root - INFO - lr: 1.8795e-04 gnorm: 1.36 [ 1:55:10< 1:01:39] +[titan] 2025-06-13 14:36:44,327 - root - INFO - step: 9775 loss: 20.3695 memory: 6.46GiB(27.34%) tps: 25,182 tflops: 25.34 mfu: 8.12% global_avg_ntp_loss: 3.5209 global_avg_mtp_loss: 16.8486 +[titan] 2025-06-13 14:36:44,328 - root - INFO - lr: 1.8771e-04 gnorm: 1.20 [ 1:55:14< 1:01:35] +[titan] 2025-06-13 14:36:47,739 - root - INFO - step: 9780 loss: 18.8705 memory: 6.46GiB(27.34%) tps: 24,017 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 3.3037 global_avg_mtp_loss: 15.5668 +[titan] 2025-06-13 14:36:47,739 - root - INFO - lr: 1.8748e-04 gnorm: 1.27 [ 1:55:17< 1:01:32] +[titan] 2025-06-13 14:36:51,392 - root - INFO - step: 9785 loss: 19.3874 memory: 6.46GiB(27.34%) tps: 22,425 tflops: 22.57 mfu: 7.23% global_avg_ntp_loss: 3.3385 global_avg_mtp_loss: 16.0489 +[titan] 2025-06-13 14:36:51,392 - root - INFO - lr: 1.8725e-04 gnorm: 1.34 [ 1:55:21< 1:01:28] +[titan] 2025-06-13 14:36:55,007 - root - INFO - step: 9790 loss: 19.4410 memory: 6.46GiB(27.34%) tps: 22,662 tflops: 22.81 mfu: 7.31% global_avg_ntp_loss: 3.3582 global_avg_mtp_loss: 16.0828 +[titan] 2025-06-13 14:36:55,008 - root - INFO - lr: 1.8702e-04 gnorm: 1.26 [ 1:55:24< 1:01:25] +[titan] 2025-06-13 14:36:58,403 - root - INFO - step: 9795 loss: 19.5947 memory: 6.46GiB(27.34%) tps: 24,130 tflops: 24.28 mfu: 7.78% global_avg_ntp_loss: 3.3901 global_avg_mtp_loss: 16.2046 +[titan] 2025-06-13 14:36:58,403 - root - INFO - lr: 1.8678e-04 gnorm: 1.20 [ 1:55:28< 1:01:21] +[titan] 2025-06-13 14:37:01,321 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:37:02,176 - root - INFO - step: 9800 loss: 17.8414 memory: 6.46GiB(27.34%) tps: 21,716 tflops: 21.85 mfu: 7.00% global_avg_ntp_loss: 3.1116 global_avg_mtp_loss: 14.7298 +[titan] 2025-06-13 14:37:02,176 - root - INFO - lr: 1.8655e-04 gnorm: 1.27 [ 1:55:32< 1:01:18] +[titan] 2025-06-13 14:37:05,277 - root - INFO - step: 9805 loss: 16.2017 memory: 6.46GiB(27.34%) tps: 26,416 tflops: 26.58 mfu: 8.52% global_avg_ntp_loss: 2.7836 global_avg_mtp_loss: 13.4180 +[titan] 2025-06-13 14:37:05,278 - root - INFO - lr: 1.8632e-04 gnorm: 1.67 [ 1:55:35< 1:01:14] +[titan] 2025-06-13 14:37:09,997 - root - INFO - step: 9810 loss: 19.3240 memory: 6.46GiB(27.34%) tps: 17,360 tflops: 17.47 mfu: 5.60% global_avg_ntp_loss: 3.3227 global_avg_mtp_loss: 16.0013 +[titan] 2025-06-13 14:37:09,997 - root - INFO - lr: 1.8609e-04 gnorm: 1.24 [ 1:55:39< 1:01:11] +[titan] 2025-06-13 14:37:13,399 - root - INFO - step: 9815 loss: 19.9701 memory: 6.46GiB(27.34%) tps: 24,080 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 3.4367 global_avg_mtp_loss: 16.5334 +[titan] 2025-06-13 14:37:13,402 - root - INFO - lr: 1.8586e-04 gnorm: 1.26 [ 1:55:43< 1:01:07] +[titan] 2025-06-13 14:37:16,798 - root - INFO - step: 9820 loss: 20.5329 memory: 6.46GiB(27.34%) tps: 24,129 tflops: 24.28 mfu: 7.78% global_avg_ntp_loss: 3.6245 global_avg_mtp_loss: 16.9085 +[titan] 2025-06-13 14:37:16,798 - root - INFO - lr: 1.8562e-04 gnorm: 1.27 [ 1:55:46< 1:01:04] +[titan] 2025-06-13 14:37:20,319 - root - INFO - step: 9825 loss: 19.1962 memory: 6.46GiB(27.34%) tps: 23,269 tflops: 23.42 mfu: 7.51% global_avg_ntp_loss: 3.3099 global_avg_mtp_loss: 15.8863 +[titan] 2025-06-13 14:37:20,319 - root - INFO - lr: 1.8539e-04 gnorm: 1.21 [ 1:55:50< 1:01:00] +[titan] 2025-06-13 14:37:23,374 - root - INFO - step: 9830 loss: 17.6184 memory: 6.46GiB(27.34%) tps: 26,817 tflops: 26.99 mfu: 8.65% global_avg_ntp_loss: 3.0350 global_avg_mtp_loss: 14.5834 +[titan] 2025-06-13 14:37:23,374 - root - INFO - lr: 1.8516e-04 gnorm: 1.58 [ 1:55:53< 1:00:57] +[titan] 2025-06-13 14:37:26,921 - root - INFO - step: 9835 loss: 19.9662 memory: 6.46GiB(27.34%) tps: 23,098 tflops: 23.25 mfu: 7.45% global_avg_ntp_loss: 3.4483 global_avg_mtp_loss: 16.5179 +[titan] 2025-06-13 14:37:26,921 - root - INFO - lr: 1.8493e-04 gnorm: 1.15 [ 1:55:56< 1:00:53] +[titan] 2025-06-13 14:37:30,493 - root - INFO - step: 9840 loss: 20.3939 memory: 6.46GiB(27.34%) tps: 22,938 tflops: 23.08 mfu: 7.40% global_avg_ntp_loss: 3.5731 global_avg_mtp_loss: 16.8209 +[titan] 2025-06-13 14:37:30,493 - root - INFO - lr: 1.8470e-04 gnorm: 1.22 [ 1:56:00< 1:00:49] +[titan] 2025-06-13 14:37:33,608 - root - INFO - step: 9845 loss: 19.7304 memory: 6.46GiB(27.34%) tps: 26,298 tflops: 26.47 mfu: 8.48% global_avg_ntp_loss: 3.3852 global_avg_mtp_loss: 16.3452 +[titan] 2025-06-13 14:37:33,609 - root - INFO - lr: 1.8447e-04 gnorm: 1.29 [ 1:56:03< 1:00:46] +[titan] 2025-06-13 14:37:36,835 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:37:37,477 - root - INFO - step: 9850 loss: 19.0741 memory: 6.46GiB(27.34%) tps: 21,176 tflops: 21.31 mfu: 6.83% global_avg_ntp_loss: 3.2798 global_avg_mtp_loss: 15.7943 +[titan] 2025-06-13 14:37:37,477 - root - INFO - lr: 1.8424e-04 gnorm: 1.31 [ 1:56:07< 1:00:42] +[titan] 2025-06-13 14:37:40,966 - root - INFO - step: 9855 loss: 19.2416 memory: 6.46GiB(27.34%) tps: 23,485 tflops: 23.63 mfu: 7.58% global_avg_ntp_loss: 3.3013 global_avg_mtp_loss: 15.9403 +[titan] 2025-06-13 14:37:40,966 - root - INFO - lr: 1.8401e-04 gnorm: 1.32 [ 1:56:10< 1:00:39] +[titan] 2025-06-13 14:37:44,325 - root - INFO - step: 9860 loss: 19.0508 memory: 6.46GiB(27.34%) tps: 24,388 tflops: 24.54 mfu: 7.87% global_avg_ntp_loss: 3.2394 global_avg_mtp_loss: 15.8114 +[titan] 2025-06-13 14:37:44,326 - root - INFO - lr: 1.8377e-04 gnorm: 1.20 [ 1:56:14< 1:00:35] +[titan] 2025-06-13 14:37:47,790 - root - INFO - step: 9865 loss: 19.7441 memory: 6.46GiB(27.34%) tps: 23,650 tflops: 23.80 mfu: 7.63% global_avg_ntp_loss: 3.3924 global_avg_mtp_loss: 16.3518 +[titan] 2025-06-13 14:37:47,790 - root - INFO - lr: 1.8354e-04 gnorm: 1.15 [ 1:56:17< 1:00:32] +[titan] 2025-06-13 14:37:51,840 - root - INFO - step: 9870 loss: 21.7743 memory: 6.46GiB(27.34%) tps: 20,227 tflops: 20.36 mfu: 6.52% global_avg_ntp_loss: 3.7719 global_avg_mtp_loss: 18.0024 +[titan] 2025-06-13 14:37:51,841 - root - INFO - lr: 1.8331e-04 gnorm: 1.47 [ 1:56:21< 1:00:28] +[titan] 2025-06-13 14:37:55,676 - root - INFO - step: 9875 loss: 17.4386 memory: 6.46GiB(27.34%) tps: 21,362 tflops: 21.50 mfu: 6.89% global_avg_ntp_loss: 2.9725 global_avg_mtp_loss: 14.4662 +[titan] 2025-06-13 14:37:55,676 - root - INFO - lr: 1.8308e-04 gnorm: 1.78 [ 1:56:25< 1:00:25] +[titan] 2025-06-13 14:37:58,881 - root - INFO - step: 9880 loss: 15.3133 memory: 6.46GiB(27.34%) tps: 25,562 tflops: 25.72 mfu: 8.25% global_avg_ntp_loss: 2.6460 global_avg_mtp_loss: 12.6673 +[titan] 2025-06-13 14:37:58,881 - root - INFO - lr: 1.8285e-04 gnorm: 1.35 [ 1:56:28< 1:00:21] +[titan] 2025-06-13 14:38:02,162 - root - INFO - step: 9885 loss: 19.0152 memory: 6.46GiB(27.34%) tps: 24,972 tflops: 25.13 mfu: 8.05% global_avg_ntp_loss: 3.2841 global_avg_mtp_loss: 15.7311 +[titan] 2025-06-13 14:38:02,162 - root - INFO - lr: 1.8262e-04 gnorm: 1.20 [ 1:56:32< 1:00:18] +[titan] 2025-06-13 14:38:05,923 - root - INFO - step: 9890 loss: 19.3393 memory: 6.46GiB(27.34%) tps: 21,781 tflops: 21.92 mfu: 7.03% global_avg_ntp_loss: 3.3951 global_avg_mtp_loss: 15.9441 +[titan] 2025-06-13 14:38:05,924 - root - INFO - lr: 1.8239e-04 gnorm: 1.24 [ 1:56:35< 1:00:14] +[titan] 2025-06-13 14:38:09,400 - root - INFO - step: 9895 loss: 18.2080 memory: 6.46GiB(27.34%) tps: 23,563 tflops: 23.71 mfu: 7.60% global_avg_ntp_loss: 3.1614 global_avg_mtp_loss: 15.0465 +[titan] 2025-06-13 14:38:09,401 - root - INFO - lr: 1.8216e-04 gnorm: 1.32 [ 1:56:39< 1:00:11] +[titan] 2025-06-13 14:38:12,674 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:38:13,341 - root - INFO - step: 9900 loss: 19.5660 memory: 6.46GiB(27.34%) tps: 20,791 tflops: 20.92 mfu: 6.71% global_avg_ntp_loss: 3.3540 global_avg_mtp_loss: 16.2119 +[titan] 2025-06-13 14:38:13,341 - root - INFO - lr: 1.8193e-04 gnorm: 1.18 [ 1:56:43< 1:00:07] +[titan] 2025-06-13 14:38:16,733 - root - INFO - step: 9905 loss: 20.0017 memory: 6.46GiB(27.34%) tps: 24,151 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 3.4440 global_avg_mtp_loss: 16.5577 +[titan] 2025-06-13 14:38:16,734 - root - INFO - lr: 1.8170e-04 gnorm: 1.19 [ 1:56:46< 1:00:04] +[titan] 2025-06-13 14:38:20,005 - root - INFO - step: 9910 loss: 20.0980 memory: 6.46GiB(27.34%) tps: 25,048 tflops: 25.21 mfu: 8.08% global_avg_ntp_loss: 3.4905 global_avg_mtp_loss: 16.6076 +[titan] 2025-06-13 14:38:20,005 - root - INFO - lr: 1.8147e-04 gnorm: 1.20 [ 1:56:49< 1:00:00] +[titan] 2025-06-13 14:38:23,396 - root - INFO - step: 9915 loss: 18.7589 memory: 6.46GiB(27.34%) tps: 24,158 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 3.2780 global_avg_mtp_loss: 15.4809 +[titan] 2025-06-13 14:38:23,396 - root - INFO - lr: 1.8124e-04 gnorm: 1.96 [ 1:56:53< 0:59:56] +[titan] 2025-06-13 14:38:26,537 - root - INFO - step: 9920 loss: 18.1078 memory: 6.46GiB(27.34%) tps: 26,089 tflops: 26.26 mfu: 8.42% global_avg_ntp_loss: 3.1021 global_avg_mtp_loss: 15.0057 +[titan] 2025-06-13 14:38:26,537 - root - INFO - lr: 1.8101e-04 gnorm: 1.28 [ 1:56:56< 0:59:53] +[titan] 2025-06-13 14:38:30,353 - root - INFO - step: 9925 loss: 20.4459 memory: 6.46GiB(27.34%) tps: 21,468 tflops: 21.61 mfu: 6.92% global_avg_ntp_loss: 3.5544 global_avg_mtp_loss: 16.8915 +[titan] 2025-06-13 14:38:30,353 - root - INFO - lr: 1.8078e-04 gnorm: 1.19 [ 1:57:00< 0:59:49] +[titan] 2025-06-13 14:38:33,554 - root - INFO - step: 9930 loss: 19.1001 memory: 6.46GiB(27.34%) tps: 25,598 tflops: 25.76 mfu: 8.26% global_avg_ntp_loss: 3.2497 global_avg_mtp_loss: 15.8503 +[titan] 2025-06-13 14:38:33,554 - root - INFO - lr: 1.8056e-04 gnorm: 1.29 [ 1:57:03< 0:59:45] +[titan] 2025-06-13 14:38:37,423 - root - INFO - step: 9935 loss: 19.7558 memory: 6.46GiB(27.34%) tps: 21,174 tflops: 21.31 mfu: 6.83% global_avg_ntp_loss: 3.4321 global_avg_mtp_loss: 16.3237 +[titan] 2025-06-13 14:38:37,423 - root - INFO - lr: 1.8033e-04 gnorm: 1.40 [ 1:57:07< 0:59:42] +[titan] 2025-06-13 14:38:42,248 - root - INFO - step: 9940 loss: 18.3657 memory: 6.46GiB(27.34%) tps: 16,981 tflops: 17.09 mfu: 5.48% global_avg_ntp_loss: 3.1675 global_avg_mtp_loss: 15.1982 +[titan] 2025-06-13 14:38:42,248 - root - INFO - lr: 1.8010e-04 gnorm: 1.20 [ 1:57:12< 0:59:39] +[titan] 2025-06-13 14:38:45,964 - root - INFO - step: 9945 loss: 17.8947 memory: 6.46GiB(27.34%) tps: 22,049 tflops: 22.19 mfu: 7.11% global_avg_ntp_loss: 3.1063 global_avg_mtp_loss: 14.7884 +[titan] 2025-06-13 14:38:45,964 - root - INFO - lr: 1.7987e-04 gnorm: 1.26 [ 1:57:15< 0:59:36] +[titan] 2025-06-13 14:38:49,603 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:38:50,171 - root - INFO - step: 9950 loss: 19.6694 memory: 6.46GiB(27.34%) tps: 19,473 tflops: 19.60 mfu: 6.28% global_avg_ntp_loss: 3.4354 global_avg_mtp_loss: 16.2341 +[titan] 2025-06-13 14:38:50,171 - root - INFO - lr: 1.7964e-04 gnorm: 1.40 [ 1:57:20< 0:59:33] +[titan] 2025-06-13 14:38:53,727 - root - INFO - step: 9955 loss: 19.4985 memory: 6.46GiB(27.34%) tps: 23,037 tflops: 23.18 mfu: 7.43% global_avg_ntp_loss: 3.3924 global_avg_mtp_loss: 16.1061 +[titan] 2025-06-13 14:38:53,727 - root - INFO - lr: 1.7941e-04 gnorm: 1.23 [ 1:57:23< 0:59:29] +[titan] 2025-06-13 14:38:57,098 - root - INFO - step: 9960 loss: 16.9429 memory: 6.46GiB(27.34%) tps: 24,309 tflops: 24.46 mfu: 7.84% global_avg_ntp_loss: 2.9010 global_avg_mtp_loss: 14.0419 +[titan] 2025-06-13 14:38:57,098 - root - INFO - lr: 1.7918e-04 gnorm: 1.76 [ 1:57:26< 0:59:25] +[titan] 2025-06-13 14:39:00,394 - root - INFO - step: 9965 loss: 18.8581 memory: 6.46GiB(27.34%) tps: 24,854 tflops: 25.01 mfu: 8.02% global_avg_ntp_loss: 3.2030 global_avg_mtp_loss: 15.6551 +[titan] 2025-06-13 14:39:00,394 - root - INFO - lr: 1.7895e-04 gnorm: 1.52 [ 1:57:30< 0:59:22] +[titan] 2025-06-13 14:39:04,022 - root - INFO - step: 9970 loss: 19.7837 memory: 6.46GiB(27.34%) tps: 22,586 tflops: 22.73 mfu: 7.29% global_avg_ntp_loss: 3.4333 global_avg_mtp_loss: 16.3505 +[titan] 2025-06-13 14:39:04,022 - root - INFO - lr: 1.7873e-04 gnorm: 1.29 [ 1:57:33< 0:59:18] +[titan] 2025-06-13 14:39:07,432 - root - INFO - step: 9975 loss: 19.0446 memory: 6.46GiB(27.34%) tps: 24,024 tflops: 24.18 mfu: 7.75% global_avg_ntp_loss: 3.2904 global_avg_mtp_loss: 15.7543 +[titan] 2025-06-13 14:39:07,432 - root - INFO - lr: 1.7850e-04 gnorm: 1.20 [ 1:57:37< 0:59:15] +[titan] 2025-06-13 14:39:11,147 - root - INFO - step: 9980 loss: 20.0519 memory: 6.46GiB(27.34%) tps: 22,057 tflops: 22.20 mfu: 7.11% global_avg_ntp_loss: 3.4960 global_avg_mtp_loss: 16.5559 +[titan] 2025-06-13 14:39:11,147 - root - INFO - lr: 1.7827e-04 gnorm: 1.16 [ 1:57:41< 0:59:11] +[titan] 2025-06-13 14:39:14,396 - root - INFO - step: 9985 loss: 19.3290 memory: 6.46GiB(27.34%) tps: 25,213 tflops: 25.37 mfu: 8.13% global_avg_ntp_loss: 3.3388 global_avg_mtp_loss: 15.9903 +[titan] 2025-06-13 14:39:14,396 - root - INFO - lr: 1.7804e-04 gnorm: 1.21 [ 1:57:44< 0:59:08] +[titan] 2025-06-13 14:39:18,056 - root - INFO - step: 9990 loss: 19.1178 memory: 6.46GiB(27.34%) tps: 22,385 tflops: 22.53 mfu: 7.22% global_avg_ntp_loss: 3.3277 global_avg_mtp_loss: 15.7901 +[titan] 2025-06-13 14:39:18,056 - root - INFO - lr: 1.7782e-04 gnorm: 1.29 [ 1:57:47< 0:59:04] +[titan] 2025-06-13 14:39:21,195 - root - INFO - step: 9995 loss: 19.5431 memory: 6.46GiB(27.34%) tps: 26,100 tflops: 26.27 mfu: 8.42% global_avg_ntp_loss: 3.3954 global_avg_mtp_loss: 16.1478 +[titan] 2025-06-13 14:39:21,196 - root - INFO - lr: 1.7759e-04 gnorm: 1.18 [ 1:57:51< 0:59:00] +[titan] 2025-06-13 14:39:24,047 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:39:24,689 - root - INFO - step: 10000 loss: 18.5252 memory: 6.46GiB(27.34%) tps: 23,450 tflops: 23.60 mfu: 7.56% global_avg_ntp_loss: 3.1728 global_avg_mtp_loss: 15.3523 +[titan] 2025-06-13 14:39:24,689 - root - INFO - lr: 1.7736e-04 gnorm: 1.25 [ 1:57:54< 0:58:57] +[titan] 2025-06-13 14:39:24,690 - root - INFO - Saving the checkpoint (or staging if async is enabled). +[titan] 2025-06-13 14:39:25,747 - root - INFO - [GC] GC collection invoked by checkpointer. 0.01 seconds. +[titan] 2025-06-13 14:39:25,747 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 1.06 seconds. +[titan] 2025-06-13 14:39:28,982 - root - INFO - step: 10005 loss: 18.8759 memory: 6.46GiB(27.34%) tps: 19,086 tflops: 19.21 mfu: 6.16% global_avg_ntp_loss: 3.3052 global_avg_mtp_loss: 15.5707 +[titan] 2025-06-13 14:39:28,982 - root - INFO - lr: 1.7713e-04 gnorm: 1.16 [ 1:57:58< 0:58:54] +[titan] 2025-06-13 14:39:32,984 - root - INFO - step: 10010 loss: 18.2907 memory: 6.46GiB(27.34%) tps: 20,470 tflops: 20.60 mfu: 6.60% global_avg_ntp_loss: 3.1476 global_avg_mtp_loss: 15.1431 +[titan] 2025-06-13 14:39:32,984 - root - INFO - lr: 1.7691e-04 gnorm: 1.32 [ 1:58:02< 0:58:50] +[titan] 2025-06-13 14:39:36,345 - root - INFO - step: 10015 loss: 18.4957 memory: 6.46GiB(27.34%) tps: 24,375 tflops: 24.53 mfu: 7.86% global_avg_ntp_loss: 3.1878 global_avg_mtp_loss: 15.3079 +[titan] 2025-06-13 14:39:36,346 - root - INFO - lr: 1.7668e-04 gnorm: 1.35 [ 1:58:06< 0:58:47] +[titan] 2025-06-13 14:39:40,207 - root - INFO - step: 10020 loss: 18.0652 memory: 6.46GiB(27.34%) tps: 21,218 tflops: 21.35 mfu: 6.84% global_avg_ntp_loss: 3.1326 global_avg_mtp_loss: 14.9326 +[titan] 2025-06-13 14:39:40,207 - root - INFO - lr: 1.7645e-04 gnorm: 1.26 [ 1:58:10< 0:58:43] +[titan] 2025-06-13 14:39:43,696 - root - INFO - step: 10025 loss: 18.8936 memory: 6.46GiB(27.34%) tps: 23,480 tflops: 23.63 mfu: 7.57% global_avg_ntp_loss: 3.2621 global_avg_mtp_loss: 15.6316 +[titan] 2025-06-13 14:39:43,696 - root - INFO - lr: 1.7622e-04 gnorm: 1.33 [ 1:58:13< 0:58:40] +[titan] 2025-06-13 14:39:47,959 - root - INFO - step: 10030 loss: 18.7066 memory: 6.46GiB(27.34%) tps: 19,219 tflops: 19.34 mfu: 6.20% global_avg_ntp_loss: 3.2650 global_avg_mtp_loss: 15.4416 +[titan] 2025-06-13 14:39:47,959 - root - INFO - lr: 1.7600e-04 gnorm: 1.42 [ 1:58:17< 0:58:37] +[titan] 2025-06-13 14:39:51,035 - root - INFO - step: 10035 loss: 18.9353 memory: 6.46GiB(27.34%) tps: 26,640 tflops: 26.81 mfu: 8.59% global_avg_ntp_loss: 3.2764 global_avg_mtp_loss: 15.6589 +[titan] 2025-06-13 14:39:51,035 - root - INFO - lr: 1.7577e-04 gnorm: 1.27 [ 1:58:20< 0:58:33] +[titan] 2025-06-13 14:39:54,973 - root - INFO - step: 10040 loss: 19.3735 memory: 6.46GiB(27.34%) tps: 20,803 tflops: 20.94 mfu: 6.71% global_avg_ntp_loss: 3.3310 global_avg_mtp_loss: 16.0425 +[titan] 2025-06-13 14:39:54,974 - root - INFO - lr: 1.7554e-04 gnorm: 2.86 [ 1:58:24< 0:58:29] +[titan] 2025-06-13 14:39:58,368 - root - INFO - step: 10045 loss: 18.5163 memory: 6.46GiB(27.34%) tps: 24,136 tflops: 24.29 mfu: 7.79% global_avg_ntp_loss: 3.2084 global_avg_mtp_loss: 15.3078 +[titan] 2025-06-13 14:39:58,368 - root - INFO - lr: 1.7532e-04 gnorm: 1.39 [ 1:58:28< 0:58:26] +[titan] 2025-06-13 14:40:00,802 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:40:01,708 - root - INFO - step: 10050 loss: 19.4952 memory: 6.46GiB(27.34%) tps: 24,531 tflops: 24.69 mfu: 7.91% global_avg_ntp_loss: 3.3397 global_avg_mtp_loss: 16.1555 +[titan] 2025-06-13 14:40:01,708 - root - INFO - lr: 1.7509e-04 gnorm: 1.23 [ 1:58:31< 0:58:22] +[titan] 2025-06-13 14:40:05,285 - root - INFO - step: 10055 loss: 16.4828 memory: 6.46GiB(27.34%) tps: 22,903 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 2.8114 global_avg_mtp_loss: 13.6714 +[titan] 2025-06-13 14:40:05,285 - root - INFO - lr: 1.7487e-04 gnorm: 1.85 [ 1:58:35< 0:58:19] +[titan] 2025-06-13 14:40:08,681 - root - INFO - step: 10060 loss: 17.6990 memory: 6.46GiB(27.34%) tps: 24,127 tflops: 24.28 mfu: 7.78% global_avg_ntp_loss: 3.0140 global_avg_mtp_loss: 14.6850 +[titan] 2025-06-13 14:40:08,681 - root - INFO - lr: 1.7464e-04 gnorm: 1.61 [ 1:58:38< 0:58:15] +[titan] 2025-06-13 14:40:12,205 - root - INFO - step: 10065 loss: 19.2714 memory: 6.46GiB(27.34%) tps: 23,252 tflops: 23.40 mfu: 7.50% global_avg_ntp_loss: 3.3867 global_avg_mtp_loss: 15.8847 +[titan] 2025-06-13 14:40:12,205 - root - INFO - lr: 1.7441e-04 gnorm: 1.26 [ 1:58:42< 0:58:12] +[titan] 2025-06-13 14:40:15,864 - root - INFO - step: 10070 loss: 18.1096 memory: 6.46GiB(27.34%) tps: 22,390 tflops: 22.53 mfu: 7.22% global_avg_ntp_loss: 3.1622 global_avg_mtp_loss: 14.9474 +[titan] 2025-06-13 14:40:15,864 - root - INFO - lr: 1.7419e-04 gnorm: 1.30 [ 1:58:45< 0:58:08] +[titan] 2025-06-13 14:40:19,334 - root - INFO - step: 10075 loss: 18.8404 memory: 6.46GiB(27.34%) tps: 23,612 tflops: 23.76 mfu: 7.62% global_avg_ntp_loss: 3.2179 global_avg_mtp_loss: 15.6225 +[titan] 2025-06-13 14:40:19,334 - root - INFO - lr: 1.7396e-04 gnorm: 1.42 [ 1:58:49< 0:58:04] +[titan] 2025-06-13 14:40:22,923 - root - INFO - step: 10080 loss: 17.9147 memory: 6.46GiB(27.34%) tps: 22,826 tflops: 22.97 mfu: 7.36% global_avg_ntp_loss: 3.0668 global_avg_mtp_loss: 14.8479 +[titan] 2025-06-13 14:40:22,923 - root - INFO - lr: 1.7374e-04 gnorm: 1.56 [ 1:58:52< 0:58:01] +[titan] 2025-06-13 14:40:26,779 - root - INFO - step: 10085 loss: 19.4934 memory: 6.46GiB(27.34%) tps: 21,248 tflops: 21.38 mfu: 6.85% global_avg_ntp_loss: 3.3342 global_avg_mtp_loss: 16.1591 +[titan] 2025-06-13 14:40:26,780 - root - INFO - lr: 1.7351e-04 gnorm: 1.23 [ 1:58:56< 0:57:58] +[titan] 2025-06-13 14:40:30,182 - root - INFO - step: 10090 loss: 20.0447 memory: 6.46GiB(27.34%) tps: 24,076 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 3.4758 global_avg_mtp_loss: 16.5689 +[titan] 2025-06-13 14:40:30,183 - root - INFO - lr: 1.7329e-04 gnorm: 1.17 [ 1:59:00< 0:57:54] +[titan] 2025-06-13 14:40:33,680 - root - INFO - step: 10095 loss: 19.2489 memory: 6.46GiB(27.34%) tps: 23,422 tflops: 23.57 mfu: 7.55% global_avg_ntp_loss: 3.3084 global_avg_mtp_loss: 15.9405 +[titan] 2025-06-13 14:40:33,681 - root - INFO - lr: 1.7306e-04 gnorm: 1.33 [ 1:59:03< 0:57:50] +[titan] 2025-06-13 14:40:36,557 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:40:37,250 - root - INFO - step: 10100 loss: 19.3021 memory: 6.46GiB(27.34%) tps: 22,950 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 3.2760 global_avg_mtp_loss: 16.0260 +[titan] 2025-06-13 14:40:37,251 - root - INFO - lr: 1.7284e-04 gnorm: 1.47 [ 1:59:07< 0:57:47] +[titan] 2025-06-13 14:40:41,271 - root - INFO - step: 10105 loss: 18.2146 memory: 6.46GiB(27.34%) tps: 20,379 tflops: 20.51 mfu: 6.57% global_avg_ntp_loss: 3.0843 global_avg_mtp_loss: 15.1303 +[titan] 2025-06-13 14:40:41,271 - root - INFO - lr: 1.7261e-04 gnorm: 1.46 [ 1:59:11< 0:57:44] +[titan] 2025-06-13 14:40:44,579 - root - INFO - step: 10110 loss: 19.3919 memory: 6.46GiB(27.34%) tps: 24,764 tflops: 24.92 mfu: 7.99% global_avg_ntp_loss: 3.2911 global_avg_mtp_loss: 16.1007 +[titan] 2025-06-13 14:40:44,580 - root - INFO - lr: 1.7239e-04 gnorm: 1.28 [ 1:59:14< 0:57:40] +[titan] 2025-06-13 14:40:48,627 - root - INFO - step: 10115 loss: 19.4358 memory: 6.46GiB(27.34%) tps: 20,240 tflops: 20.37 mfu: 6.53% global_avg_ntp_loss: 3.3755 global_avg_mtp_loss: 16.0603 +[titan] 2025-06-13 14:40:48,627 - root - INFO - lr: 1.7216e-04 gnorm: 1.27 [ 1:59:18< 0:57:37] +[titan] 2025-06-13 14:40:52,182 - root - INFO - step: 10120 loss: 20.0040 memory: 6.46GiB(27.34%) tps: 23,049 tflops: 23.20 mfu: 7.43% global_avg_ntp_loss: 3.4448 global_avg_mtp_loss: 16.5592 +[titan] 2025-06-13 14:40:52,182 - root - INFO - lr: 1.7194e-04 gnorm: 1.26 [ 1:59:22< 0:57:33] +[titan] 2025-06-13 14:40:55,592 - root - INFO - step: 10125 loss: 19.1720 memory: 6.46GiB(27.34%) tps: 24,021 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 3.3084 global_avg_mtp_loss: 15.8636 +[titan] 2025-06-13 14:40:55,593 - root - INFO - lr: 1.7171e-04 gnorm: 1.44 [ 1:59:25< 0:57:30] +[titan] 2025-06-13 14:40:59,189 - root - INFO - step: 10130 loss: 19.2715 memory: 6.46GiB(27.34%) tps: 22,783 tflops: 22.93 mfu: 7.35% global_avg_ntp_loss: 3.3120 global_avg_mtp_loss: 15.9594 +[titan] 2025-06-13 14:40:59,189 - root - INFO - lr: 1.7149e-04 gnorm: 1.21 [ 1:59:29< 0:57:26] +[titan] 2025-06-13 14:41:02,684 - root - INFO - step: 10135 loss: 18.9989 memory: 6.46GiB(27.34%) tps: 23,443 tflops: 23.59 mfu: 7.56% global_avg_ntp_loss: 3.2695 global_avg_mtp_loss: 15.7294 +[titan] 2025-06-13 14:41:02,684 - root - INFO - lr: 1.7127e-04 gnorm: 1.20 [ 1:59:32< 0:57:22] +[titan] 2025-06-13 14:41:06,253 - root - INFO - step: 10140 loss: 19.7164 memory: 6.46GiB(27.34%) tps: 22,957 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 3.3905 global_avg_mtp_loss: 16.3259 +[titan] 2025-06-13 14:41:06,253 - root - INFO - lr: 1.7104e-04 gnorm: 1.21 [ 1:59:36< 0:57:19] +[titan] 2025-06-13 14:41:10,203 - root - INFO - step: 10145 loss: 19.6096 memory: 6.46GiB(27.34%) tps: 20,742 tflops: 20.87 mfu: 6.69% global_avg_ntp_loss: 3.3723 global_avg_mtp_loss: 16.2374 +[titan] 2025-06-13 14:41:10,203 - root - INFO - lr: 1.7082e-04 gnorm: 1.19 [ 1:59:40< 0:57:16] +[titan] 2025-06-13 14:41:12,856 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:41:13,540 - root - INFO - step: 10150 loss: 19.1735 memory: 6.46GiB(27.34%) tps: 24,555 tflops: 24.71 mfu: 7.92% global_avg_ntp_loss: 3.3385 global_avg_mtp_loss: 15.8350 +[titan] 2025-06-13 14:41:13,540 - root - INFO - lr: 1.7059e-04 gnorm: 1.43 [ 1:59:43< 0:57:12] +[titan] 2025-06-13 14:41:16,770 - root - INFO - step: 10155 loss: 19.2418 memory: 6.46GiB(27.34%) tps: 25,362 tflops: 25.52 mfu: 8.18% global_avg_ntp_loss: 3.3070 global_avg_mtp_loss: 15.9348 +[titan] 2025-06-13 14:41:16,770 - root - INFO - lr: 1.7037e-04 gnorm: 1.23 [ 1:59:46< 0:57:08] +[titan] 2025-06-13 14:41:20,309 - root - INFO - step: 10160 loss: 19.4241 memory: 6.46GiB(27.34%) tps: 23,151 tflops: 23.30 mfu: 7.47% global_avg_ntp_loss: 3.3378 global_avg_mtp_loss: 16.0863 +[titan] 2025-06-13 14:41:20,310 - root - INFO - lr: 1.7015e-04 gnorm: 1.34 [ 1:59:50< 0:57:05] +[titan] 2025-06-13 14:41:23,870 - root - INFO - step: 10165 loss: 17.8603 memory: 6.46GiB(27.34%) tps: 23,011 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 3.0546 global_avg_mtp_loss: 14.8057 +[titan] 2025-06-13 14:41:23,870 - root - INFO - lr: 1.6992e-04 gnorm: 1.60 [ 1:59:53< 0:57:01] +[titan] 2025-06-13 14:41:27,188 - root - INFO - step: 10170 loss: 19.7506 memory: 6.46GiB(27.34%) tps: 24,691 tflops: 24.85 mfu: 7.96% global_avg_ntp_loss: 3.4329 global_avg_mtp_loss: 16.3177 +[titan] 2025-06-13 14:41:27,188 - root - INFO - lr: 1.6970e-04 gnorm: 1.22 [ 1:59:57< 0:56:58] +[titan] 2025-06-13 14:41:30,544 - root - INFO - step: 10175 loss: 17.6700 memory: 6.46GiB(27.34%) tps: 24,410 tflops: 24.57 mfu: 7.87% global_avg_ntp_loss: 3.0358 global_avg_mtp_loss: 14.6343 +[titan] 2025-06-13 14:41:30,545 - root - INFO - lr: 1.6948e-04 gnorm: 1.30 [ 2:00:00< 0:56:54] +[titan] 2025-06-13 14:41:33,944 - root - INFO - step: 10180 loss: 15.1464 memory: 6.46GiB(27.34%) tps: 24,102 tflops: 24.26 mfu: 7.77% global_avg_ntp_loss: 2.6379 global_avg_mtp_loss: 12.5085 +[titan] 2025-06-13 14:41:33,944 - root - INFO - lr: 1.6925e-04 gnorm: 1.56 [ 2:00:03< 0:56:50] +[titan] 2025-06-13 14:41:37,415 - root - INFO - step: 10185 loss: 15.9671 memory: 6.46GiB(27.34%) tps: 23,606 tflops: 23.76 mfu: 7.61% global_avg_ntp_loss: 2.7178 global_avg_mtp_loss: 13.2493 +[titan] 2025-06-13 14:41:37,415 - root - INFO - lr: 1.6903e-04 gnorm: 2.22 [ 2:00:07< 0:56:47] +[titan] 2025-06-13 14:41:40,780 - root - INFO - step: 10190 loss: 19.6082 memory: 6.46GiB(27.34%) tps: 24,342 tflops: 24.50 mfu: 7.85% global_avg_ntp_loss: 3.4481 global_avg_mtp_loss: 16.1601 +[titan] 2025-06-13 14:41:40,781 - root - INFO - lr: 1.6881e-04 gnorm: 1.37 [ 2:00:10< 0:56:43] +[titan] 2025-06-13 14:41:44,705 - root - INFO - step: 10195 loss: 19.4970 memory: 6.46GiB(27.34%) tps: 20,876 tflops: 21.01 mfu: 6.73% global_avg_ntp_loss: 3.3216 global_avg_mtp_loss: 16.1754 +[titan] 2025-06-13 14:41:44,705 - root - INFO - lr: 1.6859e-04 gnorm: 1.18 [ 2:00:14< 0:56:40] +[titan] 2025-06-13 14:41:47,232 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:41:47,936 - root - INFO - step: 10200 loss: 19.9276 memory: 6.46GiB(27.34%) tps: 25,361 tflops: 25.52 mfu: 8.18% global_avg_ntp_loss: 3.4644 global_avg_mtp_loss: 16.4631 +[titan] 2025-06-13 14:41:47,936 - root - INFO - lr: 1.6836e-04 gnorm: 1.25 [ 2:00:17< 0:56:36] +[titan] 2025-06-13 14:41:51,419 - root - INFO - step: 10205 loss: 19.1602 memory: 6.46GiB(27.34%) tps: 23,518 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.2774 global_avg_mtp_loss: 15.8828 +[titan] 2025-06-13 14:41:51,420 - root - INFO - lr: 1.6814e-04 gnorm: 1.25 [ 2:00:21< 0:56:33] +[titan] 2025-06-13 14:41:55,199 - root - INFO - step: 10210 loss: 19.4492 memory: 6.46GiB(27.34%) tps: 21,677 tflops: 21.81 mfu: 6.99% global_avg_ntp_loss: 3.3142 global_avg_mtp_loss: 16.1350 +[titan] 2025-06-13 14:41:55,199 - root - INFO - lr: 1.6792e-04 gnorm: 1.21 [ 2:00:25< 0:56:29] +[titan] 2025-06-13 14:41:58,874 - root - INFO - step: 10215 loss: 20.0468 memory: 6.46GiB(27.34%) tps: 22,295 tflops: 22.44 mfu: 7.19% global_avg_ntp_loss: 3.5303 global_avg_mtp_loss: 16.5164 +[titan] 2025-06-13 14:41:58,874 - root - INFO - lr: 1.6770e-04 gnorm: 1.23 [ 2:00:28< 0:56:26] +[titan] 2025-06-13 14:42:02,409 - root - INFO - step: 10220 loss: 19.6761 memory: 6.46GiB(27.34%) tps: 23,179 tflops: 23.33 mfu: 7.48% global_avg_ntp_loss: 3.3982 global_avg_mtp_loss: 16.2779 +[titan] 2025-06-13 14:42:02,410 - root - INFO - lr: 1.6748e-04 gnorm: 1.20 [ 2:00:32< 0:56:22] +[titan] 2025-06-13 14:42:06,350 - root - INFO - step: 10225 loss: 18.6876 memory: 6.46GiB(27.34%) tps: 20,793 tflops: 20.93 mfu: 6.71% global_avg_ntp_loss: 3.2103 global_avg_mtp_loss: 15.4773 +[titan] 2025-06-13 14:42:06,350 - root - INFO - lr: 1.6725e-04 gnorm: 1.21 [ 2:00:36< 0:56:19] +[titan] 2025-06-13 14:42:09,709 - root - INFO - step: 10230 loss: 15.9557 memory: 6.46GiB(27.34%) tps: 24,390 tflops: 24.55 mfu: 7.87% global_avg_ntp_loss: 2.7496 global_avg_mtp_loss: 13.2061 +[titan] 2025-06-13 14:42:09,709 - root - INFO - lr: 1.6703e-04 gnorm: 1.25 [ 2:00:39< 0:56:15] +[titan] 2025-06-13 14:42:14,056 - root - INFO - step: 10235 loss: 18.7765 memory: 6.46GiB(27.34%) tps: 18,846 tflops: 18.97 mfu: 6.08% global_avg_ntp_loss: 3.2164 global_avg_mtp_loss: 15.5600 +[titan] 2025-06-13 14:42:14,056 - root - INFO - lr: 1.6681e-04 gnorm: 1.34 [ 2:00:43< 0:56:12] +[titan] 2025-06-13 14:42:17,592 - root - INFO - step: 10240 loss: 19.5397 memory: 6.46GiB(27.34%) tps: 23,169 tflops: 23.32 mfu: 7.47% global_avg_ntp_loss: 3.3417 global_avg_mtp_loss: 16.1980 +[titan] 2025-06-13 14:42:17,593 - root - INFO - lr: 1.6659e-04 gnorm: 1.24 [ 2:00:47< 0:56:08] +[titan] 2025-06-13 14:42:17,724 - root - INFO - Dumping profiler traces at step 10240 +[titan] 2025-06-13 14:42:17,818 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 14:42:21,349 - root - INFO - step: 10245 loss: 19.1527 memory: 6.46GiB(27.34%) tps: 21,807 tflops: 21.95 mfu: 7.03% global_avg_ntp_loss: 3.3222 global_avg_mtp_loss: 15.8305 +[titan] 2025-06-13 14:42:21,350 - root - INFO - lr: 1.6637e-04 gnorm: 1.17 [ 2:00:51< 0:56:05] +[titan] 2025-06-13 14:42:23,791 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:42:24,709 - root - INFO - step: 10250 loss: 20.2282 memory: 6.46GiB(27.34%) tps: 24,389 tflops: 24.54 mfu: 7.87% global_avg_ntp_loss: 3.5035 global_avg_mtp_loss: 16.7247 +[titan] 2025-06-13 14:42:24,709 - root - INFO - lr: 1.6615e-04 gnorm: 1.19 [ 2:00:54< 0:56:01] +[titan] 2025-06-13 14:42:28,769 - root - INFO - step: 10255 loss: 19.7054 memory: 6.46GiB(27.34%) tps: 20,179 tflops: 20.31 mfu: 6.51% global_avg_ntp_loss: 3.4234 global_avg_mtp_loss: 16.2820 +[titan] 2025-06-13 14:42:28,769 - root - INFO - lr: 1.6593e-04 gnorm: 1.20 [ 2:00:58< 0:55:58] +[titan] 2025-06-13 14:42:31,977 - root - INFO - step: 10260 loss: 20.2240 memory: 6.46GiB(27.34%) tps: 25,538 tflops: 25.70 mfu: 8.24% global_avg_ntp_loss: 3.5146 global_avg_mtp_loss: 16.7095 +[titan] 2025-06-13 14:42:31,978 - root - INFO - lr: 1.6571e-04 gnorm: 1.35 [ 2:01:01< 0:55:54] +[titan] 2025-06-13 14:42:35,810 - root - INFO - step: 10265 loss: 17.2026 memory: 6.46GiB(27.34%) tps: 21,373 tflops: 21.51 mfu: 6.89% global_avg_ntp_loss: 2.9788 global_avg_mtp_loss: 14.2238 +[titan] 2025-06-13 14:42:35,811 - root - INFO - lr: 1.6549e-04 gnorm: 1.68 [ 2:01:05< 0:55:51] +[titan] 2025-06-13 14:42:39,424 - root - INFO - step: 10270 loss: 18.6971 memory: 6.46GiB(27.34%) tps: 22,672 tflops: 22.82 mfu: 7.31% global_avg_ntp_loss: 3.2132 global_avg_mtp_loss: 15.4839 +[titan] 2025-06-13 14:42:39,425 - root - INFO - lr: 1.6527e-04 gnorm: 1.17 [ 2:01:09< 0:55:47] +[titan] 2025-06-13 14:42:43,037 - root - INFO - step: 10275 loss: 18.9595 memory: 6.46GiB(27.34%) tps: 22,680 tflops: 22.82 mfu: 7.32% global_avg_ntp_loss: 3.3118 global_avg_mtp_loss: 15.6477 +[titan] 2025-06-13 14:42:43,037 - root - INFO - lr: 1.6505e-04 gnorm: 1.26 [ 2:01:12< 0:55:44] +[titan] 2025-06-13 14:42:46,503 - root - INFO - step: 10280 loss: 19.4570 memory: 6.46GiB(27.34%) tps: 23,635 tflops: 23.79 mfu: 7.62% global_avg_ntp_loss: 3.3027 global_avg_mtp_loss: 16.1543 +[titan] 2025-06-13 14:42:46,504 - root - INFO - lr: 1.6483e-04 gnorm: 1.20 [ 2:01:16< 0:55:40] +[titan] 2025-06-13 14:42:50,100 - root - INFO - step: 10285 loss: 19.3463 memory: 6.46GiB(27.34%) tps: 22,778 tflops: 22.92 mfu: 7.35% global_avg_ntp_loss: 3.3225 global_avg_mtp_loss: 16.0238 +[titan] 2025-06-13 14:42:50,100 - root - INFO - lr: 1.6461e-04 gnorm: 1.24 [ 2:01:19< 0:55:37] +[titan] 2025-06-13 14:42:53,641 - root - INFO - step: 10290 loss: 19.0099 memory: 6.46GiB(27.34%) tps: 23,135 tflops: 23.28 mfu: 7.46% global_avg_ntp_loss: 3.2824 global_avg_mtp_loss: 15.7275 +[titan] 2025-06-13 14:42:53,642 - root - INFO - lr: 1.6439e-04 gnorm: 1.24 [ 2:01:23< 0:55:33] +[titan] 2025-06-13 14:42:57,122 - root - INFO - step: 10295 loss: 19.9184 memory: 6.46GiB(27.34%) tps: 23,541 tflops: 23.69 mfu: 7.59% global_avg_ntp_loss: 3.4435 global_avg_mtp_loss: 16.4748 +[titan] 2025-06-13 14:42:57,122 - root - INFO - lr: 1.6417e-04 gnorm: 1.27 [ 2:01:26< 0:55:30] +[titan] 2025-06-13 14:42:59,900 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:43:00,873 - root - INFO - step: 10300 loss: 19.7468 memory: 6.46GiB(27.34%) tps: 21,839 tflops: 21.98 mfu: 7.04% global_avg_ntp_loss: 3.4292 global_avg_mtp_loss: 16.3176 +[titan] 2025-06-13 14:43:00,874 - root - INFO - lr: 1.6395e-04 gnorm: 1.33 [ 2:01:30< 0:55:26] +[titan] 2025-06-13 14:43:04,708 - root - INFO - step: 10305 loss: 18.1083 memory: 6.46GiB(27.34%) tps: 21,366 tflops: 21.50 mfu: 6.89% global_avg_ntp_loss: 3.0626 global_avg_mtp_loss: 15.0458 +[titan] 2025-06-13 14:43:04,708 - root - INFO - lr: 1.6373e-04 gnorm: 1.45 [ 2:01:34< 0:55:23] +[titan] 2025-06-13 14:43:08,588 - root - INFO - step: 10310 loss: 17.5962 memory: 6.46GiB(27.34%) tps: 21,116 tflops: 21.25 mfu: 6.81% global_avg_ntp_loss: 3.0275 global_avg_mtp_loss: 14.5688 +[titan] 2025-06-13 14:43:08,588 - root - INFO - lr: 1.6351e-04 gnorm: 1.28 [ 2:01:38< 0:55:20] +[titan] 2025-06-13 14:43:12,163 - root - INFO - step: 10315 loss: 19.6204 memory: 6.46GiB(27.34%) tps: 22,914 tflops: 23.06 mfu: 7.39% global_avg_ntp_loss: 3.3892 global_avg_mtp_loss: 16.2312 +[titan] 2025-06-13 14:43:12,164 - root - INFO - lr: 1.6329e-04 gnorm: 1.24 [ 2:01:42< 0:55:16] +[titan] 2025-06-13 14:43:16,201 - root - INFO - step: 10320 loss: 18.3792 memory: 6.46GiB(27.34%) tps: 20,290 tflops: 20.42 mfu: 6.54% global_avg_ntp_loss: 3.1719 global_avg_mtp_loss: 15.2073 +[titan] 2025-06-13 14:43:16,202 - root - INFO - lr: 1.6307e-04 gnorm: 1.45 [ 2:01:46< 0:55:13] +[titan] 2025-06-13 14:43:19,953 - root - INFO - step: 10325 loss: 17.7691 memory: 6.46GiB(27.34%) tps: 21,836 tflops: 21.98 mfu: 7.04% global_avg_ntp_loss: 3.1007 global_avg_mtp_loss: 14.6685 +[titan] 2025-06-13 14:43:19,954 - root - INFO - lr: 1.6285e-04 gnorm: 1.40 [ 2:01:49< 0:55:09] +[titan] 2025-06-13 14:43:23,443 - root - INFO - step: 10330 loss: 15.6400 memory: 6.46GiB(27.34%) tps: 23,476 tflops: 23.63 mfu: 7.57% global_avg_ntp_loss: 2.6855 global_avg_mtp_loss: 12.9546 +[titan] 2025-06-13 14:43:23,444 - root - INFO - lr: 1.6263e-04 gnorm: 1.52 [ 2:01:53< 0:55:06] +[titan] 2025-06-13 14:43:26,792 - root - INFO - step: 10335 loss: 18.5348 memory: 6.46GiB(27.34%) tps: 24,465 tflops: 24.62 mfu: 7.89% global_avg_ntp_loss: 3.2122 global_avg_mtp_loss: 15.3225 +[titan] 2025-06-13 14:43:26,793 - root - INFO - lr: 1.6241e-04 gnorm: 1.57 [ 2:01:56< 0:55:02] +[titan] 2025-06-13 14:43:30,321 - root - INFO - step: 10340 loss: 19.6383 memory: 6.46GiB(27.34%) tps: 23,215 tflops: 23.36 mfu: 7.49% global_avg_ntp_loss: 3.4298 global_avg_mtp_loss: 16.2085 +[titan] 2025-06-13 14:43:30,322 - root - INFO - lr: 1.6219e-04 gnorm: 1.27 [ 2:02:00< 0:54:59] +[titan] 2025-06-13 14:43:34,149 - root - INFO - step: 10345 loss: 20.0214 memory: 6.46GiB(27.34%) tps: 21,404 tflops: 21.54 mfu: 6.90% global_avg_ntp_loss: 3.4663 global_avg_mtp_loss: 16.5551 +[titan] 2025-06-13 14:43:34,150 - root - INFO - lr: 1.6198e-04 gnorm: 1.16 [ 2:02:03< 0:54:55] +[titan] 2025-06-13 14:43:36,775 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:43:37,439 - root - INFO - step: 10350 loss: 16.3236 memory: 6.46GiB(27.34%) tps: 24,907 tflops: 25.07 mfu: 8.03% global_avg_ntp_loss: 2.7994 global_avg_mtp_loss: 13.5242 +[titan] 2025-06-13 14:43:37,439 - root - INFO - lr: 1.6176e-04 gnorm: 1.95 [ 2:02:07< 0:54:51] +[titan] 2025-06-13 14:43:40,979 - root - INFO - step: 10355 loss: 18.1374 memory: 6.46GiB(27.34%) tps: 23,145 tflops: 23.29 mfu: 7.47% global_avg_ntp_loss: 3.1096 global_avg_mtp_loss: 15.0278 +[titan] 2025-06-13 14:43:40,980 - root - INFO - lr: 1.6154e-04 gnorm: 1.23 [ 2:02:10< 0:54:48] +[titan] 2025-06-13 14:43:44,063 - root - INFO - step: 10360 loss: 19.5487 memory: 6.46GiB(27.34%) tps: 26,568 tflops: 26.74 mfu: 8.57% global_avg_ntp_loss: 3.3495 global_avg_mtp_loss: 16.1992 +[titan] 2025-06-13 14:43:44,063 - root - INFO - lr: 1.6132e-04 gnorm: 1.27 [ 2:02:13< 0:54:44] +[titan] 2025-06-13 14:43:47,549 - root - INFO - step: 10365 loss: 19.0329 memory: 6.46GiB(27.34%) tps: 23,500 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.3199 global_avg_mtp_loss: 15.7130 +[titan] 2025-06-13 14:43:47,549 - root - INFO - lr: 1.6110e-04 gnorm: 1.41 [ 2:02:17< 0:54:41] +[titan] 2025-06-13 14:43:51,287 - root - INFO - step: 10370 loss: 19.6907 memory: 6.46GiB(27.34%) tps: 21,916 tflops: 22.06 mfu: 7.07% global_avg_ntp_loss: 3.3987 global_avg_mtp_loss: 16.2920 +[titan] 2025-06-13 14:43:51,288 - root - INFO - lr: 1.6089e-04 gnorm: 1.14 [ 2:02:21< 0:54:37] +[titan] 2025-06-13 14:43:54,988 - root - INFO - step: 10375 loss: 19.1707 memory: 6.46GiB(27.34%) tps: 22,138 tflops: 22.28 mfu: 7.14% global_avg_ntp_loss: 3.3193 global_avg_mtp_loss: 15.8514 +[titan] 2025-06-13 14:43:54,988 - root - INFO - lr: 1.6067e-04 gnorm: 1.20 [ 2:02:24< 0:54:34] +[titan] 2025-06-13 14:43:59,000 - root - INFO - step: 10380 loss: 17.3870 memory: 6.46GiB(27.34%) tps: 20,421 tflops: 20.55 mfu: 6.59% global_avg_ntp_loss: 2.9845 global_avg_mtp_loss: 14.4024 +[titan] 2025-06-13 14:43:59,001 - root - INFO - lr: 1.6045e-04 gnorm: 1.29 [ 2:02:28< 0:54:30] +[titan] 2025-06-13 14:44:03,036 - root - INFO - step: 10385 loss: 19.3182 memory: 6.46GiB(27.34%) tps: 20,299 tflops: 20.43 mfu: 6.55% global_avg_ntp_loss: 3.3325 global_avg_mtp_loss: 15.9857 +[titan] 2025-06-13 14:44:03,037 - root - INFO - lr: 1.6023e-04 gnorm: 1.31 [ 2:02:32< 0:54:27] +[titan] 2025-06-13 14:44:06,560 - root - INFO - step: 10390 loss: 19.3805 memory: 6.46GiB(27.34%) tps: 23,249 tflops: 23.40 mfu: 7.50% global_avg_ntp_loss: 3.3583 global_avg_mtp_loss: 16.0222 +[titan] 2025-06-13 14:44:06,561 - root - INFO - lr: 1.6002e-04 gnorm: 1.28 [ 2:02:36< 0:54:23] +[titan] 2025-06-13 14:44:09,983 - root - INFO - step: 10395 loss: 19.6300 memory: 6.46GiB(27.34%) tps: 23,934 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.4120 global_avg_mtp_loss: 16.2179 +[titan] 2025-06-13 14:44:09,984 - root - INFO - lr: 1.5980e-04 gnorm: 1.14 [ 2:02:39< 0:54:20] +[titan] 2025-06-13 14:44:13,013 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:44:13,666 - root - INFO - step: 10400 loss: 19.9176 memory: 6.46GiB(27.34%) tps: 22,245 tflops: 22.39 mfu: 7.18% global_avg_ntp_loss: 3.4548 global_avg_mtp_loss: 16.4628 +[titan] 2025-06-13 14:44:13,667 - root - INFO - lr: 1.5958e-04 gnorm: 1.16 [ 2:02:43< 0:54:16] +[titan] 2025-06-13 14:44:17,089 - root - INFO - step: 10405 loss: 19.9950 memory: 6.46GiB(27.34%) tps: 23,937 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.4806 global_avg_mtp_loss: 16.5145 +[titan] 2025-06-13 14:44:17,090 - root - INFO - lr: 1.5937e-04 gnorm: 1.18 [ 2:02:46< 0:54:13] +[titan] 2025-06-13 14:44:20,792 - root - INFO - step: 10410 loss: 19.0682 memory: 6.46GiB(27.34%) tps: 22,129 tflops: 22.27 mfu: 7.14% global_avg_ntp_loss: 3.3012 global_avg_mtp_loss: 15.7670 +[titan] 2025-06-13 14:44:20,792 - root - INFO - lr: 1.5915e-04 gnorm: 1.38 [ 2:02:50< 0:54:09] +[titan] 2025-06-13 14:44:24,340 - root - INFO - step: 10415 loss: 18.8308 memory: 6.46GiB(27.34%) tps: 23,088 tflops: 23.24 mfu: 7.45% global_avg_ntp_loss: 3.2793 global_avg_mtp_loss: 15.5515 +[titan] 2025-06-13 14:44:24,341 - root - INFO - lr: 1.5893e-04 gnorm: 1.46 [ 2:02:54< 0:54:06] +[titan] 2025-06-13 14:44:28,274 - root - INFO - step: 10420 loss: 19.2976 memory: 6.46GiB(27.34%) tps: 20,827 tflops: 20.96 mfu: 6.72% global_avg_ntp_loss: 3.3325 global_avg_mtp_loss: 15.9650 +[titan] 2025-06-13 14:44:28,274 - root - INFO - lr: 1.5872e-04 gnorm: 1.13 [ 2:02:58< 0:54:02] +[titan] 2025-06-13 14:44:31,533 - root - INFO - step: 10425 loss: 18.2693 memory: 6.46GiB(27.34%) tps: 25,139 tflops: 25.30 mfu: 8.11% global_avg_ntp_loss: 3.1531 global_avg_mtp_loss: 15.1162 +[titan] 2025-06-13 14:44:31,534 - root - INFO - lr: 1.5850e-04 gnorm: 1.36 [ 2:03:01< 0:53:59] +[titan] 2025-06-13 14:44:35,017 - root - INFO - step: 10430 loss: 18.4609 memory: 6.46GiB(27.34%) tps: 23,520 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.1770 global_avg_mtp_loss: 15.2839 +[titan] 2025-06-13 14:44:35,017 - root - INFO - lr: 1.5829e-04 gnorm: 1.43 [ 2:03:04< 0:53:55] +[titan] 2025-06-13 14:44:38,682 - root - INFO - step: 10435 loss: 18.3250 memory: 6.46GiB(27.34%) tps: 22,350 tflops: 22.49 mfu: 7.21% global_avg_ntp_loss: 3.1621 global_avg_mtp_loss: 15.1629 +[titan] 2025-06-13 14:44:38,683 - root - INFO - lr: 1.5807e-04 gnorm: 1.44 [ 2:03:08< 0:53:52] +[titan] 2025-06-13 14:44:41,946 - root - INFO - step: 10440 loss: 16.5645 memory: 6.46GiB(27.34%) tps: 25,101 tflops: 25.26 mfu: 8.10% global_avg_ntp_loss: 2.8429 global_avg_mtp_loss: 13.7216 +[titan] 2025-06-13 14:44:41,947 - root - INFO - lr: 1.5785e-04 gnorm: 1.37 [ 2:03:11< 0:53:48] +[titan] 2025-06-13 14:44:45,778 - root - INFO - step: 10445 loss: 19.9162 memory: 6.46GiB(27.34%) tps: 21,384 tflops: 21.52 mfu: 6.90% global_avg_ntp_loss: 3.4473 global_avg_mtp_loss: 16.4689 +[titan] 2025-06-13 14:44:45,778 - root - INFO - lr: 1.5764e-04 gnorm: 1.19 [ 2:03:15< 0:53:45] +[titan] 2025-06-13 14:44:48,524 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:44:49,072 - root - INFO - step: 10450 loss: 18.9539 memory: 6.46GiB(27.34%) tps: 24,872 tflops: 25.03 mfu: 8.02% global_avg_ntp_loss: 3.2350 global_avg_mtp_loss: 15.7189 +[titan] 2025-06-13 14:44:49,072 - root - INFO - lr: 1.5742e-04 gnorm: 1.22 [ 2:03:18< 0:53:41] +[titan] 2025-06-13 14:44:52,812 - root - INFO - step: 10455 loss: 19.5549 memory: 6.46GiB(27.34%) tps: 21,905 tflops: 22.05 mfu: 7.07% global_avg_ntp_loss: 3.3953 global_avg_mtp_loss: 16.1596 +[titan] 2025-06-13 14:44:52,812 - root - INFO - lr: 1.5721e-04 gnorm: 1.31 [ 2:03:22< 0:53:38] +[titan] 2025-06-13 14:44:56,004 - root - INFO - step: 10460 loss: 19.7954 memory: 6.46GiB(27.34%) tps: 25,667 tflops: 25.83 mfu: 8.28% global_avg_ntp_loss: 3.4433 global_avg_mtp_loss: 16.3521 +[titan] 2025-06-13 14:44:56,005 - root - INFO - lr: 1.5699e-04 gnorm: 1.25 [ 2:03:25< 0:53:34] +[titan] 2025-06-13 14:44:59,453 - root - INFO - step: 10465 loss: 18.8973 memory: 6.46GiB(27.34%) tps: 23,754 tflops: 23.91 mfu: 7.66% global_avg_ntp_loss: 3.2264 global_avg_mtp_loss: 15.6708 +[titan] 2025-06-13 14:44:59,454 - root - INFO - lr: 1.5678e-04 gnorm: 1.49 [ 2:03:29< 0:53:30] +[titan] 2025-06-13 14:45:03,693 - root - INFO - step: 10470 loss: 18.8758 memory: 6.46GiB(27.34%) tps: 19,324 tflops: 19.45 mfu: 6.23% global_avg_ntp_loss: 3.2448 global_avg_mtp_loss: 15.6310 +[titan] 2025-06-13 14:45:03,693 - root - INFO - lr: 1.5656e-04 gnorm: 1.22 [ 2:03:33< 0:53:27] +[titan] 2025-06-13 14:45:06,777 - root - INFO - step: 10475 loss: 19.8921 memory: 6.46GiB(27.34%) tps: 26,566 tflops: 26.74 mfu: 8.57% global_avg_ntp_loss: 3.3993 global_avg_mtp_loss: 16.4928 +[titan] 2025-06-13 14:45:06,778 - root - INFO - lr: 1.5635e-04 gnorm: 1.27 [ 2:03:36< 0:53:23] +[titan] 2025-06-13 14:45:10,073 - root - INFO - step: 10480 loss: 18.9897 memory: 6.46GiB(27.34%) tps: 24,857 tflops: 25.02 mfu: 8.02% global_avg_ntp_loss: 3.3173 global_avg_mtp_loss: 15.6724 +[titan] 2025-06-13 14:45:10,074 - root - INFO - lr: 1.5614e-04 gnorm: 1.17 [ 2:03:39< 0:53:20] +[titan] 2025-06-13 14:45:13,294 - root - INFO - step: 10485 loss: 19.0479 memory: 6.46GiB(27.34%) tps: 25,437 tflops: 25.60 mfu: 8.20% global_avg_ntp_loss: 3.3013 global_avg_mtp_loss: 15.7466 +[titan] 2025-06-13 14:45:13,295 - root - INFO - lr: 1.5592e-04 gnorm: 1.32 [ 2:03:43< 0:53:16] +[titan] 2025-06-13 14:45:17,902 - root - INFO - step: 10490 loss: 19.7925 memory: 6.46GiB(27.34%) tps: 17,780 tflops: 17.89 mfu: 5.74% global_avg_ntp_loss: 3.3996 global_avg_mtp_loss: 16.3929 +[titan] 2025-06-13 14:45:17,903 - root - INFO - lr: 1.5571e-04 gnorm: 1.16 [ 2:03:47< 0:53:13] +[titan] 2025-06-13 14:45:21,557 - root - INFO - step: 10495 loss: 18.5511 memory: 6.46GiB(27.34%) tps: 22,420 tflops: 22.56 mfu: 7.23% global_avg_ntp_loss: 3.1752 global_avg_mtp_loss: 15.3759 +[titan] 2025-06-13 14:45:21,557 - root - INFO - lr: 1.5549e-04 gnorm: 1.27 [ 2:03:51< 0:53:09] +[titan] 2025-06-13 14:45:24,007 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:45:24,782 - root - INFO - step: 10500 loss: 18.8231 memory: 6.46GiB(27.34%) tps: 25,402 tflops: 25.56 mfu: 8.19% global_avg_ntp_loss: 3.2615 global_avg_mtp_loss: 15.5616 +[titan] 2025-06-13 14:45:24,782 - root - INFO - lr: 1.5528e-04 gnorm: 1.25 [ 2:03:54< 0:53:06] +[titan] 2025-06-13 14:45:28,566 - root - INFO - step: 10505 loss: 20.1617 memory: 6.46GiB(27.34%) tps: 21,649 tflops: 21.79 mfu: 6.98% global_avg_ntp_loss: 3.5456 global_avg_mtp_loss: 16.6161 +[titan] 2025-06-13 14:45:28,567 - root - INFO - lr: 1.5507e-04 gnorm: 1.12 [ 2:03:58< 0:53:02] +[titan] 2025-06-13 14:45:32,278 - root - INFO - step: 10510 loss: 19.9039 memory: 6.46GiB(27.34%) tps: 22,076 tflops: 22.22 mfu: 7.12% global_avg_ntp_loss: 3.4067 global_avg_mtp_loss: 16.4971 +[titan] 2025-06-13 14:45:32,278 - root - INFO - lr: 1.5485e-04 gnorm: 1.31 [ 2:04:02< 0:52:59] +[titan] 2025-06-13 14:45:35,913 - root - INFO - step: 10515 loss: 19.8721 memory: 6.46GiB(27.34%) tps: 22,538 tflops: 22.68 mfu: 7.27% global_avg_ntp_loss: 3.4407 global_avg_mtp_loss: 16.4314 +[titan] 2025-06-13 14:45:35,913 - root - INFO - lr: 1.5464e-04 gnorm: 1.30 [ 2:04:05< 0:52:55] +[titan] 2025-06-13 14:45:39,552 - root - INFO - step: 10520 loss: 18.3155 memory: 6.46GiB(27.34%) tps: 22,514 tflops: 22.66 mfu: 7.26% global_avg_ntp_loss: 3.1406 global_avg_mtp_loss: 15.1749 +[titan] 2025-06-13 14:45:39,552 - root - INFO - lr: 1.5443e-04 gnorm: 1.23 [ 2:04:09< 0:52:52] +[titan] 2025-06-13 14:45:43,543 - root - INFO - step: 10525 loss: 19.8270 memory: 6.46GiB(27.34%) tps: 20,529 tflops: 20.66 mfu: 6.62% global_avg_ntp_loss: 3.4252 global_avg_mtp_loss: 16.4018 +[titan] 2025-06-13 14:45:43,543 - root - INFO - lr: 1.5421e-04 gnorm: 1.36 [ 2:04:13< 0:52:49] +[titan] 2025-06-13 14:45:46,890 - root - INFO - step: 10530 loss: 18.8234 memory: 6.46GiB(27.34%) tps: 24,479 tflops: 24.63 mfu: 7.90% global_avg_ntp_loss: 3.2342 global_avg_mtp_loss: 15.5893 +[titan] 2025-06-13 14:45:46,890 - root - INFO - lr: 1.5400e-04 gnorm: 1.23 [ 2:04:16< 0:52:45] +[titan] 2025-06-13 14:45:50,402 - root - INFO - step: 10535 loss: 19.4130 memory: 6.46GiB(27.34%) tps: 23,331 tflops: 23.48 mfu: 7.53% global_avg_ntp_loss: 3.3453 global_avg_mtp_loss: 16.0677 +[titan] 2025-06-13 14:45:50,403 - root - INFO - lr: 1.5379e-04 gnorm: 1.19 [ 2:04:20< 0:52:41] +[titan] 2025-06-13 14:45:53,991 - root - INFO - step: 10540 loss: 20.2943 memory: 6.46GiB(27.34%) tps: 22,829 tflops: 22.97 mfu: 7.36% global_avg_ntp_loss: 3.5318 global_avg_mtp_loss: 16.7625 +[titan] 2025-06-13 14:45:53,991 - root - INFO - lr: 1.5357e-04 gnorm: 1.14 [ 2:04:23< 0:52:38] +[titan] 2025-06-13 14:45:57,454 - root - INFO - step: 10545 loss: 20.2065 memory: 6.46GiB(27.34%) tps: 23,658 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 3.5256 global_avg_mtp_loss: 16.6809 +[titan] 2025-06-13 14:45:57,455 - root - INFO - lr: 1.5336e-04 gnorm: 1.23 [ 2:04:27< 0:52:34] +[titan] 2025-06-13 14:46:00,484 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:46:01,037 - root - INFO - step: 10550 loss: 19.7962 memory: 6.46GiB(27.34%) tps: 22,869 tflops: 23.01 mfu: 7.38% global_avg_ntp_loss: 3.4164 global_avg_mtp_loss: 16.3798 +[titan] 2025-06-13 14:46:01,037 - root - INFO - lr: 1.5315e-04 gnorm: 1.20 [ 2:04:30< 0:52:31] +[titan] 2025-06-13 14:46:04,597 - root - INFO - step: 10555 loss: 20.1695 memory: 6.46GiB(27.34%) tps: 23,014 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 3.4834 global_avg_mtp_loss: 16.6862 +[titan] 2025-06-13 14:46:04,597 - root - INFO - lr: 1.5294e-04 gnorm: 1.17 [ 2:04:34< 0:52:27] +[titan] 2025-06-13 14:46:08,337 - root - INFO - step: 10560 loss: 19.8446 memory: 6.46GiB(27.34%) tps: 21,904 tflops: 22.04 mfu: 7.07% global_avg_ntp_loss: 3.3952 global_avg_mtp_loss: 16.4494 +[titan] 2025-06-13 14:46:08,338 - root - INFO - lr: 1.5273e-04 gnorm: 1.17 [ 2:04:38< 0:52:24] +[titan] 2025-06-13 14:46:12,647 - root - INFO - step: 10565 loss: 18.2619 memory: 6.46GiB(27.34%) tps: 19,010 tflops: 19.13 mfu: 6.13% global_avg_ntp_loss: 3.1203 global_avg_mtp_loss: 15.1416 +[titan] 2025-06-13 14:46:12,647 - root - INFO - lr: 1.5251e-04 gnorm: 1.82 [ 2:04:42< 0:52:21] +[titan] 2025-06-13 14:46:15,812 - root - INFO - step: 10570 loss: 19.0763 memory: 6.46GiB(27.34%) tps: 25,891 tflops: 26.06 mfu: 8.35% global_avg_ntp_loss: 3.2720 global_avg_mtp_loss: 15.8043 +[titan] 2025-06-13 14:46:15,812 - root - INFO - lr: 1.5230e-04 gnorm: 1.23 [ 2:04:45< 0:52:17] +[titan] 2025-06-13 14:46:19,568 - root - INFO - step: 10575 loss: 19.2683 memory: 6.46GiB(27.34%) tps: 21,811 tflops: 21.95 mfu: 7.04% global_avg_ntp_loss: 3.3276 global_avg_mtp_loss: 15.9407 +[titan] 2025-06-13 14:46:19,568 - root - INFO - lr: 1.5209e-04 gnorm: 1.20 [ 2:04:49< 0:52:13] +[titan] 2025-06-13 14:46:23,121 - root - INFO - step: 10580 loss: 18.6697 memory: 6.46GiB(27.34%) tps: 23,059 tflops: 23.21 mfu: 7.44% global_avg_ntp_loss: 3.2147 global_avg_mtp_loss: 15.4550 +[titan] 2025-06-13 14:46:23,121 - root - INFO - lr: 1.5188e-04 gnorm: 1.16 [ 2:04:52< 0:52:10] +[titan] 2025-06-13 14:46:26,681 - root - INFO - step: 10585 loss: 17.3622 memory: 6.46GiB(27.34%) tps: 23,014 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 3.0316 global_avg_mtp_loss: 14.3305 +[titan] 2025-06-13 14:46:26,681 - root - INFO - lr: 1.5167e-04 gnorm: 1.36 [ 2:04:56< 0:52:06] +[titan] 2025-06-13 14:46:30,119 - root - INFO - step: 10590 loss: 17.4262 memory: 6.46GiB(27.34%) tps: 23,829 tflops: 23.98 mfu: 7.69% global_avg_ntp_loss: 2.9536 global_avg_mtp_loss: 14.4726 +[titan] 2025-06-13 14:46:30,120 - root - INFO - lr: 1.5146e-04 gnorm: 1.38 [ 2:04:59< 0:52:03] +[titan] 2025-06-13 14:46:33,903 - root - INFO - step: 10595 loss: 19.0011 memory: 6.46GiB(27.34%) tps: 21,651 tflops: 21.79 mfu: 6.98% global_avg_ntp_loss: 3.2442 global_avg_mtp_loss: 15.7569 +[titan] 2025-06-13 14:46:33,904 - root - INFO - lr: 1.5125e-04 gnorm: 1.59 [ 2:05:03< 0:51:59] +[titan] 2025-06-13 14:46:36,381 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:46:37,147 - root - INFO - step: 10600 loss: 20.1717 memory: 6.46GiB(27.34%) tps: 25,257 tflops: 25.42 mfu: 8.15% global_avg_ntp_loss: 3.5474 global_avg_mtp_loss: 16.6243 +[titan] 2025-06-13 14:46:37,148 - root - INFO - lr: 1.5103e-04 gnorm: 1.39 [ 2:05:06< 0:51:56] +[titan] 2025-06-13 14:46:40,751 - root - INFO - step: 10605 loss: 17.5478 memory: 6.46GiB(27.34%) tps: 22,737 tflops: 22.88 mfu: 7.33% global_avg_ntp_loss: 3.0471 global_avg_mtp_loss: 14.5006 +[titan] 2025-06-13 14:46:40,751 - root - INFO - lr: 1.5082e-04 gnorm: 1.13 [ 2:05:10< 0:51:52] +[titan] 2025-06-13 14:46:43,849 - root - INFO - step: 10610 loss: 19.0807 memory: 6.46GiB(27.34%) tps: 26,440 tflops: 26.61 mfu: 8.53% global_avg_ntp_loss: 3.2939 global_avg_mtp_loss: 15.7868 +[titan] 2025-06-13 14:46:43,850 - root - INFO - lr: 1.5061e-04 gnorm: 1.39 [ 2:05:13< 0:51:48] +[titan] 2025-06-13 14:46:47,070 - root - INFO - step: 10615 loss: 20.0810 memory: 6.46GiB(27.34%) tps: 25,443 tflops: 25.61 mfu: 8.21% global_avg_ntp_loss: 3.4855 global_avg_mtp_loss: 16.5954 +[titan] 2025-06-13 14:46:47,070 - root - INFO - lr: 1.5040e-04 gnorm: 1.35 [ 2:05:16< 0:51:45] +[titan] 2025-06-13 14:46:51,236 - root - INFO - step: 10620 loss: 19.3503 memory: 6.46GiB(27.34%) tps: 19,666 tflops: 19.79 mfu: 6.34% global_avg_ntp_loss: 3.3149 global_avg_mtp_loss: 16.0354 +[titan] 2025-06-13 14:46:51,236 - root - INFO - lr: 1.5019e-04 gnorm: 1.23 [ 2:05:21< 0:51:41] +[titan] 2025-06-13 14:46:55,878 - root - INFO - step: 10625 loss: 18.2100 memory: 6.46GiB(27.34%) tps: 17,649 tflops: 17.76 mfu: 5.69% global_avg_ntp_loss: 3.1906 global_avg_mtp_loss: 15.0194 +[titan] 2025-06-13 14:46:55,878 - root - INFO - lr: 1.4998e-04 gnorm: 1.62 [ 2:05:25< 0:51:38] +[titan] 2025-06-13 14:47:00,262 - root - INFO - step: 10630 loss: 16.1916 memory: 6.46GiB(27.34%) tps: 18,686 tflops: 18.80 mfu: 6.03% global_avg_ntp_loss: 2.7275 global_avg_mtp_loss: 13.4641 +[titan] 2025-06-13 14:47:00,263 - root - INFO - lr: 1.4977e-04 gnorm: 2.25 [ 2:05:30< 0:51:35] +[titan] 2025-06-13 14:47:04,436 - root - INFO - step: 10635 loss: 19.8887 memory: 6.46GiB(27.34%) tps: 19,631 tflops: 19.76 mfu: 6.33% global_avg_ntp_loss: 3.4181 global_avg_mtp_loss: 16.4706 +[titan] 2025-06-13 14:47:04,436 - root - INFO - lr: 1.4956e-04 gnorm: 1.14 [ 2:05:34< 0:51:32] +[titan] 2025-06-13 14:47:08,187 - root - INFO - step: 10640 loss: 19.7296 memory: 6.46GiB(27.34%) tps: 21,842 tflops: 21.98 mfu: 7.05% global_avg_ntp_loss: 3.3841 global_avg_mtp_loss: 16.3455 +[titan] 2025-06-13 14:47:08,187 - root - INFO - lr: 1.4935e-04 gnorm: 1.19 [ 2:05:37< 0:51:28] +[titan] 2025-06-13 14:47:13,406 - root - INFO - step: 10645 loss: 19.6440 memory: 6.46GiB(27.34%) tps: 15,699 tflops: 15.80 mfu: 5.06% global_avg_ntp_loss: 3.4296 global_avg_mtp_loss: 16.2144 +[titan] 2025-06-13 14:47:13,406 - root - INFO - lr: 1.4915e-04 gnorm: 1.45 [ 2:05:43< 0:51:26] +[titan] 2025-06-13 14:47:15,881 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:47:16,451 - root - INFO - step: 10650 loss: 19.1528 memory: 6.46GiB(27.34%) tps: 26,902 tflops: 27.07 mfu: 8.68% global_avg_ntp_loss: 3.2471 global_avg_mtp_loss: 15.9056 +[titan] 2025-06-13 14:47:16,452 - root - INFO - lr: 1.4894e-04 gnorm: 1.23 [ 2:05:46< 0:51:22] +[titan] 2025-06-13 14:47:20,231 - root - INFO - step: 10655 loss: 19.5462 memory: 6.46GiB(27.34%) tps: 21,676 tflops: 21.81 mfu: 6.99% global_avg_ntp_loss: 3.4540 global_avg_mtp_loss: 16.0922 +[titan] 2025-06-13 14:47:20,231 - root - INFO - lr: 1.4873e-04 gnorm: 1.56 [ 2:05:50< 0:51:18] +[titan] 2025-06-13 14:47:23,376 - root - INFO - step: 10660 loss: 17.9569 memory: 6.46GiB(27.34%) tps: 26,054 tflops: 26.22 mfu: 8.40% global_avg_ntp_loss: 3.0872 global_avg_mtp_loss: 14.8698 +[titan] 2025-06-13 14:47:23,376 - root - INFO - lr: 1.4852e-04 gnorm: 1.29 [ 2:05:53< 0:51:15] +[titan] 2025-06-13 14:47:27,066 - root - INFO - step: 10665 loss: 18.1045 memory: 6.46GiB(27.34%) tps: 22,202 tflops: 22.34 mfu: 7.16% global_avg_ntp_loss: 3.0757 global_avg_mtp_loss: 15.0288 +[titan] 2025-06-13 14:47:27,066 - root - INFO - lr: 1.4831e-04 gnorm: 1.55 [ 2:05:56< 0:51:11] +[titan] 2025-06-13 14:47:31,190 - root - INFO - step: 10670 loss: 19.2546 memory: 6.46GiB(27.34%) tps: 19,868 tflops: 20.00 mfu: 6.41% global_avg_ntp_loss: 3.3171 global_avg_mtp_loss: 15.9375 +[titan] 2025-06-13 14:47:31,190 - root - INFO - lr: 1.4810e-04 gnorm: 1.31 [ 2:06:00< 0:51:08] +[titan] 2025-06-13 14:47:34,609 - root - INFO - step: 10675 loss: 18.4554 memory: 6.46GiB(27.34%) tps: 23,960 tflops: 24.11 mfu: 7.73% global_avg_ntp_loss: 3.1711 global_avg_mtp_loss: 15.2843 +[titan] 2025-06-13 14:47:34,609 - root - INFO - lr: 1.4789e-04 gnorm: 1.28 [ 2:06:04< 0:51:04] +[titan] 2025-06-13 14:47:38,030 - root - INFO - step: 10680 loss: 20.3406 memory: 6.46GiB(27.34%) tps: 23,950 tflops: 24.10 mfu: 7.73% global_avg_ntp_loss: 3.4988 global_avg_mtp_loss: 16.8418 +[titan] 2025-06-13 14:47:38,030 - root - INFO - lr: 1.4768e-04 gnorm: 1.20 [ 2:06:07< 0:51:01] +[titan] 2025-06-13 14:47:42,060 - root - INFO - step: 10685 loss: 19.4232 memory: 6.46GiB(27.34%) tps: 20,331 tflops: 20.46 mfu: 6.56% global_avg_ntp_loss: 3.4034 global_avg_mtp_loss: 16.0198 +[titan] 2025-06-13 14:47:42,060 - root - INFO - lr: 1.4748e-04 gnorm: 1.24 [ 2:06:11< 0:50:57] +[titan] 2025-06-13 14:47:45,585 - root - INFO - step: 10690 loss: 19.1119 memory: 6.46GiB(27.34%) tps: 23,243 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.3038 global_avg_mtp_loss: 15.8081 +[titan] 2025-06-13 14:47:45,585 - root - INFO - lr: 1.4727e-04 gnorm: 1.17 [ 2:06:15< 0:50:54] +[titan] 2025-06-13 14:47:49,556 - root - INFO - step: 10695 loss: 19.0122 memory: 6.46GiB(27.34%) tps: 20,628 tflops: 20.76 mfu: 6.65% global_avg_ntp_loss: 3.2572 global_avg_mtp_loss: 15.7549 +[titan] 2025-06-13 14:47:49,557 - root - INFO - lr: 1.4706e-04 gnorm: 1.19 [ 2:06:19< 0:50:50] +[titan] 2025-06-13 14:47:52,099 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:47:52,643 - root - INFO - step: 10700 loss: 19.3993 memory: 6.46GiB(27.34%) tps: 26,542 tflops: 26.71 mfu: 8.56% global_avg_ntp_loss: 3.3658 global_avg_mtp_loss: 16.0334 +[titan] 2025-06-13 14:47:52,643 - root - INFO - lr: 1.4685e-04 gnorm: 1.26 [ 2:06:22< 0:50:47] +[titan] 2025-06-13 14:47:55,826 - root - INFO - step: 10705 loss: 17.9045 memory: 6.46GiB(27.34%) tps: 25,740 tflops: 25.90 mfu: 8.30% global_avg_ntp_loss: 3.0548 global_avg_mtp_loss: 14.8497 +[titan] 2025-06-13 14:47:55,826 - root - INFO - lr: 1.4665e-04 gnorm: 1.38 [ 2:06:25< 0:50:43] +[titan] 2025-06-13 14:47:59,366 - root - INFO - step: 10710 loss: 18.5740 memory: 6.46GiB(27.34%) tps: 23,142 tflops: 23.29 mfu: 7.46% global_avg_ntp_loss: 3.1863 global_avg_mtp_loss: 15.3877 +[titan] 2025-06-13 14:47:59,367 - root - INFO - lr: 1.4644e-04 gnorm: 1.17 [ 2:06:29< 0:50:39] +[titan] 2025-06-13 14:48:03,128 - root - INFO - step: 10715 loss: 18.3130 memory: 6.46GiB(27.34%) tps: 21,778 tflops: 21.92 mfu: 7.02% global_avg_ntp_loss: 3.1067 global_avg_mtp_loss: 15.2063 +[titan] 2025-06-13 14:48:03,129 - root - INFO - lr: 1.4623e-04 gnorm: 1.31 [ 2:06:32< 0:50:36] +[titan] 2025-06-13 14:48:06,592 - root - INFO - step: 10720 loss: 18.6704 memory: 6.46GiB(27.34%) tps: 23,655 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 3.1940 global_avg_mtp_loss: 15.4763 +[titan] 2025-06-13 14:48:06,592 - root - INFO - lr: 1.4602e-04 gnorm: 1.30 [ 2:06:36< 0:50:32] +[titan] 2025-06-13 14:48:09,859 - root - INFO - step: 10725 loss: 19.3637 memory: 6.46GiB(27.34%) tps: 25,079 tflops: 25.24 mfu: 8.09% global_avg_ntp_loss: 3.2925 global_avg_mtp_loss: 16.0712 +[titan] 2025-06-13 14:48:09,859 - root - INFO - lr: 1.4582e-04 gnorm: 1.13 [ 2:06:39< 0:50:29] +[titan] 2025-06-13 14:48:13,447 - root - INFO - step: 10730 loss: 19.8596 memory: 6.46GiB(27.34%) tps: 22,831 tflops: 22.98 mfu: 7.36% global_avg_ntp_loss: 3.4230 global_avg_mtp_loss: 16.4366 +[titan] 2025-06-13 14:48:13,448 - root - INFO - lr: 1.4561e-04 gnorm: 1.21 [ 2:06:43< 0:50:25] +[titan] 2025-06-13 14:48:16,591 - root - INFO - step: 10735 loss: 19.1650 memory: 6.46GiB(27.34%) tps: 26,061 tflops: 26.23 mfu: 8.41% global_avg_ntp_loss: 3.3409 global_avg_mtp_loss: 15.8242 +[titan] 2025-06-13 14:48:16,591 - root - INFO - lr: 1.4540e-04 gnorm: 1.36 [ 2:06:46< 0:50:22] +[titan] 2025-06-13 14:48:20,161 - root - INFO - step: 10740 loss: 20.2425 memory: 6.46GiB(27.34%) tps: 22,951 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 3.5004 global_avg_mtp_loss: 16.7421 +[titan] 2025-06-13 14:48:20,161 - root - INFO - lr: 1.4520e-04 gnorm: 1.16 [ 2:06:49< 0:50:18] +[titan] 2025-06-13 14:48:23,204 - root - INFO - step: 10745 loss: 16.0390 memory: 6.46GiB(27.34%) tps: 26,919 tflops: 27.09 mfu: 8.68% global_avg_ntp_loss: 2.8120 global_avg_mtp_loss: 13.2270 +[titan] 2025-06-13 14:48:23,205 - root - INFO - lr: 1.4499e-04 gnorm: 1.13 [ 2:06:52< 0:50:14] +[titan] 2025-06-13 14:48:26,074 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:48:26,744 - root - INFO - step: 10750 loss: 18.4040 memory: 6.46GiB(27.34%) tps: 23,148 tflops: 23.30 mfu: 7.47% global_avg_ntp_loss: 3.1294 global_avg_mtp_loss: 15.2745 +[titan] 2025-06-13 14:48:26,744 - root - INFO - lr: 1.4479e-04 gnorm: 1.47 [ 2:06:56< 0:50:11] +[titan] 2025-06-13 14:48:28,157 - root - INFO - Dumping profiler traces at step 10752 +[titan] 2025-06-13 14:48:28,252 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 14:48:30,562 - root - INFO - step: 10755 loss: 18.6567 memory: 6.46GiB(27.34%) tps: 21,459 tflops: 21.60 mfu: 6.92% global_avg_ntp_loss: 3.2120 global_avg_mtp_loss: 15.4447 +[titan] 2025-06-13 14:48:30,563 - root - INFO - lr: 1.4458e-04 gnorm: 1.32 [ 2:07:00< 0:50:07] +[titan] 2025-06-13 14:48:33,993 - root - INFO - step: 10760 loss: 18.1279 memory: 6.46GiB(27.34%) tps: 23,881 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 3.1404 global_avg_mtp_loss: 14.9876 +[titan] 2025-06-13 14:48:33,993 - root - INFO - lr: 1.4438e-04 gnorm: 1.28 [ 2:07:03< 0:50:04] +[titan] 2025-06-13 14:48:37,265 - root - INFO - step: 10765 loss: 17.8355 memory: 6.46GiB(27.34%) tps: 25,042 tflops: 25.20 mfu: 8.08% global_avg_ntp_loss: 3.0384 global_avg_mtp_loss: 14.7971 +[titan] 2025-06-13 14:48:37,265 - root - INFO - lr: 1.4417e-04 gnorm: 1.26 [ 2:07:07< 0:50:00] +[titan] 2025-06-13 14:48:40,597 - root - INFO - step: 10770 loss: 19.4171 memory: 6.46GiB(27.34%) tps: 24,587 tflops: 24.74 mfu: 7.93% global_avg_ntp_loss: 3.3414 global_avg_mtp_loss: 16.0757 +[titan] 2025-06-13 14:48:40,597 - root - INFO - lr: 1.4396e-04 gnorm: 1.31 [ 2:07:10< 0:49:56] +[titan] 2025-06-13 14:48:44,112 - root - INFO - step: 10775 loss: 16.8671 memory: 6.46GiB(27.34%) tps: 23,311 tflops: 23.46 mfu: 7.52% global_avg_ntp_loss: 2.8869 global_avg_mtp_loss: 13.9802 +[titan] 2025-06-13 14:48:44,112 - root - INFO - lr: 1.4376e-04 gnorm: 1.19 [ 2:07:13< 0:49:53] +[titan] 2025-06-13 14:48:47,480 - root - INFO - step: 10780 loss: 19.4393 memory: 6.46GiB(27.34%) tps: 24,325 tflops: 24.48 mfu: 7.85% global_avg_ntp_loss: 3.3403 global_avg_mtp_loss: 16.0990 +[titan] 2025-06-13 14:48:47,480 - root - INFO - lr: 1.4355e-04 gnorm: 1.22 [ 2:07:17< 0:49:49] +[titan] 2025-06-13 14:48:51,181 - root - INFO - step: 10785 loss: 18.9440 memory: 6.46GiB(27.34%) tps: 22,137 tflops: 22.28 mfu: 7.14% global_avg_ntp_loss: 3.2693 global_avg_mtp_loss: 15.6748 +[titan] 2025-06-13 14:48:51,181 - root - INFO - lr: 1.4335e-04 gnorm: 1.32 [ 2:07:20< 0:49:46] +[titan] 2025-06-13 14:48:54,500 - root - INFO - step: 10790 loss: 18.9748 memory: 6.46GiB(27.34%) tps: 24,685 tflops: 24.84 mfu: 7.96% global_avg_ntp_loss: 3.2636 global_avg_mtp_loss: 15.7112 +[titan] 2025-06-13 14:48:54,500 - root - INFO - lr: 1.4314e-04 gnorm: 1.25 [ 2:07:24< 0:49:42] +[titan] 2025-06-13 14:48:58,470 - root - INFO - step: 10795 loss: 20.9804 memory: 6.46GiB(27.34%) tps: 20,639 tflops: 20.77 mfu: 6.66% global_avg_ntp_loss: 3.6997 global_avg_mtp_loss: 17.2807 +[titan] 2025-06-13 14:48:58,470 - root - INFO - lr: 1.4294e-04 gnorm: 1.29 [ 2:07:28< 0:49:39] +[titan] 2025-06-13 14:49:01,382 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:49:02,048 - root - INFO - step: 10800 loss: 18.8837 memory: 6.46GiB(27.34%) tps: 22,899 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 3.2387 global_avg_mtp_loss: 15.6450 +[titan] 2025-06-13 14:49:02,048 - root - INFO - lr: 1.4274e-04 gnorm: 1.25 [ 2:07:31< 0:49:35] +[titan] 2025-06-13 14:49:05,710 - root - INFO - step: 10805 loss: 18.6822 memory: 6.46GiB(27.34%) tps: 22,372 tflops: 22.51 mfu: 7.22% global_avg_ntp_loss: 3.2041 global_avg_mtp_loss: 15.4782 +[titan] 2025-06-13 14:49:05,710 - root - INFO - lr: 1.4253e-04 gnorm: 1.19 [ 2:07:35< 0:49:32] +[titan] 2025-06-13 14:49:09,099 - root - INFO - step: 10810 loss: 20.0934 memory: 6.46GiB(27.34%) tps: 24,174 tflops: 24.33 mfu: 7.80% global_avg_ntp_loss: 3.5207 global_avg_mtp_loss: 16.5727 +[titan] 2025-06-13 14:49:09,099 - root - INFO - lr: 1.4233e-04 gnorm: 1.27 [ 2:07:38< 0:49:28] +[titan] 2025-06-13 14:49:12,685 - root - INFO - step: 10815 loss: 20.9418 memory: 6.46GiB(27.34%) tps: 22,851 tflops: 23.00 mfu: 7.37% global_avg_ntp_loss: 3.7169 global_avg_mtp_loss: 17.2249 +[titan] 2025-06-13 14:49:12,685 - root - INFO - lr: 1.4212e-04 gnorm: 1.28 [ 2:07:42< 0:49:25] +[titan] 2025-06-13 14:49:16,079 - root - INFO - step: 10820 loss: 18.9219 memory: 6.46GiB(27.34%) tps: 24,141 tflops: 24.29 mfu: 7.79% global_avg_ntp_loss: 3.2267 global_avg_mtp_loss: 15.6952 +[titan] 2025-06-13 14:49:16,079 - root - INFO - lr: 1.4192e-04 gnorm: 1.25 [ 2:07:45< 0:49:21] +[titan] 2025-06-13 14:49:19,708 - root - INFO - step: 10825 loss: 19.4945 memory: 6.46GiB(27.34%) tps: 22,572 tflops: 22.72 mfu: 7.28% global_avg_ntp_loss: 3.3225 global_avg_mtp_loss: 16.1720 +[titan] 2025-06-13 14:49:19,709 - root - INFO - lr: 1.4172e-04 gnorm: 1.20 [ 2:07:49< 0:49:17] +[titan] 2025-06-13 14:49:25,947 - root - INFO - step: 10830 loss: 19.0875 memory: 6.46GiB(27.34%) tps: 13,133 tflops: 13.22 mfu: 4.24% global_avg_ntp_loss: 3.2584 global_avg_mtp_loss: 15.8291 +[titan] 2025-06-13 14:49:25,947 - root - INFO - lr: 1.4151e-04 gnorm: 1.43 [ 2:07:55< 0:49:15] +[titan] 2025-06-13 14:49:29,188 - root - INFO - step: 10835 loss: 19.4302 memory: 6.46GiB(27.34%) tps: 25,279 tflops: 25.44 mfu: 8.15% global_avg_ntp_loss: 3.3669 global_avg_mtp_loss: 16.0633 +[titan] 2025-06-13 14:49:29,188 - root - INFO - lr: 1.4131e-04 gnorm: 1.26 [ 2:07:58< 0:49:11] +[titan] 2025-06-13 14:49:32,705 - root - INFO - step: 10840 loss: 18.7022 memory: 6.46GiB(27.34%) tps: 23,295 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 3.2272 global_avg_mtp_loss: 15.4751 +[titan] 2025-06-13 14:49:32,705 - root - INFO - lr: 1.4111e-04 gnorm: 1.37 [ 2:08:02< 0:49:08] +[titan] 2025-06-13 14:49:36,169 - root - INFO - step: 10845 loss: 20.6008 memory: 6.46GiB(27.34%) tps: 23,654 tflops: 23.80 mfu: 7.63% global_avg_ntp_loss: 3.5516 global_avg_mtp_loss: 17.0491 +[titan] 2025-06-13 14:49:36,169 - root - INFO - lr: 1.4090e-04 gnorm: 1.22 [ 2:08:05< 0:49:04] +[titan] 2025-06-13 14:49:38,824 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:49:39,642 - root - INFO - step: 10850 loss: 19.5311 memory: 6.46GiB(27.34%) tps: 23,586 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 3.3274 global_avg_mtp_loss: 16.2037 +[titan] 2025-06-13 14:49:39,643 - root - INFO - lr: 1.4070e-04 gnorm: 1.17 [ 2:08:09< 0:49:01] +[titan] 2025-06-13 14:49:43,036 - root - INFO - step: 10855 loss: 17.7138 memory: 6.46GiB(27.34%) tps: 24,144 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 3.0542 global_avg_mtp_loss: 14.6596 +[titan] 2025-06-13 14:49:43,036 - root - INFO - lr: 1.4050e-04 gnorm: 1.42 [ 2:08:12< 0:48:57] +[titan] 2025-06-13 14:49:46,411 - root - INFO - step: 10860 loss: 19.4232 memory: 6.46GiB(27.34%) tps: 24,273 tflops: 24.43 mfu: 7.83% global_avg_ntp_loss: 3.3949 global_avg_mtp_loss: 16.0283 +[titan] 2025-06-13 14:49:46,411 - root - INFO - lr: 1.4030e-04 gnorm: 1.40 [ 2:08:16< 0:48:53] +[titan] 2025-06-13 14:49:49,728 - root - INFO - step: 10865 loss: 19.1455 memory: 6.46GiB(27.34%) tps: 24,705 tflops: 24.86 mfu: 7.97% global_avg_ntp_loss: 3.2839 global_avg_mtp_loss: 15.8616 +[titan] 2025-06-13 14:49:49,728 - root - INFO - lr: 1.4010e-04 gnorm: 1.25 [ 2:08:19< 0:48:50] +[titan] 2025-06-13 14:49:53,358 - root - INFO - step: 10870 loss: 19.3446 memory: 6.46GiB(27.34%) tps: 22,570 tflops: 22.71 mfu: 7.28% global_avg_ntp_loss: 3.4430 global_avg_mtp_loss: 15.9017 +[titan] 2025-06-13 14:49:53,358 - root - INFO - lr: 1.3989e-04 gnorm: 1.28 [ 2:08:23< 0:48:46] +[titan] 2025-06-13 14:49:56,824 - root - INFO - step: 10875 loss: 19.2673 memory: 6.46GiB(27.34%) tps: 23,636 tflops: 23.79 mfu: 7.62% global_avg_ntp_loss: 3.2828 global_avg_mtp_loss: 15.9845 +[titan] 2025-06-13 14:49:56,824 - root - INFO - lr: 1.3969e-04 gnorm: 1.23 [ 2:08:26< 0:48:43] +[titan] 2025-06-13 14:50:00,331 - root - INFO - step: 10880 loss: 18.8837 memory: 6.46GiB(27.34%) tps: 23,362 tflops: 23.51 mfu: 7.54% global_avg_ntp_loss: 3.2035 global_avg_mtp_loss: 15.6802 +[titan] 2025-06-13 14:50:00,331 - root - INFO - lr: 1.3949e-04 gnorm: 1.25 [ 2:08:30< 0:48:39] +[titan] 2025-06-13 14:50:03,779 - root - INFO - step: 10885 loss: 18.6921 memory: 6.46GiB(27.34%) tps: 23,762 tflops: 23.91 mfu: 7.66% global_avg_ntp_loss: 3.1985 global_avg_mtp_loss: 15.4936 +[titan] 2025-06-13 14:50:03,780 - root - INFO - lr: 1.3929e-04 gnorm: 1.22 [ 2:08:33< 0:48:36] +[titan] 2025-06-13 14:50:07,147 - root - INFO - step: 10890 loss: 18.6458 memory: 6.46GiB(27.34%) tps: 24,332 tflops: 24.49 mfu: 7.85% global_avg_ntp_loss: 3.1904 global_avg_mtp_loss: 15.4553 +[titan] 2025-06-13 14:50:07,147 - root - INFO - lr: 1.3909e-04 gnorm: 1.24 [ 2:08:36< 0:48:32] +[titan] 2025-06-13 14:50:10,440 - root - INFO - step: 10895 loss: 16.9028 memory: 6.46GiB(27.34%) tps: 24,879 tflops: 25.04 mfu: 8.02% global_avg_ntp_loss: 2.8714 global_avg_mtp_loss: 14.0314 +[titan] 2025-06-13 14:50:10,440 - root - INFO - lr: 1.3889e-04 gnorm: 1.39 [ 2:08:40< 0:48:28] +[titan] 2025-06-13 14:50:13,226 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:50:13,881 - root - INFO - step: 10900 loss: 19.8391 memory: 6.46GiB(27.34%) tps: 23,807 tflops: 23.96 mfu: 7.68% global_avg_ntp_loss: 3.4571 global_avg_mtp_loss: 16.3820 +[titan] 2025-06-13 14:50:13,882 - root - INFO - lr: 1.3869e-04 gnorm: 1.21 [ 2:08:43< 0:48:25] +[titan] 2025-06-13 14:50:17,542 - root - INFO - step: 10905 loss: 19.5819 memory: 6.46GiB(27.34%) tps: 22,380 tflops: 22.52 mfu: 7.22% global_avg_ntp_loss: 3.4095 global_avg_mtp_loss: 16.1724 +[titan] 2025-06-13 14:50:17,542 - root - INFO - lr: 1.3848e-04 gnorm: 1.24 [ 2:08:47< 0:48:21] +[titan] 2025-06-13 14:50:21,876 - root - INFO - step: 10910 loss: 19.8284 memory: 6.46GiB(27.34%) tps: 18,905 tflops: 19.03 mfu: 6.10% global_avg_ntp_loss: 3.3809 global_avg_mtp_loss: 16.4475 +[titan] 2025-06-13 14:50:21,876 - root - INFO - lr: 1.3828e-04 gnorm: 1.34 [ 2:08:51< 0:48:18] +[titan] 2025-06-13 14:50:25,448 - root - INFO - step: 10915 loss: 18.6997 memory: 6.46GiB(27.34%) tps: 22,937 tflops: 23.08 mfu: 7.40% global_avg_ntp_loss: 3.1866 global_avg_mtp_loss: 15.5131 +[titan] 2025-06-13 14:50:25,448 - root - INFO - lr: 1.3808e-04 gnorm: 1.55 [ 2:08:55< 0:48:14] +[titan] 2025-06-13 14:50:28,749 - root - INFO - step: 10920 loss: 20.5765 memory: 6.46GiB(27.34%) tps: 24,822 tflops: 24.98 mfu: 8.01% global_avg_ntp_loss: 3.5812 global_avg_mtp_loss: 16.9953 +[titan] 2025-06-13 14:50:28,749 - root - INFO - lr: 1.3788e-04 gnorm: 1.28 [ 2:08:58< 0:48:11] +[titan] 2025-06-13 14:50:32,209 - root - INFO - step: 10925 loss: 16.9454 memory: 6.46GiB(27.34%) tps: 23,679 tflops: 23.83 mfu: 7.64% global_avg_ntp_loss: 2.9047 global_avg_mtp_loss: 14.0407 +[titan] 2025-06-13 14:50:32,209 - root - INFO - lr: 1.3768e-04 gnorm: 1.61 [ 2:09:01< 0:48:07] +[titan] 2025-06-13 14:50:35,663 - root - INFO - step: 10930 loss: 18.2224 memory: 6.46GiB(27.34%) tps: 23,717 tflops: 23.87 mfu: 7.65% global_avg_ntp_loss: 3.1337 global_avg_mtp_loss: 15.0887 +[titan] 2025-06-13 14:50:35,664 - root - INFO - lr: 1.3748e-04 gnorm: 1.25 [ 2:09:05< 0:48:04] +[titan] 2025-06-13 14:50:39,473 - root - INFO - step: 10935 loss: 19.9006 memory: 6.46GiB(27.34%) tps: 21,507 tflops: 21.64 mfu: 6.94% global_avg_ntp_loss: 3.3813 global_avg_mtp_loss: 16.5193 +[titan] 2025-06-13 14:50:39,473 - root - INFO - lr: 1.3728e-04 gnorm: 1.22 [ 2:09:09< 0:48:00] +[titan] 2025-06-13 14:50:42,930 - root - INFO - step: 10940 loss: 19.5626 memory: 6.46GiB(27.34%) tps: 23,696 tflops: 23.85 mfu: 7.64% global_avg_ntp_loss: 3.4298 global_avg_mtp_loss: 16.1329 +[titan] 2025-06-13 14:50:42,931 - root - INFO - lr: 1.3708e-04 gnorm: 1.32 [ 2:09:12< 0:47:57] +[titan] 2025-06-13 14:50:46,661 - root - INFO - step: 10945 loss: 19.6488 memory: 6.46GiB(27.34%) tps: 21,960 tflops: 22.10 mfu: 7.08% global_avg_ntp_loss: 3.3915 global_avg_mtp_loss: 16.2573 +[titan] 2025-06-13 14:50:46,661 - root - INFO - lr: 1.3688e-04 gnorm: 1.16 [ 2:09:16< 0:47:53] +[titan] 2025-06-13 14:50:49,468 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:50:50,033 - root - INFO - step: 10950 loss: 19.2777 memory: 6.46GiB(27.34%) tps: 24,299 tflops: 24.45 mfu: 7.84% global_avg_ntp_loss: 3.2867 global_avg_mtp_loss: 15.9910 +[titan] 2025-06-13 14:50:50,033 - root - INFO - lr: 1.3669e-04 gnorm: 1.20 [ 2:09:19< 0:47:50] +[titan] 2025-06-13 14:50:53,704 - root - INFO - step: 10955 loss: 19.5026 memory: 6.46GiB(27.34%) tps: 22,320 tflops: 22.46 mfu: 7.20% global_avg_ntp_loss: 3.3631 global_avg_mtp_loss: 16.1396 +[titan] 2025-06-13 14:50:53,704 - root - INFO - lr: 1.3649e-04 gnorm: 1.18 [ 2:09:23< 0:47:46] +[titan] 2025-06-13 14:50:56,980 - root - INFO - step: 10960 loss: 19.0017 memory: 6.46GiB(27.34%) tps: 25,007 tflops: 25.17 mfu: 8.07% global_avg_ntp_loss: 3.2707 global_avg_mtp_loss: 15.7310 +[titan] 2025-06-13 14:50:56,980 - root - INFO - lr: 1.3629e-04 gnorm: 1.16 [ 2:09:26< 0:47:42] +[titan] 2025-06-13 14:51:00,682 - root - INFO - step: 10965 loss: 17.8852 memory: 6.46GiB(27.34%) tps: 22,130 tflops: 22.27 mfu: 7.14% global_avg_ntp_loss: 3.0589 global_avg_mtp_loss: 14.8264 +[titan] 2025-06-13 14:51:00,682 - root - INFO - lr: 1.3609e-04 gnorm: 1.52 [ 2:09:30< 0:47:39] +[titan] 2025-06-13 14:51:04,407 - root - INFO - step: 10970 loss: 18.1815 memory: 6.46GiB(27.34%) tps: 21,993 tflops: 22.13 mfu: 7.09% global_avg_ntp_loss: 3.1029 global_avg_mtp_loss: 15.0787 +[titan] 2025-06-13 14:51:04,408 - root - INFO - lr: 1.3589e-04 gnorm: 1.53 [ 2:09:34< 0:47:35] +[titan] 2025-06-13 14:51:07,688 - root - INFO - step: 10975 loss: 17.4107 memory: 6.46GiB(27.34%) tps: 24,975 tflops: 25.13 mfu: 8.06% global_avg_ntp_loss: 2.9197 global_avg_mtp_loss: 14.4911 +[titan] 2025-06-13 14:51:07,688 - root - INFO - lr: 1.3569e-04 gnorm: 1.50 [ 2:09:37< 0:47:32] +[titan] 2025-06-13 14:51:11,604 - root - INFO - step: 10980 loss: 18.1654 memory: 6.46GiB(27.34%) tps: 20,924 tflops: 21.06 mfu: 6.75% global_avg_ntp_loss: 3.1261 global_avg_mtp_loss: 15.0392 +[titan] 2025-06-13 14:51:11,604 - root - INFO - lr: 1.3549e-04 gnorm: 1.36 [ 2:09:41< 0:47:28] +[titan] 2025-06-13 14:51:15,898 - root - INFO - step: 10985 loss: 18.7038 memory: 6.46GiB(27.34%) tps: 19,078 tflops: 19.20 mfu: 6.15% global_avg_ntp_loss: 3.2154 global_avg_mtp_loss: 15.4883 +[titan] 2025-06-13 14:51:15,898 - root - INFO - lr: 1.3530e-04 gnorm: 1.22 [ 2:09:45< 0:47:25] +[titan] 2025-06-13 14:51:19,033 - root - INFO - step: 10990 loss: 19.9996 memory: 6.46GiB(27.34%) tps: 26,138 tflops: 26.30 mfu: 8.43% global_avg_ntp_loss: 3.4356 global_avg_mtp_loss: 16.5640 +[titan] 2025-06-13 14:51:19,033 - root - INFO - lr: 1.3510e-04 gnorm: 1.25 [ 2:09:48< 0:47:21] +[titan] 2025-06-13 14:51:22,850 - root - INFO - step: 10995 loss: 17.8101 memory: 6.46GiB(27.34%) tps: 21,462 tflops: 21.60 mfu: 6.92% global_avg_ntp_loss: 3.0662 global_avg_mtp_loss: 14.7438 +[titan] 2025-06-13 14:51:22,851 - root - INFO - lr: 1.3490e-04 gnorm: 1.20 [ 2:09:52< 0:47:18] +[titan] 2025-06-13 14:51:25,414 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:51:26,099 - root - INFO - step: 11000 loss: 19.8657 memory: 6.46GiB(27.34%) tps: 25,221 tflops: 25.38 mfu: 8.14% global_avg_ntp_loss: 3.4399 global_avg_mtp_loss: 16.4257 +[titan] 2025-06-13 14:51:26,099 - root - INFO - lr: 1.3470e-04 gnorm: 1.18 [ 2:09:55< 0:47:14] +[titan] 2025-06-13 14:51:29,944 - root - INFO - step: 11005 loss: 21.3098 memory: 6.46GiB(27.34%) tps: 21,308 tflops: 21.44 mfu: 6.87% global_avg_ntp_loss: 3.9121 global_avg_mtp_loss: 17.3978 +[titan] 2025-06-13 14:51:29,944 - root - INFO - lr: 1.3451e-04 gnorm: 1.76 [ 2:09:59< 0:47:11] +[titan] 2025-06-13 14:51:33,611 - root - INFO - step: 11010 loss: 20.0061 memory: 6.46GiB(27.34%) tps: 22,341 tflops: 22.48 mfu: 7.21% global_avg_ntp_loss: 3.4495 global_avg_mtp_loss: 16.5566 +[titan] 2025-06-13 14:51:33,611 - root - INFO - lr: 1.3431e-04 gnorm: 1.16 [ 2:10:03< 0:47:07] +[titan] 2025-06-13 14:51:36,817 - root - INFO - step: 11015 loss: 17.6368 memory: 6.46GiB(27.34%) tps: 25,554 tflops: 25.72 mfu: 8.24% global_avg_ntp_loss: 3.0237 global_avg_mtp_loss: 14.6131 +[titan] 2025-06-13 14:51:36,817 - root - INFO - lr: 1.3411e-04 gnorm: 1.42 [ 2:10:06< 0:47:04] +[titan] 2025-06-13 14:51:40,170 - root - INFO - step: 11020 loss: 19.5463 memory: 6.46GiB(27.34%) tps: 24,438 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 3.3960 global_avg_mtp_loss: 16.1503 +[titan] 2025-06-13 14:51:40,170 - root - INFO - lr: 1.3392e-04 gnorm: 1.30 [ 2:10:09< 0:47:00] +[titan] 2025-06-13 14:51:43,563 - root - INFO - step: 11025 loss: 20.0213 memory: 6.46GiB(27.34%) tps: 24,148 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 3.5526 global_avg_mtp_loss: 16.4688 +[titan] 2025-06-13 14:51:43,563 - root - INFO - lr: 1.3372e-04 gnorm: 1.54 [ 2:10:13< 0:46:57] +[titan] 2025-06-13 14:51:47,153 - root - INFO - step: 11030 loss: 19.2777 memory: 6.46GiB(27.34%) tps: 22,822 tflops: 22.97 mfu: 7.36% global_avg_ntp_loss: 3.3378 global_avg_mtp_loss: 15.9400 +[titan] 2025-06-13 14:51:47,153 - root - INFO - lr: 1.3352e-04 gnorm: 1.20 [ 2:10:16< 0:46:53] +[titan] 2025-06-13 14:51:50,373 - root - INFO - step: 11035 loss: 18.4794 memory: 6.46GiB(27.34%) tps: 25,443 tflops: 25.61 mfu: 8.21% global_avg_ntp_loss: 3.1645 global_avg_mtp_loss: 15.3149 +[titan] 2025-06-13 14:51:50,373 - root - INFO - lr: 1.3333e-04 gnorm: 1.54 [ 2:10:20< 0:46:49] +[titan] 2025-06-13 14:51:53,818 - root - INFO - step: 11040 loss: 19.5353 memory: 6.46GiB(27.34%) tps: 23,779 tflops: 23.93 mfu: 7.67% global_avg_ntp_loss: 3.3353 global_avg_mtp_loss: 16.2000 +[titan] 2025-06-13 14:51:53,819 - root - INFO - lr: 1.3313e-04 gnorm: 1.12 [ 2:10:23< 0:46:46] +[titan] 2025-06-13 14:51:57,397 - root - INFO - step: 11045 loss: 19.8119 memory: 6.46GiB(27.34%) tps: 22,895 tflops: 23.04 mfu: 7.38% global_avg_ntp_loss: 3.4250 global_avg_mtp_loss: 16.3869 +[titan] 2025-06-13 14:51:57,397 - root - INFO - lr: 1.3293e-04 gnorm: 1.17 [ 2:10:27< 0:46:42] +[titan] 2025-06-13 14:51:59,974 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:52:00,516 - root - INFO - step: 11050 loss: 19.5868 memory: 6.46GiB(27.34%) tps: 26,271 tflops: 26.44 mfu: 8.47% global_avg_ntp_loss: 3.3667 global_avg_mtp_loss: 16.2202 +[titan] 2025-06-13 14:52:00,516 - root - INFO - lr: 1.3274e-04 gnorm: 1.33 [ 2:10:30< 0:46:39] +[titan] 2025-06-13 14:52:04,242 - root - INFO - step: 11055 loss: 19.0968 memory: 6.46GiB(27.34%) tps: 21,990 tflops: 22.13 mfu: 7.09% global_avg_ntp_loss: 3.3114 global_avg_mtp_loss: 15.7854 +[titan] 2025-06-13 14:52:04,242 - root - INFO - lr: 1.3254e-04 gnorm: 1.25 [ 2:10:34< 0:46:35] +[titan] 2025-06-13 14:52:07,885 - root - INFO - step: 11060 loss: 18.0182 memory: 6.46GiB(27.34%) tps: 22,485 tflops: 22.63 mfu: 7.25% global_avg_ntp_loss: 3.1253 global_avg_mtp_loss: 14.8929 +[titan] 2025-06-13 14:52:07,886 - root - INFO - lr: 1.3235e-04 gnorm: 1.18 [ 2:10:37< 0:46:32] +[titan] 2025-06-13 14:52:11,845 - root - INFO - step: 11065 loss: 18.5556 memory: 6.46GiB(27.34%) tps: 20,693 tflops: 20.82 mfu: 6.67% global_avg_ntp_loss: 3.1590 global_avg_mtp_loss: 15.3966 +[titan] 2025-06-13 14:52:11,845 - root - INFO - lr: 1.3215e-04 gnorm: 1.34 [ 2:10:41< 0:46:28] +[titan] 2025-06-13 14:52:15,440 - root - INFO - step: 11070 loss: 19.9944 memory: 6.46GiB(27.34%) tps: 22,788 tflops: 22.93 mfu: 7.35% global_avg_ntp_loss: 3.4342 global_avg_mtp_loss: 16.5603 +[titan] 2025-06-13 14:52:15,441 - root - INFO - lr: 1.3196e-04 gnorm: 1.22 [ 2:10:45< 0:46:25] +[titan] 2025-06-13 14:52:19,292 - root - INFO - step: 11075 loss: 19.4073 memory: 6.46GiB(27.34%) tps: 21,271 tflops: 21.41 mfu: 6.86% global_avg_ntp_loss: 3.3336 global_avg_mtp_loss: 16.0737 +[titan] 2025-06-13 14:52:19,292 - root - INFO - lr: 1.3176e-04 gnorm: 1.20 [ 2:10:49< 0:46:21] +[titan] 2025-06-13 14:52:22,550 - root - INFO - step: 11080 loss: 20.1311 memory: 6.46GiB(27.34%) tps: 25,150 tflops: 25.31 mfu: 8.11% global_avg_ntp_loss: 3.4784 global_avg_mtp_loss: 16.6527 +[titan] 2025-06-13 14:52:22,550 - root - INFO - lr: 1.3157e-04 gnorm: 1.26 [ 2:10:52< 0:46:18] +[titan] 2025-06-13 14:52:25,968 - root - INFO - step: 11085 loss: 18.6643 memory: 6.46GiB(27.34%) tps: 23,971 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 3.1937 global_avg_mtp_loss: 15.4705 +[titan] 2025-06-13 14:52:25,968 - root - INFO - lr: 1.3137e-04 gnorm: 1.33 [ 2:10:55< 0:46:14] +[titan] 2025-06-13 14:52:29,208 - root - INFO - step: 11090 loss: 19.2123 memory: 6.46GiB(27.34%) tps: 25,287 tflops: 25.45 mfu: 8.16% global_avg_ntp_loss: 3.2893 global_avg_mtp_loss: 15.9230 +[titan] 2025-06-13 14:52:29,208 - root - INFO - lr: 1.3118e-04 gnorm: 1.25 [ 2:10:58< 0:46:10] +[titan] 2025-06-13 14:52:32,702 - root - INFO - step: 11095 loss: 18.4340 memory: 6.46GiB(27.34%) tps: 23,450 tflops: 23.60 mfu: 7.56% global_avg_ntp_loss: 3.1766 global_avg_mtp_loss: 15.2574 +[titan] 2025-06-13 14:52:32,702 - root - INFO - lr: 1.3099e-04 gnorm: 1.24 [ 2:11:02< 0:46:07] +[titan] 2025-06-13 14:52:35,523 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:52:36,081 - root - INFO - step: 11100 loss: 20.0716 memory: 6.46GiB(27.34%) tps: 24,249 tflops: 24.40 mfu: 7.82% global_avg_ntp_loss: 3.4520 global_avg_mtp_loss: 16.6196 +[titan] 2025-06-13 14:52:36,081 - root - INFO - lr: 1.3079e-04 gnorm: 1.22 [ 2:11:05< 0:46:03] +[titan] 2025-06-13 14:52:39,540 - root - INFO - step: 11105 loss: 19.9072 memory: 6.46GiB(27.34%) tps: 23,684 tflops: 23.83 mfu: 7.64% global_avg_ntp_loss: 3.4348 global_avg_mtp_loss: 16.4724 +[titan] 2025-06-13 14:52:39,540 - root - INFO - lr: 1.3060e-04 gnorm: 1.65 [ 2:11:09< 0:46:00] +[titan] 2025-06-13 14:52:42,991 - root - INFO - step: 11110 loss: 17.2593 memory: 6.46GiB(27.34%) tps: 23,744 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 2.9257 global_avg_mtp_loss: 14.3336 +[titan] 2025-06-13 14:52:42,991 - root - INFO - lr: 1.3041e-04 gnorm: 1.43 [ 2:11:12< 0:45:56] +[titan] 2025-06-13 14:52:46,486 - root - INFO - step: 11115 loss: 19.9838 memory: 6.46GiB(27.34%) tps: 23,440 tflops: 23.59 mfu: 7.56% global_avg_ntp_loss: 3.4704 global_avg_mtp_loss: 16.5134 +[titan] 2025-06-13 14:52:46,487 - root - INFO - lr: 1.3021e-04 gnorm: 1.29 [ 2:11:16< 0:45:52] +[titan] 2025-06-13 14:52:49,893 - root - INFO - step: 11120 loss: 19.6168 memory: 6.46GiB(27.34%) tps: 24,047 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 3.3743 global_avg_mtp_loss: 16.2425 +[titan] 2025-06-13 14:52:49,894 - root - INFO - lr: 1.3002e-04 gnorm: 1.23 [ 2:11:19< 0:45:49] +[titan] 2025-06-13 14:52:53,226 - root - INFO - step: 11125 loss: 19.3835 memory: 6.46GiB(27.34%) tps: 24,583 tflops: 24.74 mfu: 7.93% global_avg_ntp_loss: 3.4053 global_avg_mtp_loss: 15.9781 +[titan] 2025-06-13 14:52:53,226 - root - INFO - lr: 1.2983e-04 gnorm: 1.40 [ 2:11:22< 0:45:45] +[titan] 2025-06-13 14:52:56,581 - root - INFO - step: 11130 loss: 16.4130 memory: 6.46GiB(27.34%) tps: 24,419 tflops: 24.57 mfu: 7.88% global_avg_ntp_loss: 2.8272 global_avg_mtp_loss: 13.5857 +[titan] 2025-06-13 14:52:56,582 - root - INFO - lr: 1.2963e-04 gnorm: 1.65 [ 2:11:26< 0:45:42] +[titan] 2025-06-13 14:53:00,326 - root - INFO - step: 11135 loss: 18.6834 memory: 6.46GiB(27.34%) tps: 21,880 tflops: 22.02 mfu: 7.06% global_avg_ntp_loss: 3.1976 global_avg_mtp_loss: 15.4858 +[titan] 2025-06-13 14:53:00,326 - root - INFO - lr: 1.2944e-04 gnorm: 1.29 [ 2:11:30< 0:45:38] +[titan] 2025-06-13 14:53:03,421 - root - INFO - step: 11140 loss: 19.1392 memory: 6.46GiB(27.34%) tps: 26,469 tflops: 26.64 mfu: 8.54% global_avg_ntp_loss: 3.2841 global_avg_mtp_loss: 15.8552 +[titan] 2025-06-13 14:53:03,422 - root - INFO - lr: 1.2925e-04 gnorm: 1.31 [ 2:11:33< 0:45:34] +[titan] 2025-06-13 14:53:07,054 - root - INFO - step: 11145 loss: 19.1480 memory: 6.46GiB(27.34%) tps: 22,553 tflops: 22.70 mfu: 7.27% global_avg_ntp_loss: 3.3180 global_avg_mtp_loss: 15.8301 +[titan] 2025-06-13 14:53:07,054 - root - INFO - lr: 1.2906e-04 gnorm: 1.22 [ 2:11:36< 0:45:31] +[titan] 2025-06-13 14:53:10,190 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:53:10,772 - root - INFO - step: 11150 loss: 19.5933 memory: 6.46GiB(27.34%) tps: 22,035 tflops: 22.18 mfu: 7.11% global_avg_ntp_loss: 3.3932 global_avg_mtp_loss: 16.2001 +[titan] 2025-06-13 14:53:10,773 - root - INFO - lr: 1.2886e-04 gnorm: 1.24 [ 2:11:40< 0:45:27] +[titan] 2025-06-13 14:53:14,060 - root - INFO - step: 11155 loss: 19.6334 memory: 6.46GiB(27.34%) tps: 24,921 tflops: 25.08 mfu: 8.04% global_avg_ntp_loss: 3.3450 global_avg_mtp_loss: 16.2885 +[titan] 2025-06-13 14:53:14,060 - root - INFO - lr: 1.2867e-04 gnorm: 1.27 [ 2:11:43< 0:45:24] +[titan] 2025-06-13 14:53:17,776 - root - INFO - step: 11160 loss: 18.6271 memory: 6.46GiB(27.34%) tps: 22,050 tflops: 22.19 mfu: 7.11% global_avg_ntp_loss: 3.2166 global_avg_mtp_loss: 15.4104 +[titan] 2025-06-13 14:53:17,776 - root - INFO - lr: 1.2848e-04 gnorm: 1.43 [ 2:11:47< 0:45:20] +[titan] 2025-06-13 14:53:21,123 - root - INFO - step: 11165 loss: 19.7850 memory: 6.46GiB(27.34%) tps: 24,478 tflops: 24.63 mfu: 7.90% global_avg_ntp_loss: 3.4605 global_avg_mtp_loss: 16.3245 +[titan] 2025-06-13 14:53:21,123 - root - INFO - lr: 1.2829e-04 gnorm: 1.25 [ 2:11:50< 0:45:17] +[titan] 2025-06-13 14:53:24,627 - root - INFO - step: 11170 loss: 17.6438 memory: 6.46GiB(27.34%) tps: 23,384 tflops: 23.53 mfu: 7.54% global_avg_ntp_loss: 3.0250 global_avg_mtp_loss: 14.6188 +[titan] 2025-06-13 14:53:24,627 - root - INFO - lr: 1.2810e-04 gnorm: 1.31 [ 2:11:54< 0:45:13] +[titan] 2025-06-13 14:53:28,082 - root - INFO - step: 11175 loss: 18.4386 memory: 6.46GiB(27.34%) tps: 23,715 tflops: 23.87 mfu: 7.65% global_avg_ntp_loss: 3.1448 global_avg_mtp_loss: 15.2938 +[titan] 2025-06-13 14:53:28,082 - root - INFO - lr: 1.2791e-04 gnorm: 1.40 [ 2:11:57< 0:45:10] +[titan] 2025-06-13 14:53:31,184 - root - INFO - step: 11180 loss: 16.1126 memory: 6.46GiB(27.34%) tps: 26,411 tflops: 26.58 mfu: 8.52% global_avg_ntp_loss: 2.7130 global_avg_mtp_loss: 13.3996 +[titan] 2025-06-13 14:53:31,184 - root - INFO - lr: 1.2772e-04 gnorm: 1.54 [ 2:12:00< 0:45:06] +[titan] 2025-06-13 14:53:34,441 - root - INFO - step: 11185 loss: 19.8790 memory: 6.46GiB(27.34%) tps: 25,153 tflops: 25.31 mfu: 8.11% global_avg_ntp_loss: 3.4210 global_avg_mtp_loss: 16.4579 +[titan] 2025-06-13 14:53:34,441 - root - INFO - lr: 1.2752e-04 gnorm: 1.25 [ 2:12:04< 0:45:02] +[titan] 2025-06-13 14:53:38,137 - root - INFO - step: 11190 loss: 19.1059 memory: 6.46GiB(27.34%) tps: 22,167 tflops: 22.31 mfu: 7.15% global_avg_ntp_loss: 3.2726 global_avg_mtp_loss: 15.8332 +[titan] 2025-06-13 14:53:38,137 - root - INFO - lr: 1.2733e-04 gnorm: 1.25 [ 2:12:07< 0:44:59] +[titan] 2025-06-13 14:53:41,380 - root - INFO - step: 11195 loss: 19.4891 memory: 6.46GiB(27.34%) tps: 25,263 tflops: 25.42 mfu: 8.15% global_avg_ntp_loss: 3.3219 global_avg_mtp_loss: 16.1671 +[titan] 2025-06-13 14:53:41,381 - root - INFO - lr: 1.2714e-04 gnorm: 1.26 [ 2:12:11< 0:44:55] +[titan] 2025-06-13 14:53:44,341 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:53:44,964 - root - INFO - step: 11200 loss: 19.7786 memory: 6.46GiB(27.34%) tps: 22,863 tflops: 23.01 mfu: 7.37% global_avg_ntp_loss: 3.4606 global_avg_mtp_loss: 16.3180 +[titan] 2025-06-13 14:53:44,964 - root - INFO - lr: 1.2695e-04 gnorm: 1.28 [ 2:12:14< 0:44:52] +[titan] 2025-06-13 14:53:48,795 - root - INFO - step: 11205 loss: 19.4209 memory: 6.46GiB(27.34%) tps: 21,386 tflops: 21.52 mfu: 6.90% global_avg_ntp_loss: 3.3395 global_avg_mtp_loss: 16.0814 +[titan] 2025-06-13 14:53:48,795 - root - INFO - lr: 1.2676e-04 gnorm: 1.20 [ 2:12:18< 0:44:48] +[titan] 2025-06-13 14:53:52,369 - root - INFO - step: 11210 loss: 18.2649 memory: 6.46GiB(27.34%) tps: 22,921 tflops: 23.07 mfu: 7.39% global_avg_ntp_loss: 3.1880 global_avg_mtp_loss: 15.0769 +[titan] 2025-06-13 14:53:52,369 - root - INFO - lr: 1.2657e-04 gnorm: 1.22 [ 2:12:22< 0:44:45] +[titan] 2025-06-13 14:53:56,331 - root - INFO - step: 11215 loss: 19.3543 memory: 6.46GiB(27.34%) tps: 20,681 tflops: 20.81 mfu: 6.67% global_avg_ntp_loss: 3.3238 global_avg_mtp_loss: 16.0304 +[titan] 2025-06-13 14:53:56,331 - root - INFO - lr: 1.2638e-04 gnorm: 1.16 [ 2:12:26< 0:44:41] +[titan] 2025-06-13 14:53:59,738 - root - INFO - step: 11220 loss: 19.8788 memory: 6.46GiB(27.34%) tps: 24,045 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 3.4040 global_avg_mtp_loss: 16.4748 +[titan] 2025-06-13 14:53:59,738 - root - INFO - lr: 1.2619e-04 gnorm: 1.18 [ 2:12:29< 0:44:38] +[titan] 2025-06-13 14:54:03,228 - root - INFO - step: 11225 loss: 19.9252 memory: 6.46GiB(27.34%) tps: 23,475 tflops: 23.62 mfu: 7.57% global_avg_ntp_loss: 3.4266 global_avg_mtp_loss: 16.4986 +[titan] 2025-06-13 14:54:03,229 - root - INFO - lr: 1.2601e-04 gnorm: 1.34 [ 2:12:32< 0:44:34] +[titan] 2025-06-13 14:54:06,639 - root - INFO - step: 11230 loss: 19.2308 memory: 6.46GiB(27.34%) tps: 24,021 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 3.3236 global_avg_mtp_loss: 15.9072 +[titan] 2025-06-13 14:54:06,639 - root - INFO - lr: 1.2582e-04 gnorm: 1.31 [ 2:12:36< 0:44:31] +[titan] 2025-06-13 14:54:10,865 - root - INFO - step: 11235 loss: 19.8491 memory: 6.46GiB(27.34%) tps: 19,386 tflops: 19.51 mfu: 6.25% global_avg_ntp_loss: 3.4200 global_avg_mtp_loss: 16.4290 +[titan] 2025-06-13 14:54:10,866 - root - INFO - lr: 1.2563e-04 gnorm: 1.23 [ 2:12:40< 0:44:27] +[titan] 2025-06-13 14:54:14,341 - root - INFO - step: 11240 loss: 17.5653 memory: 6.46GiB(27.34%) tps: 23,573 tflops: 23.72 mfu: 7.60% global_avg_ntp_loss: 3.0181 global_avg_mtp_loss: 14.5473 +[titan] 2025-06-13 14:54:14,341 - root - INFO - lr: 1.2544e-04 gnorm: 1.62 [ 2:12:44< 0:44:24] +[titan] 2025-06-13 14:54:17,840 - root - INFO - step: 11245 loss: 18.3074 memory: 6.46GiB(27.34%) tps: 23,411 tflops: 23.56 mfu: 7.55% global_avg_ntp_loss: 3.1614 global_avg_mtp_loss: 15.1460 +[titan] 2025-06-13 14:54:17,841 - root - INFO - lr: 1.2525e-04 gnorm: 1.29 [ 2:12:47< 0:44:20] +[titan] 2025-06-13 14:54:20,991 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:54:21,640 - root - INFO - step: 11250 loss: 19.1006 memory: 6.46GiB(27.34%) tps: 21,565 tflops: 21.70 mfu: 6.96% global_avg_ntp_loss: 3.2801 global_avg_mtp_loss: 15.8206 +[titan] 2025-06-13 14:54:21,640 - root - INFO - lr: 1.2506e-04 gnorm: 1.34 [ 2:12:51< 0:44:17] +[titan] 2025-06-13 14:54:24,819 - root - INFO - step: 11255 loss: 19.4048 memory: 6.46GiB(27.34%) tps: 25,771 tflops: 25.94 mfu: 8.31% global_avg_ntp_loss: 3.3007 global_avg_mtp_loss: 16.1040 +[titan] 2025-06-13 14:54:24,819 - root - INFO - lr: 1.2487e-04 gnorm: 1.32 [ 2:12:54< 0:44:13] +[titan] 2025-06-13 14:54:28,396 - root - INFO - step: 11260 loss: 19.3801 memory: 6.46GiB(27.34%) tps: 22,902 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 3.3343 global_avg_mtp_loss: 16.0458 +[titan] 2025-06-13 14:54:28,397 - root - INFO - lr: 1.2469e-04 gnorm: 1.14 [ 2:12:58< 0:44:09] +[titan] 2025-06-13 14:54:31,040 - root - INFO - Dumping profiler traces at step 11264 +[titan] 2025-06-13 14:54:31,136 - root - INFO - Finished dumping profiler traces in 0.10 seconds +[titan] 2025-06-13 14:54:31,681 - root - INFO - step: 11265 loss: 19.3300 memory: 6.46GiB(27.34%) tps: 24,942 tflops: 25.10 mfu: 8.05% global_avg_ntp_loss: 3.3670 global_avg_mtp_loss: 15.9630 +[titan] 2025-06-13 14:54:31,681 - root - INFO - lr: 1.2450e-04 gnorm: 1.35 [ 2:13:01< 0:44:06] +[titan] 2025-06-13 14:54:36,059 - root - INFO - step: 11270 loss: 19.7201 memory: 6.46GiB(27.34%) tps: 18,713 tflops: 18.83 mfu: 6.04% global_avg_ntp_loss: 3.3877 global_avg_mtp_loss: 16.3324 +[titan] 2025-06-13 14:54:36,060 - root - INFO - lr: 1.2431e-04 gnorm: 1.26 [ 2:13:05< 0:44:03] +[titan] 2025-06-13 14:54:40,078 - root - INFO - step: 11275 loss: 20.0189 memory: 6.46GiB(27.34%) tps: 20,384 tflops: 20.51 mfu: 6.58% global_avg_ntp_loss: 3.4748 global_avg_mtp_loss: 16.5441 +[titan] 2025-06-13 14:54:40,079 - root - INFO - lr: 1.2412e-04 gnorm: 1.25 [ 2:13:09< 0:43:59] +[titan] 2025-06-13 14:54:43,837 - root - INFO - step: 11280 loss: 19.4803 memory: 6.46GiB(27.34%) tps: 21,799 tflops: 21.94 mfu: 7.03% global_avg_ntp_loss: 3.3177 global_avg_mtp_loss: 16.1626 +[titan] 2025-06-13 14:54:43,837 - root - INFO - lr: 1.2394e-04 gnorm: 1.27 [ 2:13:13< 0:43:56] +[titan] 2025-06-13 14:54:47,978 - root - INFO - step: 11285 loss: 19.1385 memory: 6.46GiB(27.34%) tps: 19,784 tflops: 19.91 mfu: 6.38% global_avg_ntp_loss: 3.2967 global_avg_mtp_loss: 15.8419 +[titan] 2025-06-13 14:54:47,978 - root - INFO - lr: 1.2375e-04 gnorm: 1.28 [ 2:13:17< 0:43:52] +[titan] 2025-06-13 14:54:51,687 - root - INFO - step: 11290 loss: 20.3827 memory: 6.46GiB(27.34%) tps: 22,089 tflops: 22.23 mfu: 7.12% global_avg_ntp_loss: 3.5582 global_avg_mtp_loss: 16.8245 +[titan] 2025-06-13 14:54:51,687 - root - INFO - lr: 1.2356e-04 gnorm: 1.34 [ 2:13:21< 0:43:49] +[titan] 2025-06-13 14:54:55,397 - root - INFO - step: 11295 loss: 17.8358 memory: 6.46GiB(27.34%) tps: 22,082 tflops: 22.22 mfu: 7.12% global_avg_ntp_loss: 3.0393 global_avg_mtp_loss: 14.7965 +[titan] 2025-06-13 14:54:55,398 - root - INFO - lr: 1.2338e-04 gnorm: 1.35 [ 2:13:25< 0:43:45] +[titan] 2025-06-13 14:54:58,054 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:54:59,080 - root - INFO - step: 11300 loss: 20.1518 memory: 6.46GiB(27.34%) tps: 22,246 tflops: 22.39 mfu: 7.18% global_avg_ntp_loss: 3.4898 global_avg_mtp_loss: 16.6620 +[titan] 2025-06-13 14:54:59,081 - root - INFO - lr: 1.2319e-04 gnorm: 1.16 [ 2:13:28< 0:43:42] +[titan] 2025-06-13 14:55:02,726 - root - INFO - step: 11305 loss: 18.3561 memory: 6.46GiB(27.34%) tps: 22,475 tflops: 22.62 mfu: 7.25% global_avg_ntp_loss: 3.2101 global_avg_mtp_loss: 15.1460 +[titan] 2025-06-13 14:55:02,726 - root - INFO - lr: 1.2300e-04 gnorm: 1.30 [ 2:13:32< 0:43:38] +[titan] 2025-06-13 14:55:06,295 - root - INFO - step: 11310 loss: 19.3400 memory: 6.46GiB(27.34%) tps: 22,952 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 3.2990 global_avg_mtp_loss: 16.0410 +[titan] 2025-06-13 14:55:06,295 - root - INFO - lr: 1.2282e-04 gnorm: 1.21 [ 2:13:36< 0:43:35] +[titan] 2025-06-13 14:55:09,778 - root - INFO - step: 11315 loss: 20.4489 memory: 6.46GiB(27.34%) tps: 23,528 tflops: 23.68 mfu: 7.59% global_avg_ntp_loss: 3.5212 global_avg_mtp_loss: 16.9276 +[titan] 2025-06-13 14:55:09,778 - root - INFO - lr: 1.2263e-04 gnorm: 1.28 [ 2:13:39< 0:43:31] +[titan] 2025-06-13 14:55:13,093 - root - INFO - step: 11320 loss: 18.2370 memory: 6.46GiB(27.34%) tps: 24,708 tflops: 24.87 mfu: 7.97% global_avg_ntp_loss: 3.0839 global_avg_mtp_loss: 15.1530 +[titan] 2025-06-13 14:55:13,094 - root - INFO - lr: 1.2245e-04 gnorm: 1.29 [ 2:13:42< 0:43:28] +[titan] 2025-06-13 14:55:16,516 - root - INFO - step: 11325 loss: 19.3873 memory: 6.46GiB(27.34%) tps: 23,940 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.3305 global_avg_mtp_loss: 16.0567 +[titan] 2025-06-13 14:55:16,516 - root - INFO - lr: 1.2226e-04 gnorm: 1.23 [ 2:13:46< 0:43:24] +[titan] 2025-06-13 14:55:20,130 - root - INFO - step: 11330 loss: 19.8957 memory: 6.46GiB(27.34%) tps: 22,667 tflops: 22.81 mfu: 7.31% global_avg_ntp_loss: 3.4395 global_avg_mtp_loss: 16.4562 +[titan] 2025-06-13 14:55:20,131 - root - INFO - lr: 1.2207e-04 gnorm: 1.24 [ 2:13:49< 0:43:21] +[titan] 2025-06-13 14:55:23,285 - root - INFO - step: 11335 loss: 17.5045 memory: 6.46GiB(27.34%) tps: 25,974 tflops: 26.14 mfu: 8.38% global_avg_ntp_loss: 3.0606 global_avg_mtp_loss: 14.4439 +[titan] 2025-06-13 14:55:23,285 - root - INFO - lr: 1.2189e-04 gnorm: 1.35 [ 2:13:53< 0:43:17] +[titan] 2025-06-13 14:55:26,564 - root - INFO - step: 11340 loss: 17.1155 memory: 6.46GiB(27.34%) tps: 24,989 tflops: 25.15 mfu: 8.06% global_avg_ntp_loss: 2.9462 global_avg_mtp_loss: 14.1693 +[titan] 2025-06-13 14:55:26,564 - root - INFO - lr: 1.2170e-04 gnorm: 1.35 [ 2:13:56< 0:43:13] +[titan] 2025-06-13 14:55:30,082 - root - INFO - step: 11345 loss: 19.2306 memory: 6.46GiB(27.34%) tps: 23,286 tflops: 23.43 mfu: 7.51% global_avg_ntp_loss: 3.2633 global_avg_mtp_loss: 15.9673 +[titan] 2025-06-13 14:55:30,082 - root - INFO - lr: 1.2152e-04 gnorm: 1.28 [ 2:13:59< 0:43:10] +[titan] 2025-06-13 14:55:32,771 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:55:33,459 - root - INFO - step: 11350 loss: 19.6184 memory: 6.46GiB(27.34%) tps: 24,264 tflops: 24.42 mfu: 7.83% global_avg_ntp_loss: 3.3571 global_avg_mtp_loss: 16.2614 +[titan] 2025-06-13 14:55:33,459 - root - INFO - lr: 1.2134e-04 gnorm: 1.17 [ 2:14:03< 0:43:06] +[titan] 2025-06-13 14:55:36,890 - root - INFO - step: 11355 loss: 18.6890 memory: 6.46GiB(27.34%) tps: 23,880 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 3.1384 global_avg_mtp_loss: 15.5505 +[titan] 2025-06-13 14:55:36,890 - root - INFO - lr: 1.2115e-04 gnorm: 1.38 [ 2:14:06< 0:43:02] +[titan] 2025-06-13 14:55:40,278 - root - INFO - step: 11360 loss: 19.0388 memory: 6.46GiB(27.34%) tps: 24,178 tflops: 24.33 mfu: 7.80% global_avg_ntp_loss: 3.2656 global_avg_mtp_loss: 15.7731 +[titan] 2025-06-13 14:55:40,279 - root - INFO - lr: 1.2097e-04 gnorm: 1.33 [ 2:14:10< 0:42:59] +[titan] 2025-06-13 14:55:43,618 - root - INFO - step: 11365 loss: 16.3699 memory: 6.46GiB(27.34%) tps: 24,531 tflops: 24.69 mfu: 7.91% global_avg_ntp_loss: 2.7629 global_avg_mtp_loss: 13.6070 +[titan] 2025-06-13 14:55:43,618 - root - INFO - lr: 1.2078e-04 gnorm: 1.56 [ 2:14:13< 0:42:55] +[titan] 2025-06-13 14:55:47,251 - root - INFO - step: 11370 loss: 19.8988 memory: 6.46GiB(27.34%) tps: 22,551 tflops: 22.69 mfu: 7.27% global_avg_ntp_loss: 3.4694 global_avg_mtp_loss: 16.4294 +[titan] 2025-06-13 14:55:47,252 - root - INFO - lr: 1.2060e-04 gnorm: 1.40 [ 2:14:16< 0:42:52] +[titan] 2025-06-13 14:55:50,958 - root - INFO - step: 11375 loss: 18.9712 memory: 6.46GiB(27.34%) tps: 22,106 tflops: 22.25 mfu: 7.13% global_avg_ntp_loss: 3.2648 global_avg_mtp_loss: 15.7064 +[titan] 2025-06-13 14:55:50,958 - root - INFO - lr: 1.2042e-04 gnorm: 1.21 [ 2:14:20< 0:42:48] +[titan] 2025-06-13 14:55:54,304 - root - INFO - step: 11380 loss: 17.3099 memory: 6.46GiB(27.34%) tps: 24,486 tflops: 24.64 mfu: 7.90% global_avg_ntp_loss: 2.9866 global_avg_mtp_loss: 14.3233 +[titan] 2025-06-13 14:55:54,304 - root - INFO - lr: 1.2023e-04 gnorm: 1.64 [ 2:14:24< 0:42:45] +[titan] 2025-06-13 14:55:58,030 - root - INFO - step: 11385 loss: 18.3927 memory: 6.46GiB(27.34%) tps: 21,988 tflops: 22.13 mfu: 7.09% global_avg_ntp_loss: 3.1577 global_avg_mtp_loss: 15.2350 +[titan] 2025-06-13 14:55:58,030 - root - INFO - lr: 1.2005e-04 gnorm: 1.27 [ 2:14:27< 0:42:41] +[titan] 2025-06-13 14:56:01,888 - root - INFO - step: 11390 loss: 19.3940 memory: 6.46GiB(27.34%) tps: 21,237 tflops: 21.37 mfu: 6.85% global_avg_ntp_loss: 3.3120 global_avg_mtp_loss: 16.0820 +[titan] 2025-06-13 14:56:01,888 - root - INFO - lr: 1.1987e-04 gnorm: 1.23 [ 2:14:31< 0:42:38] +[titan] 2025-06-13 14:56:05,111 - root - INFO - step: 11395 loss: 16.1814 memory: 6.46GiB(27.34%) tps: 25,422 tflops: 25.58 mfu: 8.20% global_avg_ntp_loss: 2.7902 global_avg_mtp_loss: 13.3912 +[titan] 2025-06-13 14:56:05,111 - root - INFO - lr: 1.1968e-04 gnorm: 1.48 [ 2:14:34< 0:42:34] +[titan] 2025-06-13 14:56:07,983 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:56:08,557 - root - INFO - step: 11400 loss: 19.0557 memory: 6.46GiB(27.34%) tps: 23,772 tflops: 23.92 mfu: 7.67% global_avg_ntp_loss: 3.2741 global_avg_mtp_loss: 15.7815 +[titan] 2025-06-13 14:56:08,558 - root - INFO - lr: 1.1950e-04 gnorm: 1.15 [ 2:14:38< 0:42:31] +[titan] 2025-06-13 14:56:11,894 - root - INFO - step: 11405 loss: 19.8673 memory: 6.46GiB(27.34%) tps: 24,555 tflops: 24.71 mfu: 7.92% global_avg_ntp_loss: 3.4012 global_avg_mtp_loss: 16.4661 +[titan] 2025-06-13 14:56:11,894 - root - INFO - lr: 1.1932e-04 gnorm: 1.36 [ 2:14:41< 0:42:27] +[titan] 2025-06-13 14:56:15,432 - root - INFO - step: 11410 loss: 19.4585 memory: 6.46GiB(27.34%) tps: 23,160 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.3557 global_avg_mtp_loss: 16.1028 +[titan] 2025-06-13 14:56:15,432 - root - INFO - lr: 1.1914e-04 gnorm: 1.29 [ 2:14:45< 0:42:23] +[titan] 2025-06-13 14:56:19,321 - root - INFO - step: 11415 loss: 19.8835 memory: 6.46GiB(27.34%) tps: 21,068 tflops: 21.20 mfu: 6.80% global_avg_ntp_loss: 3.4523 global_avg_mtp_loss: 16.4312 +[titan] 2025-06-13 14:56:19,321 - root - INFO - lr: 1.1896e-04 gnorm: 1.22 [ 2:14:49< 0:42:20] +[titan] 2025-06-13 14:56:22,779 - root - INFO - step: 11420 loss: 17.7038 memory: 6.46GiB(27.34%) tps: 23,694 tflops: 23.84 mfu: 7.64% global_avg_ntp_loss: 3.0132 global_avg_mtp_loss: 14.6906 +[titan] 2025-06-13 14:56:22,779 - root - INFO - lr: 1.1877e-04 gnorm: 1.30 [ 2:14:52< 0:42:16] +[titan] 2025-06-13 14:56:26,854 - root - INFO - step: 11425 loss: 19.7394 memory: 6.46GiB(27.34%) tps: 20,105 tflops: 20.23 mfu: 6.49% global_avg_ntp_loss: 3.3921 global_avg_mtp_loss: 16.3473 +[titan] 2025-06-13 14:56:26,854 - root - INFO - lr: 1.1859e-04 gnorm: 1.23 [ 2:14:56< 0:42:13] +[titan] 2025-06-13 14:56:30,013 - root - INFO - step: 11430 loss: 19.5772 memory: 6.46GiB(27.34%) tps: 25,934 tflops: 26.10 mfu: 8.37% global_avg_ntp_loss: 3.3559 global_avg_mtp_loss: 16.2213 +[titan] 2025-06-13 14:56:30,014 - root - INFO - lr: 1.1841e-04 gnorm: 1.49 [ 2:14:59< 0:42:09] +[titan] 2025-06-13 14:56:33,160 - root - INFO - step: 11435 loss: 18.7849 memory: 6.46GiB(27.34%) tps: 26,034 tflops: 26.20 mfu: 8.40% global_avg_ntp_loss: 3.2476 global_avg_mtp_loss: 15.5373 +[titan] 2025-06-13 14:56:33,161 - root - INFO - lr: 1.1823e-04 gnorm: 1.33 [ 2:15:02< 0:42:06] +[titan] 2025-06-13 14:56:36,895 - root - INFO - step: 11440 loss: 19.0818 memory: 6.46GiB(27.34%) tps: 21,939 tflops: 22.08 mfu: 7.08% global_avg_ntp_loss: 3.3084 global_avg_mtp_loss: 15.7734 +[titan] 2025-06-13 14:56:36,895 - root - INFO - lr: 1.1805e-04 gnorm: 1.24 [ 2:15:06< 0:42:02] +[titan] 2025-06-13 14:56:40,557 - root - INFO - step: 11445 loss: 20.2509 memory: 6.46GiB(27.34%) tps: 22,374 tflops: 22.52 mfu: 7.22% global_avg_ntp_loss: 3.5273 global_avg_mtp_loss: 16.7236 +[titan] 2025-06-13 14:56:40,557 - root - INFO - lr: 1.1787e-04 gnorm: 1.44 [ 2:15:10< 0:41:59] +[titan] 2025-06-13 14:56:43,088 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:56:43,651 - root - INFO - step: 11450 loss: 19.0087 memory: 6.46GiB(27.34%) tps: 26,480 tflops: 26.65 mfu: 8.54% global_avg_ntp_loss: 3.2459 global_avg_mtp_loss: 15.7628 +[titan] 2025-06-13 14:56:43,651 - root - INFO - lr: 1.1769e-04 gnorm: 1.17 [ 2:15:13< 0:41:55] +[titan] 2025-06-13 14:56:47,302 - root - INFO - step: 11455 loss: 17.8600 memory: 6.46GiB(27.34%) tps: 22,438 tflops: 22.58 mfu: 7.24% global_avg_ntp_loss: 3.0118 global_avg_mtp_loss: 14.8482 +[titan] 2025-06-13 14:56:47,303 - root - INFO - lr: 1.1751e-04 gnorm: 1.47 [ 2:15:17< 0:41:51] +[titan] 2025-06-13 14:56:51,326 - root - INFO - step: 11460 loss: 19.3925 memory: 6.46GiB(27.34%) tps: 20,361 tflops: 20.49 mfu: 6.57% global_avg_ntp_loss: 3.2977 global_avg_mtp_loss: 16.0948 +[titan] 2025-06-13 14:56:51,326 - root - INFO - lr: 1.1733e-04 gnorm: 1.41 [ 2:15:21< 0:41:48] +[titan] 2025-06-13 14:56:54,567 - root - INFO - step: 11465 loss: 19.5756 memory: 6.46GiB(27.34%) tps: 25,277 tflops: 25.44 mfu: 8.15% global_avg_ntp_loss: 3.3318 global_avg_mtp_loss: 16.2438 +[titan] 2025-06-13 14:56:54,568 - root - INFO - lr: 1.1715e-04 gnorm: 1.29 [ 2:15:24< 0:41:44] +[titan] 2025-06-13 14:56:57,981 - root - INFO - step: 11470 loss: 19.1918 memory: 6.46GiB(27.34%) tps: 24,006 tflops: 24.16 mfu: 7.74% global_avg_ntp_loss: 3.2930 global_avg_mtp_loss: 15.8988 +[titan] 2025-06-13 14:56:57,981 - root - INFO - lr: 1.1697e-04 gnorm: 1.21 [ 2:15:27< 0:41:41] +[titan] 2025-06-13 14:57:01,872 - root - INFO - step: 11475 loss: 19.2821 memory: 6.46GiB(27.34%) tps: 21,055 tflops: 21.19 mfu: 6.79% global_avg_ntp_loss: 3.3062 global_avg_mtp_loss: 15.9759 +[titan] 2025-06-13 14:57:01,872 - root - INFO - lr: 1.1679e-04 gnorm: 1.16 [ 2:15:31< 0:41:37] +[titan] 2025-06-13 14:57:05,334 - root - INFO - step: 11480 loss: 18.1394 memory: 6.46GiB(27.34%) tps: 23,666 tflops: 23.82 mfu: 7.63% global_avg_ntp_loss: 3.0578 global_avg_mtp_loss: 15.0815 +[titan] 2025-06-13 14:57:05,334 - root - INFO - lr: 1.1661e-04 gnorm: 1.42 [ 2:15:35< 0:41:34] +[titan] 2025-06-13 14:57:08,855 - root - INFO - step: 11485 loss: 19.9391 memory: 6.46GiB(27.34%) tps: 23,270 tflops: 23.42 mfu: 7.51% global_avg_ntp_loss: 3.4460 global_avg_mtp_loss: 16.4931 +[titan] 2025-06-13 14:57:08,855 - root - INFO - lr: 1.1643e-04 gnorm: 1.29 [ 2:15:38< 0:41:30] +[titan] 2025-06-13 14:57:12,214 - root - INFO - step: 11490 loss: 20.4999 memory: 6.46GiB(27.34%) tps: 24,391 tflops: 24.55 mfu: 7.87% global_avg_ntp_loss: 3.5748 global_avg_mtp_loss: 16.9251 +[titan] 2025-06-13 14:57:12,214 - root - INFO - lr: 1.1625e-04 gnorm: 1.27 [ 2:15:41< 0:41:27] +[titan] 2025-06-13 14:57:15,848 - root - INFO - step: 11495 loss: 20.0698 memory: 6.46GiB(27.34%) tps: 22,544 tflops: 22.69 mfu: 7.27% global_avg_ntp_loss: 3.4495 global_avg_mtp_loss: 16.6203 +[titan] 2025-06-13 14:57:15,848 - root - INFO - lr: 1.1607e-04 gnorm: 1.18 [ 2:15:45< 0:41:23] +[titan] 2025-06-13 14:57:18,620 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:57:19,584 - root - INFO - step: 11500 loss: 18.6696 memory: 6.46GiB(27.34%) tps: 21,932 tflops: 22.07 mfu: 7.07% global_avg_ntp_loss: 3.2295 global_avg_mtp_loss: 15.4401 +[titan] 2025-06-13 14:57:19,584 - root - INFO - lr: 1.1589e-04 gnorm: 1.38 [ 2:15:49< 0:41:20] +[titan] 2025-06-13 14:57:22,931 - root - INFO - step: 11505 loss: 19.9110 memory: 6.46GiB(27.34%) tps: 24,474 tflops: 24.63 mfu: 7.89% global_avg_ntp_loss: 3.3895 global_avg_mtp_loss: 16.5215 +[titan] 2025-06-13 14:57:22,932 - root - INFO - lr: 1.1571e-04 gnorm: 1.20 [ 2:15:52< 0:41:16] +[titan] 2025-06-13 14:57:26,382 - root - INFO - step: 11510 loss: 19.0870 memory: 6.46GiB(27.34%) tps: 23,742 tflops: 23.89 mfu: 7.66% global_avg_ntp_loss: 3.2559 global_avg_mtp_loss: 15.8311 +[titan] 2025-06-13 14:57:26,383 - root - INFO - lr: 1.1554e-04 gnorm: 1.58 [ 2:15:56< 0:41:13] +[titan] 2025-06-13 14:57:29,881 - root - INFO - step: 11515 loss: 19.5996 memory: 6.46GiB(27.34%) tps: 23,414 tflops: 23.56 mfu: 7.55% global_avg_ntp_loss: 3.4103 global_avg_mtp_loss: 16.1892 +[titan] 2025-06-13 14:57:29,882 - root - INFO - lr: 1.1536e-04 gnorm: 1.25 [ 2:15:59< 0:41:09] +[titan] 2025-06-13 14:57:33,470 - root - INFO - step: 11520 loss: 19.6137 memory: 6.46GiB(27.34%) tps: 22,833 tflops: 22.98 mfu: 7.36% global_avg_ntp_loss: 3.4156 global_avg_mtp_loss: 16.1982 +[titan] 2025-06-13 14:57:33,470 - root - INFO - lr: 1.1518e-04 gnorm: 1.45 [ 2:16:03< 0:41:05] +[titan] 2025-06-13 14:57:36,955 - root - INFO - step: 11525 loss: 18.8721 memory: 6.46GiB(27.34%) tps: 23,511 tflops: 23.66 mfu: 7.58% global_avg_ntp_loss: 3.2651 global_avg_mtp_loss: 15.6070 +[titan] 2025-06-13 14:57:36,955 - root - INFO - lr: 1.1500e-04 gnorm: 1.34 [ 2:16:06< 0:41:02] +[titan] 2025-06-13 14:57:40,429 - root - INFO - step: 11530 loss: 17.7834 memory: 6.46GiB(27.34%) tps: 23,580 tflops: 23.73 mfu: 7.61% global_avg_ntp_loss: 3.0040 global_avg_mtp_loss: 14.7794 +[titan] 2025-06-13 14:57:40,429 - root - INFO - lr: 1.1482e-04 gnorm: 1.50 [ 2:16:10< 0:40:58] +[titan] 2025-06-13 14:57:44,266 - root - INFO - step: 11535 loss: 19.7834 memory: 6.46GiB(27.34%) tps: 21,354 tflops: 21.49 mfu: 6.89% global_avg_ntp_loss: 3.3996 global_avg_mtp_loss: 16.3838 +[titan] 2025-06-13 14:57:44,266 - root - INFO - lr: 1.1465e-04 gnorm: 1.27 [ 2:16:13< 0:40:55] +[titan] 2025-06-13 14:57:47,668 - root - INFO - step: 11540 loss: 20.0887 memory: 6.46GiB(27.34%) tps: 24,079 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 3.5048 global_avg_mtp_loss: 16.5840 +[titan] 2025-06-13 14:57:47,669 - root - INFO - lr: 1.1447e-04 gnorm: 1.29 [ 2:16:17< 0:40:51] +[titan] 2025-06-13 14:57:51,356 - root - INFO - step: 11545 loss: 19.9310 memory: 6.46GiB(27.34%) tps: 22,216 tflops: 22.36 mfu: 7.17% global_avg_ntp_loss: 3.4068 global_avg_mtp_loss: 16.5242 +[titan] 2025-06-13 14:57:51,357 - root - INFO - lr: 1.1429e-04 gnorm: 1.20 [ 2:16:21< 0:40:48] +[titan] 2025-06-13 14:57:54,087 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:57:54,881 - root - INFO - step: 11550 loss: 19.2161 memory: 6.46GiB(27.34%) tps: 23,245 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.2876 global_avg_mtp_loss: 15.9285 +[titan] 2025-06-13 14:57:54,881 - root - INFO - lr: 1.1412e-04 gnorm: 1.24 [ 2:16:24< 0:40:44] +[titan] 2025-06-13 14:57:58,193 - root - INFO - step: 11555 loss: 20.1209 memory: 6.46GiB(27.34%) tps: 24,737 tflops: 24.89 mfu: 7.98% global_avg_ntp_loss: 3.4798 global_avg_mtp_loss: 16.6411 +[titan] 2025-06-13 14:57:58,194 - root - INFO - lr: 1.1394e-04 gnorm: 1.21 [ 2:16:27< 0:40:41] +[titan] 2025-06-13 14:58:02,162 - root - INFO - step: 11560 loss: 15.7614 memory: 6.46GiB(27.34%) tps: 20,643 tflops: 20.77 mfu: 6.66% global_avg_ntp_loss: 2.6838 global_avg_mtp_loss: 13.0776 +[titan] 2025-06-13 14:58:02,163 - root - INFO - lr: 1.1376e-04 gnorm: 1.43 [ 2:16:31< 0:40:37] +[titan] 2025-06-13 14:58:05,821 - root - INFO - step: 11565 loss: 17.8152 memory: 6.46GiB(27.34%) tps: 22,394 tflops: 22.54 mfu: 7.22% global_avg_ntp_loss: 3.0661 global_avg_mtp_loss: 14.7491 +[titan] 2025-06-13 14:58:05,821 - root - INFO - lr: 1.1359e-04 gnorm: 1.23 [ 2:16:35< 0:40:34] +[titan] 2025-06-13 14:58:09,310 - root - INFO - step: 11570 loss: 18.6923 memory: 6.46GiB(27.34%) tps: 23,479 tflops: 23.63 mfu: 7.57% global_avg_ntp_loss: 3.1874 global_avg_mtp_loss: 15.5048 +[titan] 2025-06-13 14:58:09,311 - root - INFO - lr: 1.1341e-04 gnorm: 1.31 [ 2:16:39< 0:40:30] +[titan] 2025-06-13 14:58:13,013 - root - INFO - step: 11575 loss: 20.2102 memory: 6.46GiB(27.34%) tps: 22,127 tflops: 22.27 mfu: 7.14% global_avg_ntp_loss: 3.5178 global_avg_mtp_loss: 16.6923 +[titan] 2025-06-13 14:58:13,013 - root - INFO - lr: 1.1324e-04 gnorm: 1.24 [ 2:16:42< 0:40:27] +[titan] 2025-06-13 14:58:16,529 - root - INFO - step: 11580 loss: 19.5577 memory: 6.46GiB(27.34%) tps: 23,301 tflops: 23.45 mfu: 7.52% global_avg_ntp_loss: 3.3613 global_avg_mtp_loss: 16.1965 +[titan] 2025-06-13 14:58:16,529 - root - INFO - lr: 1.1306e-04 gnorm: 1.25 [ 2:16:46< 0:40:23] +[titan] 2025-06-13 14:58:19,629 - root - INFO - step: 11585 loss: 19.5344 memory: 6.46GiB(27.34%) tps: 26,430 tflops: 26.60 mfu: 8.53% global_avg_ntp_loss: 3.3815 global_avg_mtp_loss: 16.1528 +[titan] 2025-06-13 14:58:19,629 - root - INFO - lr: 1.1289e-04 gnorm: 1.34 [ 2:16:49< 0:40:19] +[titan] 2025-06-13 14:58:23,622 - root - INFO - step: 11590 loss: 17.4644 memory: 6.46GiB(27.34%) tps: 20,518 tflops: 20.65 mfu: 6.62% global_avg_ntp_loss: 3.0091 global_avg_mtp_loss: 14.4553 +[titan] 2025-06-13 14:58:23,622 - root - INFO - lr: 1.1271e-04 gnorm: 1.17 [ 2:16:53< 0:40:16] +[titan] 2025-06-13 14:58:26,999 - root - INFO - step: 11595 loss: 18.6246 memory: 6.46GiB(27.34%) tps: 24,257 tflops: 24.41 mfu: 7.82% global_avg_ntp_loss: 3.1509 global_avg_mtp_loss: 15.4737 +[titan] 2025-06-13 14:58:27,000 - root - INFO - lr: 1.1254e-04 gnorm: 1.26 [ 2:16:56< 0:40:12] +[titan] 2025-06-13 14:58:30,437 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:58:31,000 - root - INFO - step: 11600 loss: 18.3116 memory: 6.46GiB(27.34%) tps: 20,479 tflops: 20.61 mfu: 6.61% global_avg_ntp_loss: 3.1412 global_avg_mtp_loss: 15.1704 +[titan] 2025-06-13 14:58:31,000 - root - INFO - lr: 1.1236e-04 gnorm: 1.31 [ 2:17:00< 0:40:09] +[titan] 2025-06-13 14:58:35,027 - root - INFO - step: 11605 loss: 20.0354 memory: 6.46GiB(27.34%) tps: 20,343 tflops: 20.47 mfu: 6.56% global_avg_ntp_loss: 3.4426 global_avg_mtp_loss: 16.5928 +[titan] 2025-06-13 14:58:35,028 - root - INFO - lr: 1.1219e-04 gnorm: 1.22 [ 2:17:04< 0:40:06] +[titan] 2025-06-13 14:58:38,442 - root - INFO - step: 11610 loss: 18.5929 memory: 6.46GiB(27.34%) tps: 23,996 tflops: 24.15 mfu: 7.74% global_avg_ntp_loss: 3.2149 global_avg_mtp_loss: 15.3780 +[titan] 2025-06-13 14:58:38,442 - root - INFO - lr: 1.1201e-04 gnorm: 2.19 [ 2:17:08< 0:40:02] +[titan] 2025-06-13 14:58:41,654 - root - INFO - step: 11615 loss: 19.5197 memory: 6.46GiB(27.34%) tps: 25,501 tflops: 25.66 mfu: 8.23% global_avg_ntp_loss: 3.3617 global_avg_mtp_loss: 16.1580 +[titan] 2025-06-13 14:58:41,655 - root - INFO - lr: 1.1184e-04 gnorm: 1.32 [ 2:17:11< 0:39:58] +[titan] 2025-06-13 14:58:45,174 - root - INFO - step: 11620 loss: 19.3952 memory: 6.46GiB(27.34%) tps: 23,278 tflops: 23.43 mfu: 7.51% global_avg_ntp_loss: 3.3339 global_avg_mtp_loss: 16.0613 +[titan] 2025-06-13 14:58:45,174 - root - INFO - lr: 1.1167e-04 gnorm: 1.20 [ 2:17:14< 0:39:55] +[titan] 2025-06-13 14:58:48,448 - root - INFO - step: 11625 loss: 19.0946 memory: 6.46GiB(27.34%) tps: 25,028 tflops: 25.19 mfu: 8.07% global_avg_ntp_loss: 3.2797 global_avg_mtp_loss: 15.8150 +[titan] 2025-06-13 14:58:48,448 - root - INFO - lr: 1.1149e-04 gnorm: 1.21 [ 2:17:18< 0:39:51] +[titan] 2025-06-13 14:58:52,502 - root - INFO - step: 11630 loss: 19.2266 memory: 6.46GiB(27.34%) tps: 20,208 tflops: 20.34 mfu: 6.52% global_avg_ntp_loss: 3.3025 global_avg_mtp_loss: 15.9240 +[titan] 2025-06-13 14:58:52,502 - root - INFO - lr: 1.1132e-04 gnorm: 1.31 [ 2:17:22< 0:39:48] +[titan] 2025-06-13 14:58:56,300 - root - INFO - step: 11635 loss: 18.8672 memory: 6.46GiB(27.34%) tps: 21,569 tflops: 21.71 mfu: 6.96% global_avg_ntp_loss: 3.2476 global_avg_mtp_loss: 15.6196 +[titan] 2025-06-13 14:58:56,300 - root - INFO - lr: 1.1115e-04 gnorm: 1.25 [ 2:17:26< 0:39:44] +[titan] 2025-06-13 14:58:59,486 - root - INFO - step: 11640 loss: 19.8767 memory: 6.46GiB(27.34%) tps: 25,714 tflops: 25.88 mfu: 8.29% global_avg_ntp_loss: 3.4551 global_avg_mtp_loss: 16.4216 +[titan] 2025-06-13 14:58:59,487 - root - INFO - lr: 1.1097e-04 gnorm: 1.35 [ 2:17:29< 0:39:41] +[titan] 2025-06-13 14:59:03,096 - root - INFO - step: 11645 loss: 19.7558 memory: 6.46GiB(27.34%) tps: 22,697 tflops: 22.84 mfu: 7.32% global_avg_ntp_loss: 3.4505 global_avg_mtp_loss: 16.3053 +[titan] 2025-06-13 14:59:03,096 - root - INFO - lr: 1.1080e-04 gnorm: 1.35 [ 2:17:32< 0:39:37] +[titan] 2025-06-13 14:59:05,523 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:59:06,251 - root - INFO - step: 11650 loss: 20.7434 memory: 6.46GiB(27.34%) tps: 25,969 tflops: 26.13 mfu: 8.38% global_avg_ntp_loss: 3.6098 global_avg_mtp_loss: 17.1337 +[titan] 2025-06-13 14:59:06,252 - root - INFO - lr: 1.1063e-04 gnorm: 1.29 [ 2:17:35< 0:39:34] +[titan] 2025-06-13 14:59:09,558 - root - INFO - step: 11655 loss: 19.0733 memory: 6.46GiB(27.34%) tps: 24,777 tflops: 24.94 mfu: 7.99% global_avg_ntp_loss: 3.2462 global_avg_mtp_loss: 15.8270 +[titan] 2025-06-13 14:59:09,558 - root - INFO - lr: 1.1046e-04 gnorm: 1.54 [ 2:17:39< 0:39:30] +[titan] 2025-06-13 14:59:13,285 - root - INFO - step: 11660 loss: 19.8783 memory: 6.46GiB(27.34%) tps: 21,982 tflops: 22.12 mfu: 7.09% global_avg_ntp_loss: 3.4195 global_avg_mtp_loss: 16.4588 +[titan] 2025-06-13 14:59:13,285 - root - INFO - lr: 1.1028e-04 gnorm: 1.24 [ 2:17:42< 0:39:26] +[titan] 2025-06-13 14:59:16,738 - root - INFO - step: 11665 loss: 19.3589 memory: 6.46GiB(27.34%) tps: 23,732 tflops: 23.88 mfu: 7.65% global_avg_ntp_loss: 3.3025 global_avg_mtp_loss: 16.0564 +[titan] 2025-06-13 14:59:16,738 - root - INFO - lr: 1.1011e-04 gnorm: 1.21 [ 2:17:46< 0:39:23] +[titan] 2025-06-13 14:59:20,295 - root - INFO - step: 11670 loss: 19.6747 memory: 6.46GiB(27.34%) tps: 23,030 tflops: 23.18 mfu: 7.43% global_avg_ntp_loss: 3.3722 global_avg_mtp_loss: 16.3024 +[titan] 2025-06-13 14:59:20,295 - root - INFO - lr: 1.0994e-04 gnorm: 1.25 [ 2:17:49< 0:39:19] +[titan] 2025-06-13 14:59:23,739 - root - INFO - step: 11675 loss: 20.7228 memory: 6.46GiB(27.34%) tps: 23,787 tflops: 23.94 mfu: 7.67% global_avg_ntp_loss: 3.6854 global_avg_mtp_loss: 17.0373 +[titan] 2025-06-13 14:59:23,740 - root - INFO - lr: 1.0977e-04 gnorm: 1.31 [ 2:17:53< 0:39:16] +[titan] 2025-06-13 14:59:27,200 - root - INFO - step: 11680 loss: 18.7508 memory: 6.46GiB(27.34%) tps: 23,673 tflops: 23.82 mfu: 7.64% global_avg_ntp_loss: 3.1923 global_avg_mtp_loss: 15.5585 +[titan] 2025-06-13 14:59:27,201 - root - INFO - lr: 1.0960e-04 gnorm: 1.24 [ 2:17:56< 0:39:12] +[titan] 2025-06-13 14:59:30,404 - root - INFO - step: 11685 loss: 19.3819 memory: 6.46GiB(27.34%) tps: 25,574 tflops: 25.74 mfu: 8.25% global_avg_ntp_loss: 3.3555 global_avg_mtp_loss: 16.0264 +[titan] 2025-06-13 14:59:30,404 - root - INFO - lr: 1.0943e-04 gnorm: 1.29 [ 2:18:00< 0:39:09] +[titan] 2025-06-13 14:59:33,809 - root - INFO - step: 11690 loss: 19.2620 memory: 6.46GiB(27.34%) tps: 24,062 tflops: 24.22 mfu: 7.76% global_avg_ntp_loss: 3.2892 global_avg_mtp_loss: 15.9728 +[titan] 2025-06-13 14:59:33,810 - root - INFO - lr: 1.0926e-04 gnorm: 1.27 [ 2:18:03< 0:39:05] +[titan] 2025-06-13 14:59:37,167 - root - INFO - step: 11695 loss: 18.3897 memory: 6.46GiB(27.34%) tps: 24,402 tflops: 24.56 mfu: 7.87% global_avg_ntp_loss: 3.1092 global_avg_mtp_loss: 15.2805 +[titan] 2025-06-13 14:59:37,167 - root - INFO - lr: 1.0909e-04 gnorm: 1.35 [ 2:18:06< 0:39:01] +[titan] 2025-06-13 14:59:40,562 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 14:59:41,284 - root - INFO - step: 11700 loss: 19.2989 memory: 6.46GiB(27.34%) tps: 19,901 tflops: 20.03 mfu: 6.42% global_avg_ntp_loss: 3.3253 global_avg_mtp_loss: 15.9736 +[titan] 2025-06-13 14:59:41,284 - root - INFO - lr: 1.0892e-04 gnorm: 1.15 [ 2:18:10< 0:38:58] +[titan] 2025-06-13 14:59:45,028 - root - INFO - step: 11705 loss: 18.2385 memory: 6.46GiB(27.34%) tps: 21,883 tflops: 22.02 mfu: 7.06% global_avg_ntp_loss: 3.0725 global_avg_mtp_loss: 15.1661 +[titan] 2025-06-13 14:59:45,028 - root - INFO - lr: 1.0875e-04 gnorm: 1.28 [ 2:18:14< 0:38:54] +[titan] 2025-06-13 14:59:48,534 - root - INFO - step: 11710 loss: 18.8274 memory: 6.46GiB(27.34%) tps: 23,366 tflops: 23.51 mfu: 7.54% global_avg_ntp_loss: 3.1932 global_avg_mtp_loss: 15.6342 +[titan] 2025-06-13 14:59:48,535 - root - INFO - lr: 1.0858e-04 gnorm: 1.27 [ 2:18:18< 0:38:51] +[titan] 2025-06-13 14:59:52,098 - root - INFO - step: 11715 loss: 20.1162 memory: 6.46GiB(27.34%) tps: 22,990 tflops: 23.14 mfu: 7.42% global_avg_ntp_loss: 3.4022 global_avg_mtp_loss: 16.7140 +[titan] 2025-06-13 14:59:52,098 - root - INFO - lr: 1.0841e-04 gnorm: 1.49 [ 2:18:21< 0:38:47] +[titan] 2025-06-13 14:59:55,628 - root - INFO - step: 11720 loss: 18.6194 memory: 6.46GiB(27.34%) tps: 23,211 tflops: 23.36 mfu: 7.49% global_avg_ntp_loss: 3.1514 global_avg_mtp_loss: 15.4680 +[titan] 2025-06-13 14:59:55,628 - root - INFO - lr: 1.0824e-04 gnorm: 1.20 [ 2:18:25< 0:38:44] +[titan] 2025-06-13 14:59:59,390 - root - INFO - step: 11725 loss: 18.7300 memory: 6.46GiB(27.34%) tps: 21,780 tflops: 21.92 mfu: 7.03% global_avg_ntp_loss: 3.1836 global_avg_mtp_loss: 15.5464 +[titan] 2025-06-13 14:59:59,390 - root - INFO - lr: 1.0807e-04 gnorm: 1.38 [ 2:18:29< 0:38:40] +[titan] 2025-06-13 15:00:02,971 - root - INFO - step: 11730 loss: 19.5450 memory: 6.46GiB(27.34%) tps: 22,878 tflops: 23.02 mfu: 7.38% global_avg_ntp_loss: 3.3920 global_avg_mtp_loss: 16.1531 +[titan] 2025-06-13 15:00:02,971 - root - INFO - lr: 1.0790e-04 gnorm: 1.25 [ 2:18:32< 0:38:37] +[titan] 2025-06-13 15:00:06,495 - root - INFO - step: 11735 loss: 20.0939 memory: 6.46GiB(27.34%) tps: 23,247 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.4830 global_avg_mtp_loss: 16.6108 +[titan] 2025-06-13 15:00:06,496 - root - INFO - lr: 1.0773e-04 gnorm: 1.30 [ 2:18:36< 0:38:33] +[titan] 2025-06-13 15:00:09,784 - root - INFO - step: 11740 loss: 17.5041 memory: 6.46GiB(27.34%) tps: 24,914 tflops: 25.07 mfu: 8.04% global_avg_ntp_loss: 3.0592 global_avg_mtp_loss: 14.4449 +[titan] 2025-06-13 15:00:09,784 - root - INFO - lr: 1.0756e-04 gnorm: 1.52 [ 2:18:39< 0:38:30] +[titan] 2025-06-13 15:00:13,217 - root - INFO - step: 11745 loss: 16.3222 memory: 6.46GiB(27.34%) tps: 23,863 tflops: 24.02 mfu: 7.70% global_avg_ntp_loss: 2.8469 global_avg_mtp_loss: 13.4753 +[titan] 2025-06-13 15:00:13,218 - root - INFO - lr: 1.0739e-04 gnorm: 1.34 [ 2:18:42< 0:38:26] +[titan] 2025-06-13 15:00:15,820 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:00:16,527 - root - INFO - step: 11750 loss: 19.2706 memory: 6.46GiB(27.34%) tps: 24,751 tflops: 24.91 mfu: 7.98% global_avg_ntp_loss: 3.3678 global_avg_mtp_loss: 15.9028 +[titan] 2025-06-13 15:00:16,528 - root - INFO - lr: 1.0722e-04 gnorm: 1.41 [ 2:18:46< 0:38:22] +[titan] 2025-06-13 15:00:20,012 - root - INFO - step: 11755 loss: 19.8123 memory: 6.46GiB(27.34%) tps: 23,511 tflops: 23.66 mfu: 7.58% global_avg_ntp_loss: 3.4876 global_avg_mtp_loss: 16.3247 +[titan] 2025-06-13 15:00:20,013 - root - INFO - lr: 1.0705e-04 gnorm: 1.27 [ 2:18:49< 0:38:19] +[titan] 2025-06-13 15:00:23,734 - root - INFO - step: 11760 loss: 19.4014 memory: 6.46GiB(27.34%) tps: 22,013 tflops: 22.15 mfu: 7.10% global_avg_ntp_loss: 3.3302 global_avg_mtp_loss: 16.0712 +[titan] 2025-06-13 15:00:23,734 - root - INFO - lr: 1.0689e-04 gnorm: 1.32 [ 2:18:53< 0:38:15] +[titan] 2025-06-13 15:00:27,152 - root - INFO - step: 11765 loss: 20.0624 memory: 6.46GiB(27.34%) tps: 23,968 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 3.4417 global_avg_mtp_loss: 16.6206 +[titan] 2025-06-13 15:00:27,153 - root - INFO - lr: 1.0672e-04 gnorm: 1.27 [ 2:18:56< 0:38:12] +[titan] 2025-06-13 15:00:30,715 - root - INFO - step: 11770 loss: 19.2428 memory: 6.46GiB(27.34%) tps: 22,996 tflops: 23.14 mfu: 7.42% global_avg_ntp_loss: 3.3143 global_avg_mtp_loss: 15.9285 +[titan] 2025-06-13 15:00:30,716 - root - INFO - lr: 1.0655e-04 gnorm: 1.17 [ 2:19:00< 0:38:08] +[titan] 2025-06-13 15:00:34,053 - root - INFO - step: 11775 loss: 19.7669 memory: 6.46GiB(27.34%) tps: 24,549 tflops: 24.71 mfu: 7.92% global_avg_ntp_loss: 3.3966 global_avg_mtp_loss: 16.3703 +[titan] 2025-06-13 15:00:34,053 - root - INFO - lr: 1.0638e-04 gnorm: 1.44 [ 2:19:03< 0:38:05] +[titan] 2025-06-13 15:00:35,043 - root - INFO - Dumping profiler traces at step 11776 +[titan] 2025-06-13 15:00:35,141 - root - INFO - Finished dumping profiler traces in 0.10 seconds +[titan] 2025-06-13 15:00:37,590 - root - INFO - step: 11780 loss: 18.2103 memory: 6.46GiB(27.34%) tps: 23,162 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.0696 global_avg_mtp_loss: 15.1406 +[titan] 2025-06-13 15:00:37,591 - root - INFO - lr: 1.0622e-04 gnorm: 1.31 [ 2:19:07< 0:38:01] +[titan] 2025-06-13 15:00:41,083 - root - INFO - step: 11785 loss: 18.6332 memory: 6.46GiB(27.34%) tps: 23,461 tflops: 23.61 mfu: 7.57% global_avg_ntp_loss: 3.1886 global_avg_mtp_loss: 15.4445 +[titan] 2025-06-13 15:00:41,083 - root - INFO - lr: 1.0605e-04 gnorm: 1.57 [ 2:19:10< 0:37:58] +[titan] 2025-06-13 15:00:44,677 - root - INFO - step: 11790 loss: 20.0579 memory: 6.46GiB(27.34%) tps: 22,791 tflops: 22.94 mfu: 7.35% global_avg_ntp_loss: 3.4325 global_avg_mtp_loss: 16.6254 +[titan] 2025-06-13 15:00:44,678 - root - INFO - lr: 1.0588e-04 gnorm: 1.17 [ 2:19:14< 0:37:54] +[titan] 2025-06-13 15:00:48,021 - root - INFO - step: 11795 loss: 18.8358 memory: 6.46GiB(27.34%) tps: 24,502 tflops: 24.66 mfu: 7.90% global_avg_ntp_loss: 3.2589 global_avg_mtp_loss: 15.5769 +[titan] 2025-06-13 15:00:48,021 - root - INFO - lr: 1.0572e-04 gnorm: 1.21 [ 2:19:17< 0:37:51] +[titan] 2025-06-13 15:00:50,719 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:00:51,269 - root - INFO - step: 11800 loss: 17.2625 memory: 6.46GiB(27.34%) tps: 25,222 tflops: 25.38 mfu: 8.14% global_avg_ntp_loss: 2.9253 global_avg_mtp_loss: 14.3373 +[titan] 2025-06-13 15:00:51,270 - root - INFO - lr: 1.0555e-04 gnorm: 1.39 [ 2:19:20< 0:37:47] +[titan] 2025-06-13 15:00:54,587 - root - INFO - step: 11805 loss: 19.6621 memory: 6.46GiB(27.34%) tps: 24,699 tflops: 24.86 mfu: 7.97% global_avg_ntp_loss: 3.3711 global_avg_mtp_loss: 16.2910 +[titan] 2025-06-13 15:00:54,587 - root - INFO - lr: 1.0539e-04 gnorm: 1.27 [ 2:19:24< 0:37:43] +[titan] 2025-06-13 15:00:57,615 - root - INFO - step: 11810 loss: 17.9775 memory: 6.46GiB(27.34%) tps: 27,052 tflops: 27.22 mfu: 8.73% global_avg_ntp_loss: 3.0534 global_avg_mtp_loss: 14.9242 +[titan] 2025-06-13 15:00:57,615 - root - INFO - lr: 1.0522e-04 gnorm: 1.16 [ 2:19:27< 0:37:40] +[titan] 2025-06-13 15:01:00,685 - root - INFO - step: 11815 loss: 19.1253 memory: 6.46GiB(27.34%) tps: 26,689 tflops: 26.86 mfu: 8.61% global_avg_ntp_loss: 3.2687 global_avg_mtp_loss: 15.8566 +[titan] 2025-06-13 15:01:00,685 - root - INFO - lr: 1.0505e-04 gnorm: 1.27 [ 2:19:30< 0:37:36] +[titan] 2025-06-13 15:01:04,280 - root - INFO - step: 11820 loss: 19.7358 memory: 6.46GiB(27.34%) tps: 22,789 tflops: 22.93 mfu: 7.35% global_avg_ntp_loss: 3.3686 global_avg_mtp_loss: 16.3672 +[titan] 2025-06-13 15:01:04,280 - root - INFO - lr: 1.0489e-04 gnorm: 1.18 [ 2:19:33< 0:37:32] +[titan] 2025-06-13 15:01:07,741 - root - INFO - step: 11825 loss: 17.2264 memory: 6.46GiB(27.34%) tps: 23,674 tflops: 23.82 mfu: 7.64% global_avg_ntp_loss: 2.9182 global_avg_mtp_loss: 14.3081 +[titan] 2025-06-13 15:01:07,741 - root - INFO - lr: 1.0472e-04 gnorm: 1.36 [ 2:19:37< 0:37:29] +[titan] 2025-06-13 15:01:11,641 - root - INFO - step: 11830 loss: 19.4223 memory: 6.46GiB(27.34%) tps: 21,010 tflops: 21.14 mfu: 6.78% global_avg_ntp_loss: 3.3506 global_avg_mtp_loss: 16.0717 +[titan] 2025-06-13 15:01:11,641 - root - INFO - lr: 1.0456e-04 gnorm: 1.21 [ 2:19:41< 0:37:25] +[titan] 2025-06-13 15:01:15,123 - root - INFO - step: 11835 loss: 18.6043 memory: 6.46GiB(27.34%) tps: 23,526 tflops: 23.68 mfu: 7.59% global_avg_ntp_loss: 3.2023 global_avg_mtp_loss: 15.4020 +[titan] 2025-06-13 15:01:15,123 - root - INFO - lr: 1.0439e-04 gnorm: 1.24 [ 2:19:44< 0:37:22] +[titan] 2025-06-13 15:01:18,583 - root - INFO - step: 11840 loss: 17.9838 memory: 6.46GiB(27.34%) tps: 23,679 tflops: 23.83 mfu: 7.64% global_avg_ntp_loss: 3.0859 global_avg_mtp_loss: 14.8979 +[titan] 2025-06-13 15:01:18,583 - root - INFO - lr: 1.0423e-04 gnorm: 1.37 [ 2:19:48< 0:37:18] +[titan] 2025-06-13 15:01:22,147 - root - INFO - step: 11845 loss: 18.8286 memory: 6.46GiB(27.34%) tps: 22,986 tflops: 23.13 mfu: 7.41% global_avg_ntp_loss: 3.2770 global_avg_mtp_loss: 15.5516 +[titan] 2025-06-13 15:01:22,148 - root - INFO - lr: 1.0407e-04 gnorm: 1.27 [ 2:19:51< 0:37:15] +[titan] 2025-06-13 15:01:25,601 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:01:26,272 - root - INFO - step: 11850 loss: 18.4238 memory: 6.46GiB(27.34%) tps: 19,862 tflops: 19.99 mfu: 6.41% global_avg_ntp_loss: 3.1456 global_avg_mtp_loss: 15.2782 +[titan] 2025-06-13 15:01:26,272 - root - INFO - lr: 1.0390e-04 gnorm: 1.25 [ 2:19:55< 0:37:11] +[titan] 2025-06-13 15:01:29,772 - root - INFO - step: 11855 loss: 18.9946 memory: 6.46GiB(27.34%) tps: 23,413 tflops: 23.56 mfu: 7.55% global_avg_ntp_loss: 3.2449 global_avg_mtp_loss: 15.7496 +[titan] 2025-06-13 15:01:29,772 - root - INFO - lr: 1.0374e-04 gnorm: 1.31 [ 2:19:59< 0:37:08] +[titan] 2025-06-13 15:01:33,407 - root - INFO - step: 11860 loss: 19.6019 memory: 6.46GiB(27.34%) tps: 22,534 tflops: 22.68 mfu: 7.27% global_avg_ntp_loss: 3.3439 global_avg_mtp_loss: 16.2580 +[titan] 2025-06-13 15:01:33,408 - root - INFO - lr: 1.0357e-04 gnorm: 1.19 [ 2:20:03< 0:37:04] +[titan] 2025-06-13 15:01:36,995 - root - INFO - step: 11865 loss: 19.4519 memory: 6.46GiB(27.34%) tps: 22,837 tflops: 22.98 mfu: 7.37% global_avg_ntp_loss: 3.3737 global_avg_mtp_loss: 16.0782 +[titan] 2025-06-13 15:01:36,995 - root - INFO - lr: 1.0341e-04 gnorm: 1.60 [ 2:20:06< 0:37:01] +[titan] 2025-06-13 15:01:40,490 - root - INFO - step: 11870 loss: 19.4788 memory: 6.46GiB(27.34%) tps: 23,439 tflops: 23.59 mfu: 7.56% global_avg_ntp_loss: 3.3254 global_avg_mtp_loss: 16.1534 +[titan] 2025-06-13 15:01:40,491 - root - INFO - lr: 1.0325e-04 gnorm: 1.19 [ 2:20:10< 0:36:57] +[titan] 2025-06-13 15:01:43,806 - root - INFO - step: 11875 loss: 19.3649 memory: 6.46GiB(27.34%) tps: 24,710 tflops: 24.87 mfu: 7.97% global_avg_ntp_loss: 3.3079 global_avg_mtp_loss: 16.0570 +[titan] 2025-06-13 15:01:43,807 - root - INFO - lr: 1.0308e-04 gnorm: 1.52 [ 2:20:13< 0:36:54] +[titan] 2025-06-13 15:01:48,710 - root - INFO - step: 11880 loss: 19.1147 memory: 6.46GiB(27.34%) tps: 16,706 tflops: 16.81 mfu: 5.39% global_avg_ntp_loss: 3.2935 global_avg_mtp_loss: 15.8212 +[titan] 2025-06-13 15:01:48,711 - root - INFO - lr: 1.0292e-04 gnorm: 1.53 [ 2:20:18< 0:36:50] +[titan] 2025-06-13 15:01:51,899 - root - INFO - step: 11885 loss: 19.7904 memory: 6.46GiB(27.34%) tps: 25,691 tflops: 25.85 mfu: 8.29% global_avg_ntp_loss: 3.4229 global_avg_mtp_loss: 16.3675 +[titan] 2025-06-13 15:01:51,900 - root - INFO - lr: 1.0276e-04 gnorm: 1.53 [ 2:20:21< 0:36:47] +[titan] 2025-06-13 15:01:55,850 - root - INFO - step: 11890 loss: 19.4225 memory: 6.46GiB(27.34%) tps: 20,739 tflops: 20.87 mfu: 6.69% global_avg_ntp_loss: 3.3731 global_avg_mtp_loss: 16.0493 +[titan] 2025-06-13 15:01:55,850 - root - INFO - lr: 1.0260e-04 gnorm: 1.29 [ 2:20:25< 0:36:43] +[titan] 2025-06-13 15:01:59,249 - root - INFO - step: 11895 loss: 18.8202 memory: 6.46GiB(27.34%) tps: 24,106 tflops: 24.26 mfu: 7.78% global_avg_ntp_loss: 3.2441 global_avg_mtp_loss: 15.5761 +[titan] 2025-06-13 15:01:59,249 - root - INFO - lr: 1.0243e-04 gnorm: 1.40 [ 2:20:28< 0:36:40] +[titan] 2025-06-13 15:02:02,178 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:02:02,907 - root - INFO - step: 11900 loss: 18.2072 memory: 6.46GiB(27.34%) tps: 22,395 tflops: 22.54 mfu: 7.22% global_avg_ntp_loss: 3.1264 global_avg_mtp_loss: 15.0808 +[titan] 2025-06-13 15:02:02,907 - root - INFO - lr: 1.0227e-04 gnorm: 1.53 [ 2:20:32< 0:36:36] +[titan] 2025-06-13 15:02:06,702 - root - INFO - step: 11905 loss: 20.4008 memory: 6.46GiB(27.34%) tps: 21,587 tflops: 21.72 mfu: 6.96% global_avg_ntp_loss: 3.5389 global_avg_mtp_loss: 16.8618 +[titan] 2025-06-13 15:02:06,703 - root - INFO - lr: 1.0211e-04 gnorm: 1.33 [ 2:20:36< 0:36:33] +[titan] 2025-06-13 15:02:10,348 - root - INFO - step: 11910 loss: 19.2627 memory: 6.46GiB(27.34%) tps: 22,472 tflops: 22.62 mfu: 7.25% global_avg_ntp_loss: 3.2824 global_avg_mtp_loss: 15.9802 +[titan] 2025-06-13 15:02:10,349 - root - INFO - lr: 1.0195e-04 gnorm: 1.33 [ 2:20:40< 0:36:29] +[titan] 2025-06-13 15:02:14,030 - root - INFO - step: 11915 loss: 17.4674 memory: 6.46GiB(27.34%) tps: 22,252 tflops: 22.39 mfu: 7.18% global_avg_ntp_loss: 2.9813 global_avg_mtp_loss: 14.4861 +[titan] 2025-06-13 15:02:14,031 - root - INFO - lr: 1.0179e-04 gnorm: 1.23 [ 2:20:43< 0:36:26] +[titan] 2025-06-13 15:02:17,629 - root - INFO - step: 11920 loss: 19.7109 memory: 6.46GiB(27.34%) tps: 22,769 tflops: 22.91 mfu: 7.34% global_avg_ntp_loss: 3.3666 global_avg_mtp_loss: 16.3442 +[titan] 2025-06-13 15:02:17,629 - root - INFO - lr: 1.0163e-04 gnorm: 1.19 [ 2:20:47< 0:36:22] +[titan] 2025-06-13 15:02:20,880 - root - INFO - step: 11925 loss: 18.6414 memory: 6.46GiB(27.34%) tps: 25,200 tflops: 25.36 mfu: 8.13% global_avg_ntp_loss: 3.1700 global_avg_mtp_loss: 15.4714 +[titan] 2025-06-13 15:02:20,880 - root - INFO - lr: 1.0147e-04 gnorm: 1.24 [ 2:20:50< 0:36:19] +[titan] 2025-06-13 15:02:24,224 - root - INFO - step: 11930 loss: 19.2289 memory: 6.46GiB(27.34%) tps: 24,500 tflops: 24.66 mfu: 7.90% global_avg_ntp_loss: 3.3148 global_avg_mtp_loss: 15.9140 +[titan] 2025-06-13 15:02:24,224 - root - INFO - lr: 1.0131e-04 gnorm: 1.20 [ 2:20:53< 0:36:15] +[titan] 2025-06-13 15:02:27,629 - root - INFO - step: 11935 loss: 15.1503 memory: 6.46GiB(27.34%) tps: 24,062 tflops: 24.22 mfu: 7.76% global_avg_ntp_loss: 2.5846 global_avg_mtp_loss: 12.5657 +[titan] 2025-06-13 15:02:27,629 - root - INFO - lr: 1.0115e-04 gnorm: 1.25 [ 2:20:57< 0:36:11] +[titan] 2025-06-13 15:02:31,429 - root - INFO - step: 11940 loss: 18.9064 memory: 6.46GiB(27.34%) tps: 21,561 tflops: 21.70 mfu: 6.95% global_avg_ntp_loss: 3.2327 global_avg_mtp_loss: 15.6737 +[titan] 2025-06-13 15:02:31,429 - root - INFO - lr: 1.0099e-04 gnorm: 1.29 [ 2:21:01< 0:36:08] +[titan] 2025-06-13 15:02:34,944 - root - INFO - step: 11945 loss: 19.9306 memory: 6.46GiB(27.34%) tps: 23,310 tflops: 23.46 mfu: 7.52% global_avg_ntp_loss: 3.3968 global_avg_mtp_loss: 16.5338 +[titan] 2025-06-13 15:02:34,944 - root - INFO - lr: 1.0083e-04 gnorm: 1.29 [ 2:21:04< 0:36:04] +[titan] 2025-06-13 15:02:37,463 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:02:38,102 - root - INFO - step: 11950 loss: 18.0171 memory: 6.46GiB(27.34%) tps: 25,942 tflops: 26.11 mfu: 8.37% global_avg_ntp_loss: 3.0470 global_avg_mtp_loss: 14.9701 +[titan] 2025-06-13 15:02:38,111 - root - INFO - lr: 1.0067e-04 gnorm: 1.60 [ 2:21:07< 0:36:01] +[titan] 2025-06-13 15:02:41,447 - root - INFO - step: 11955 loss: 17.7574 memory: 6.46GiB(27.34%) tps: 24,562 tflops: 24.72 mfu: 7.92% global_avg_ntp_loss: 3.0374 global_avg_mtp_loss: 14.7200 +[titan] 2025-06-13 15:02:41,447 - root - INFO - lr: 1.0051e-04 gnorm: 1.26 [ 2:21:11< 0:35:57] +[titan] 2025-06-13 15:02:45,078 - root - INFO - step: 11960 loss: 19.2561 memory: 6.46GiB(27.34%) tps: 22,565 tflops: 22.71 mfu: 7.28% global_avg_ntp_loss: 3.2898 global_avg_mtp_loss: 15.9663 +[titan] 2025-06-13 15:02:45,078 - root - INFO - lr: 1.0035e-04 gnorm: 1.22 [ 2:21:14< 0:35:54] +[titan] 2025-06-13 15:02:48,472 - root - INFO - step: 11965 loss: 18.6638 memory: 6.46GiB(27.34%) tps: 24,142 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 3.2065 global_avg_mtp_loss: 15.4573 +[titan] 2025-06-13 15:02:48,472 - root - INFO - lr: 1.0019e-04 gnorm: 1.33 [ 2:21:18< 0:35:50] +[titan] 2025-06-13 15:02:52,158 - root - INFO - step: 11970 loss: 19.6085 memory: 6.46GiB(27.34%) tps: 22,223 tflops: 22.36 mfu: 7.17% global_avg_ntp_loss: 3.3775 global_avg_mtp_loss: 16.2310 +[titan] 2025-06-13 15:02:52,159 - root - INFO - lr: 1.0003e-04 gnorm: 1.29 [ 2:21:21< 0:35:47] +[titan] 2025-06-13 15:02:55,521 - root - INFO - step: 11975 loss: 19.8135 memory: 6.46GiB(27.34%) tps: 24,368 tflops: 24.52 mfu: 7.86% global_avg_ntp_loss: 3.4502 global_avg_mtp_loss: 16.3633 +[titan] 2025-06-13 15:02:55,521 - root - INFO - lr: 9.9871e-05 gnorm: 1.28 [ 2:21:25< 0:35:43] +[titan] 2025-06-13 15:02:59,033 - root - INFO - step: 11980 loss: 19.6078 memory: 6.46GiB(27.34%) tps: 23,326 tflops: 23.47 mfu: 7.52% global_avg_ntp_loss: 3.3829 global_avg_mtp_loss: 16.2249 +[titan] 2025-06-13 15:02:59,033 - root - INFO - lr: 9.9712e-05 gnorm: 1.40 [ 2:21:28< 0:35:39] +[titan] 2025-06-13 15:03:02,340 - root - INFO - step: 11985 loss: 17.9287 memory: 6.46GiB(27.34%) tps: 24,773 tflops: 24.93 mfu: 7.99% global_avg_ntp_loss: 3.0370 global_avg_mtp_loss: 14.8917 +[titan] 2025-06-13 15:03:02,341 - root - INFO - lr: 9.9554e-05 gnorm: 1.42 [ 2:21:31< 0:35:36] +[titan] 2025-06-13 15:03:06,282 - root - INFO - step: 11990 loss: 16.6631 memory: 6.46GiB(27.34%) tps: 20,787 tflops: 20.92 mfu: 6.71% global_avg_ntp_loss: 2.8290 global_avg_mtp_loss: 13.8341 +[titan] 2025-06-13 15:03:06,282 - root - INFO - lr: 9.9396e-05 gnorm: 1.71 [ 2:21:35< 0:35:32] +[titan] 2025-06-13 15:03:09,673 - root - INFO - step: 11995 loss: 17.1689 memory: 6.46GiB(27.34%) tps: 24,158 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 2.9019 global_avg_mtp_loss: 14.2670 +[titan] 2025-06-13 15:03:09,674 - root - INFO - lr: 9.9239e-05 gnorm: 1.46 [ 2:21:39< 0:35:29] +[titan] 2025-06-13 15:03:12,294 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:03:12,923 - root - INFO - step: 12000 loss: 18.9110 memory: 6.46GiB(27.34%) tps: 25,211 tflops: 25.37 mfu: 8.13% global_avg_ntp_loss: 3.2355 global_avg_mtp_loss: 15.6755 +[titan] 2025-06-13 15:03:12,924 - root - INFO - lr: 9.9081e-05 gnorm: 1.72 [ 2:21:42< 0:35:25] +[titan] 2025-06-13 15:03:16,834 - root - INFO - step: 12005 loss: 18.6509 memory: 6.46GiB(27.34%) tps: 20,950 tflops: 21.08 mfu: 6.76% global_avg_ntp_loss: 3.2025 global_avg_mtp_loss: 15.4485 +[titan] 2025-06-13 15:03:16,835 - root - INFO - lr: 9.8924e-05 gnorm: 1.21 [ 2:21:46< 0:35:22] +[titan] 2025-06-13 15:03:20,450 - root - INFO - step: 12010 loss: 19.7900 memory: 6.46GiB(27.34%) tps: 22,660 tflops: 22.80 mfu: 7.31% global_avg_ntp_loss: 3.4242 global_avg_mtp_loss: 16.3658 +[titan] 2025-06-13 15:03:20,451 - root - INFO - lr: 9.8767e-05 gnorm: 1.26 [ 2:21:50< 0:35:18] +[titan] 2025-06-13 15:03:23,907 - root - INFO - step: 12015 loss: 19.8750 memory: 6.46GiB(27.34%) tps: 23,700 tflops: 23.85 mfu: 7.64% global_avg_ntp_loss: 3.4204 global_avg_mtp_loss: 16.4546 +[titan] 2025-06-13 15:03:23,908 - root - INFO - lr: 9.8610e-05 gnorm: 1.34 [ 2:21:53< 0:35:15] +[titan] 2025-06-13 15:03:27,386 - root - INFO - step: 12020 loss: 19.8536 memory: 6.46GiB(27.34%) tps: 23,552 tflops: 23.70 mfu: 7.60% global_avg_ntp_loss: 3.4306 global_avg_mtp_loss: 16.4230 +[titan] 2025-06-13 15:03:27,386 - root - INFO - lr: 9.8453e-05 gnorm: 1.21 [ 2:21:57< 0:35:11] +[titan] 2025-06-13 15:03:30,934 - root - INFO - step: 12025 loss: 18.9762 memory: 6.46GiB(27.34%) tps: 23,091 tflops: 23.24 mfu: 7.45% global_avg_ntp_loss: 3.2176 global_avg_mtp_loss: 15.7586 +[titan] 2025-06-13 15:03:30,935 - root - INFO - lr: 9.8297e-05 gnorm: 1.22 [ 2:22:00< 0:35:08] +[titan] 2025-06-13 15:03:34,273 - root - INFO - step: 12030 loss: 18.8514 memory: 6.46GiB(27.34%) tps: 24,538 tflops: 24.69 mfu: 7.91% global_avg_ntp_loss: 3.2207 global_avg_mtp_loss: 15.6307 +[titan] 2025-06-13 15:03:34,274 - root - INFO - lr: 9.8141e-05 gnorm: 1.25 [ 2:22:03< 0:35:04] +[titan] 2025-06-13 15:03:37,744 - root - INFO - step: 12035 loss: 18.5634 memory: 6.46GiB(27.34%) tps: 23,607 tflops: 23.76 mfu: 7.61% global_avg_ntp_loss: 3.2042 global_avg_mtp_loss: 15.3592 +[titan] 2025-06-13 15:03:37,744 - root - INFO - lr: 9.7985e-05 gnorm: 1.18 [ 2:22:07< 0:35:00] +[titan] 2025-06-13 15:03:41,397 - root - INFO - step: 12040 loss: 20.0963 memory: 6.46GiB(27.34%) tps: 22,428 tflops: 22.57 mfu: 7.23% global_avg_ntp_loss: 3.4182 global_avg_mtp_loss: 16.6781 +[titan] 2025-06-13 15:03:41,397 - root - INFO - lr: 9.7829e-05 gnorm: 1.17 [ 2:22:11< 0:34:57] +[titan] 2025-06-13 15:03:45,022 - root - INFO - step: 12045 loss: 19.6725 memory: 6.46GiB(27.34%) tps: 22,603 tflops: 22.75 mfu: 7.29% global_avg_ntp_loss: 3.4049 global_avg_mtp_loss: 16.2676 +[titan] 2025-06-13 15:03:45,022 - root - INFO - lr: 9.7674e-05 gnorm: 1.46 [ 2:22:14< 0:34:53] +[titan] 2025-06-13 15:03:47,895 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:03:48,515 - root - INFO - step: 12050 loss: 18.6970 memory: 6.46GiB(27.34%) tps: 23,453 tflops: 23.60 mfu: 7.57% global_avg_ntp_loss: 3.2164 global_avg_mtp_loss: 15.4806 +[titan] 2025-06-13 15:03:48,515 - root - INFO - lr: 9.7518e-05 gnorm: 1.49 [ 2:22:18< 0:34:50] +[titan] 2025-06-13 15:03:52,123 - root - INFO - step: 12055 loss: 18.5058 memory: 6.46GiB(27.34%) tps: 22,706 tflops: 22.85 mfu: 7.32% global_avg_ntp_loss: 3.1492 global_avg_mtp_loss: 15.3566 +[titan] 2025-06-13 15:03:52,124 - root - INFO - lr: 9.7363e-05 gnorm: 1.28 [ 2:22:21< 0:34:46] +[titan] 2025-06-13 15:03:55,704 - root - INFO - step: 12060 loss: 19.1970 memory: 6.46GiB(27.34%) tps: 22,884 tflops: 23.03 mfu: 7.38% global_avg_ntp_loss: 3.2839 global_avg_mtp_loss: 15.9131 +[titan] 2025-06-13 15:03:55,704 - root - INFO - lr: 9.7209e-05 gnorm: 1.33 [ 2:22:25< 0:34:43] +[titan] 2025-06-13 15:03:59,001 - root - INFO - step: 12065 loss: 19.8593 memory: 6.46GiB(27.34%) tps: 24,844 tflops: 25.00 mfu: 8.01% global_avg_ntp_loss: 3.3859 global_avg_mtp_loss: 16.4734 +[titan] 2025-06-13 15:03:59,002 - root - INFO - lr: 9.7054e-05 gnorm: 1.23 [ 2:22:28< 0:34:39] +[titan] 2025-06-13 15:04:02,594 - root - INFO - step: 12070 loss: 17.8336 memory: 6.46GiB(27.34%) tps: 22,806 tflops: 22.95 mfu: 7.36% global_avg_ntp_loss: 3.0740 global_avg_mtp_loss: 14.7596 +[titan] 2025-06-13 15:04:02,594 - root - INFO - lr: 9.6900e-05 gnorm: 1.57 [ 2:22:32< 0:34:36] +[titan] 2025-06-13 15:04:05,786 - root - INFO - step: 12075 loss: 19.8594 memory: 6.46GiB(27.34%) tps: 25,665 tflops: 25.83 mfu: 8.28% global_avg_ntp_loss: 3.4577 global_avg_mtp_loss: 16.4017 +[titan] 2025-06-13 15:04:05,786 - root - INFO - lr: 9.6746e-05 gnorm: 1.39 [ 2:22:35< 0:34:32] +[titan] 2025-06-13 15:04:09,756 - root - INFO - step: 12080 loss: 16.4009 memory: 6.46GiB(27.34%) tps: 20,637 tflops: 20.77 mfu: 6.66% global_avg_ntp_loss: 2.7936 global_avg_mtp_loss: 13.6073 +[titan] 2025-06-13 15:04:09,756 - root - INFO - lr: 9.6592e-05 gnorm: 1.90 [ 2:22:39< 0:34:28] +[titan] 2025-06-13 15:04:13,324 - root - INFO - step: 12085 loss: 19.9692 memory: 6.46GiB(27.34%) tps: 22,964 tflops: 23.11 mfu: 7.41% global_avg_ntp_loss: 3.4620 global_avg_mtp_loss: 16.5072 +[titan] 2025-06-13 15:04:13,324 - root - INFO - lr: 9.6438e-05 gnorm: 1.42 [ 2:22:42< 0:34:25] +[titan] 2025-06-13 15:04:16,900 - root - INFO - step: 12090 loss: 20.0297 memory: 6.46GiB(27.34%) tps: 22,907 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 3.4323 global_avg_mtp_loss: 16.5974 +[titan] 2025-06-13 15:04:16,901 - root - INFO - lr: 9.6284e-05 gnorm: 1.20 [ 2:22:46< 0:34:21] +[titan] 2025-06-13 15:04:20,589 - root - INFO - step: 12095 loss: 18.8510 memory: 6.46GiB(27.34%) tps: 22,212 tflops: 22.35 mfu: 7.16% global_avg_ntp_loss: 3.2310 global_avg_mtp_loss: 15.6200 +[titan] 2025-06-13 15:04:20,590 - root - INFO - lr: 9.6131e-05 gnorm: 1.27 [ 2:22:50< 0:34:18] +[titan] 2025-06-13 15:04:23,298 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:04:23,943 - root - INFO - step: 12100 loss: 17.5656 memory: 6.46GiB(27.34%) tps: 24,429 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 3.0105 global_avg_mtp_loss: 14.5551 +[titan] 2025-06-13 15:04:23,944 - root - INFO - lr: 9.5978e-05 gnorm: 1.21 [ 2:22:53< 0:34:14] +[titan] 2025-06-13 15:04:27,123 - root - INFO - step: 12105 loss: 18.9447 memory: 6.46GiB(27.34%) tps: 25,768 tflops: 25.93 mfu: 8.31% global_avg_ntp_loss: 3.2162 global_avg_mtp_loss: 15.7285 +[titan] 2025-06-13 15:04:27,123 - root - INFO - lr: 9.5825e-05 gnorm: 1.32 [ 2:22:56< 0:34:11] +[titan] 2025-06-13 15:04:30,512 - root - INFO - step: 12110 loss: 18.2344 memory: 6.46GiB(27.34%) tps: 24,172 tflops: 24.33 mfu: 7.80% global_avg_ntp_loss: 3.0874 global_avg_mtp_loss: 15.1470 +[titan] 2025-06-13 15:04:30,513 - root - INFO - lr: 9.5673e-05 gnorm: 1.40 [ 2:23:00< 0:34:07] +[titan] 2025-06-13 15:04:34,074 - root - INFO - step: 12115 loss: 18.1541 memory: 6.46GiB(27.34%) tps: 23,004 tflops: 23.15 mfu: 7.42% global_avg_ntp_loss: 3.1301 global_avg_mtp_loss: 15.0240 +[titan] 2025-06-13 15:04:34,074 - root - INFO - lr: 9.5520e-05 gnorm: 1.15 [ 2:23:03< 0:34:04] +[titan] 2025-06-13 15:04:37,508 - root - INFO - step: 12120 loss: 19.3493 memory: 6.46GiB(27.34%) tps: 23,858 tflops: 24.01 mfu: 7.70% global_avg_ntp_loss: 3.3377 global_avg_mtp_loss: 16.0117 +[titan] 2025-06-13 15:04:37,508 - root - INFO - lr: 9.5368e-05 gnorm: 1.25 [ 2:23:07< 0:34:00] +[titan] 2025-06-13 15:04:41,106 - root - INFO - step: 12125 loss: 18.5518 memory: 6.46GiB(27.34%) tps: 22,771 tflops: 22.92 mfu: 7.34% global_avg_ntp_loss: 3.1910 global_avg_mtp_loss: 15.3607 +[titan] 2025-06-13 15:04:41,106 - root - INFO - lr: 9.5216e-05 gnorm: 1.25 [ 2:23:10< 0:33:56] +[titan] 2025-06-13 15:04:44,815 - root - INFO - step: 12130 loss: 19.8182 memory: 6.46GiB(27.34%) tps: 22,088 tflops: 22.23 mfu: 7.12% global_avg_ntp_loss: 3.4349 global_avg_mtp_loss: 16.3833 +[titan] 2025-06-13 15:04:44,816 - root - INFO - lr: 9.5065e-05 gnorm: 1.26 [ 2:23:14< 0:33:53] +[titan] 2025-06-13 15:04:48,093 - root - INFO - step: 12135 loss: 15.2099 memory: 6.46GiB(27.34%) tps: 24,999 tflops: 25.16 mfu: 8.06% global_avg_ntp_loss: 2.5824 global_avg_mtp_loss: 12.6275 +[titan] 2025-06-13 15:04:48,093 - root - INFO - lr: 9.4913e-05 gnorm: 1.26 [ 2:23:17< 0:33:49] +[titan] 2025-06-13 15:04:51,535 - root - INFO - step: 12140 loss: 18.2489 memory: 6.46GiB(27.34%) tps: 23,801 tflops: 23.95 mfu: 7.68% global_avg_ntp_loss: 3.1598 global_avg_mtp_loss: 15.0892 +[titan] 2025-06-13 15:04:51,535 - root - INFO - lr: 9.4762e-05 gnorm: 1.23 [ 2:23:21< 0:33:46] +[titan] 2025-06-13 15:04:54,783 - root - INFO - step: 12145 loss: 18.7403 memory: 6.46GiB(27.34%) tps: 25,224 tflops: 25.39 mfu: 8.14% global_avg_ntp_loss: 3.2058 global_avg_mtp_loss: 15.5345 +[titan] 2025-06-13 15:04:54,784 - root - INFO - lr: 9.4611e-05 gnorm: 1.32 [ 2:23:24< 0:33:42] +[titan] 2025-06-13 15:04:57,306 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:04:58,017 - root - INFO - step: 12150 loss: 19.3584 memory: 6.46GiB(27.34%) tps: 25,334 tflops: 25.50 mfu: 8.17% global_avg_ntp_loss: 3.2948 global_avg_mtp_loss: 16.0637 +[titan] 2025-06-13 15:04:58,017 - root - INFO - lr: 9.4460e-05 gnorm: 1.48 [ 2:23:27< 0:33:39] +[titan] 2025-06-13 15:05:01,214 - root - INFO - step: 12155 loss: 19.5651 memory: 6.46GiB(27.34%) tps: 25,629 tflops: 25.79 mfu: 8.27% global_avg_ntp_loss: 3.3849 global_avg_mtp_loss: 16.1802 +[titan] 2025-06-13 15:05:01,214 - root - INFO - lr: 9.4310e-05 gnorm: 1.28 [ 2:23:30< 0:33:35] +[titan] 2025-06-13 15:05:04,581 - root - INFO - step: 12160 loss: 19.6683 memory: 6.46GiB(27.34%) tps: 24,336 tflops: 24.49 mfu: 7.85% global_avg_ntp_loss: 3.3728 global_avg_mtp_loss: 16.2956 +[titan] 2025-06-13 15:05:04,581 - root - INFO - lr: 9.4159e-05 gnorm: 1.24 [ 2:23:34< 0:33:31] +[titan] 2025-06-13 15:05:08,086 - root - INFO - step: 12165 loss: 20.1390 memory: 6.46GiB(27.34%) tps: 23,375 tflops: 23.52 mfu: 7.54% global_avg_ntp_loss: 3.4636 global_avg_mtp_loss: 16.6754 +[titan] 2025-06-13 15:05:08,086 - root - INFO - lr: 9.4009e-05 gnorm: 1.19 [ 2:23:37< 0:33:28] +[titan] 2025-06-13 15:05:11,662 - root - INFO - step: 12170 loss: 18.8439 memory: 6.46GiB(27.34%) tps: 22,907 tflops: 23.05 mfu: 7.39% global_avg_ntp_loss: 3.2229 global_avg_mtp_loss: 15.6210 +[titan] 2025-06-13 15:05:11,662 - root - INFO - lr: 9.3860e-05 gnorm: 1.25 [ 2:23:41< 0:33:24] +[titan] 2025-06-13 15:05:14,780 - root - INFO - step: 12175 loss: 19.8582 memory: 6.46GiB(27.34%) tps: 26,276 tflops: 26.44 mfu: 8.48% global_avg_ntp_loss: 3.4099 global_avg_mtp_loss: 16.4483 +[titan] 2025-06-13 15:05:14,780 - root - INFO - lr: 9.3710e-05 gnorm: 1.35 [ 2:23:44< 0:33:21] +[titan] 2025-06-13 15:05:18,105 - root - INFO - step: 12180 loss: 19.2386 memory: 6.46GiB(27.34%) tps: 24,644 tflops: 24.80 mfu: 7.95% global_avg_ntp_loss: 3.2920 global_avg_mtp_loss: 15.9466 +[titan] 2025-06-13 15:05:18,105 - root - INFO - lr: 9.3561e-05 gnorm: 1.21 [ 2:23:47< 0:33:17] +[titan] 2025-06-13 15:05:21,271 - root - INFO - step: 12185 loss: 17.6635 memory: 6.46GiB(27.34%) tps: 25,874 tflops: 26.04 mfu: 8.35% global_avg_ntp_loss: 3.0174 global_avg_mtp_loss: 14.6461 +[titan] 2025-06-13 15:05:21,272 - root - INFO - lr: 9.3411e-05 gnorm: 1.21 [ 2:23:50< 0:33:13] +[titan] 2025-06-13 15:05:24,698 - root - INFO - step: 12190 loss: 18.1805 memory: 6.46GiB(27.34%) tps: 23,908 tflops: 24.06 mfu: 7.71% global_avg_ntp_loss: 3.1061 global_avg_mtp_loss: 15.0744 +[titan] 2025-06-13 15:05:24,698 - root - INFO - lr: 9.3262e-05 gnorm: 1.27 [ 2:23:54< 0:33:10] +[titan] 2025-06-13 15:05:28,549 - root - INFO - step: 12195 loss: 17.1691 memory: 6.46GiB(27.34%) tps: 21,277 tflops: 21.41 mfu: 6.86% global_avg_ntp_loss: 2.8668 global_avg_mtp_loss: 14.3023 +[titan] 2025-06-13 15:05:28,549 - root - INFO - lr: 9.3114e-05 gnorm: 1.58 [ 2:23:58< 0:33:06] +[titan] 2025-06-13 15:05:31,332 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:05:32,003 - root - INFO - step: 12200 loss: 17.6289 memory: 6.46GiB(27.34%) tps: 23,718 tflops: 23.87 mfu: 7.65% global_avg_ntp_loss: 3.0393 global_avg_mtp_loss: 14.5896 +[titan] 2025-06-13 15:05:32,003 - root - INFO - lr: 9.2965e-05 gnorm: 1.46 [ 2:24:01< 0:33:03] +[titan] 2025-06-13 15:05:35,444 - root - INFO - step: 12205 loss: 16.5520 memory: 6.46GiB(27.34%) tps: 23,811 tflops: 23.96 mfu: 7.68% global_avg_ntp_loss: 2.7897 global_avg_mtp_loss: 13.7622 +[titan] 2025-06-13 15:05:35,444 - root - INFO - lr: 9.2817e-05 gnorm: 1.44 [ 2:24:05< 0:32:59] +[titan] 2025-06-13 15:05:38,617 - root - INFO - step: 12210 loss: 18.2242 memory: 6.46GiB(27.34%) tps: 25,825 tflops: 25.99 mfu: 8.33% global_avg_ntp_loss: 3.0462 global_avg_mtp_loss: 15.1781 +[titan] 2025-06-13 15:05:38,617 - root - INFO - lr: 9.2669e-05 gnorm: 1.48 [ 2:24:08< 0:32:56] +[titan] 2025-06-13 15:05:42,021 - root - INFO - step: 12215 loss: 18.1740 memory: 6.46GiB(27.34%) tps: 24,065 tflops: 24.22 mfu: 7.76% global_avg_ntp_loss: 3.1506 global_avg_mtp_loss: 15.0234 +[titan] 2025-06-13 15:05:42,021 - root - INFO - lr: 9.2521e-05 gnorm: 1.36 [ 2:24:11< 0:32:52] +[titan] 2025-06-13 15:05:45,434 - root - INFO - step: 12220 loss: 19.6583 memory: 6.46GiB(27.34%) tps: 24,006 tflops: 24.16 mfu: 7.74% global_avg_ntp_loss: 3.3535 global_avg_mtp_loss: 16.3048 +[titan] 2025-06-13 15:05:45,434 - root - INFO - lr: 9.2374e-05 gnorm: 1.28 [ 2:24:15< 0:32:48] +[titan] 2025-06-13 15:05:49,060 - root - INFO - step: 12225 loss: 14.6754 memory: 6.46GiB(27.34%) tps: 22,593 tflops: 22.74 mfu: 7.29% global_avg_ntp_loss: 2.5036 global_avg_mtp_loss: 12.1719 +[titan] 2025-06-13 15:05:49,061 - root - INFO - lr: 9.2226e-05 gnorm: 1.60 [ 2:24:18< 0:32:45] +[titan] 2025-06-13 15:05:52,582 - root - INFO - step: 12230 loss: 20.5697 memory: 6.46GiB(27.34%) tps: 23,265 tflops: 23.41 mfu: 7.50% global_avg_ntp_loss: 3.5414 global_avg_mtp_loss: 17.0284 +[titan] 2025-06-13 15:05:52,582 - root - INFO - lr: 9.2079e-05 gnorm: 1.14 [ 2:24:22< 0:32:41] +[titan] 2025-06-13 15:05:56,282 - root - INFO - step: 12235 loss: 18.7422 memory: 6.46GiB(27.34%) tps: 22,141 tflops: 22.28 mfu: 7.14% global_avg_ntp_loss: 3.1977 global_avg_mtp_loss: 15.5445 +[titan] 2025-06-13 15:05:56,283 - root - INFO - lr: 9.1932e-05 gnorm: 1.19 [ 2:24:25< 0:32:38] +[titan] 2025-06-13 15:05:59,818 - root - INFO - step: 12240 loss: 19.2282 memory: 6.46GiB(27.34%) tps: 23,175 tflops: 23.32 mfu: 7.48% global_avg_ntp_loss: 3.3108 global_avg_mtp_loss: 15.9174 +[titan] 2025-06-13 15:05:59,818 - root - INFO - lr: 9.1786e-05 gnorm: 1.25 [ 2:24:29< 0:32:34] +[titan] 2025-06-13 15:06:03,200 - root - INFO - step: 12245 loss: 19.7201 memory: 6.46GiB(27.34%) tps: 24,220 tflops: 24.37 mfu: 7.81% global_avg_ntp_loss: 3.3856 global_avg_mtp_loss: 16.3346 +[titan] 2025-06-13 15:06:03,201 - root - INFO - lr: 9.1639e-05 gnorm: 1.41 [ 2:24:32< 0:32:31] +[titan] 2025-06-13 15:06:05,919 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:06:06,455 - root - INFO - step: 12250 loss: 19.8762 memory: 6.46GiB(27.34%) tps: 25,177 tflops: 25.34 mfu: 8.12% global_avg_ntp_loss: 3.4461 global_avg_mtp_loss: 16.4300 +[titan] 2025-06-13 15:06:06,455 - root - INFO - lr: 9.1493e-05 gnorm: 1.32 [ 2:24:36< 0:32:27] +[titan] 2025-06-13 15:06:10,000 - root - INFO - step: 12255 loss: 18.1214 memory: 6.46GiB(27.34%) tps: 23,109 tflops: 23.26 mfu: 7.45% global_avg_ntp_loss: 3.0957 global_avg_mtp_loss: 15.0257 +[titan] 2025-06-13 15:06:10,000 - root - INFO - lr: 9.1347e-05 gnorm: 1.24 [ 2:24:39< 0:32:24] +[titan] 2025-06-13 15:06:13,320 - root - INFO - step: 12260 loss: 17.8160 memory: 6.46GiB(27.34%) tps: 24,677 tflops: 24.83 mfu: 7.96% global_avg_ntp_loss: 3.0338 global_avg_mtp_loss: 14.7822 +[titan] 2025-06-13 15:06:13,320 - root - INFO - lr: 9.1201e-05 gnorm: 1.47 [ 2:24:42< 0:32:20] +[titan] 2025-06-13 15:06:17,465 - root - INFO - step: 12265 loss: 18.7016 memory: 6.46GiB(27.34%) tps: 19,765 tflops: 19.89 mfu: 6.38% global_avg_ntp_loss: 3.1845 global_avg_mtp_loss: 15.5171 +[titan] 2025-06-13 15:06:17,465 - root - INFO - lr: 9.1056e-05 gnorm: 1.35 [ 2:24:47< 0:32:17] +[titan] 2025-06-13 15:06:20,444 - root - INFO - step: 12270 loss: 19.4396 memory: 6.46GiB(27.34%) tps: 27,502 tflops: 27.68 mfu: 8.87% global_avg_ntp_loss: 3.3372 global_avg_mtp_loss: 16.1024 +[titan] 2025-06-13 15:06:20,444 - root - INFO - lr: 9.0911e-05 gnorm: 1.26 [ 2:24:50< 0:32:13] +[titan] 2025-06-13 15:06:24,086 - root - INFO - step: 12275 loss: 19.1999 memory: 6.46GiB(27.34%) tps: 22,497 tflops: 22.64 mfu: 7.26% global_avg_ntp_loss: 3.2593 global_avg_mtp_loss: 15.9406 +[titan] 2025-06-13 15:06:24,086 - root - INFO - lr: 9.0766e-05 gnorm: 1.37 [ 2:24:53< 0:32:09] +[titan] 2025-06-13 15:06:27,549 - root - INFO - step: 12280 loss: 19.5595 memory: 6.46GiB(27.34%) tps: 23,657 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 3.4026 global_avg_mtp_loss: 16.1569 +[titan] 2025-06-13 15:06:27,550 - root - INFO - lr: 9.0621e-05 gnorm: 2.70 [ 2:24:57< 0:32:06] +[titan] 2025-06-13 15:06:31,119 - root - INFO - step: 12285 loss: 18.6857 memory: 6.46GiB(27.34%) tps: 22,951 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 3.1970 global_avg_mtp_loss: 15.4887 +[titan] 2025-06-13 15:06:31,120 - root - INFO - lr: 9.0476e-05 gnorm: 1.29 [ 2:25:00< 0:32:02] +[titan] 2025-06-13 15:06:33,148 - root - INFO - Dumping profiler traces at step 12288 +[titan] 2025-06-13 15:06:33,243 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 15:06:34,493 - root - INFO - step: 12290 loss: 19.5121 memory: 6.46GiB(27.34%) tps: 24,288 tflops: 24.44 mfu: 7.83% global_avg_ntp_loss: 3.3333 global_avg_mtp_loss: 16.1788 +[titan] 2025-06-13 15:06:34,493 - root - INFO - lr: 9.0332e-05 gnorm: 1.28 [ 2:25:04< 0:31:59] +[titan] 2025-06-13 15:06:37,926 - root - INFO - step: 12295 loss: 17.9683 memory: 6.46GiB(27.34%) tps: 23,863 tflops: 24.02 mfu: 7.70% global_avg_ntp_loss: 3.0042 global_avg_mtp_loss: 14.9641 +[titan] 2025-06-13 15:06:37,927 - root - INFO - lr: 9.0188e-05 gnorm: 1.43 [ 2:25:07< 0:31:55] +[titan] 2025-06-13 15:06:40,421 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:06:40,994 - root - INFO - step: 12300 loss: 19.6868 memory: 6.46GiB(27.34%) tps: 26,709 tflops: 26.88 mfu: 8.62% global_avg_ntp_loss: 3.3840 global_avg_mtp_loss: 16.3028 +[titan] 2025-06-13 15:06:40,994 - root - INFO - lr: 9.0044e-05 gnorm: 1.24 [ 2:25:10< 0:31:52] +[titan] 2025-06-13 15:06:44,097 - root - INFO - step: 12305 loss: 19.8463 memory: 6.46GiB(27.34%) tps: 26,399 tflops: 26.57 mfu: 8.52% global_avg_ntp_loss: 3.3723 global_avg_mtp_loss: 16.4739 +[titan] 2025-06-13 15:06:44,098 - root - INFO - lr: 8.9900e-05 gnorm: 1.23 [ 2:25:13< 0:31:48] +[titan] 2025-06-13 15:06:48,017 - root - INFO - step: 12310 loss: 19.9318 memory: 6.46GiB(27.34%) tps: 20,901 tflops: 21.03 mfu: 6.74% global_avg_ntp_loss: 3.4276 global_avg_mtp_loss: 16.5042 +[titan] 2025-06-13 15:06:48,018 - root - INFO - lr: 8.9757e-05 gnorm: 1.30 [ 2:25:17< 0:31:44] +[titan] 2025-06-13 15:06:51,479 - root - INFO - step: 12315 loss: 19.3223 memory: 6.46GiB(27.34%) tps: 23,666 tflops: 23.82 mfu: 7.63% global_avg_ntp_loss: 3.2933 global_avg_mtp_loss: 16.0290 +[titan] 2025-06-13 15:06:51,480 - root - INFO - lr: 8.9614e-05 gnorm: 1.18 [ 2:25:21< 0:31:41] +[titan] 2025-06-13 15:06:55,119 - root - INFO - step: 12320 loss: 18.8834 memory: 6.46GiB(27.34%) tps: 22,511 tflops: 22.65 mfu: 7.26% global_avg_ntp_loss: 3.1989 global_avg_mtp_loss: 15.6845 +[titan] 2025-06-13 15:06:55,119 - root - INFO - lr: 8.9471e-05 gnorm: 1.19 [ 2:25:24< 0:31:37] +[titan] 2025-06-13 15:06:58,242 - root - INFO - step: 12325 loss: 19.8615 memory: 6.46GiB(27.34%) tps: 26,237 tflops: 26.40 mfu: 8.46% global_avg_ntp_loss: 3.3918 global_avg_mtp_loss: 16.4696 +[titan] 2025-06-13 15:06:58,242 - root - INFO - lr: 8.9328e-05 gnorm: 1.29 [ 2:25:27< 0:31:34] +[titan] 2025-06-13 15:07:01,739 - root - INFO - step: 12330 loss: 19.2466 memory: 6.46GiB(27.34%) tps: 23,427 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 3.3257 global_avg_mtp_loss: 15.9209 +[titan] 2025-06-13 15:07:01,739 - root - INFO - lr: 8.9186e-05 gnorm: 1.26 [ 2:25:31< 0:31:30] +[titan] 2025-06-13 15:07:05,131 - root - INFO - step: 12335 loss: 17.2805 memory: 6.46GiB(27.34%) tps: 24,155 tflops: 24.31 mfu: 7.79% global_avg_ntp_loss: 2.9272 global_avg_mtp_loss: 14.3533 +[titan] 2025-06-13 15:07:05,131 - root - INFO - lr: 8.9043e-05 gnorm: 1.40 [ 2:25:34< 0:31:27] +[titan] 2025-06-13 15:07:08,651 - root - INFO - step: 12340 loss: 19.5811 memory: 6.46GiB(27.34%) tps: 23,278 tflops: 23.43 mfu: 7.51% global_avg_ntp_loss: 3.3582 global_avg_mtp_loss: 16.2229 +[titan] 2025-06-13 15:07:08,651 - root - INFO - lr: 8.8901e-05 gnorm: 1.36 [ 2:25:38< 0:31:23] +[titan] 2025-06-13 15:07:12,056 - root - INFO - step: 12345 loss: 19.7630 memory: 6.46GiB(27.34%) tps: 24,055 tflops: 24.21 mfu: 7.76% global_avg_ntp_loss: 3.3965 global_avg_mtp_loss: 16.3664 +[titan] 2025-06-13 15:07:12,057 - root - INFO - lr: 8.8760e-05 gnorm: 1.21 [ 2:25:41< 0:31:20] +[titan] 2025-06-13 15:07:14,840 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:07:15,514 - root - INFO - step: 12350 loss: 18.0061 memory: 6.46GiB(27.34%) tps: 23,698 tflops: 23.85 mfu: 7.64% global_avg_ntp_loss: 3.0813 global_avg_mtp_loss: 14.9248 +[titan] 2025-06-13 15:07:15,514 - root - INFO - lr: 8.8618e-05 gnorm: 1.35 [ 2:25:45< 0:31:16] +[titan] 2025-06-13 15:07:18,406 - root - INFO - step: 12355 loss: 18.8616 memory: 6.46GiB(27.34%) tps: 28,332 tflops: 28.51 mfu: 9.14% global_avg_ntp_loss: 3.1978 global_avg_mtp_loss: 15.6638 +[titan] 2025-06-13 15:07:18,406 - root - INFO - lr: 8.8477e-05 gnorm: 1.21 [ 2:25:48< 0:31:12] +[titan] 2025-06-13 15:07:21,988 - root - INFO - step: 12360 loss: 19.2075 memory: 6.46GiB(27.34%) tps: 22,870 tflops: 23.02 mfu: 7.38% global_avg_ntp_loss: 3.3718 global_avg_mtp_loss: 15.8357 +[titan] 2025-06-13 15:07:21,988 - root - INFO - lr: 8.8336e-05 gnorm: 4.51 [ 2:25:51< 0:31:09] +[titan] 2025-06-13 15:07:25,535 - root - INFO - step: 12365 loss: 19.3326 memory: 6.46GiB(27.34%) tps: 23,099 tflops: 23.25 mfu: 7.45% global_avg_ntp_loss: 3.2825 global_avg_mtp_loss: 16.0501 +[titan] 2025-06-13 15:07:25,535 - root - INFO - lr: 8.8195e-05 gnorm: 1.26 [ 2:25:55< 0:31:05] +[titan] 2025-06-13 15:07:28,907 - root - INFO - step: 12370 loss: 19.1165 memory: 6.46GiB(27.34%) tps: 24,297 tflops: 24.45 mfu: 7.84% global_avg_ntp_loss: 3.2503 global_avg_mtp_loss: 15.8662 +[titan] 2025-06-13 15:07:28,907 - root - INFO - lr: 8.8054e-05 gnorm: 1.41 [ 2:25:58< 0:31:02] +[titan] 2025-06-13 15:07:32,138 - root - INFO - step: 12375 loss: 19.3744 memory: 6.46GiB(27.34%) tps: 25,356 tflops: 25.52 mfu: 8.18% global_avg_ntp_loss: 3.3422 global_avg_mtp_loss: 16.0323 +[titan] 2025-06-13 15:07:32,138 - root - INFO - lr: 8.7914e-05 gnorm: 1.25 [ 2:26:01< 0:30:58] +[titan] 2025-06-13 15:07:36,009 - root - INFO - step: 12380 loss: 18.7371 memory: 6.46GiB(27.34%) tps: 21,166 tflops: 21.30 mfu: 6.83% global_avg_ntp_loss: 3.2141 global_avg_mtp_loss: 15.5229 +[titan] 2025-06-13 15:07:36,009 - root - INFO - lr: 8.7774e-05 gnorm: 1.35 [ 2:26:05< 0:30:55] +[titan] 2025-06-13 15:07:39,138 - root - INFO - step: 12385 loss: 18.9008 memory: 6.46GiB(27.34%) tps: 26,183 tflops: 26.35 mfu: 8.45% global_avg_ntp_loss: 3.2584 global_avg_mtp_loss: 15.6424 +[titan] 2025-06-13 15:07:39,138 - root - INFO - lr: 8.7634e-05 gnorm: 1.21 [ 2:26:08< 0:30:51] +[titan] 2025-06-13 15:07:43,072 - root - INFO - step: 12390 loss: 19.0232 memory: 6.46GiB(27.34%) tps: 20,828 tflops: 20.96 mfu: 6.72% global_avg_ntp_loss: 3.2532 global_avg_mtp_loss: 15.7701 +[titan] 2025-06-13 15:07:43,072 - root - INFO - lr: 8.7494e-05 gnorm: 1.18 [ 2:26:12< 0:30:48] +[titan] 2025-06-13 15:07:45,970 - root - INFO - step: 12395 loss: 19.4898 memory: 6.46GiB(27.34%) tps: 28,272 tflops: 28.45 mfu: 9.12% global_avg_ntp_loss: 3.3223 global_avg_mtp_loss: 16.1674 +[titan] 2025-06-13 15:07:45,970 - root - INFO - lr: 8.7355e-05 gnorm: 1.35 [ 2:26:15< 0:30:44] +[titan] 2025-06-13 15:07:49,199 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:07:49,667 - root - INFO - step: 12400 loss: 18.7711 memory: 6.46GiB(27.34%) tps: 22,158 tflops: 22.30 mfu: 7.15% global_avg_ntp_loss: 3.2000 global_avg_mtp_loss: 15.5711 +[titan] 2025-06-13 15:07:49,668 - root - INFO - lr: 8.7216e-05 gnorm: 1.48 [ 2:26:19< 0:30:40] +[titan] 2025-06-13 15:07:52,872 - root - INFO - step: 12405 loss: 20.4696 memory: 6.46GiB(27.34%) tps: 25,561 tflops: 25.72 mfu: 8.24% global_avg_ntp_loss: 3.5675 global_avg_mtp_loss: 16.9020 +[titan] 2025-06-13 15:07:52,873 - root - INFO - lr: 8.7077e-05 gnorm: 1.36 [ 2:26:22< 0:30:37] +[titan] 2025-06-13 15:07:56,235 - root - INFO - step: 12410 loss: 19.2319 memory: 6.46GiB(27.34%) tps: 24,365 tflops: 24.52 mfu: 7.86% global_avg_ntp_loss: 3.3322 global_avg_mtp_loss: 15.8997 +[titan] 2025-06-13 15:07:56,235 - root - INFO - lr: 8.6938e-05 gnorm: 1.43 [ 2:26:25< 0:30:33] +[titan] 2025-06-13 15:07:59,879 - root - INFO - step: 12415 loss: 19.4163 memory: 6.46GiB(27.34%) tps: 22,481 tflops: 22.62 mfu: 7.25% global_avg_ntp_loss: 3.3235 global_avg_mtp_loss: 16.0928 +[titan] 2025-06-13 15:07:59,880 - root - INFO - lr: 8.6800e-05 gnorm: 1.32 [ 2:26:29< 0:30:30] +[titan] 2025-06-13 15:08:02,870 - root - INFO - step: 12420 loss: 18.2104 memory: 6.46GiB(27.34%) tps: 27,402 tflops: 27.58 mfu: 8.84% global_avg_ntp_loss: 3.1528 global_avg_mtp_loss: 15.0576 +[titan] 2025-06-13 15:08:02,870 - root - INFO - lr: 8.6662e-05 gnorm: 1.33 [ 2:26:32< 0:30:26] +[titan] 2025-06-13 15:08:06,464 - root - INFO - step: 12425 loss: 17.8564 memory: 6.46GiB(27.34%) tps: 22,796 tflops: 22.94 mfu: 7.35% global_avg_ntp_loss: 3.1328 global_avg_mtp_loss: 14.7236 +[titan] 2025-06-13 15:08:06,464 - root - INFO - lr: 8.6524e-05 gnorm: 2.58 [ 2:26:36< 0:30:22] +[titan] 2025-06-13 15:08:09,836 - root - INFO - step: 12430 loss: 17.3323 memory: 6.46GiB(27.34%) tps: 24,295 tflops: 24.45 mfu: 7.84% global_avg_ntp_loss: 3.0306 global_avg_mtp_loss: 14.3017 +[titan] 2025-06-13 15:08:09,836 - root - INFO - lr: 8.6386e-05 gnorm: 2.66 [ 2:26:39< 0:30:19] +[titan] 2025-06-13 15:08:13,176 - root - INFO - step: 12435 loss: 16.3913 memory: 6.46GiB(27.34%) tps: 24,530 tflops: 24.69 mfu: 7.91% global_avg_ntp_loss: 2.7713 global_avg_mtp_loss: 13.6200 +[titan] 2025-06-13 15:08:13,176 - root - INFO - lr: 8.6248e-05 gnorm: 1.41 [ 2:26:42< 0:30:15] +[titan] 2025-06-13 15:08:16,829 - root - INFO - step: 12440 loss: 19.2362 memory: 6.46GiB(27.34%) tps: 22,426 tflops: 22.57 mfu: 7.23% global_avg_ntp_loss: 3.3486 global_avg_mtp_loss: 15.8875 +[titan] 2025-06-13 15:08:16,829 - root - INFO - lr: 8.6111e-05 gnorm: 1.29 [ 2:26:46< 0:30:12] +[titan] 2025-06-13 15:08:19,983 - root - INFO - step: 12445 loss: 17.9578 memory: 6.46GiB(27.34%) tps: 25,977 tflops: 26.14 mfu: 8.38% global_avg_ntp_loss: 3.0005 global_avg_mtp_loss: 14.9574 +[titan] 2025-06-13 15:08:19,983 - root - INFO - lr: 8.5974e-05 gnorm: 1.50 [ 2:26:49< 0:30:08] +[titan] 2025-06-13 15:08:22,580 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:08:23,211 - root - INFO - step: 12450 loss: 18.4364 memory: 6.46GiB(27.34%) tps: 25,382 tflops: 25.54 mfu: 8.19% global_avg_ntp_loss: 3.1624 global_avg_mtp_loss: 15.2740 +[titan] 2025-06-13 15:08:23,211 - root - INFO - lr: 8.5837e-05 gnorm: 1.27 [ 2:26:52< 0:30:05] +[titan] 2025-06-13 15:08:26,879 - root - INFO - step: 12455 loss: 19.5920 memory: 6.46GiB(27.34%) tps: 22,335 tflops: 22.48 mfu: 7.20% global_avg_ntp_loss: 3.3522 global_avg_mtp_loss: 16.2398 +[titan] 2025-06-13 15:08:26,879 - root - INFO - lr: 8.5701e-05 gnorm: 1.25 [ 2:26:56< 0:30:01] +[titan] 2025-06-13 15:08:30,087 - root - INFO - step: 12460 loss: 18.0669 memory: 6.46GiB(27.34%) tps: 25,539 tflops: 25.70 mfu: 8.24% global_avg_ntp_loss: 3.0965 global_avg_mtp_loss: 14.9704 +[titan] 2025-06-13 15:08:30,088 - root - INFO - lr: 8.5564e-05 gnorm: 1.26 [ 2:26:59< 0:29:57] +[titan] 2025-06-13 15:08:35,233 - root - INFO - step: 12465 loss: 18.6290 memory: 6.46GiB(27.34%) tps: 15,921 tflops: 16.02 mfu: 5.14% global_avg_ntp_loss: 3.1887 global_avg_mtp_loss: 15.4403 +[titan] 2025-06-13 15:08:35,233 - root - INFO - lr: 8.5428e-05 gnorm: 1.39 [ 2:27:04< 0:29:54] +[titan] 2025-06-13 15:08:38,802 - root - INFO - step: 12470 loss: 19.8103 memory: 6.46GiB(27.34%) tps: 22,955 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 3.3887 global_avg_mtp_loss: 16.4216 +[titan] 2025-06-13 15:08:38,803 - root - INFO - lr: 8.5292e-05 gnorm: 1.31 [ 2:27:08< 0:29:51] +[titan] 2025-06-13 15:08:42,363 - root - INFO - step: 12475 loss: 19.0326 memory: 6.46GiB(27.34%) tps: 23,009 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 3.2801 global_avg_mtp_loss: 15.7525 +[titan] 2025-06-13 15:08:42,363 - root - INFO - lr: 8.5157e-05 gnorm: 1.48 [ 2:27:11< 0:29:47] +[titan] 2025-06-13 15:08:46,091 - root - INFO - step: 12480 loss: 19.5490 memory: 6.46GiB(27.34%) tps: 21,978 tflops: 22.12 mfu: 7.09% global_avg_ntp_loss: 3.3359 global_avg_mtp_loss: 16.2131 +[titan] 2025-06-13 15:08:46,091 - root - INFO - lr: 8.5021e-05 gnorm: 1.21 [ 2:27:15< 0:29:44] +[titan] 2025-06-13 15:08:49,152 - root - INFO - step: 12485 loss: 19.6063 memory: 6.46GiB(27.34%) tps: 26,763 tflops: 26.93 mfu: 8.63% global_avg_ntp_loss: 3.3631 global_avg_mtp_loss: 16.2432 +[titan] 2025-06-13 15:08:49,152 - root - INFO - lr: 8.4886e-05 gnorm: 1.29 [ 2:27:18< 0:29:40] +[titan] 2025-06-13 15:08:52,560 - root - INFO - step: 12490 loss: 19.9273 memory: 6.46GiB(27.34%) tps: 24,040 tflops: 24.19 mfu: 7.75% global_avg_ntp_loss: 3.4315 global_avg_mtp_loss: 16.4958 +[titan] 2025-06-13 15:08:52,560 - root - INFO - lr: 8.4751e-05 gnorm: 1.19 [ 2:27:22< 0:29:36] +[titan] 2025-06-13 15:08:56,333 - root - INFO - step: 12495 loss: 14.4332 memory: 6.46GiB(27.34%) tps: 21,716 tflops: 21.85 mfu: 7.00% global_avg_ntp_loss: 2.4372 global_avg_mtp_loss: 11.9960 +[titan] 2025-06-13 15:08:56,333 - root - INFO - lr: 8.4617e-05 gnorm: 1.46 [ 2:27:25< 0:29:33] +[titan] 2025-06-13 15:08:59,086 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:08:59,932 - root - INFO - step: 12500 loss: 19.6338 memory: 6.46GiB(27.34%) tps: 22,762 tflops: 22.91 mfu: 7.34% global_avg_ntp_loss: 3.4136 global_avg_mtp_loss: 16.2202 +[titan] 2025-06-13 15:08:59,933 - root - INFO - lr: 8.4482e-05 gnorm: 1.18 [ 2:27:29< 0:29:29] +[titan] 2025-06-13 15:09:03,044 - root - INFO - step: 12505 loss: 20.2614 memory: 6.46GiB(27.34%) tps: 26,330 tflops: 26.50 mfu: 8.49% global_avg_ntp_loss: 3.4782 global_avg_mtp_loss: 16.7832 +[titan] 2025-06-13 15:09:03,045 - root - INFO - lr: 8.4348e-05 gnorm: 1.37 [ 2:27:32< 0:29:26] +[titan] 2025-06-13 15:09:06,591 - root - INFO - step: 12510 loss: 18.4537 memory: 6.46GiB(27.34%) tps: 23,103 tflops: 23.25 mfu: 7.45% global_avg_ntp_loss: 3.1575 global_avg_mtp_loss: 15.2962 +[titan] 2025-06-13 15:09:06,591 - root - INFO - lr: 8.4214e-05 gnorm: 1.30 [ 2:27:36< 0:29:22] +[titan] 2025-06-13 15:09:09,984 - root - INFO - step: 12515 loss: 17.5236 memory: 6.46GiB(27.34%) tps: 24,148 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 2.9579 global_avg_mtp_loss: 14.5657 +[titan] 2025-06-13 15:09:09,984 - root - INFO - lr: 8.4080e-05 gnorm: 1.37 [ 2:27:39< 0:29:19] +[titan] 2025-06-13 15:09:13,471 - root - INFO - step: 12520 loss: 19.9337 memory: 6.46GiB(27.34%) tps: 23,493 tflops: 23.64 mfu: 7.58% global_avg_ntp_loss: 3.4157 global_avg_mtp_loss: 16.5180 +[titan] 2025-06-13 15:09:13,472 - root - INFO - lr: 8.3947e-05 gnorm: 1.27 [ 2:27:43< 0:29:15] +[titan] 2025-06-13 15:09:16,945 - root - INFO - step: 12525 loss: 18.7822 memory: 6.46GiB(27.34%) tps: 23,588 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 3.2230 global_avg_mtp_loss: 15.5591 +[titan] 2025-06-13 15:09:16,945 - root - INFO - lr: 8.3814e-05 gnorm: 1.45 [ 2:27:46< 0:29:12] +[titan] 2025-06-13 15:09:20,482 - root - INFO - step: 12530 loss: 20.1685 memory: 6.46GiB(27.34%) tps: 23,158 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.4375 global_avg_mtp_loss: 16.7310 +[titan] 2025-06-13 15:09:20,483 - root - INFO - lr: 8.3681e-05 gnorm: 1.21 [ 2:27:50< 0:29:08] +[titan] 2025-06-13 15:09:25,083 - root - INFO - step: 12535 loss: 18.5291 memory: 6.46GiB(27.34%) tps: 17,809 tflops: 17.92 mfu: 5.74% global_avg_ntp_loss: 3.1398 global_avg_mtp_loss: 15.3893 +[titan] 2025-06-13 15:09:25,083 - root - INFO - lr: 8.3548e-05 gnorm: 1.39 [ 2:27:54< 0:29:05] +[titan] 2025-06-13 15:09:28,174 - root - INFO - step: 12540 loss: 19.3140 memory: 6.46GiB(27.34%) tps: 26,500 tflops: 26.67 mfu: 8.55% global_avg_ntp_loss: 3.2897 global_avg_mtp_loss: 16.0243 +[titan] 2025-06-13 15:09:28,175 - root - INFO - lr: 8.3416e-05 gnorm: 1.28 [ 2:27:57< 0:29:01] +[titan] 2025-06-13 15:09:31,606 - root - INFO - step: 12545 loss: 17.5641 memory: 6.46GiB(27.34%) tps: 23,876 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 2.9745 global_avg_mtp_loss: 14.5896 +[titan] 2025-06-13 15:09:31,606 - root - INFO - lr: 8.3283e-05 gnorm: 1.40 [ 2:28:01< 0:28:58] +[titan] 2025-06-13 15:09:34,542 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:09:35,205 - root - INFO - step: 12550 loss: 19.1093 memory: 6.46GiB(27.34%) tps: 22,766 tflops: 22.91 mfu: 7.34% global_avg_ntp_loss: 3.3124 global_avg_mtp_loss: 15.7969 +[titan] 2025-06-13 15:09:35,205 - root - INFO - lr: 8.3151e-05 gnorm: 1.30 [ 2:28:04< 0:28:54] +[titan] 2025-06-13 15:09:38,289 - root - INFO - step: 12555 loss: 19.6549 memory: 6.46GiB(27.34%) tps: 26,567 tflops: 26.74 mfu: 8.57% global_avg_ntp_loss: 3.3955 global_avg_mtp_loss: 16.2594 +[titan] 2025-06-13 15:09:38,289 - root - INFO - lr: 8.3020e-05 gnorm: 1.24 [ 2:28:07< 0:28:50] +[titan] 2025-06-13 15:09:41,859 - root - INFO - step: 12560 loss: 19.5545 memory: 6.46GiB(27.34%) tps: 22,945 tflops: 23.09 mfu: 7.40% global_avg_ntp_loss: 3.3918 global_avg_mtp_loss: 16.1627 +[titan] 2025-06-13 15:09:41,859 - root - INFO - lr: 8.2888e-05 gnorm: 1.14 [ 2:28:11< 0:28:47] +[titan] 2025-06-13 15:09:45,279 - root - INFO - step: 12565 loss: 19.3554 memory: 6.46GiB(27.34%) tps: 23,961 tflops: 24.11 mfu: 7.73% global_avg_ntp_loss: 3.2977 global_avg_mtp_loss: 16.0577 +[titan] 2025-06-13 15:09:45,279 - root - INFO - lr: 8.2757e-05 gnorm: 1.22 [ 2:28:14< 0:28:43] +[titan] 2025-06-13 15:09:48,977 - root - INFO - step: 12570 loss: 19.8418 memory: 6.46GiB(27.34%) tps: 22,151 tflops: 22.29 mfu: 7.14% global_avg_ntp_loss: 3.4010 global_avg_mtp_loss: 16.4408 +[titan] 2025-06-13 15:09:48,978 - root - INFO - lr: 8.2626e-05 gnorm: 1.21 [ 2:28:18< 0:28:40] +[titan] 2025-06-13 15:09:52,253 - root - INFO - step: 12575 loss: 18.4482 memory: 6.46GiB(27.34%) tps: 25,008 tflops: 25.17 mfu: 8.07% global_avg_ntp_loss: 3.1578 global_avg_mtp_loss: 15.2904 +[titan] 2025-06-13 15:09:52,254 - root - INFO - lr: 8.2495e-05 gnorm: 1.45 [ 2:28:21< 0:28:36] +[titan] 2025-06-13 15:09:55,675 - root - INFO - step: 12580 loss: 19.0383 memory: 6.46GiB(27.34%) tps: 23,942 tflops: 24.10 mfu: 7.72% global_avg_ntp_loss: 3.2830 global_avg_mtp_loss: 15.7553 +[titan] 2025-06-13 15:09:55,676 - root - INFO - lr: 8.2364e-05 gnorm: 1.22 [ 2:28:25< 0:28:33] +[titan] 2025-06-13 15:09:58,946 - root - INFO - step: 12585 loss: 19.3558 memory: 6.46GiB(27.34%) tps: 25,052 tflops: 25.21 mfu: 8.08% global_avg_ntp_loss: 3.3197 global_avg_mtp_loss: 16.0360 +[titan] 2025-06-13 15:09:58,946 - root - INFO - lr: 8.2234e-05 gnorm: 1.22 [ 2:28:28< 0:28:29] +[titan] 2025-06-13 15:10:02,200 - root - INFO - step: 12590 loss: 19.2170 memory: 6.46GiB(27.34%) tps: 25,180 tflops: 25.34 mfu: 8.12% global_avg_ntp_loss: 3.2946 global_avg_mtp_loss: 15.9224 +[titan] 2025-06-13 15:10:02,200 - root - INFO - lr: 8.2104e-05 gnorm: 1.24 [ 2:28:31< 0:28:25] +[titan] 2025-06-13 15:10:05,764 - root - INFO - step: 12595 loss: 18.6329 memory: 6.46GiB(27.34%) tps: 22,988 tflops: 23.13 mfu: 7.41% global_avg_ntp_loss: 3.1955 global_avg_mtp_loss: 15.4375 +[titan] 2025-06-13 15:10:05,764 - root - INFO - lr: 8.1974e-05 gnorm: 1.32 [ 2:28:35< 0:28:22] +[titan] 2025-06-13 15:10:08,219 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:10:08,948 - root - INFO - step: 12600 loss: 18.5413 memory: 6.46GiB(27.34%) tps: 25,730 tflops: 25.89 mfu: 8.30% global_avg_ntp_loss: 3.2181 global_avg_mtp_loss: 15.3232 +[titan] 2025-06-13 15:10:08,948 - root - INFO - lr: 8.1845e-05 gnorm: 1.67 [ 2:28:38< 0:28:18] +[titan] 2025-06-13 15:10:12,201 - root - INFO - step: 12605 loss: 18.0855 memory: 6.46GiB(27.34%) tps: 25,183 tflops: 25.34 mfu: 8.12% global_avg_ntp_loss: 3.0697 global_avg_mtp_loss: 15.0157 +[titan] 2025-06-13 15:10:12,201 - root - INFO - lr: 8.1715e-05 gnorm: 1.28 [ 2:28:41< 0:28:15] +[titan] 2025-06-13 15:10:15,491 - root - INFO - step: 12610 loss: 18.8635 memory: 6.46GiB(27.34%) tps: 24,904 tflops: 25.06 mfu: 8.03% global_avg_ntp_loss: 3.2289 global_avg_mtp_loss: 15.6346 +[titan] 2025-06-13 15:10:15,491 - root - INFO - lr: 8.1586e-05 gnorm: 1.23 [ 2:28:45< 0:28:11] +[titan] 2025-06-13 15:10:19,000 - root - INFO - step: 12615 loss: 19.0251 memory: 6.46GiB(27.34%) tps: 23,350 tflops: 23.50 mfu: 7.53% global_avg_ntp_loss: 3.2360 global_avg_mtp_loss: 15.7890 +[titan] 2025-06-13 15:10:19,000 - root - INFO - lr: 8.1457e-05 gnorm: 1.28 [ 2:28:48< 0:28:08] +[titan] 2025-06-13 15:10:22,405 - root - INFO - step: 12620 loss: 19.0427 memory: 6.46GiB(27.34%) tps: 24,063 tflops: 24.22 mfu: 7.76% global_avg_ntp_loss: 3.2294 global_avg_mtp_loss: 15.8134 +[titan] 2025-06-13 15:10:22,405 - root - INFO - lr: 8.1329e-05 gnorm: 1.58 [ 2:28:52< 0:28:04] +[titan] 2025-06-13 15:10:25,592 - root - INFO - step: 12625 loss: 19.7854 memory: 6.46GiB(27.34%) tps: 25,704 tflops: 25.87 mfu: 8.29% global_avg_ntp_loss: 3.3747 global_avg_mtp_loss: 16.4107 +[titan] 2025-06-13 15:10:25,592 - root - INFO - lr: 8.1200e-05 gnorm: 1.33 [ 2:28:55< 0:28:00] +[titan] 2025-06-13 15:10:28,924 - root - INFO - step: 12630 loss: 19.1097 memory: 6.46GiB(27.34%) tps: 24,590 tflops: 24.75 mfu: 7.93% global_avg_ntp_loss: 3.2253 global_avg_mtp_loss: 15.8844 +[titan] 2025-06-13 15:10:28,924 - root - INFO - lr: 8.1072e-05 gnorm: 1.31 [ 2:28:58< 0:27:57] +[titan] 2025-06-13 15:10:32,530 - root - INFO - step: 12635 loss: 18.9320 memory: 6.46GiB(27.34%) tps: 22,720 tflops: 22.86 mfu: 7.33% global_avg_ntp_loss: 3.2835 global_avg_mtp_loss: 15.6485 +[titan] 2025-06-13 15:10:32,530 - root - INFO - lr: 8.0944e-05 gnorm: 1.50 [ 2:29:02< 0:27:53] +[titan] 2025-06-13 15:10:35,994 - root - INFO - step: 12640 loss: 19.4229 memory: 6.46GiB(27.34%) tps: 23,654 tflops: 23.81 mfu: 7.63% global_avg_ntp_loss: 3.3433 global_avg_mtp_loss: 16.0796 +[titan] 2025-06-13 15:10:35,994 - root - INFO - lr: 8.0817e-05 gnorm: 1.39 [ 2:29:05< 0:27:50] +[titan] 2025-06-13 15:10:39,546 - root - INFO - step: 12645 loss: 18.3161 memory: 6.46GiB(27.34%) tps: 23,067 tflops: 23.21 mfu: 7.44% global_avg_ntp_loss: 3.0989 global_avg_mtp_loss: 15.2172 +[titan] 2025-06-13 15:10:39,546 - root - INFO - lr: 8.0689e-05 gnorm: 1.23 [ 2:29:09< 0:27:46] +[titan] 2025-06-13 15:10:42,178 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:10:42,991 - root - INFO - step: 12650 loss: 19.4318 memory: 6.46GiB(27.34%) tps: 23,776 tflops: 23.93 mfu: 7.67% global_avg_ntp_loss: 3.3441 global_avg_mtp_loss: 16.0877 +[titan] 2025-06-13 15:10:42,992 - root - INFO - lr: 8.0562e-05 gnorm: 1.29 [ 2:29:12< 0:27:43] +[titan] 2025-06-13 15:10:46,395 - root - INFO - step: 12655 loss: 17.8548 memory: 6.46GiB(27.34%) tps: 24,075 tflops: 24.23 mfu: 7.77% global_avg_ntp_loss: 3.0407 global_avg_mtp_loss: 14.8141 +[titan] 2025-06-13 15:10:46,395 - root - INFO - lr: 8.0435e-05 gnorm: 1.43 [ 2:29:15< 0:27:39] +[titan] 2025-06-13 15:10:49,559 - root - INFO - step: 12660 loss: 19.3951 memory: 6.46GiB(27.34%) tps: 25,888 tflops: 26.05 mfu: 8.35% global_avg_ntp_loss: 3.3085 global_avg_mtp_loss: 16.0865 +[titan] 2025-06-13 15:10:49,560 - root - INFO - lr: 8.0309e-05 gnorm: 1.21 [ 2:29:19< 0:27:35] +[titan] 2025-06-13 15:10:53,220 - root - INFO - step: 12665 loss: 20.8935 memory: 6.46GiB(27.34%) tps: 22,378 tflops: 22.52 mfu: 7.22% global_avg_ntp_loss: 3.5939 global_avg_mtp_loss: 17.2996 +[titan] 2025-06-13 15:10:53,221 - root - INFO - lr: 8.0182e-05 gnorm: 1.29 [ 2:29:22< 0:27:32] +[titan] 2025-06-13 15:10:57,015 - root - INFO - step: 12670 loss: 18.2682 memory: 6.46GiB(27.34%) tps: 21,590 tflops: 21.73 mfu: 6.96% global_avg_ntp_loss: 3.1138 global_avg_mtp_loss: 15.1544 +[titan] 2025-06-13 15:10:57,016 - root - INFO - lr: 8.0056e-05 gnorm: 1.36 [ 2:29:26< 0:27:28] +[titan] 2025-06-13 15:11:01,292 - root - INFO - step: 12675 loss: 17.9043 memory: 6.46GiB(27.34%) tps: 19,157 tflops: 19.28 mfu: 6.18% global_avg_ntp_loss: 3.0616 global_avg_mtp_loss: 14.8427 +[titan] 2025-06-13 15:11:01,293 - root - INFO - lr: 7.9930e-05 gnorm: 1.26 [ 2:29:30< 0:27:25] +[titan] 2025-06-13 15:11:05,078 - root - INFO - step: 12680 loss: 19.6249 memory: 6.46GiB(27.34%) tps: 21,639 tflops: 21.78 mfu: 6.98% global_avg_ntp_loss: 3.3736 global_avg_mtp_loss: 16.2513 +[titan] 2025-06-13 15:11:05,079 - root - INFO - lr: 7.9804e-05 gnorm: 1.26 [ 2:29:34< 0:27:22] +[titan] 2025-06-13 15:11:08,652 - root - INFO - step: 12685 loss: 19.6734 memory: 6.46GiB(27.34%) tps: 22,928 tflops: 23.07 mfu: 7.40% global_avg_ntp_loss: 3.3386 global_avg_mtp_loss: 16.3348 +[titan] 2025-06-13 15:11:08,652 - root - INFO - lr: 7.9679e-05 gnorm: 1.23 [ 2:29:38< 0:27:18] +[titan] 2025-06-13 15:11:12,169 - root - INFO - step: 12690 loss: 18.4597 memory: 6.46GiB(27.34%) tps: 23,296 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 3.0713 global_avg_mtp_loss: 15.3885 +[titan] 2025-06-13 15:11:12,169 - root - INFO - lr: 7.9554e-05 gnorm: 1.39 [ 2:29:41< 0:27:14] +[titan] 2025-06-13 15:11:15,404 - root - INFO - step: 12695 loss: 18.9711 memory: 6.46GiB(27.34%) tps: 25,323 tflops: 25.48 mfu: 8.17% global_avg_ntp_loss: 3.1978 global_avg_mtp_loss: 15.7733 +[titan] 2025-06-13 15:11:15,405 - root - INFO - lr: 7.9429e-05 gnorm: 1.27 [ 2:29:44< 0:27:11] +[titan] 2025-06-13 15:11:18,530 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:11:18,998 - root - INFO - step: 12700 loss: 18.1176 memory: 6.46GiB(27.34%) tps: 22,795 tflops: 22.94 mfu: 7.35% global_avg_ntp_loss: 3.1055 global_avg_mtp_loss: 15.0121 +[titan] 2025-06-13 15:11:18,999 - root - INFO - lr: 7.9304e-05 gnorm: 1.20 [ 2:29:48< 0:27:07] +[titan] 2025-06-13 15:11:22,548 - root - INFO - step: 12705 loss: 19.3048 memory: 6.46GiB(27.34%) tps: 23,082 tflops: 23.23 mfu: 7.45% global_avg_ntp_loss: 3.2384 global_avg_mtp_loss: 16.0664 +[titan] 2025-06-13 15:11:22,548 - root - INFO - lr: 7.9180e-05 gnorm: 1.58 [ 2:29:52< 0:27:04] +[titan] 2025-06-13 15:11:25,782 - root - INFO - step: 12710 loss: 18.0012 memory: 6.46GiB(27.34%) tps: 25,333 tflops: 25.49 mfu: 8.17% global_avg_ntp_loss: 3.0263 global_avg_mtp_loss: 14.9749 +[titan] 2025-06-13 15:11:25,782 - root - INFO - lr: 7.9056e-05 gnorm: 1.29 [ 2:29:55< 0:27:00] +[titan] 2025-06-13 15:11:29,275 - root - INFO - step: 12715 loss: 19.0566 memory: 6.46GiB(27.34%) tps: 23,457 tflops: 23.61 mfu: 7.57% global_avg_ntp_loss: 3.2675 global_avg_mtp_loss: 15.7892 +[titan] 2025-06-13 15:11:29,275 - root - INFO - lr: 7.8932e-05 gnorm: 1.37 [ 2:29:58< 0:26:57] +[titan] 2025-06-13 15:11:32,793 - root - INFO - step: 12720 loss: 18.3542 memory: 6.46GiB(27.34%) tps: 23,287 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 3.1158 global_avg_mtp_loss: 15.2384 +[titan] 2025-06-13 15:11:32,793 - root - INFO - lr: 7.8808e-05 gnorm: 1.37 [ 2:30:02< 0:26:53] +[titan] 2025-06-13 15:11:35,936 - root - INFO - step: 12725 loss: 18.5624 memory: 6.46GiB(27.34%) tps: 26,064 tflops: 26.23 mfu: 8.41% global_avg_ntp_loss: 3.1886 global_avg_mtp_loss: 15.3737 +[titan] 2025-06-13 15:11:35,937 - root - INFO - lr: 7.8684e-05 gnorm: 1.24 [ 2:30:05< 0:26:50] +[titan] 2025-06-13 15:11:39,587 - root - INFO - step: 12730 loss: 18.4240 memory: 6.46GiB(27.34%) tps: 22,444 tflops: 22.59 mfu: 7.24% global_avg_ntp_loss: 3.1406 global_avg_mtp_loss: 15.2834 +[titan] 2025-06-13 15:11:39,587 - root - INFO - lr: 7.8561e-05 gnorm: 1.22 [ 2:30:09< 0:26:46] +[titan] 2025-06-13 15:11:42,837 - root - INFO - step: 12735 loss: 20.1184 memory: 6.46GiB(27.34%) tps: 25,207 tflops: 25.37 mfu: 8.13% global_avg_ntp_loss: 3.5052 global_avg_mtp_loss: 16.6133 +[titan] 2025-06-13 15:11:42,838 - root - INFO - lr: 7.8438e-05 gnorm: 1.45 [ 2:30:12< 0:26:42] +[titan] 2025-06-13 15:11:45,918 - root - INFO - step: 12740 loss: 17.9895 memory: 6.46GiB(27.34%) tps: 26,591 tflops: 26.76 mfu: 8.58% global_avg_ntp_loss: 3.0086 global_avg_mtp_loss: 14.9809 +[titan] 2025-06-13 15:11:45,919 - root - INFO - lr: 7.8315e-05 gnorm: 1.39 [ 2:30:15< 0:26:39] +[titan] 2025-06-13 15:11:49,492 - root - INFO - step: 12745 loss: 18.1868 memory: 6.46GiB(27.34%) tps: 22,926 tflops: 23.07 mfu: 7.39% global_avg_ntp_loss: 3.0689 global_avg_mtp_loss: 15.1179 +[titan] 2025-06-13 15:11:49,492 - root - INFO - lr: 7.8193e-05 gnorm: 1.41 [ 2:30:19< 0:26:35] +[titan] 2025-06-13 15:11:51,963 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:11:52,604 - root - INFO - step: 12750 loss: 19.6767 memory: 6.46GiB(27.34%) tps: 26,332 tflops: 26.50 mfu: 8.49% global_avg_ntp_loss: 3.3810 global_avg_mtp_loss: 16.2956 +[titan] 2025-06-13 15:11:52,604 - root - INFO - lr: 7.8071e-05 gnorm: 1.25 [ 2:30:22< 0:26:32] +[titan] 2025-06-13 15:11:56,070 - root - INFO - step: 12755 loss: 18.8902 memory: 6.46GiB(27.34%) tps: 23,637 tflops: 23.79 mfu: 7.62% global_avg_ntp_loss: 3.2773 global_avg_mtp_loss: 15.6129 +[titan] 2025-06-13 15:11:56,070 - root - INFO - lr: 7.7949e-05 gnorm: 1.29 [ 2:30:25< 0:26:28] +[titan] 2025-06-13 15:11:59,309 - root - INFO - step: 12760 loss: 19.3234 memory: 6.46GiB(27.34%) tps: 25,296 tflops: 25.46 mfu: 8.16% global_avg_ntp_loss: 3.3118 global_avg_mtp_loss: 16.0116 +[titan] 2025-06-13 15:11:59,309 - root - INFO - lr: 7.7827e-05 gnorm: 1.30 [ 2:30:28< 0:26:25] +[titan] 2025-06-13 15:12:02,737 - root - INFO - step: 12765 loss: 19.5331 memory: 6.46GiB(27.34%) tps: 23,893 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 3.3374 global_avg_mtp_loss: 16.1956 +[titan] 2025-06-13 15:12:02,738 - root - INFO - lr: 7.7706e-05 gnorm: 1.42 [ 2:30:32< 0:26:21] +[titan] 2025-06-13 15:12:06,249 - root - INFO - step: 12770 loss: 18.5101 memory: 6.46GiB(27.34%) tps: 23,334 tflops: 23.48 mfu: 7.53% global_avg_ntp_loss: 3.1217 global_avg_mtp_loss: 15.3883 +[titan] 2025-06-13 15:12:06,249 - root - INFO - lr: 7.7584e-05 gnorm: 1.53 [ 2:30:35< 0:26:17] +[titan] 2025-06-13 15:12:09,720 - root - INFO - step: 12775 loss: 19.0245 memory: 6.46GiB(27.34%) tps: 23,603 tflops: 23.75 mfu: 7.61% global_avg_ntp_loss: 3.2662 global_avg_mtp_loss: 15.7582 +[titan] 2025-06-13 15:12:09,720 - root - INFO - lr: 7.7463e-05 gnorm: 1.23 [ 2:30:39< 0:26:14] +[titan] 2025-06-13 15:12:13,094 - root - INFO - step: 12780 loss: 19.1698 memory: 6.46GiB(27.34%) tps: 24,286 tflops: 24.44 mfu: 7.83% global_avg_ntp_loss: 3.2859 global_avg_mtp_loss: 15.8839 +[titan] 2025-06-13 15:12:13,094 - root - INFO - lr: 7.7343e-05 gnorm: 1.23 [ 2:30:42< 0:26:10] +[titan] 2025-06-13 15:12:16,320 - root - INFO - step: 12785 loss: 17.4336 memory: 6.46GiB(27.34%) tps: 25,398 tflops: 25.56 mfu: 8.19% global_avg_ntp_loss: 3.0122 global_avg_mtp_loss: 14.4214 +[titan] 2025-06-13 15:12:16,320 - root - INFO - lr: 7.7222e-05 gnorm: 1.53 [ 2:30:45< 0:26:07] +[titan] 2025-06-13 15:12:19,471 - root - INFO - step: 12790 loss: 19.8308 memory: 6.46GiB(27.34%) tps: 25,994 tflops: 26.16 mfu: 8.38% global_avg_ntp_loss: 3.4467 global_avg_mtp_loss: 16.3842 +[titan] 2025-06-13 15:12:19,472 - root - INFO - lr: 7.7102e-05 gnorm: 1.24 [ 2:30:49< 0:26:03] +[titan] 2025-06-13 15:12:23,298 - root - INFO - step: 12795 loss: 20.2822 memory: 6.46GiB(27.34%) tps: 21,413 tflops: 21.55 mfu: 6.91% global_avg_ntp_loss: 3.4823 global_avg_mtp_loss: 16.8000 +[titan] 2025-06-13 15:12:23,298 - root - INFO - lr: 7.6982e-05 gnorm: 1.26 [ 2:30:52< 0:26:00] +[titan] 2025-06-13 15:12:26,067 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:12:26,729 - root - INFO - step: 12800 loss: 17.5779 memory: 6.46GiB(27.34%) tps: 23,873 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 3.0117 global_avg_mtp_loss: 14.5662 +[titan] 2025-06-13 15:12:26,730 - root - INFO - lr: 7.6862e-05 gnorm: 1.78 [ 2:30:56< 0:25:56] +[titan] 2025-06-13 15:12:26,854 - root - INFO - Dumping profiler traces at step 12800 +[titan] 2025-06-13 15:12:26,946 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 15:12:30,296 - root - INFO - step: 12805 loss: 19.4907 memory: 6.46GiB(27.34%) tps: 22,969 tflops: 23.12 mfu: 7.41% global_avg_ntp_loss: 3.3467 global_avg_mtp_loss: 16.1440 +[titan] 2025-06-13 15:12:30,297 - root - INFO - lr: 7.6743e-05 gnorm: 1.23 [ 2:30:59< 0:25:53] +[titan] 2025-06-13 15:12:33,832 - root - INFO - step: 12810 loss: 19.1129 memory: 6.46GiB(27.34%) tps: 23,174 tflops: 23.32 mfu: 7.47% global_avg_ntp_loss: 3.3351 global_avg_mtp_loss: 15.7778 +[titan] 2025-06-13 15:12:33,832 - root - INFO - lr: 7.6624e-05 gnorm: 1.44 [ 2:31:03< 0:25:49] +[titan] 2025-06-13 15:12:37,303 - root - INFO - step: 12815 loss: 19.5558 memory: 6.46GiB(27.34%) tps: 23,603 tflops: 23.75 mfu: 7.61% global_avg_ntp_loss: 3.3418 global_avg_mtp_loss: 16.2139 +[titan] 2025-06-13 15:12:37,303 - root - INFO - lr: 7.6505e-05 gnorm: 1.14 [ 2:31:06< 0:25:45] +[titan] 2025-06-13 15:12:40,474 - root - INFO - step: 12820 loss: 19.0128 memory: 6.46GiB(27.34%) tps: 25,833 tflops: 26.00 mfu: 8.33% global_avg_ntp_loss: 3.3055 global_avg_mtp_loss: 15.7073 +[titan] 2025-06-13 15:12:40,475 - root - INFO - lr: 7.6386e-05 gnorm: 1.31 [ 2:31:10< 0:25:42] +[titan] 2025-06-13 15:12:44,405 - root - INFO - step: 12825 loss: 19.7345 memory: 6.46GiB(27.34%) tps: 20,846 tflops: 20.98 mfu: 6.72% global_avg_ntp_loss: 3.3736 global_avg_mtp_loss: 16.3609 +[titan] 2025-06-13 15:12:44,405 - root - INFO - lr: 7.6267e-05 gnorm: 1.20 [ 2:31:13< 0:25:38] +[titan] 2025-06-13 15:12:47,482 - root - INFO - step: 12830 loss: 20.2284 memory: 6.46GiB(27.34%) tps: 26,625 tflops: 26.79 mfu: 8.59% global_avg_ntp_loss: 3.4952 global_avg_mtp_loss: 16.7331 +[titan] 2025-06-13 15:12:47,482 - root - INFO - lr: 7.6149e-05 gnorm: 1.49 [ 2:31:17< 0:25:35] +[titan] 2025-06-13 15:12:51,033 - root - INFO - step: 12835 loss: 19.7582 memory: 6.46GiB(27.34%) tps: 23,069 tflops: 23.22 mfu: 7.44% global_avg_ntp_loss: 3.3748 global_avg_mtp_loss: 16.3835 +[titan] 2025-06-13 15:12:51,034 - root - INFO - lr: 7.6031e-05 gnorm: 1.20 [ 2:31:20< 0:25:31] +[titan] 2025-06-13 15:12:54,335 - root - INFO - step: 12840 loss: 19.1765 memory: 6.46GiB(27.34%) tps: 24,819 tflops: 24.98 mfu: 8.01% global_avg_ntp_loss: 3.2714 global_avg_mtp_loss: 15.9051 +[titan] 2025-06-13 15:12:54,335 - root - INFO - lr: 7.5913e-05 gnorm: 1.18 [ 2:31:23< 0:25:28] +[titan] 2025-06-13 15:12:57,900 - root - INFO - step: 12845 loss: 19.7621 memory: 6.46GiB(27.34%) tps: 22,980 tflops: 23.13 mfu: 7.41% global_avg_ntp_loss: 3.4161 global_avg_mtp_loss: 16.3460 +[titan] 2025-06-13 15:12:57,900 - root - INFO - lr: 7.5796e-05 gnorm: 1.28 [ 2:31:27< 0:25:24] +[titan] 2025-06-13 15:13:00,265 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:13:00,806 - root - INFO - step: 12850 loss: 18.9769 memory: 6.46GiB(27.34%) tps: 28,192 tflops: 28.37 mfu: 9.09% global_avg_ntp_loss: 3.2583 global_avg_mtp_loss: 15.7186 +[titan] 2025-06-13 15:13:00,806 - root - INFO - lr: 7.5679e-05 gnorm: 1.23 [ 2:31:30< 0:25:20] +[titan] 2025-06-13 15:13:04,174 - root - INFO - step: 12855 loss: 18.3550 memory: 6.46GiB(27.34%) tps: 24,323 tflops: 24.48 mfu: 7.85% global_avg_ntp_loss: 3.1059 global_avg_mtp_loss: 15.2491 +[titan] 2025-06-13 15:13:04,175 - root - INFO - lr: 7.5562e-05 gnorm: 1.27 [ 2:31:33< 0:25:17] +[titan] 2025-06-13 15:13:07,476 - root - INFO - step: 12860 loss: 19.9385 memory: 6.46GiB(27.34%) tps: 24,813 tflops: 24.97 mfu: 8.00% global_avg_ntp_loss: 3.3734 global_avg_mtp_loss: 16.5651 +[titan] 2025-06-13 15:13:07,477 - root - INFO - lr: 7.5445e-05 gnorm: 1.38 [ 2:31:37< 0:25:13] +[titan] 2025-06-13 15:13:10,964 - root - INFO - step: 12865 loss: 19.4905 memory: 6.46GiB(27.34%) tps: 23,490 tflops: 23.64 mfu: 7.58% global_avg_ntp_loss: 3.3774 global_avg_mtp_loss: 16.1131 +[titan] 2025-06-13 15:13:10,965 - root - INFO - lr: 7.5328e-05 gnorm: 1.21 [ 2:31:40< 0:25:10] +[titan] 2025-06-13 15:13:14,737 - root - INFO - step: 12870 loss: 18.3376 memory: 6.46GiB(27.34%) tps: 21,718 tflops: 21.86 mfu: 7.01% global_avg_ntp_loss: 3.1559 global_avg_mtp_loss: 15.1817 +[titan] 2025-06-13 15:13:14,737 - root - INFO - lr: 7.5212e-05 gnorm: 1.27 [ 2:31:44< 0:25:06] +[titan] 2025-06-13 15:13:17,922 - root - INFO - step: 12875 loss: 19.0602 memory: 6.46GiB(27.34%) tps: 25,723 tflops: 25.89 mfu: 8.30% global_avg_ntp_loss: 3.3519 global_avg_mtp_loss: 15.7084 +[titan] 2025-06-13 15:13:17,922 - root - INFO - lr: 7.5096e-05 gnorm: 1.34 [ 2:31:47< 0:25:03] +[titan] 2025-06-13 15:13:21,256 - root - INFO - step: 12880 loss: 18.5495 memory: 6.46GiB(27.34%) tps: 24,575 tflops: 24.73 mfu: 7.93% global_avg_ntp_loss: 3.1987 global_avg_mtp_loss: 15.3508 +[titan] 2025-06-13 15:13:21,256 - root - INFO - lr: 7.4980e-05 gnorm: 1.23 [ 2:31:50< 0:24:59] +[titan] 2025-06-13 15:13:24,567 - root - INFO - step: 12885 loss: 20.6171 memory: 6.46GiB(27.34%) tps: 24,741 tflops: 24.90 mfu: 7.98% global_avg_ntp_loss: 3.5407 global_avg_mtp_loss: 17.0765 +[titan] 2025-06-13 15:13:24,568 - root - INFO - lr: 7.4865e-05 gnorm: 1.32 [ 2:31:54< 0:24:56] +[titan] 2025-06-13 15:13:27,966 - root - INFO - step: 12890 loss: 19.6084 memory: 6.46GiB(27.34%) tps: 24,110 tflops: 24.26 mfu: 7.78% global_avg_ntp_loss: 3.3645 global_avg_mtp_loss: 16.2439 +[titan] 2025-06-13 15:13:27,966 - root - INFO - lr: 7.4750e-05 gnorm: 1.19 [ 2:31:57< 0:24:52] +[titan] 2025-06-13 15:13:31,319 - root - INFO - step: 12895 loss: 19.9693 memory: 6.46GiB(27.34%) tps: 24,435 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 3.4548 global_avg_mtp_loss: 16.5145 +[titan] 2025-06-13 15:13:31,319 - root - INFO - lr: 7.4635e-05 gnorm: 1.25 [ 2:32:00< 0:24:48] +[titan] 2025-06-13 15:13:34,068 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:13:34,737 - root - INFO - step: 12900 loss: 18.4823 memory: 6.46GiB(27.34%) tps: 23,971 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 3.1281 global_avg_mtp_loss: 15.3542 +[titan] 2025-06-13 15:13:34,737 - root - INFO - lr: 7.4520e-05 gnorm: 1.41 [ 2:32:04< 0:24:45] +[titan] 2025-06-13 15:13:38,267 - root - INFO - step: 12905 loss: 18.8844 memory: 6.46GiB(27.34%) tps: 23,212 tflops: 23.36 mfu: 7.49% global_avg_ntp_loss: 3.2382 global_avg_mtp_loss: 15.6462 +[titan] 2025-06-13 15:13:38,267 - root - INFO - lr: 7.4406e-05 gnorm: 1.22 [ 2:32:07< 0:24:41] +[titan] 2025-06-13 15:13:41,919 - root - INFO - step: 12910 loss: 18.7030 memory: 6.46GiB(27.34%) tps: 22,433 tflops: 22.58 mfu: 7.24% global_avg_ntp_loss: 3.1382 global_avg_mtp_loss: 15.5648 +[titan] 2025-06-13 15:13:41,919 - root - INFO - lr: 7.4291e-05 gnorm: 1.39 [ 2:32:11< 0:24:38] +[titan] 2025-06-13 15:13:45,854 - root - INFO - step: 12915 loss: 18.2625 memory: 6.46GiB(27.34%) tps: 20,819 tflops: 20.95 mfu: 6.72% global_avg_ntp_loss: 3.1112 global_avg_mtp_loss: 15.1512 +[titan] 2025-06-13 15:13:45,854 - root - INFO - lr: 7.4177e-05 gnorm: 1.44 [ 2:32:15< 0:24:34] +[titan] 2025-06-13 15:13:49,182 - root - INFO - step: 12920 loss: 20.0667 memory: 6.46GiB(27.34%) tps: 24,621 tflops: 24.78 mfu: 7.94% global_avg_ntp_loss: 3.5011 global_avg_mtp_loss: 16.5656 +[titan] 2025-06-13 15:13:49,182 - root - INFO - lr: 7.4064e-05 gnorm: 1.26 [ 2:32:18< 0:24:31] +[titan] 2025-06-13 15:13:52,535 - root - INFO - step: 12925 loss: 19.8088 memory: 6.46GiB(27.34%) tps: 24,430 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 3.4052 global_avg_mtp_loss: 16.4036 +[titan] 2025-06-13 15:13:52,536 - root - INFO - lr: 7.3950e-05 gnorm: 1.39 [ 2:32:22< 0:24:27] +[titan] 2025-06-13 15:13:56,215 - root - INFO - step: 12930 loss: 18.3568 memory: 6.46GiB(27.34%) tps: 22,267 tflops: 22.41 mfu: 7.18% global_avg_ntp_loss: 3.1317 global_avg_mtp_loss: 15.2251 +[titan] 2025-06-13 15:13:56,215 - root - INFO - lr: 7.3837e-05 gnorm: 1.39 [ 2:32:25< 0:24:24] +[titan] 2025-06-13 15:13:59,642 - root - INFO - step: 12935 loss: 17.5260 memory: 6.46GiB(27.34%) tps: 23,903 tflops: 24.06 mfu: 7.71% global_avg_ntp_loss: 3.0062 global_avg_mtp_loss: 14.5198 +[titan] 2025-06-13 15:13:59,643 - root - INFO - lr: 7.3724e-05 gnorm: 1.38 [ 2:32:29< 0:24:20] +[titan] 2025-06-13 15:14:03,418 - root - INFO - step: 12940 loss: 17.7536 memory: 6.46GiB(27.34%) tps: 21,702 tflops: 21.84 mfu: 7.00% global_avg_ntp_loss: 3.0574 global_avg_mtp_loss: 14.6962 +[titan] 2025-06-13 15:14:03,418 - root - INFO - lr: 7.3611e-05 gnorm: 1.42 [ 2:32:32< 0:24:17] +[titan] 2025-06-13 15:14:06,987 - root - INFO - step: 12945 loss: 19.2035 memory: 6.46GiB(27.34%) tps: 22,956 tflops: 23.10 mfu: 7.40% global_avg_ntp_loss: 3.2905 global_avg_mtp_loss: 15.9130 +[titan] 2025-06-13 15:14:06,987 - root - INFO - lr: 7.3499e-05 gnorm: 1.27 [ 2:32:36< 0:24:13] +[titan] 2025-06-13 15:14:09,763 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:14:10,623 - root - INFO - step: 12950 loss: 18.5850 memory: 6.46GiB(27.34%) tps: 22,535 tflops: 22.68 mfu: 7.27% global_avg_ntp_loss: 3.1923 global_avg_mtp_loss: 15.3927 +[titan] 2025-06-13 15:14:10,623 - root - INFO - lr: 7.3387e-05 gnorm: 1.34 [ 2:32:40< 0:24:10] +[titan] 2025-06-13 15:14:14,038 - root - INFO - step: 12955 loss: 19.0811 memory: 6.46GiB(27.34%) tps: 23,986 tflops: 24.14 mfu: 7.74% global_avg_ntp_loss: 3.2627 global_avg_mtp_loss: 15.8183 +[titan] 2025-06-13 15:14:14,039 - root - INFO - lr: 7.3275e-05 gnorm: 1.35 [ 2:32:43< 0:24:06] +[titan] 2025-06-13 15:14:17,603 - root - INFO - step: 12960 loss: 17.2856 memory: 6.46GiB(27.34%) tps: 22,989 tflops: 23.14 mfu: 7.42% global_avg_ntp_loss: 2.8963 global_avg_mtp_loss: 14.3894 +[titan] 2025-06-13 15:14:17,603 - root - INFO - lr: 7.3163e-05 gnorm: 1.53 [ 2:32:47< 0:24:02] +[titan] 2025-06-13 15:14:21,403 - root - INFO - step: 12965 loss: 19.3011 memory: 6.46GiB(27.34%) tps: 21,557 tflops: 21.69 mfu: 6.95% global_avg_ntp_loss: 3.2858 global_avg_mtp_loss: 16.0153 +[titan] 2025-06-13 15:14:21,404 - root - INFO - lr: 7.3052e-05 gnorm: 1.18 [ 2:32:50< 0:23:59] +[titan] 2025-06-13 15:14:24,731 - root - INFO - step: 12970 loss: 18.7516 memory: 6.46GiB(27.34%) tps: 24,619 tflops: 24.78 mfu: 7.94% global_avg_ntp_loss: 3.1870 global_avg_mtp_loss: 15.5646 +[titan] 2025-06-13 15:14:24,732 - root - INFO - lr: 7.2941e-05 gnorm: 1.37 [ 2:32:54< 0:23:55] +[titan] 2025-06-13 15:14:27,868 - root - INFO - step: 12975 loss: 20.1475 memory: 6.46GiB(27.34%) tps: 26,121 tflops: 26.29 mfu: 8.43% global_avg_ntp_loss: 3.4753 global_avg_mtp_loss: 16.6721 +[titan] 2025-06-13 15:14:27,868 - root - INFO - lr: 7.2830e-05 gnorm: 1.29 [ 2:32:57< 0:23:52] +[titan] 2025-06-13 15:14:31,333 - root - INFO - step: 12980 loss: 19.0553 memory: 6.46GiB(27.34%) tps: 23,644 tflops: 23.79 mfu: 7.63% global_avg_ntp_loss: 3.2527 global_avg_mtp_loss: 15.8026 +[titan] 2025-06-13 15:14:31,333 - root - INFO - lr: 7.2719e-05 gnorm: 1.34 [ 2:33:00< 0:23:48] +[titan] 2025-06-13 15:14:34,967 - root - INFO - step: 12985 loss: 18.4092 memory: 6.46GiB(27.34%) tps: 22,548 tflops: 22.69 mfu: 7.27% global_avg_ntp_loss: 3.2095 global_avg_mtp_loss: 15.1997 +[titan] 2025-06-13 15:14:34,967 - root - INFO - lr: 7.2609e-05 gnorm: 1.31 [ 2:33:04< 0:23:45] +[titan] 2025-06-13 15:14:38,584 - root - INFO - step: 12990 loss: 17.4178 memory: 6.46GiB(27.34%) tps: 22,651 tflops: 22.79 mfu: 7.31% global_avg_ntp_loss: 2.9588 global_avg_mtp_loss: 14.4589 +[titan] 2025-06-13 15:14:38,584 - root - INFO - lr: 7.2498e-05 gnorm: 1.42 [ 2:33:08< 0:23:41] +[titan] 2025-06-13 15:14:42,093 - root - INFO - step: 12995 loss: 19.6934 memory: 6.46GiB(27.34%) tps: 23,350 tflops: 23.50 mfu: 7.53% global_avg_ntp_loss: 3.3656 global_avg_mtp_loss: 16.3278 +[titan] 2025-06-13 15:14:42,093 - root - INFO - lr: 7.2389e-05 gnorm: 1.28 [ 2:33:11< 0:23:38] +[titan] 2025-06-13 15:14:44,603 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:14:45,241 - root - INFO - step: 13000 loss: 18.3415 memory: 6.46GiB(27.34%) tps: 26,022 tflops: 26.19 mfu: 8.39% global_avg_ntp_loss: 3.1052 global_avg_mtp_loss: 15.2363 +[titan] 2025-06-13 15:14:45,241 - root - INFO - lr: 7.2279e-05 gnorm: 1.37 [ 2:33:14< 0:23:34] +[titan] 2025-06-13 15:14:48,786 - root - INFO - step: 13005 loss: 19.3866 memory: 6.46GiB(27.34%) tps: 23,114 tflops: 23.26 mfu: 7.46% global_avg_ntp_loss: 3.3878 global_avg_mtp_loss: 15.9987 +[titan] 2025-06-13 15:14:48,786 - root - INFO - lr: 7.2169e-05 gnorm: 1.30 [ 2:33:18< 0:23:31] +[titan] 2025-06-13 15:14:52,413 - root - INFO - step: 13010 loss: 18.1424 memory: 6.46GiB(27.34%) tps: 22,589 tflops: 22.73 mfu: 7.29% global_avg_ntp_loss: 3.1017 global_avg_mtp_loss: 15.0407 +[titan] 2025-06-13 15:14:52,413 - root - INFO - lr: 7.2060e-05 gnorm: 1.20 [ 2:33:21< 0:23:27] +[titan] 2025-06-13 15:14:55,805 - root - INFO - step: 13015 loss: 19.0997 memory: 6.46GiB(27.34%) tps: 24,148 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 3.2703 global_avg_mtp_loss: 15.8294 +[titan] 2025-06-13 15:14:55,806 - root - INFO - lr: 7.1951e-05 gnorm: 1.37 [ 2:33:25< 0:23:23] +[titan] 2025-06-13 15:14:59,330 - root - INFO - step: 13020 loss: 18.3162 memory: 6.46GiB(27.34%) tps: 23,246 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.1294 global_avg_mtp_loss: 15.1868 +[titan] 2025-06-13 15:14:59,330 - root - INFO - lr: 7.1843e-05 gnorm: 1.54 [ 2:33:28< 0:23:20] +[titan] 2025-06-13 15:15:02,752 - root - INFO - step: 13025 loss: 19.4399 memory: 6.46GiB(27.34%) tps: 23,941 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.3333 global_avg_mtp_loss: 16.1066 +[titan] 2025-06-13 15:15:02,752 - root - INFO - lr: 7.1734e-05 gnorm: 1.22 [ 2:33:32< 0:23:16] +[titan] 2025-06-13 15:15:06,277 - root - INFO - step: 13030 loss: 20.0689 memory: 6.46GiB(27.34%) tps: 23,243 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.4544 global_avg_mtp_loss: 16.6146 +[titan] 2025-06-13 15:15:06,277 - root - INFO - lr: 7.1626e-05 gnorm: 1.32 [ 2:33:35< 0:23:13] +[titan] 2025-06-13 15:15:09,979 - root - INFO - step: 13035 loss: 18.4969 memory: 6.46GiB(27.34%) tps: 22,131 tflops: 22.27 mfu: 7.14% global_avg_ntp_loss: 3.1660 global_avg_mtp_loss: 15.3308 +[titan] 2025-06-13 15:15:09,979 - root - INFO - lr: 7.1519e-05 gnorm: 1.54 [ 2:33:39< 0:23:09] +[titan] 2025-06-13 15:15:13,350 - root - INFO - step: 13040 loss: 19.5210 memory: 6.46GiB(27.34%) tps: 24,306 tflops: 24.46 mfu: 7.84% global_avg_ntp_loss: 3.3484 global_avg_mtp_loss: 16.1726 +[titan] 2025-06-13 15:15:13,350 - root - INFO - lr: 7.1411e-05 gnorm: 1.23 [ 2:33:42< 0:23:06] +[titan] 2025-06-13 15:15:16,424 - root - INFO - step: 13045 loss: 19.0616 memory: 6.46GiB(27.34%) tps: 26,648 tflops: 26.82 mfu: 8.60% global_avg_ntp_loss: 3.2579 global_avg_mtp_loss: 15.8037 +[titan] 2025-06-13 15:15:16,425 - root - INFO - lr: 7.1304e-05 gnorm: 1.25 [ 2:33:45< 0:23:02] +[titan] 2025-06-13 15:15:19,729 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:15:20,293 - root - INFO - step: 13050 loss: 19.7509 memory: 6.46GiB(27.34%) tps: 21,181 tflops: 21.32 mfu: 6.83% global_avg_ntp_loss: 3.3719 global_avg_mtp_loss: 16.3790 +[titan] 2025-06-13 15:15:20,293 - root - INFO - lr: 7.1196e-05 gnorm: 1.19 [ 2:33:49< 0:22:59] +[titan] 2025-06-13 15:15:24,064 - root - INFO - step: 13055 loss: 19.1345 memory: 6.46GiB(27.34%) tps: 21,723 tflops: 21.86 mfu: 7.01% global_avg_ntp_loss: 3.2713 global_avg_mtp_loss: 15.8632 +[titan] 2025-06-13 15:15:24,064 - root - INFO - lr: 7.1090e-05 gnorm: 1.23 [ 2:33:53< 0:22:55] +[titan] 2025-06-13 15:15:27,363 - root - INFO - step: 13060 loss: 18.3220 memory: 6.46GiB(27.34%) tps: 24,834 tflops: 24.99 mfu: 8.01% global_avg_ntp_loss: 3.1184 global_avg_mtp_loss: 15.2036 +[titan] 2025-06-13 15:15:27,364 - root - INFO - lr: 7.0983e-05 gnorm: 1.30 [ 2:33:56< 0:22:52] +[titan] 2025-06-13 15:15:31,073 - root - INFO - step: 13065 loss: 19.7612 memory: 6.46GiB(27.34%) tps: 22,086 tflops: 22.23 mfu: 7.12% global_avg_ntp_loss: 3.3808 global_avg_mtp_loss: 16.3804 +[titan] 2025-06-13 15:15:31,073 - root - INFO - lr: 7.0877e-05 gnorm: 1.25 [ 2:34:00< 0:22:48] +[titan] 2025-06-13 15:15:34,542 - root - INFO - step: 13070 loss: 18.4244 memory: 6.46GiB(27.34%) tps: 23,619 tflops: 23.77 mfu: 7.62% global_avg_ntp_loss: 3.1452 global_avg_mtp_loss: 15.2792 +[titan] 2025-06-13 15:15:34,542 - root - INFO - lr: 7.0771e-05 gnorm: 1.26 [ 2:34:04< 0:22:45] +[titan] 2025-06-13 15:15:37,923 - root - INFO - step: 13075 loss: 19.5590 memory: 6.46GiB(27.34%) tps: 24,227 tflops: 24.38 mfu: 7.81% global_avg_ntp_loss: 3.3475 global_avg_mtp_loss: 16.2114 +[titan] 2025-06-13 15:15:37,924 - root - INFO - lr: 7.0665e-05 gnorm: 1.27 [ 2:34:07< 0:22:41] +[titan] 2025-06-13 15:15:41,995 - root - INFO - step: 13080 loss: 19.1608 memory: 6.46GiB(27.34%) tps: 20,120 tflops: 20.25 mfu: 6.49% global_avg_ntp_loss: 3.2599 global_avg_mtp_loss: 15.9009 +[titan] 2025-06-13 15:15:41,996 - root - INFO - lr: 7.0559e-05 gnorm: 1.25 [ 2:34:11< 0:22:38] +[titan] 2025-06-13 15:15:45,373 - root - INFO - step: 13085 loss: 17.4212 memory: 6.46GiB(27.34%) tps: 24,255 tflops: 24.41 mfu: 7.82% global_avg_ntp_loss: 3.0589 global_avg_mtp_loss: 14.3623 +[titan] 2025-06-13 15:15:45,373 - root - INFO - lr: 7.0454e-05 gnorm: 1.24 [ 2:34:14< 0:22:34] +[titan] 2025-06-13 15:15:48,945 - root - INFO - step: 13090 loss: 19.0580 memory: 6.46GiB(27.34%) tps: 22,936 tflops: 23.08 mfu: 7.40% global_avg_ntp_loss: 3.2985 global_avg_mtp_loss: 15.7595 +[titan] 2025-06-13 15:15:48,946 - root - INFO - lr: 7.0349e-05 gnorm: 1.38 [ 2:34:18< 0:22:30] +[titan] 2025-06-13 15:15:52,512 - root - INFO - step: 13095 loss: 19.2551 memory: 6.46GiB(27.34%) tps: 22,973 tflops: 23.12 mfu: 7.41% global_avg_ntp_loss: 3.2763 global_avg_mtp_loss: 15.9788 +[titan] 2025-06-13 15:15:52,512 - root - INFO - lr: 7.0244e-05 gnorm: 1.41 [ 2:34:22< 0:22:27] +[titan] 2025-06-13 15:15:55,251 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:15:56,096 - root - INFO - step: 13100 loss: 18.9637 memory: 6.46GiB(27.34%) tps: 22,860 tflops: 23.01 mfu: 7.37% global_avg_ntp_loss: 3.2514 global_avg_mtp_loss: 15.7123 +[titan] 2025-06-13 15:15:56,096 - root - INFO - lr: 7.0140e-05 gnorm: 1.38 [ 2:34:25< 0:22:23] +[titan] 2025-06-13 15:15:59,395 - root - INFO - step: 13105 loss: 19.3202 memory: 6.46GiB(27.34%) tps: 24,831 tflops: 24.99 mfu: 8.01% global_avg_ntp_loss: 3.3753 global_avg_mtp_loss: 15.9449 +[titan] 2025-06-13 15:15:59,396 - root - INFO - lr: 7.0035e-05 gnorm: 1.27 [ 2:34:28< 0:22:20] +[titan] 2025-06-13 15:16:02,849 - root - INFO - step: 13110 loss: 19.8843 memory: 6.46GiB(27.34%) tps: 23,722 tflops: 23.87 mfu: 7.65% global_avg_ntp_loss: 3.3865 global_avg_mtp_loss: 16.4978 +[titan] 2025-06-13 15:16:02,849 - root - INFO - lr: 6.9931e-05 gnorm: 1.24 [ 2:34:32< 0:22:16] +[titan] 2025-06-13 15:16:06,423 - root - INFO - step: 13115 loss: 19.7386 memory: 6.46GiB(27.34%) tps: 22,921 tflops: 23.07 mfu: 7.39% global_avg_ntp_loss: 3.3649 global_avg_mtp_loss: 16.3738 +[titan] 2025-06-13 15:16:06,424 - root - INFO - lr: 6.9828e-05 gnorm: 1.20 [ 2:34:35< 0:22:13] +[titan] 2025-06-13 15:16:09,732 - root - INFO - step: 13120 loss: 17.0807 memory: 6.46GiB(27.34%) tps: 24,764 tflops: 24.92 mfu: 7.99% global_avg_ntp_loss: 2.9017 global_avg_mtp_loss: 14.1790 +[titan] 2025-06-13 15:16:09,732 - root - INFO - lr: 6.9724e-05 gnorm: 1.57 [ 2:34:39< 0:22:09] +[titan] 2025-06-13 15:16:13,119 - root - INFO - step: 13125 loss: 18.1531 memory: 6.46GiB(27.34%) tps: 24,189 tflops: 24.34 mfu: 7.80% global_avg_ntp_loss: 3.0961 global_avg_mtp_loss: 15.0569 +[titan] 2025-06-13 15:16:13,119 - root - INFO - lr: 6.9621e-05 gnorm: 1.53 [ 2:34:42< 0:22:06] +[titan] 2025-06-13 15:16:16,503 - root - INFO - step: 13130 loss: 18.8170 memory: 6.46GiB(27.34%) tps: 24,214 tflops: 24.37 mfu: 7.81% global_avg_ntp_loss: 3.2806 global_avg_mtp_loss: 15.5364 +[titan] 2025-06-13 15:16:16,503 - root - INFO - lr: 6.9518e-05 gnorm: 1.24 [ 2:34:46< 0:22:02] +[titan] 2025-06-13 15:16:19,806 - root - INFO - step: 13135 loss: 19.2874 memory: 6.46GiB(27.34%) tps: 24,799 tflops: 24.96 mfu: 8.00% global_avg_ntp_loss: 3.2179 global_avg_mtp_loss: 16.0695 +[titan] 2025-06-13 15:16:19,807 - root - INFO - lr: 6.9415e-05 gnorm: 1.49 [ 2:34:49< 0:21:58] +[titan] 2025-06-13 15:16:22,961 - root - INFO - step: 13140 loss: 14.8394 memory: 6.46GiB(27.34%) tps: 25,977 tflops: 26.14 mfu: 8.38% global_avg_ntp_loss: 2.5737 global_avg_mtp_loss: 12.2657 +[titan] 2025-06-13 15:16:22,961 - root - INFO - lr: 6.9313e-05 gnorm: 1.69 [ 2:34:52< 0:21:55] +[titan] 2025-06-13 15:16:26,325 - root - INFO - step: 13145 loss: 17.1733 memory: 6.46GiB(27.34%) tps: 24,353 tflops: 24.51 mfu: 7.86% global_avg_ntp_loss: 2.8989 global_avg_mtp_loss: 14.2744 +[titan] 2025-06-13 15:16:26,325 - root - INFO - lr: 6.9211e-05 gnorm: 1.45 [ 2:34:55< 0:21:51] +[titan] 2025-06-13 15:16:29,097 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:16:29,773 - root - INFO - step: 13150 loss: 19.3101 memory: 6.46GiB(27.34%) tps: 23,758 tflops: 23.91 mfu: 7.66% global_avg_ntp_loss: 3.3090 global_avg_mtp_loss: 16.0011 +[titan] 2025-06-13 15:16:29,774 - root - INFO - lr: 6.9109e-05 gnorm: 1.21 [ 2:34:59< 0:21:48] +[titan] 2025-06-13 15:16:33,506 - root - INFO - step: 13155 loss: 16.9743 memory: 6.46GiB(27.34%) tps: 21,951 tflops: 22.09 mfu: 7.08% global_avg_ntp_loss: 2.8943 global_avg_mtp_loss: 14.0801 +[titan] 2025-06-13 15:16:33,506 - root - INFO - lr: 6.9007e-05 gnorm: 1.27 [ 2:35:03< 0:21:44] +[titan] 2025-06-13 15:16:36,874 - root - INFO - step: 13160 loss: 19.5046 memory: 6.46GiB(27.34%) tps: 24,322 tflops: 24.48 mfu: 7.85% global_avg_ntp_loss: 3.3266 global_avg_mtp_loss: 16.1780 +[titan] 2025-06-13 15:16:36,874 - root - INFO - lr: 6.8906e-05 gnorm: 1.18 [ 2:35:06< 0:21:41] +[titan] 2025-06-13 15:16:40,372 - root - INFO - step: 13165 loss: 20.4301 memory: 6.46GiB(27.34%) tps: 23,422 tflops: 23.57 mfu: 7.55% global_avg_ntp_loss: 3.4834 global_avg_mtp_loss: 16.9467 +[titan] 2025-06-13 15:16:40,372 - root - INFO - lr: 6.8805e-05 gnorm: 1.21 [ 2:35:09< 0:21:37] +[titan] 2025-06-13 15:16:43,547 - root - INFO - step: 13170 loss: 17.2013 memory: 6.46GiB(27.34%) tps: 25,808 tflops: 25.97 mfu: 8.32% global_avg_ntp_loss: 2.9202 global_avg_mtp_loss: 14.2811 +[titan] 2025-06-13 15:16:43,547 - root - INFO - lr: 6.8704e-05 gnorm: 1.38 [ 2:35:13< 0:21:34] +[titan] 2025-06-13 15:16:46,777 - root - INFO - step: 13175 loss: 18.1247 memory: 6.46GiB(27.34%) tps: 25,361 tflops: 25.52 mfu: 8.18% global_avg_ntp_loss: 3.0745 global_avg_mtp_loss: 15.0502 +[titan] 2025-06-13 15:16:46,778 - root - INFO - lr: 6.8603e-05 gnorm: 1.19 [ 2:35:16< 0:21:30] +[titan] 2025-06-13 15:16:50,450 - root - INFO - step: 13180 loss: 19.7204 memory: 6.46GiB(27.34%) tps: 22,308 tflops: 22.45 mfu: 7.20% global_avg_ntp_loss: 3.3665 global_avg_mtp_loss: 16.3539 +[titan] 2025-06-13 15:16:50,450 - root - INFO - lr: 6.8503e-05 gnorm: 1.25 [ 2:35:19< 0:21:26] +[titan] 2025-06-13 15:16:53,814 - root - INFO - step: 13185 loss: 19.2460 memory: 6.46GiB(27.34%) tps: 24,358 tflops: 24.51 mfu: 7.86% global_avg_ntp_loss: 3.2916 global_avg_mtp_loss: 15.9544 +[titan] 2025-06-13 15:16:53,814 - root - INFO - lr: 6.8402e-05 gnorm: 1.26 [ 2:35:23< 0:21:23] +[titan] 2025-06-13 15:16:57,434 - root - INFO - step: 13190 loss: 19.9652 memory: 6.46GiB(27.34%) tps: 22,633 tflops: 22.78 mfu: 7.30% global_avg_ntp_loss: 3.4689 global_avg_mtp_loss: 16.4962 +[titan] 2025-06-13 15:16:57,434 - root - INFO - lr: 6.8303e-05 gnorm: 1.33 [ 2:35:26< 0:21:19] +[titan] 2025-06-13 15:17:00,970 - root - INFO - step: 13195 loss: 19.5560 memory: 6.46GiB(27.34%) tps: 23,170 tflops: 23.32 mfu: 7.47% global_avg_ntp_loss: 3.3310 global_avg_mtp_loss: 16.2250 +[titan] 2025-06-13 15:17:00,970 - root - INFO - lr: 6.8203e-05 gnorm: 1.29 [ 2:35:30< 0:21:16] +[titan] 2025-06-13 15:17:03,581 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:17:04,316 - root - INFO - step: 13200 loss: 19.1592 memory: 6.46GiB(27.34%) tps: 24,485 tflops: 24.64 mfu: 7.90% global_avg_ntp_loss: 3.2392 global_avg_mtp_loss: 15.9200 +[titan] 2025-06-13 15:17:04,316 - root - INFO - lr: 6.8104e-05 gnorm: 1.33 [ 2:35:33< 0:21:12] +[titan] 2025-06-13 15:17:07,656 - root - INFO - step: 13205 loss: 18.7184 memory: 6.46GiB(27.34%) tps: 24,527 tflops: 24.68 mfu: 7.91% global_avg_ntp_loss: 3.1582 global_avg_mtp_loss: 15.5602 +[titan] 2025-06-13 15:17:07,657 - root - INFO - lr: 6.8005e-05 gnorm: 1.48 [ 2:35:37< 0:21:09] +[titan] 2025-06-13 15:17:11,128 - root - INFO - step: 13210 loss: 16.9827 memory: 6.46GiB(27.34%) tps: 23,599 tflops: 23.75 mfu: 7.61% global_avg_ntp_loss: 2.9106 global_avg_mtp_loss: 14.0721 +[titan] 2025-06-13 15:17:11,128 - root - INFO - lr: 6.7906e-05 gnorm: 1.33 [ 2:35:40< 0:21:05] +[titan] 2025-06-13 15:17:14,559 - root - INFO - step: 13215 loss: 17.2214 memory: 6.46GiB(27.34%) tps: 23,879 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 2.9071 global_avg_mtp_loss: 14.3144 +[titan] 2025-06-13 15:17:14,559 - root - INFO - lr: 6.7807e-05 gnorm: 1.35 [ 2:35:44< 0:21:02] +[titan] 2025-06-13 15:17:18,057 - root - INFO - step: 13220 loss: 18.0766 memory: 6.46GiB(27.34%) tps: 23,420 tflops: 23.57 mfu: 7.55% global_avg_ntp_loss: 3.1005 global_avg_mtp_loss: 14.9761 +[titan] 2025-06-13 15:17:18,058 - root - INFO - lr: 6.7709e-05 gnorm: 1.50 [ 2:35:47< 0:20:58] +[titan] 2025-06-13 15:17:21,288 - root - INFO - step: 13225 loss: 19.8438 memory: 6.46GiB(27.34%) tps: 25,359 tflops: 25.52 mfu: 8.18% global_avg_ntp_loss: 3.4020 global_avg_mtp_loss: 16.4418 +[titan] 2025-06-13 15:17:21,289 - root - INFO - lr: 6.7611e-05 gnorm: 1.45 [ 2:35:50< 0:20:55] +[titan] 2025-06-13 15:17:24,797 - root - INFO - step: 13230 loss: 20.0295 memory: 6.46GiB(27.34%) tps: 23,350 tflops: 23.50 mfu: 7.53% global_avg_ntp_loss: 3.4198 global_avg_mtp_loss: 16.6097 +[titan] 2025-06-13 15:17:24,797 - root - INFO - lr: 6.7513e-05 gnorm: 1.18 [ 2:35:54< 0:20:51] +[titan] 2025-06-13 15:17:28,147 - root - INFO - step: 13235 loss: 18.8147 memory: 6.46GiB(27.34%) tps: 24,454 tflops: 24.61 mfu: 7.89% global_avg_ntp_loss: 3.2221 global_avg_mtp_loss: 15.5927 +[titan] 2025-06-13 15:17:28,148 - root - INFO - lr: 6.7416e-05 gnorm: 1.27 [ 2:35:57< 0:20:47] +[titan] 2025-06-13 15:17:31,566 - root - INFO - step: 13240 loss: 18.5176 memory: 6.46GiB(27.34%) tps: 23,963 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 3.1577 global_avg_mtp_loss: 15.3599 +[titan] 2025-06-13 15:17:31,567 - root - INFO - lr: 6.7318e-05 gnorm: 1.26 [ 2:36:01< 0:20:44] +[titan] 2025-06-13 15:17:35,100 - root - INFO - step: 13245 loss: 18.3702 memory: 6.46GiB(27.34%) tps: 23,183 tflops: 23.33 mfu: 7.48% global_avg_ntp_loss: 3.1203 global_avg_mtp_loss: 15.2499 +[titan] 2025-06-13 15:17:35,101 - root - INFO - lr: 6.7221e-05 gnorm: 1.30 [ 2:36:04< 0:20:40] +[titan] 2025-06-13 15:17:37,701 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:17:38,269 - root - INFO - step: 13250 loss: 20.0219 memory: 6.46GiB(27.34%) tps: 25,855 tflops: 26.02 mfu: 8.34% global_avg_ntp_loss: 3.4450 global_avg_mtp_loss: 16.5769 +[titan] 2025-06-13 15:17:38,270 - root - INFO - lr: 6.7125e-05 gnorm: 1.20 [ 2:36:07< 0:20:37] +[titan] 2025-06-13 15:17:41,893 - root - INFO - step: 13255 loss: 17.9076 memory: 6.46GiB(27.34%) tps: 22,612 tflops: 22.76 mfu: 7.29% global_avg_ntp_loss: 3.0149 global_avg_mtp_loss: 14.8926 +[titan] 2025-06-13 15:17:41,893 - root - INFO - lr: 6.7028e-05 gnorm: 1.75 [ 2:36:11< 0:20:33] +[titan] 2025-06-13 15:17:45,319 - root - INFO - step: 13260 loss: 18.6683 memory: 6.46GiB(27.34%) tps: 23,912 tflops: 24.06 mfu: 7.71% global_avg_ntp_loss: 3.1411 global_avg_mtp_loss: 15.5272 +[titan] 2025-06-13 15:17:45,319 - root - INFO - lr: 6.6932e-05 gnorm: 1.25 [ 2:36:14< 0:20:30] +[titan] 2025-06-13 15:17:48,713 - root - INFO - step: 13265 loss: 19.6643 memory: 6.46GiB(27.34%) tps: 24,140 tflops: 24.29 mfu: 7.79% global_avg_ntp_loss: 3.4403 global_avg_mtp_loss: 16.2240 +[titan] 2025-06-13 15:17:48,713 - root - INFO - lr: 6.6836e-05 gnorm: 1.33 [ 2:36:18< 0:20:26] +[titan] 2025-06-13 15:17:52,303 - root - INFO - step: 13270 loss: 18.7827 memory: 6.46GiB(27.34%) tps: 22,816 tflops: 22.96 mfu: 7.36% global_avg_ntp_loss: 3.2241 global_avg_mtp_loss: 15.5587 +[titan] 2025-06-13 15:17:52,304 - root - INFO - lr: 6.6740e-05 gnorm: 1.26 [ 2:36:21< 0:20:23] +[titan] 2025-06-13 15:17:55,726 - root - INFO - step: 13275 loss: 19.4954 memory: 6.46GiB(27.34%) tps: 23,936 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.3324 global_avg_mtp_loss: 16.1630 +[titan] 2025-06-13 15:17:55,727 - root - INFO - lr: 6.6645e-05 gnorm: 1.22 [ 2:36:25< 0:20:19] +[titan] 2025-06-13 15:18:00,583 - root - INFO - step: 13280 loss: 20.0979 memory: 6.46GiB(27.34%) tps: 16,870 tflops: 16.98 mfu: 5.44% global_avg_ntp_loss: 3.4470 global_avg_mtp_loss: 16.6509 +[titan] 2025-06-13 15:18:00,583 - root - INFO - lr: 6.6550e-05 gnorm: 1.32 [ 2:36:30< 0:20:16] +[titan] 2025-06-13 15:18:03,856 - root - INFO - step: 13285 loss: 20.2352 memory: 6.46GiB(27.34%) tps: 25,030 tflops: 25.19 mfu: 8.07% global_avg_ntp_loss: 3.5145 global_avg_mtp_loss: 16.7208 +[titan] 2025-06-13 15:18:03,856 - root - INFO - lr: 6.6455e-05 gnorm: 1.44 [ 2:36:33< 0:20:12] +[titan] 2025-06-13 15:18:06,920 - root - INFO - step: 13290 loss: 20.0964 memory: 6.46GiB(27.34%) tps: 26,742 tflops: 26.91 mfu: 8.63% global_avg_ntp_loss: 3.5189 global_avg_mtp_loss: 16.5775 +[titan] 2025-06-13 15:18:06,920 - root - INFO - lr: 6.6360e-05 gnorm: 1.28 [ 2:36:36< 0:20:09] +[titan] 2025-06-13 15:18:10,392 - root - INFO - step: 13295 loss: 18.8117 memory: 6.46GiB(27.34%) tps: 23,598 tflops: 23.75 mfu: 7.61% global_avg_ntp_loss: 3.2376 global_avg_mtp_loss: 15.5740 +[titan] 2025-06-13 15:18:10,392 - root - INFO - lr: 6.6266e-05 gnorm: 1.53 [ 2:36:39< 0:20:05] +[titan] 2025-06-13 15:18:13,131 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:18:13,805 - root - INFO - step: 13300 loss: 18.8067 memory: 6.46GiB(27.34%) tps: 24,001 tflops: 24.15 mfu: 7.74% global_avg_ntp_loss: 3.2389 global_avg_mtp_loss: 15.5678 +[titan] 2025-06-13 15:18:13,806 - root - INFO - lr: 6.6172e-05 gnorm: 1.36 [ 2:36:43< 0:20:01] +[titan] 2025-06-13 15:18:17,145 - root - INFO - step: 13305 loss: 19.4642 memory: 6.46GiB(27.34%) tps: 24,534 tflops: 24.69 mfu: 7.91% global_avg_ntp_loss: 3.3339 global_avg_mtp_loss: 16.1303 +[titan] 2025-06-13 15:18:17,145 - root - INFO - lr: 6.6078e-05 gnorm: 1.34 [ 2:36:46< 0:19:58] +[titan] 2025-06-13 15:18:20,582 - root - INFO - step: 13310 loss: 18.7764 memory: 6.46GiB(27.34%) tps: 23,834 tflops: 23.99 mfu: 7.69% global_avg_ntp_loss: 3.1829 global_avg_mtp_loss: 15.5935 +[titan] 2025-06-13 15:18:20,583 - root - INFO - lr: 6.5984e-05 gnorm: 1.29 [ 2:36:50< 0:19:54] +[titan] 2025-06-13 15:18:21,962 - root - INFO - Dumping profiler traces at step 13312 +[titan] 2025-06-13 15:18:22,060 - root - INFO - Finished dumping profiler traces in 0.10 seconds +[titan] 2025-06-13 15:18:23,856 - root - INFO - step: 13315 loss: 19.2910 memory: 6.46GiB(27.34%) tps: 25,028 tflops: 25.19 mfu: 8.07% global_avg_ntp_loss: 3.2940 global_avg_mtp_loss: 15.9970 +[titan] 2025-06-13 15:18:23,856 - root - INFO - lr: 6.5891e-05 gnorm: 1.44 [ 2:36:53< 0:19:51] +[titan] 2025-06-13 15:18:26,924 - root - INFO - step: 13320 loss: 19.9833 memory: 6.46GiB(27.34%) tps: 26,706 tflops: 26.88 mfu: 8.61% global_avg_ntp_loss: 3.4200 global_avg_mtp_loss: 16.5634 +[titan] 2025-06-13 15:18:26,924 - root - INFO - lr: 6.5798e-05 gnorm: 1.39 [ 2:36:56< 0:19:47] +[titan] 2025-06-13 15:18:30,094 - root - INFO - step: 13325 loss: 16.0494 memory: 6.46GiB(27.34%) tps: 25,842 tflops: 26.01 mfu: 8.34% global_avg_ntp_loss: 2.7461 global_avg_mtp_loss: 13.3032 +[titan] 2025-06-13 15:18:30,095 - root - INFO - lr: 6.5705e-05 gnorm: 1.49 [ 2:36:59< 0:19:44] +[titan] 2025-06-13 15:18:33,304 - root - INFO - step: 13330 loss: 19.0522 memory: 6.46GiB(27.34%) tps: 25,528 tflops: 25.69 mfu: 8.23% global_avg_ntp_loss: 3.2670 global_avg_mtp_loss: 15.7852 +[titan] 2025-06-13 15:18:33,304 - root - INFO - lr: 6.5613e-05 gnorm: 1.28 [ 2:37:02< 0:19:40] +[titan] 2025-06-13 15:18:37,126 - root - INFO - step: 13335 loss: 18.0958 memory: 6.46GiB(27.34%) tps: 21,437 tflops: 21.57 mfu: 6.91% global_avg_ntp_loss: 3.1131 global_avg_mtp_loss: 14.9827 +[titan] 2025-06-13 15:18:37,126 - root - INFO - lr: 6.5520e-05 gnorm: 1.41 [ 2:37:06< 0:19:37] +[titan] 2025-06-13 15:18:40,272 - root - INFO - step: 13340 loss: 18.6005 memory: 6.46GiB(27.34%) tps: 26,043 tflops: 26.21 mfu: 8.40% global_avg_ntp_loss: 3.1847 global_avg_mtp_loss: 15.4158 +[titan] 2025-06-13 15:18:40,272 - root - INFO - lr: 6.5428e-05 gnorm: 1.31 [ 2:37:09< 0:19:33] +[titan] 2025-06-13 15:18:44,205 - root - INFO - step: 13345 loss: 18.1396 memory: 6.46GiB(27.34%) tps: 20,830 tflops: 20.96 mfu: 6.72% global_avg_ntp_loss: 3.1031 global_avg_mtp_loss: 15.0365 +[titan] 2025-06-13 15:18:44,205 - root - INFO - lr: 6.5337e-05 gnorm: 1.58 [ 2:37:13< 0:19:29] +[titan] 2025-06-13 15:18:46,732 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:18:47,381 - root - INFO - step: 13350 loss: 18.2938 memory: 6.46GiB(27.34%) tps: 25,795 tflops: 25.96 mfu: 8.32% global_avg_ntp_loss: 3.1085 global_avg_mtp_loss: 15.1853 +[titan] 2025-06-13 15:18:47,381 - root - INFO - lr: 6.5245e-05 gnorm: 1.19 [ 2:37:16< 0:19:26] +[titan] 2025-06-13 15:18:50,689 - root - INFO - step: 13355 loss: 18.2548 memory: 6.46GiB(27.34%) tps: 24,766 tflops: 24.92 mfu: 7.99% global_avg_ntp_loss: 3.1119 global_avg_mtp_loss: 15.1428 +[titan] 2025-06-13 15:18:50,689 - root - INFO - lr: 6.5154e-05 gnorm: 1.25 [ 2:37:20< 0:19:22] +[titan] 2025-06-13 15:18:54,280 - root - INFO - step: 13360 loss: 18.9088 memory: 6.46GiB(27.34%) tps: 22,815 tflops: 22.96 mfu: 7.36% global_avg_ntp_loss: 3.2535 global_avg_mtp_loss: 15.6553 +[titan] 2025-06-13 15:18:54,281 - root - INFO - lr: 6.5063e-05 gnorm: 1.36 [ 2:37:23< 0:19:19] +[titan] 2025-06-13 15:18:57,452 - root - INFO - step: 13365 loss: 17.9939 memory: 6.46GiB(27.34%) tps: 25,833 tflops: 26.00 mfu: 8.33% global_avg_ntp_loss: 3.0482 global_avg_mtp_loss: 14.9457 +[titan] 2025-06-13 15:18:57,452 - root - INFO - lr: 6.4972e-05 gnorm: 1.32 [ 2:37:26< 0:19:15] +[titan] 2025-06-13 15:19:00,556 - root - INFO - step: 13370 loss: 15.6550 memory: 6.46GiB(27.34%) tps: 26,393 tflops: 26.56 mfu: 8.51% global_avg_ntp_loss: 2.6169 global_avg_mtp_loss: 13.0381 +[titan] 2025-06-13 15:19:00,556 - root - INFO - lr: 6.4882e-05 gnorm: 1.71 [ 2:37:30< 0:19:12] +[titan] 2025-06-13 15:19:04,299 - root - INFO - step: 13375 loss: 17.9433 memory: 6.46GiB(27.34%) tps: 21,888 tflops: 22.03 mfu: 7.06% global_avg_ntp_loss: 3.1597 global_avg_mtp_loss: 14.7836 +[titan] 2025-06-13 15:19:04,299 - root - INFO - lr: 6.4792e-05 gnorm: 1.39 [ 2:37:33< 0:19:08] +[titan] 2025-06-13 15:19:07,657 - root - INFO - step: 13380 loss: 19.6268 memory: 6.46GiB(27.34%) tps: 24,400 tflops: 24.56 mfu: 7.87% global_avg_ntp_loss: 3.3820 global_avg_mtp_loss: 16.2448 +[titan] 2025-06-13 15:19:07,657 - root - INFO - lr: 6.4702e-05 gnorm: 1.30 [ 2:37:37< 0:19:05] +[titan] 2025-06-13 15:19:10,926 - root - INFO - step: 13385 loss: 18.9846 memory: 6.46GiB(27.34%) tps: 25,063 tflops: 25.22 mfu: 8.08% global_avg_ntp_loss: 3.2390 global_avg_mtp_loss: 15.7457 +[titan] 2025-06-13 15:19:10,926 - root - INFO - lr: 6.4612e-05 gnorm: 1.28 [ 2:37:40< 0:19:01] +[titan] 2025-06-13 15:19:14,374 - root - INFO - step: 13390 loss: 19.1947 memory: 6.46GiB(27.34%) tps: 23,760 tflops: 23.91 mfu: 7.66% global_avg_ntp_loss: 3.3259 global_avg_mtp_loss: 15.8688 +[titan] 2025-06-13 15:19:14,374 - root - INFO - lr: 6.4523e-05 gnorm: 1.29 [ 2:37:43< 0:18:57] +[titan] 2025-06-13 15:19:18,225 - root - INFO - step: 13395 loss: 19.4682 memory: 6.46GiB(27.34%) tps: 21,278 tflops: 21.41 mfu: 6.86% global_avg_ntp_loss: 3.3112 global_avg_mtp_loss: 16.1571 +[titan] 2025-06-13 15:19:18,225 - root - INFO - lr: 6.4434e-05 gnorm: 1.32 [ 2:37:47< 0:18:54] +[titan] 2025-06-13 15:19:21,230 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:19:21,751 - root - INFO - step: 13400 loss: 18.3574 memory: 6.46GiB(27.34%) tps: 23,231 tflops: 23.38 mfu: 7.49% global_avg_ntp_loss: 3.1744 global_avg_mtp_loss: 15.1831 +[titan] 2025-06-13 15:19:21,751 - root - INFO - lr: 6.4345e-05 gnorm: 1.33 [ 2:37:51< 0:18:50] +[titan] 2025-06-13 15:19:24,921 - root - INFO - step: 13405 loss: 17.0643 memory: 6.46GiB(27.34%) tps: 25,849 tflops: 26.01 mfu: 8.34% global_avg_ntp_loss: 2.8842 global_avg_mtp_loss: 14.1800 +[titan] 2025-06-13 15:19:24,921 - root - INFO - lr: 6.4257e-05 gnorm: 1.77 [ 2:37:54< 0:18:47] +[titan] 2025-06-13 15:19:27,980 - root - INFO - step: 13410 loss: 20.0426 memory: 6.46GiB(27.34%) tps: 26,785 tflops: 26.96 mfu: 8.64% global_avg_ntp_loss: 3.4623 global_avg_mtp_loss: 16.5803 +[titan] 2025-06-13 15:19:27,980 - root - INFO - lr: 6.4168e-05 gnorm: 1.73 [ 2:37:57< 0:18:43] +[titan] 2025-06-13 15:19:32,084 - root - INFO - step: 13415 loss: 19.4497 memory: 6.46GiB(27.34%) tps: 19,962 tflops: 20.09 mfu: 6.44% global_avg_ntp_loss: 3.3500 global_avg_mtp_loss: 16.0997 +[titan] 2025-06-13 15:19:32,084 - root - INFO - lr: 6.4080e-05 gnorm: 1.24 [ 2:38:01< 0:18:40] +[titan] 2025-06-13 15:19:35,851 - root - INFO - step: 13420 loss: 18.6406 memory: 6.46GiB(27.34%) tps: 21,747 tflops: 21.89 mfu: 7.01% global_avg_ntp_loss: 3.2341 global_avg_mtp_loss: 15.4065 +[titan] 2025-06-13 15:19:35,852 - root - INFO - lr: 6.3992e-05 gnorm: 1.33 [ 2:38:05< 0:18:36] +[titan] 2025-06-13 15:19:39,095 - root - INFO - step: 13425 loss: 18.2476 memory: 6.46GiB(27.34%) tps: 25,262 tflops: 25.42 mfu: 8.15% global_avg_ntp_loss: 3.0746 global_avg_mtp_loss: 15.1730 +[titan] 2025-06-13 15:19:39,095 - root - INFO - lr: 6.3905e-05 gnorm: 1.35 [ 2:38:08< 0:18:33] +[titan] 2025-06-13 15:19:42,450 - root - INFO - step: 13430 loss: 19.3794 memory: 6.46GiB(27.34%) tps: 24,419 tflops: 24.57 mfu: 7.88% global_avg_ntp_loss: 3.2905 global_avg_mtp_loss: 16.0889 +[titan] 2025-06-13 15:19:42,450 - root - INFO - lr: 6.3818e-05 gnorm: 1.30 [ 2:38:11< 0:18:29] +[titan] 2025-06-13 15:19:45,866 - root - INFO - step: 13435 loss: 20.1162 memory: 6.46GiB(27.34%) tps: 23,985 tflops: 24.14 mfu: 7.74% global_avg_ntp_loss: 3.4283 global_avg_mtp_loss: 16.6879 +[titan] 2025-06-13 15:19:45,866 - root - INFO - lr: 6.3731e-05 gnorm: 1.19 [ 2:38:15< 0:18:26] +[titan] 2025-06-13 15:19:49,356 - root - INFO - step: 13440 loss: 19.9560 memory: 6.46GiB(27.34%) tps: 23,475 tflops: 23.62 mfu: 7.57% global_avg_ntp_loss: 3.4304 global_avg_mtp_loss: 16.5257 +[titan] 2025-06-13 15:19:49,356 - root - INFO - lr: 6.3644e-05 gnorm: 1.35 [ 2:38:18< 0:18:22] +[titan] 2025-06-13 15:19:52,687 - root - INFO - step: 13445 loss: 18.1377 memory: 6.46GiB(27.34%) tps: 24,597 tflops: 24.75 mfu: 7.93% global_avg_ntp_loss: 3.1001 global_avg_mtp_loss: 15.0375 +[titan] 2025-06-13 15:19:52,687 - root - INFO - lr: 6.3558e-05 gnorm: 1.30 [ 2:38:22< 0:18:18] +[titan] 2025-06-13 15:19:55,716 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:19:56,423 - root - INFO - step: 13450 loss: 19.2931 memory: 6.46GiB(27.34%) tps: 21,930 tflops: 22.07 mfu: 7.07% global_avg_ntp_loss: 3.2983 global_avg_mtp_loss: 15.9948 +[titan] 2025-06-13 15:19:56,423 - root - INFO - lr: 6.3471e-05 gnorm: 2.14 [ 2:38:25< 0:18:15] +[titan] 2025-06-13 15:19:59,845 - root - INFO - step: 13455 loss: 16.6534 memory: 6.46GiB(27.34%) tps: 23,937 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 2.8198 global_avg_mtp_loss: 13.8336 +[titan] 2025-06-13 15:19:59,846 - root - INFO - lr: 6.3386e-05 gnorm: 1.49 [ 2:38:29< 0:18:11] +[titan] 2025-06-13 15:20:03,346 - root - INFO - step: 13460 loss: 19.1632 memory: 6.46GiB(27.34%) tps: 23,407 tflops: 23.56 mfu: 7.55% global_avg_ntp_loss: 3.2587 global_avg_mtp_loss: 15.9045 +[titan] 2025-06-13 15:20:03,346 - root - INFO - lr: 6.3300e-05 gnorm: 1.39 [ 2:38:32< 0:18:08] +[titan] 2025-06-13 15:20:06,721 - root - INFO - step: 13465 loss: 19.6698 memory: 6.46GiB(27.34%) tps: 24,275 tflops: 24.43 mfu: 7.83% global_avg_ntp_loss: 3.3770 global_avg_mtp_loss: 16.2928 +[titan] 2025-06-13 15:20:06,721 - root - INFO - lr: 6.3215e-05 gnorm: 1.30 [ 2:38:36< 0:18:04] +[titan] 2025-06-13 15:20:10,483 - root - INFO - step: 13470 loss: 18.0839 memory: 6.46GiB(27.34%) tps: 21,777 tflops: 21.92 mfu: 7.02% global_avg_ntp_loss: 3.0771 global_avg_mtp_loss: 15.0068 +[titan] 2025-06-13 15:20:10,483 - root - INFO - lr: 6.3129e-05 gnorm: 1.34 [ 2:38:39< 0:18:01] +[titan] 2025-06-13 15:20:14,025 - root - INFO - step: 13475 loss: 18.2218 memory: 6.46GiB(27.34%) tps: 23,131 tflops: 23.28 mfu: 7.46% global_avg_ntp_loss: 3.1252 global_avg_mtp_loss: 15.0967 +[titan] 2025-06-13 15:20:14,025 - root - INFO - lr: 6.3045e-05 gnorm: 1.24 [ 2:38:43< 0:17:57] +[titan] 2025-06-13 15:20:17,632 - root - INFO - step: 13480 loss: 19.1661 memory: 6.46GiB(27.34%) tps: 22,713 tflops: 22.86 mfu: 7.33% global_avg_ntp_loss: 3.3205 global_avg_mtp_loss: 15.8457 +[titan] 2025-06-13 15:20:17,632 - root - INFO - lr: 6.2960e-05 gnorm: 1.29 [ 2:38:47< 0:17:54] +[titan] 2025-06-13 15:20:20,859 - root - INFO - step: 13485 loss: 19.7027 memory: 6.46GiB(27.34%) tps: 25,387 tflops: 25.55 mfu: 8.19% global_avg_ntp_loss: 3.3989 global_avg_mtp_loss: 16.3038 +[titan] 2025-06-13 15:20:20,859 - root - INFO - lr: 6.2876e-05 gnorm: 1.25 [ 2:38:50< 0:17:50] +[titan] 2025-06-13 15:20:24,265 - root - INFO - step: 13490 loss: 16.8835 memory: 6.46GiB(27.34%) tps: 24,056 tflops: 24.21 mfu: 7.76% global_avg_ntp_loss: 2.8580 global_avg_mtp_loss: 14.0255 +[titan] 2025-06-13 15:20:24,265 - root - INFO - lr: 6.2792e-05 gnorm: 1.77 [ 2:38:53< 0:17:47] +[titan] 2025-06-13 15:20:27,583 - root - INFO - step: 13495 loss: 19.7697 memory: 6.46GiB(27.34%) tps: 24,694 tflops: 24.85 mfu: 7.97% global_avg_ntp_loss: 3.4079 global_avg_mtp_loss: 16.3619 +[titan] 2025-06-13 15:20:27,583 - root - INFO - lr: 6.2708e-05 gnorm: 1.23 [ 2:38:57< 0:17:43] +[titan] 2025-06-13 15:20:30,266 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:20:31,035 - root - INFO - step: 13500 loss: 18.7183 memory: 6.46GiB(27.34%) tps: 23,731 tflops: 23.88 mfu: 7.65% global_avg_ntp_loss: 3.2095 global_avg_mtp_loss: 15.5088 +[titan] 2025-06-13 15:20:31,036 - root - INFO - lr: 6.2624e-05 gnorm: 1.32 [ 2:39:00< 0:17:40] +[titan] 2025-06-13 15:20:34,689 - root - INFO - step: 13505 loss: 19.5013 memory: 6.46GiB(27.34%) tps: 22,425 tflops: 22.57 mfu: 7.23% global_avg_ntp_loss: 3.3163 global_avg_mtp_loss: 16.1850 +[titan] 2025-06-13 15:20:34,689 - root - INFO - lr: 6.2541e-05 gnorm: 1.32 [ 2:39:04< 0:17:36] +[titan] 2025-06-13 15:20:38,097 - root - INFO - step: 13510 loss: 18.8683 memory: 6.46GiB(27.34%) tps: 24,042 tflops: 24.19 mfu: 7.75% global_avg_ntp_loss: 3.2466 global_avg_mtp_loss: 15.6218 +[titan] 2025-06-13 15:20:38,097 - root - INFO - lr: 6.2458e-05 gnorm: 1.29 [ 2:39:07< 0:17:32] +[titan] 2025-06-13 15:20:41,757 - root - INFO - step: 13515 loss: 18.9285 memory: 6.46GiB(27.34%) tps: 22,384 tflops: 22.53 mfu: 7.22% global_avg_ntp_loss: 3.2402 global_avg_mtp_loss: 15.6883 +[titan] 2025-06-13 15:20:41,757 - root - INFO - lr: 6.2376e-05 gnorm: 1.26 [ 2:39:11< 0:17:29] +[titan] 2025-06-13 15:20:45,141 - root - INFO - step: 13520 loss: 19.0448 memory: 6.46GiB(27.34%) tps: 24,208 tflops: 24.36 mfu: 7.81% global_avg_ntp_loss: 3.2475 global_avg_mtp_loss: 15.7973 +[titan] 2025-06-13 15:20:45,142 - root - INFO - lr: 6.2293e-05 gnorm: 1.63 [ 2:39:14< 0:17:25] +[titan] 2025-06-13 15:20:48,303 - root - INFO - step: 13525 loss: 20.0380 memory: 6.46GiB(27.34%) tps: 25,916 tflops: 26.08 mfu: 8.36% global_avg_ntp_loss: 3.4196 global_avg_mtp_loss: 16.6184 +[titan] 2025-06-13 15:20:48,303 - root - INFO - lr: 6.2211e-05 gnorm: 1.20 [ 2:39:17< 0:17:22] +[titan] 2025-06-13 15:20:51,723 - root - INFO - step: 13530 loss: 18.5762 memory: 6.46GiB(27.34%) tps: 23,957 tflops: 24.11 mfu: 7.73% global_avg_ntp_loss: 3.2050 global_avg_mtp_loss: 15.3712 +[titan] 2025-06-13 15:20:51,723 - root - INFO - lr: 6.2129e-05 gnorm: 1.27 [ 2:39:21< 0:17:18] +[titan] 2025-06-13 15:20:55,412 - root - INFO - step: 13535 loss: 20.1961 memory: 6.46GiB(27.34%) tps: 22,205 tflops: 22.35 mfu: 7.16% global_avg_ntp_loss: 3.4815 global_avg_mtp_loss: 16.7146 +[titan] 2025-06-13 15:20:55,412 - root - INFO - lr: 6.2047e-05 gnorm: 1.25 [ 2:39:24< 0:17:15] +[titan] 2025-06-13 15:20:59,043 - root - INFO - step: 13540 loss: 19.1164 memory: 6.46GiB(27.34%) tps: 22,566 tflops: 22.71 mfu: 7.28% global_avg_ntp_loss: 3.2873 global_avg_mtp_loss: 15.8291 +[titan] 2025-06-13 15:20:59,043 - root - INFO - lr: 6.1966e-05 gnorm: 1.26 [ 2:39:28< 0:17:11] +[titan] 2025-06-13 15:21:02,334 - root - INFO - step: 13545 loss: 19.8790 memory: 6.46GiB(27.34%) tps: 24,896 tflops: 25.05 mfu: 8.03% global_avg_ntp_loss: 3.4109 global_avg_mtp_loss: 16.4681 +[titan] 2025-06-13 15:21:02,334 - root - INFO - lr: 6.1885e-05 gnorm: 1.25 [ 2:39:31< 0:17:08] +[titan] 2025-06-13 15:21:04,886 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:21:05,643 - root - INFO - step: 13550 loss: 18.5991 memory: 6.46GiB(27.34%) tps: 24,757 tflops: 24.92 mfu: 7.99% global_avg_ntp_loss: 3.1862 global_avg_mtp_loss: 15.4129 +[titan] 2025-06-13 15:21:05,643 - root - INFO - lr: 6.1804e-05 gnorm: 1.22 [ 2:39:35< 0:17:04] +[titan] 2025-06-13 15:21:09,075 - root - INFO - step: 13555 loss: 17.8078 memory: 6.46GiB(27.34%) tps: 23,873 tflops: 24.03 mfu: 7.70% global_avg_ntp_loss: 3.0256 global_avg_mtp_loss: 14.7822 +[titan] 2025-06-13 15:21:09,075 - root - INFO - lr: 6.1724e-05 gnorm: 1.48 [ 2:39:38< 0:17:01] +[titan] 2025-06-13 15:21:12,404 - root - INFO - step: 13560 loss: 20.0475 memory: 6.46GiB(27.34%) tps: 24,615 tflops: 24.77 mfu: 7.94% global_avg_ntp_loss: 3.4788 global_avg_mtp_loss: 16.5687 +[titan] 2025-06-13 15:21:12,404 - root - INFO - lr: 6.1643e-05 gnorm: 1.38 [ 2:39:41< 0:16:57] +[titan] 2025-06-13 15:21:16,066 - root - INFO - step: 13565 loss: 19.4972 memory: 6.46GiB(27.34%) tps: 22,369 tflops: 22.51 mfu: 7.22% global_avg_ntp_loss: 3.3337 global_avg_mtp_loss: 16.1635 +[titan] 2025-06-13 15:21:16,067 - root - INFO - lr: 6.1563e-05 gnorm: 1.31 [ 2:39:45< 0:16:54] +[titan] 2025-06-13 15:21:19,612 - root - INFO - step: 13570 loss: 18.5822 memory: 6.46GiB(27.34%) tps: 23,104 tflops: 23.25 mfu: 7.45% global_avg_ntp_loss: 3.1791 global_avg_mtp_loss: 15.4032 +[titan] 2025-06-13 15:21:19,613 - root - INFO - lr: 6.1484e-05 gnorm: 1.21 [ 2:39:49< 0:16:50] +[titan] 2025-06-13 15:21:23,545 - root - INFO - step: 13575 loss: 19.0245 memory: 6.46GiB(27.34%) tps: 20,832 tflops: 20.97 mfu: 6.72% global_avg_ntp_loss: 3.2434 global_avg_mtp_loss: 15.7811 +[titan] 2025-06-13 15:21:23,545 - root - INFO - lr: 6.1404e-05 gnorm: 1.25 [ 2:39:53< 0:16:47] +[titan] 2025-06-13 15:21:26,541 - root - INFO - step: 13580 loss: 18.9527 memory: 6.46GiB(27.34%) tps: 27,347 tflops: 27.52 mfu: 8.82% global_avg_ntp_loss: 3.2115 global_avg_mtp_loss: 15.7412 +[titan] 2025-06-13 15:21:26,541 - root - INFO - lr: 6.1325e-05 gnorm: 1.29 [ 2:39:56< 0:16:43] +[titan] 2025-06-13 15:21:30,102 - root - INFO - step: 13585 loss: 19.5618 memory: 6.46GiB(27.34%) tps: 23,008 tflops: 23.16 mfu: 7.42% global_avg_ntp_loss: 3.3474 global_avg_mtp_loss: 16.2144 +[titan] 2025-06-13 15:21:30,102 - root - INFO - lr: 6.1246e-05 gnorm: 1.21 [ 2:39:59< 0:16:39] +[titan] 2025-06-13 15:21:34,059 - root - INFO - step: 13590 loss: 19.2329 memory: 6.46GiB(27.34%) tps: 20,707 tflops: 20.84 mfu: 6.68% global_avg_ntp_loss: 3.2607 global_avg_mtp_loss: 15.9722 +[titan] 2025-06-13 15:21:34,059 - root - INFO - lr: 6.1167e-05 gnorm: 1.28 [ 2:40:03< 0:16:36] +[titan] 2025-06-13 15:21:37,758 - root - INFO - step: 13595 loss: 18.4207 memory: 6.46GiB(27.34%) tps: 22,144 tflops: 22.29 mfu: 7.14% global_avg_ntp_loss: 3.0766 global_avg_mtp_loss: 15.3441 +[titan] 2025-06-13 15:21:37,759 - root - INFO - lr: 6.1089e-05 gnorm: 1.36 [ 2:40:07< 0:16:32] +[titan] 2025-06-13 15:21:40,518 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:21:41,382 - root - INFO - step: 13600 loss: 19.3703 memory: 6.46GiB(27.34%) tps: 22,609 tflops: 22.75 mfu: 7.29% global_avg_ntp_loss: 3.3153 global_avg_mtp_loss: 16.0549 +[titan] 2025-06-13 15:21:41,382 - root - INFO - lr: 6.1011e-05 gnorm: 1.27 [ 2:40:10< 0:16:29] +[titan] 2025-06-13 15:21:44,635 - root - INFO - step: 13605 loss: 19.5728 memory: 6.46GiB(27.34%) tps: 25,188 tflops: 25.35 mfu: 8.12% global_avg_ntp_loss: 3.3266 global_avg_mtp_loss: 16.2462 +[titan] 2025-06-13 15:21:44,635 - root - INFO - lr: 6.0933e-05 gnorm: 1.24 [ 2:40:14< 0:16:25] +[titan] 2025-06-13 15:21:48,339 - root - INFO - step: 13610 loss: 20.0944 memory: 6.46GiB(27.34%) tps: 22,121 tflops: 22.26 mfu: 7.14% global_avg_ntp_loss: 3.4708 global_avg_mtp_loss: 16.6235 +[titan] 2025-06-13 15:21:48,339 - root - INFO - lr: 6.0855e-05 gnorm: 1.26 [ 2:40:17< 0:16:22] +[titan] 2025-06-13 15:21:51,917 - root - INFO - step: 13615 loss: 19.6573 memory: 6.46GiB(27.34%) tps: 22,899 tflops: 23.04 mfu: 7.39% global_avg_ntp_loss: 3.3235 global_avg_mtp_loss: 16.3338 +[titan] 2025-06-13 15:21:51,917 - root - INFO - lr: 6.0778e-05 gnorm: 1.25 [ 2:40:21< 0:16:18] +[titan] 2025-06-13 15:21:55,479 - root - INFO - step: 13620 loss: 18.5136 memory: 6.46GiB(27.34%) tps: 23,001 tflops: 23.15 mfu: 7.42% global_avg_ntp_loss: 3.1463 global_avg_mtp_loss: 15.3672 +[titan] 2025-06-13 15:21:55,479 - root - INFO - lr: 6.0701e-05 gnorm: 1.40 [ 2:40:24< 0:16:15] +[titan] 2025-06-13 15:21:58,981 - root - INFO - step: 13625 loss: 20.1574 memory: 6.46GiB(27.34%) tps: 23,395 tflops: 23.54 mfu: 7.55% global_avg_ntp_loss: 3.4830 global_avg_mtp_loss: 16.6745 +[titan] 2025-06-13 15:21:58,981 - root - INFO - lr: 6.0624e-05 gnorm: 1.24 [ 2:40:28< 0:16:11] +[titan] 2025-06-13 15:22:02,364 - root - INFO - step: 13630 loss: 17.9604 memory: 6.46GiB(27.34%) tps: 24,215 tflops: 24.37 mfu: 7.81% global_avg_ntp_loss: 3.0766 global_avg_mtp_loss: 14.8838 +[titan] 2025-06-13 15:22:02,365 - root - INFO - lr: 6.0548e-05 gnorm: 1.33 [ 2:40:31< 0:16:08] +[titan] 2025-06-13 15:22:05,983 - root - INFO - step: 13635 loss: 19.1828 memory: 6.46GiB(27.34%) tps: 22,641 tflops: 22.78 mfu: 7.30% global_avg_ntp_loss: 3.2555 global_avg_mtp_loss: 15.9273 +[titan] 2025-06-13 15:22:05,983 - root - INFO - lr: 6.0471e-05 gnorm: 1.24 [ 2:40:35< 0:16:04] +[titan] 2025-06-13 15:22:09,498 - root - INFO - step: 13640 loss: 18.3048 memory: 6.46GiB(27.34%) tps: 23,313 tflops: 23.46 mfu: 7.52% global_avg_ntp_loss: 3.1040 global_avg_mtp_loss: 15.2008 +[titan] 2025-06-13 15:22:09,498 - root - INFO - lr: 6.0395e-05 gnorm: 1.26 [ 2:40:38< 0:16:01] +[titan] 2025-06-13 15:22:13,573 - root - INFO - step: 13645 loss: 19.3579 memory: 6.46GiB(27.34%) tps: 20,105 tflops: 20.23 mfu: 6.48% global_avg_ntp_loss: 3.2458 global_avg_mtp_loss: 16.1121 +[titan] 2025-06-13 15:22:13,573 - root - INFO - lr: 6.0320e-05 gnorm: 1.44 [ 2:40:43< 0:15:57] +[titan] 2025-06-13 15:22:16,545 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:22:17,098 - root - INFO - step: 13650 loss: 18.2360 memory: 6.46GiB(27.34%) tps: 23,237 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.0987 global_avg_mtp_loss: 15.1373 +[titan] 2025-06-13 15:22:17,099 - root - INFO - lr: 6.0244e-05 gnorm: 1.41 [ 2:40:46< 0:15:54] +[titan] 2025-06-13 15:22:20,397 - root - INFO - step: 13655 loss: 18.0980 memory: 6.46GiB(27.34%) tps: 24,835 tflops: 24.99 mfu: 8.01% global_avg_ntp_loss: 3.1023 global_avg_mtp_loss: 14.9957 +[titan] 2025-06-13 15:22:20,398 - root - INFO - lr: 6.0169e-05 gnorm: 1.42 [ 2:40:49< 0:15:50] +[titan] 2025-06-13 15:22:23,584 - root - INFO - step: 13660 loss: 18.1545 memory: 6.46GiB(27.34%) tps: 25,710 tflops: 25.87 mfu: 8.29% global_avg_ntp_loss: 3.1132 global_avg_mtp_loss: 15.0413 +[titan] 2025-06-13 15:22:23,584 - root - INFO - lr: 6.0094e-05 gnorm: 1.43 [ 2:40:53< 0:15:46] +[titan] 2025-06-13 15:22:26,792 - root - INFO - step: 13665 loss: 18.5952 memory: 6.46GiB(27.34%) tps: 25,542 tflops: 25.70 mfu: 8.24% global_avg_ntp_loss: 3.2203 global_avg_mtp_loss: 15.3749 +[titan] 2025-06-13 15:22:26,792 - root - INFO - lr: 6.0020e-05 gnorm: 1.39 [ 2:40:56< 0:15:43] +[titan] 2025-06-13 15:22:30,032 - root - INFO - step: 13670 loss: 18.3224 memory: 6.46GiB(27.34%) tps: 25,284 tflops: 25.45 mfu: 8.16% global_avg_ntp_loss: 3.1177 global_avg_mtp_loss: 15.2047 +[titan] 2025-06-13 15:22:30,032 - root - INFO - lr: 5.9945e-05 gnorm: 1.25 [ 2:40:59< 0:15:39] +[titan] 2025-06-13 15:22:33,238 - root - INFO - step: 13675 loss: 15.6034 memory: 6.46GiB(27.34%) tps: 25,555 tflops: 25.72 mfu: 8.24% global_avg_ntp_loss: 2.7056 global_avg_mtp_loss: 12.8978 +[titan] 2025-06-13 15:22:33,239 - root - INFO - lr: 5.9871e-05 gnorm: 1.32 [ 2:41:02< 0:15:36] +[titan] 2025-06-13 15:22:36,682 - root - INFO - step: 13680 loss: 18.4568 memory: 6.46GiB(27.34%) tps: 23,795 tflops: 23.95 mfu: 7.68% global_avg_ntp_loss: 3.1746 global_avg_mtp_loss: 15.2822 +[titan] 2025-06-13 15:22:36,682 - root - INFO - lr: 5.9797e-05 gnorm: 1.36 [ 2:41:06< 0:15:32] +[titan] 2025-06-13 15:22:40,199 - root - INFO - step: 13685 loss: 18.8090 memory: 6.46GiB(27.34%) tps: 23,294 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 3.1995 global_avg_mtp_loss: 15.6095 +[titan] 2025-06-13 15:22:40,199 - root - INFO - lr: 5.9724e-05 gnorm: 1.35 [ 2:41:09< 0:15:29] +[titan] 2025-06-13 15:22:44,029 - root - INFO - step: 13690 loss: 16.6716 memory: 6.46GiB(27.34%) tps: 21,394 tflops: 21.53 mfu: 6.90% global_avg_ntp_loss: 2.8698 global_avg_mtp_loss: 13.8018 +[titan] 2025-06-13 15:22:44,029 - root - INFO - lr: 5.9650e-05 gnorm: 1.36 [ 2:41:13< 0:15:25] +[titan] 2025-06-13 15:22:47,368 - root - INFO - step: 13695 loss: 19.9484 memory: 6.46GiB(27.34%) tps: 24,533 tflops: 24.69 mfu: 7.91% global_avg_ntp_loss: 3.4388 global_avg_mtp_loss: 16.5095 +[titan] 2025-06-13 15:22:47,369 - root - INFO - lr: 5.9577e-05 gnorm: 1.33 [ 2:41:16< 0:15:22] +[titan] 2025-06-13 15:22:50,131 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:22:50,764 - root - INFO - step: 13700 loss: 19.1239 memory: 6.46GiB(27.34%) tps: 24,128 tflops: 24.28 mfu: 7.78% global_avg_ntp_loss: 3.2745 global_avg_mtp_loss: 15.8494 +[titan] 2025-06-13 15:22:50,765 - root - INFO - lr: 5.9505e-05 gnorm: 1.26 [ 2:41:20< 0:15:18] +[titan] 2025-06-13 15:22:54,394 - root - INFO - step: 13705 loss: 18.3846 memory: 6.46GiB(27.34%) tps: 22,573 tflops: 22.72 mfu: 7.28% global_avg_ntp_loss: 3.1123 global_avg_mtp_loss: 15.2723 +[titan] 2025-06-13 15:22:54,394 - root - INFO - lr: 5.9432e-05 gnorm: 1.27 [ 2:41:23< 0:15:15] +[titan] 2025-06-13 15:22:57,787 - root - INFO - step: 13710 loss: 19.7508 memory: 6.46GiB(27.34%) tps: 24,144 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 3.4571 global_avg_mtp_loss: 16.2937 +[titan] 2025-06-13 15:22:57,788 - root - INFO - lr: 5.9360e-05 gnorm: 1.42 [ 2:41:27< 0:15:11] +[titan] 2025-06-13 15:23:01,065 - root - INFO - step: 13715 loss: 18.6937 memory: 6.46GiB(27.34%) tps: 24,997 tflops: 25.16 mfu: 8.06% global_avg_ntp_loss: 3.1776 global_avg_mtp_loss: 15.5160 +[titan] 2025-06-13 15:23:01,065 - root - INFO - lr: 5.9288e-05 gnorm: 1.26 [ 2:41:30< 0:15:07] +[titan] 2025-06-13 15:23:04,523 - root - INFO - step: 13720 loss: 18.0432 memory: 6.46GiB(27.34%) tps: 23,695 tflops: 23.85 mfu: 7.64% global_avg_ntp_loss: 3.0918 global_avg_mtp_loss: 14.9515 +[titan] 2025-06-13 15:23:04,523 - root - INFO - lr: 5.9217e-05 gnorm: 1.21 [ 2:41:34< 0:15:04] +[titan] 2025-06-13 15:23:08,003 - root - INFO - step: 13725 loss: 18.1775 memory: 6.46GiB(27.34%) tps: 23,542 tflops: 23.69 mfu: 7.59% global_avg_ntp_loss: 3.1017 global_avg_mtp_loss: 15.0757 +[titan] 2025-06-13 15:23:08,003 - root - INFO - lr: 5.9145e-05 gnorm: 1.46 [ 2:41:37< 0:15:00] +[titan] 2025-06-13 15:23:11,624 - root - INFO - step: 13730 loss: 18.4990 memory: 6.46GiB(27.34%) tps: 22,625 tflops: 22.77 mfu: 7.30% global_avg_ntp_loss: 3.1052 global_avg_mtp_loss: 15.3938 +[titan] 2025-06-13 15:23:11,624 - root - INFO - lr: 5.9074e-05 gnorm: 1.51 [ 2:41:41< 0:14:57] +[titan] 2025-06-13 15:23:14,982 - root - INFO - step: 13735 loss: 20.3398 memory: 6.46GiB(27.34%) tps: 24,397 tflops: 24.55 mfu: 7.87% global_avg_ntp_loss: 3.5349 global_avg_mtp_loss: 16.8050 +[titan] 2025-06-13 15:23:14,983 - root - INFO - lr: 5.9003e-05 gnorm: 1.26 [ 2:41:44< 0:14:53] +[titan] 2025-06-13 15:23:18,325 - root - INFO - step: 13740 loss: 18.6988 memory: 6.46GiB(27.34%) tps: 24,514 tflops: 24.67 mfu: 7.91% global_avg_ntp_loss: 3.1982 global_avg_mtp_loss: 15.5006 +[titan] 2025-06-13 15:23:18,325 - root - INFO - lr: 5.8933e-05 gnorm: 1.34 [ 2:41:47< 0:14:50] +[titan] 2025-06-13 15:23:21,870 - root - INFO - step: 13745 loss: 19.3461 memory: 6.46GiB(27.34%) tps: 23,106 tflops: 23.25 mfu: 7.45% global_avg_ntp_loss: 3.3050 global_avg_mtp_loss: 16.0411 +[titan] 2025-06-13 15:23:21,871 - root - INFO - lr: 5.8862e-05 gnorm: 1.22 [ 2:41:51< 0:14:46] +[titan] 2025-06-13 15:23:24,838 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:23:25,471 - root - INFO - step: 13750 loss: 19.0884 memory: 6.46GiB(27.34%) tps: 22,754 tflops: 22.90 mfu: 7.34% global_avg_ntp_loss: 3.2704 global_avg_mtp_loss: 15.8179 +[titan] 2025-06-13 15:23:25,471 - root - INFO - lr: 5.8792e-05 gnorm: 1.31 [ 2:41:54< 0:14:43] +[titan] 2025-06-13 15:23:28,635 - root - INFO - step: 13755 loss: 20.0993 memory: 6.46GiB(27.34%) tps: 25,894 tflops: 26.06 mfu: 8.35% global_avg_ntp_loss: 3.4759 global_avg_mtp_loss: 16.6233 +[titan] 2025-06-13 15:23:28,635 - root - INFO - lr: 5.8723e-05 gnorm: 1.61 [ 2:41:58< 0:14:39] +[titan] 2025-06-13 15:23:32,480 - root - INFO - step: 13760 loss: 18.9724 memory: 6.46GiB(27.34%) tps: 21,309 tflops: 21.45 mfu: 6.87% global_avg_ntp_loss: 3.2177 global_avg_mtp_loss: 15.7547 +[titan] 2025-06-13 15:23:32,480 - root - INFO - lr: 5.8653e-05 gnorm: 1.31 [ 2:42:01< 0:14:36] +[titan] 2025-06-13 15:23:35,719 - root - INFO - step: 13765 loss: 18.2241 memory: 6.46GiB(27.34%) tps: 25,291 tflops: 25.45 mfu: 8.16% global_avg_ntp_loss: 3.0391 global_avg_mtp_loss: 15.1850 +[titan] 2025-06-13 15:23:35,720 - root - INFO - lr: 5.8584e-05 gnorm: 1.94 [ 2:42:05< 0:14:32] +[titan] 2025-06-13 15:23:39,138 - root - INFO - step: 13770 loss: 19.3588 memory: 6.46GiB(27.34%) tps: 23,966 tflops: 24.12 mfu: 7.73% global_avg_ntp_loss: 3.2970 global_avg_mtp_loss: 16.0618 +[titan] 2025-06-13 15:23:39,139 - root - INFO - lr: 5.8515e-05 gnorm: 1.28 [ 2:42:08< 0:14:29] +[titan] 2025-06-13 15:23:42,448 - root - INFO - step: 13775 loss: 19.5058 memory: 6.46GiB(27.34%) tps: 24,755 tflops: 24.91 mfu: 7.98% global_avg_ntp_loss: 3.3359 global_avg_mtp_loss: 16.1699 +[titan] 2025-06-13 15:23:42,448 - root - INFO - lr: 5.8446e-05 gnorm: 1.40 [ 2:42:11< 0:14:25] +[titan] 2025-06-13 15:23:45,650 - root - INFO - step: 13780 loss: 18.8877 memory: 6.46GiB(27.34%) tps: 25,583 tflops: 25.75 mfu: 8.25% global_avg_ntp_loss: 3.1963 global_avg_mtp_loss: 15.6913 +[titan] 2025-06-13 15:23:45,651 - root - INFO - lr: 5.8378e-05 gnorm: 1.35 [ 2:42:15< 0:14:21] +[titan] 2025-06-13 15:23:49,541 - root - INFO - step: 13785 loss: 19.5577 memory: 6.46GiB(27.34%) tps: 21,057 tflops: 21.19 mfu: 6.79% global_avg_ntp_loss: 3.3620 global_avg_mtp_loss: 16.1957 +[titan] 2025-06-13 15:23:49,542 - root - INFO - lr: 5.8310e-05 gnorm: 1.25 [ 2:42:19< 0:14:18] +[titan] 2025-06-13 15:23:53,312 - root - INFO - step: 13790 loss: 18.8738 memory: 6.46GiB(27.34%) tps: 21,725 tflops: 21.86 mfu: 7.01% global_avg_ntp_loss: 3.2404 global_avg_mtp_loss: 15.6334 +[titan] 2025-06-13 15:23:53,313 - root - INFO - lr: 5.8242e-05 gnorm: 1.30 [ 2:42:22< 0:14:14] +[titan] 2025-06-13 15:23:56,828 - root - INFO - step: 13795 loss: 16.6467 memory: 6.46GiB(27.34%) tps: 23,304 tflops: 23.45 mfu: 7.52% global_avg_ntp_loss: 2.8637 global_avg_mtp_loss: 13.7830 +[titan] 2025-06-13 15:23:56,828 - root - INFO - lr: 5.8174e-05 gnorm: 1.25 [ 2:42:26< 0:14:11] +[titan] 2025-06-13 15:23:59,676 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:24:00,362 - root - INFO - step: 13800 loss: 19.7938 memory: 6.46GiB(27.34%) tps: 23,183 tflops: 23.33 mfu: 7.48% global_avg_ntp_loss: 3.4440 global_avg_mtp_loss: 16.3498 +[titan] 2025-06-13 15:24:00,363 - root - INFO - lr: 5.8107e-05 gnorm: 1.27 [ 2:42:29< 0:14:07] +[titan] 2025-06-13 15:24:03,812 - root - INFO - step: 13805 loss: 16.0076 memory: 6.46GiB(27.34%) tps: 23,753 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 2.7184 global_avg_mtp_loss: 13.2891 +[titan] 2025-06-13 15:24:03,812 - root - INFO - lr: 5.8040e-05 gnorm: 1.28 [ 2:42:33< 0:14:04] +[titan] 2025-06-13 15:24:07,177 - root - INFO - step: 13810 loss: 19.8612 memory: 6.46GiB(27.34%) tps: 24,343 tflops: 24.50 mfu: 7.85% global_avg_ntp_loss: 3.3664 global_avg_mtp_loss: 16.4948 +[titan] 2025-06-13 15:24:07,178 - root - INFO - lr: 5.7973e-05 gnorm: 1.26 [ 2:42:36< 0:14:00] +[titan] 2025-06-13 15:24:10,682 - root - INFO - step: 13815 loss: 19.7494 memory: 6.46GiB(27.34%) tps: 23,381 tflops: 23.53 mfu: 7.54% global_avg_ntp_loss: 3.4252 global_avg_mtp_loss: 16.3242 +[titan] 2025-06-13 15:24:10,682 - root - INFO - lr: 5.7907e-05 gnorm: 1.31 [ 2:42:40< 0:13:57] +[titan] 2025-06-13 15:24:14,296 - root - INFO - step: 13820 loss: 16.2962 memory: 6.46GiB(27.34%) tps: 22,665 tflops: 22.81 mfu: 7.31% global_avg_ntp_loss: 2.7911 global_avg_mtp_loss: 13.5052 +[titan] 2025-06-13 15:24:14,297 - root - INFO - lr: 5.7841e-05 gnorm: 1.32 [ 2:42:43< 0:13:53] +[titan] 2025-06-13 15:24:17,280 - root - INFO - Dumping profiler traces at step 13824 +[titan] 2025-06-13 15:24:17,372 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 15:24:18,011 - root - INFO - step: 13825 loss: 18.4076 memory: 6.46GiB(27.34%) tps: 22,056 tflops: 22.20 mfu: 7.11% global_avg_ntp_loss: 3.1261 global_avg_mtp_loss: 15.2814 +[titan] 2025-06-13 15:24:18,011 - root - INFO - lr: 5.7775e-05 gnorm: 1.28 [ 2:42:47< 0:13:50] +[titan] 2025-06-13 15:24:21,392 - root - INFO - step: 13830 loss: 19.7032 memory: 6.46GiB(27.34%) tps: 24,230 tflops: 24.38 mfu: 7.82% global_avg_ntp_loss: 3.4337 global_avg_mtp_loss: 16.2695 +[titan] 2025-06-13 15:24:21,393 - root - INFO - lr: 5.7709e-05 gnorm: 1.35 [ 2:42:50< 0:13:46] +[titan] 2025-06-13 15:24:24,604 - root - INFO - step: 13835 loss: 19.5089 memory: 6.46GiB(27.34%) tps: 25,511 tflops: 25.67 mfu: 8.23% global_avg_ntp_loss: 3.3135 global_avg_mtp_loss: 16.1954 +[titan] 2025-06-13 15:24:24,604 - root - INFO - lr: 5.7644e-05 gnorm: 1.20 [ 2:42:54< 0:13:43] +[titan] 2025-06-13 15:24:28,101 - root - INFO - step: 13840 loss: 19.2894 memory: 6.46GiB(27.34%) tps: 23,426 tflops: 23.58 mfu: 7.56% global_avg_ntp_loss: 3.3384 global_avg_mtp_loss: 15.9511 +[titan] 2025-06-13 15:24:28,102 - root - INFO - lr: 5.7579e-05 gnorm: 1.34 [ 2:42:57< 0:13:39] +[titan] 2025-06-13 15:24:31,434 - root - INFO - step: 13845 loss: 16.7820 memory: 6.46GiB(27.34%) tps: 24,588 tflops: 24.74 mfu: 7.93% global_avg_ntp_loss: 2.8843 global_avg_mtp_loss: 13.8977 +[titan] 2025-06-13 15:24:31,434 - root - INFO - lr: 5.7514e-05 gnorm: 1.78 [ 2:43:00< 0:13:35] +[titan] 2025-06-13 15:24:33,834 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:24:34,482 - root - INFO - step: 13850 loss: 19.6698 memory: 6.46GiB(27.34%) tps: 26,879 tflops: 27.05 mfu: 8.67% global_avg_ntp_loss: 3.3743 global_avg_mtp_loss: 16.2955 +[titan] 2025-06-13 15:24:34,482 - root - INFO - lr: 5.7449e-05 gnorm: 1.37 [ 2:43:03< 0:13:32] +[titan] 2025-06-13 15:24:37,908 - root - INFO - step: 13855 loss: 19.4615 memory: 6.46GiB(27.34%) tps: 23,913 tflops: 24.07 mfu: 7.71% global_avg_ntp_loss: 3.2917 global_avg_mtp_loss: 16.1698 +[titan] 2025-06-13 15:24:37,908 - root - INFO - lr: 5.7385e-05 gnorm: 1.26 [ 2:43:07< 0:13:28] +[titan] 2025-06-13 15:24:41,286 - root - INFO - step: 13860 loss: 19.2117 memory: 6.46GiB(27.34%) tps: 24,252 tflops: 24.41 mfu: 7.82% global_avg_ntp_loss: 3.2679 global_avg_mtp_loss: 15.9438 +[titan] 2025-06-13 15:24:41,287 - root - INFO - lr: 5.7321e-05 gnorm: 1.22 [ 2:43:10< 0:13:25] +[titan] 2025-06-13 15:24:45,018 - root - INFO - step: 13865 loss: 18.6599 memory: 6.46GiB(27.34%) tps: 21,956 tflops: 22.10 mfu: 7.08% global_avg_ntp_loss: 3.1635 global_avg_mtp_loss: 15.4964 +[titan] 2025-06-13 15:24:45,018 - root - INFO - lr: 5.7257e-05 gnorm: 1.37 [ 2:43:14< 0:13:21] +[titan] 2025-06-13 15:24:48,603 - root - INFO - step: 13870 loss: 18.9440 memory: 6.46GiB(27.34%) tps: 22,854 tflops: 23.00 mfu: 7.37% global_avg_ntp_loss: 3.2406 global_avg_mtp_loss: 15.7033 +[titan] 2025-06-13 15:24:48,603 - root - INFO - lr: 5.7194e-05 gnorm: 1.29 [ 2:43:18< 0:13:18] +[titan] 2025-06-13 15:24:52,175 - root - INFO - step: 13875 loss: 19.0328 memory: 6.46GiB(27.34%) tps: 22,939 tflops: 23.08 mfu: 7.40% global_avg_ntp_loss: 3.2224 global_avg_mtp_loss: 15.8104 +[titan] 2025-06-13 15:24:52,175 - root - INFO - lr: 5.7131e-05 gnorm: 1.28 [ 2:43:21< 0:13:14] +[titan] 2025-06-13 15:24:55,569 - root - INFO - step: 13880 loss: 19.4127 memory: 6.46GiB(27.34%) tps: 24,140 tflops: 24.29 mfu: 7.79% global_avg_ntp_loss: 3.2893 global_avg_mtp_loss: 16.1233 +[titan] 2025-06-13 15:24:55,569 - root - INFO - lr: 5.7068e-05 gnorm: 1.26 [ 2:43:25< 0:13:11] +[titan] 2025-06-13 15:24:58,942 - root - INFO - step: 13885 loss: 18.9009 memory: 6.46GiB(27.34%) tps: 24,285 tflops: 24.44 mfu: 7.83% global_avg_ntp_loss: 3.1897 global_avg_mtp_loss: 15.7112 +[titan] 2025-06-13 15:24:58,942 - root - INFO - lr: 5.7005e-05 gnorm: 1.33 [ 2:43:28< 0:13:07] +[titan] 2025-06-13 15:25:02,531 - root - INFO - step: 13890 loss: 17.7367 memory: 6.46GiB(27.34%) tps: 22,832 tflops: 22.98 mfu: 7.36% global_avg_ntp_loss: 3.0645 global_avg_mtp_loss: 14.6722 +[titan] 2025-06-13 15:25:02,531 - root - INFO - lr: 5.6943e-05 gnorm: 1.27 [ 2:43:32< 0:13:04] +[titan] 2025-06-13 15:25:06,253 - root - INFO - step: 13895 loss: 18.7445 memory: 6.46GiB(27.34%) tps: 22,011 tflops: 22.15 mfu: 7.10% global_avg_ntp_loss: 3.1850 global_avg_mtp_loss: 15.5594 +[titan] 2025-06-13 15:25:06,253 - root - INFO - lr: 5.6881e-05 gnorm: 1.34 [ 2:43:35< 0:13:00] +[titan] 2025-06-13 15:25:09,231 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:25:09,793 - root - INFO - step: 13900 loss: 20.1264 memory: 6.46GiB(27.34%) tps: 23,142 tflops: 23.29 mfu: 7.46% global_avg_ntp_loss: 3.4424 global_avg_mtp_loss: 16.6840 +[titan] 2025-06-13 15:25:09,794 - root - INFO - lr: 5.6819e-05 gnorm: 1.24 [ 2:43:39< 0:12:57] +[titan] 2025-06-13 15:25:13,233 - root - INFO - step: 13905 loss: 19.3601 memory: 6.46GiB(27.34%) tps: 23,817 tflops: 23.97 mfu: 7.68% global_avg_ntp_loss: 3.2857 global_avg_mtp_loss: 16.0744 +[titan] 2025-06-13 15:25:13,234 - root - INFO - lr: 5.6757e-05 gnorm: 1.30 [ 2:43:42< 0:12:53] +[titan] 2025-06-13 15:25:16,645 - root - INFO - step: 13910 loss: 18.3892 memory: 6.46GiB(27.34%) tps: 24,013 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 3.1461 global_avg_mtp_loss: 15.2430 +[titan] 2025-06-13 15:25:16,646 - root - INFO - lr: 5.6696e-05 gnorm: 1.48 [ 2:43:46< 0:12:49] +[titan] 2025-06-13 15:25:20,079 - root - INFO - step: 13915 loss: 19.5554 memory: 6.46GiB(27.34%) tps: 23,857 tflops: 24.01 mfu: 7.70% global_avg_ntp_loss: 3.3705 global_avg_mtp_loss: 16.1849 +[titan] 2025-06-13 15:25:20,080 - root - INFO - lr: 5.6635e-05 gnorm: 1.22 [ 2:43:49< 0:12:46] +[titan] 2025-06-13 15:25:23,754 - root - INFO - step: 13920 loss: 19.0221 memory: 6.46GiB(27.34%) tps: 22,297 tflops: 22.44 mfu: 7.19% global_avg_ntp_loss: 3.2655 global_avg_mtp_loss: 15.7566 +[titan] 2025-06-13 15:25:23,754 - root - INFO - lr: 5.6574e-05 gnorm: 1.62 [ 2:43:53< 0:12:42] +[titan] 2025-06-13 15:25:27,279 - root - INFO - step: 13925 loss: 19.8539 memory: 6.46GiB(27.34%) tps: 23,240 tflops: 23.39 mfu: 7.50% global_avg_ntp_loss: 3.4083 global_avg_mtp_loss: 16.4456 +[titan] 2025-06-13 15:25:27,279 - root - INFO - lr: 5.6514e-05 gnorm: 1.21 [ 2:43:56< 0:12:39] +[titan] 2025-06-13 15:25:30,847 - root - INFO - step: 13930 loss: 17.4078 memory: 6.46GiB(27.34%) tps: 22,963 tflops: 23.11 mfu: 7.41% global_avg_ntp_loss: 2.9356 global_avg_mtp_loss: 14.4722 +[titan] 2025-06-13 15:25:30,847 - root - INFO - lr: 5.6454e-05 gnorm: 1.42 [ 2:44:00< 0:12:35] +[titan] 2025-06-13 15:25:34,301 - root - INFO - step: 13935 loss: 19.3287 memory: 6.46GiB(27.34%) tps: 23,723 tflops: 23.87 mfu: 7.65% global_avg_ntp_loss: 3.3308 global_avg_mtp_loss: 15.9979 +[titan] 2025-06-13 15:25:34,301 - root - INFO - lr: 5.6394e-05 gnorm: 1.26 [ 2:44:03< 0:12:32] +[titan] 2025-06-13 15:25:37,413 - root - INFO - step: 13940 loss: 17.2253 memory: 6.46GiB(27.34%) tps: 26,322 tflops: 26.49 mfu: 8.49% global_avg_ntp_loss: 2.8974 global_avg_mtp_loss: 14.3279 +[titan] 2025-06-13 15:25:37,413 - root - INFO - lr: 5.6334e-05 gnorm: 1.56 [ 2:44:06< 0:12:28] +[titan] 2025-06-13 15:25:40,972 - root - INFO - step: 13945 loss: 19.9885 memory: 6.46GiB(27.34%) tps: 23,024 tflops: 23.17 mfu: 7.43% global_avg_ntp_loss: 3.4444 global_avg_mtp_loss: 16.5441 +[titan] 2025-06-13 15:25:40,972 - root - INFO - lr: 5.6275e-05 gnorm: 1.27 [ 2:44:10< 0:12:25] +[titan] 2025-06-13 15:25:43,470 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:25:44,167 - root - INFO - step: 13950 loss: 19.3309 memory: 6.46GiB(27.34%) tps: 25,639 tflops: 25.80 mfu: 8.27% global_avg_ntp_loss: 3.3401 global_avg_mtp_loss: 15.9909 +[titan] 2025-06-13 15:25:44,168 - root - INFO - lr: 5.6216e-05 gnorm: 1.43 [ 2:44:13< 0:12:21] +[titan] 2025-06-13 15:25:47,329 - root - INFO - step: 13955 loss: 19.0107 memory: 6.46GiB(27.34%) tps: 25,916 tflops: 26.08 mfu: 8.36% global_avg_ntp_loss: 3.2427 global_avg_mtp_loss: 15.7680 +[titan] 2025-06-13 15:25:47,329 - root - INFO - lr: 5.6157e-05 gnorm: 1.29 [ 2:44:16< 0:12:18] +[titan] 2025-06-13 15:25:50,794 - root - INFO - step: 13960 loss: 17.7204 memory: 6.46GiB(27.34%) tps: 23,646 tflops: 23.80 mfu: 7.63% global_avg_ntp_loss: 3.0050 global_avg_mtp_loss: 14.7154 +[titan] 2025-06-13 15:25:50,794 - root - INFO - lr: 5.6099e-05 gnorm: 1.17 [ 2:44:20< 0:12:14] +[titan] 2025-06-13 15:25:54,301 - root - INFO - step: 13965 loss: 19.3824 memory: 6.46GiB(27.34%) tps: 23,359 tflops: 23.51 mfu: 7.53% global_avg_ntp_loss: 3.3155 global_avg_mtp_loss: 16.0669 +[titan] 2025-06-13 15:25:54,302 - root - INFO - lr: 5.6040e-05 gnorm: 1.21 [ 2:44:23< 0:12:11] +[titan] 2025-06-13 15:25:57,424 - root - INFO - step: 13970 loss: 20.0834 memory: 6.46GiB(27.34%) tps: 26,240 tflops: 26.41 mfu: 8.46% global_avg_ntp_loss: 3.4391 global_avg_mtp_loss: 16.6444 +[titan] 2025-06-13 15:25:57,424 - root - INFO - lr: 5.5982e-05 gnorm: 1.32 [ 2:44:26< 0:12:07] +[titan] 2025-06-13 15:26:00,768 - root - INFO - step: 13975 loss: 18.7365 memory: 6.46GiB(27.34%) tps: 24,495 tflops: 24.65 mfu: 7.90% global_avg_ntp_loss: 3.2165 global_avg_mtp_loss: 15.5200 +[titan] 2025-06-13 15:26:00,769 - root - INFO - lr: 5.5925e-05 gnorm: 1.21 [ 2:44:30< 0:12:03] +[titan] 2025-06-13 15:26:04,206 - root - INFO - step: 13980 loss: 19.0948 memory: 6.46GiB(27.34%) tps: 23,832 tflops: 23.98 mfu: 7.69% global_avg_ntp_loss: 3.2789 global_avg_mtp_loss: 15.8159 +[titan] 2025-06-13 15:26:04,206 - root - INFO - lr: 5.5867e-05 gnorm: 1.40 [ 2:44:33< 0:12:00] +[titan] 2025-06-13 15:26:08,087 - root - INFO - step: 13985 loss: 16.4441 memory: 6.46GiB(27.34%) tps: 21,110 tflops: 21.24 mfu: 6.81% global_avg_ntp_loss: 2.7740 global_avg_mtp_loss: 13.6702 +[titan] 2025-06-13 15:26:08,087 - root - INFO - lr: 5.5810e-05 gnorm: 1.38 [ 2:44:37< 0:11:56] +[titan] 2025-06-13 15:26:11,440 - root - INFO - step: 13990 loss: 18.7824 memory: 6.46GiB(27.34%) tps: 24,435 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 3.1934 global_avg_mtp_loss: 15.5890 +[titan] 2025-06-13 15:26:11,440 - root - INFO - lr: 5.5753e-05 gnorm: 1.27 [ 2:44:40< 0:11:53] +[titan] 2025-06-13 15:26:15,100 - root - INFO - step: 13995 loss: 20.0511 memory: 6.46GiB(27.34%) tps: 22,385 tflops: 22.53 mfu: 7.22% global_avg_ntp_loss: 3.4490 global_avg_mtp_loss: 16.6021 +[titan] 2025-06-13 15:26:15,100 - root - INFO - lr: 5.5697e-05 gnorm: 1.25 [ 2:44:44< 0:11:49] +[titan] 2025-06-13 15:26:17,825 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:26:18,394 - root - INFO - step: 14000 loss: 19.6264 memory: 6.46GiB(27.34%) tps: 24,874 tflops: 25.03 mfu: 8.02% global_avg_ntp_loss: 3.3433 global_avg_mtp_loss: 16.2831 +[titan] 2025-06-13 15:26:18,394 - root - INFO - lr: 5.5640e-05 gnorm: 1.23 [ 2:44:47< 0:11:46] +[titan] 2025-06-13 15:26:21,922 - root - INFO - step: 14005 loss: 18.8916 memory: 6.46GiB(27.34%) tps: 23,225 tflops: 23.37 mfu: 7.49% global_avg_ntp_loss: 3.1811 global_avg_mtp_loss: 15.7104 +[titan] 2025-06-13 15:26:21,922 - root - INFO - lr: 5.5584e-05 gnorm: 1.42 [ 2:44:51< 0:11:42] +[titan] 2025-06-13 15:26:25,311 - root - INFO - step: 14010 loss: 18.2800 memory: 6.46GiB(27.34%) tps: 24,169 tflops: 24.32 mfu: 7.80% global_avg_ntp_loss: 3.1444 global_avg_mtp_loss: 15.1355 +[titan] 2025-06-13 15:26:25,312 - root - INFO - lr: 5.5529e-05 gnorm: 1.33 [ 2:44:54< 0:11:39] +[titan] 2025-06-13 15:26:28,886 - root - INFO - step: 14015 loss: 19.0691 memory: 6.46GiB(27.34%) tps: 22,923 tflops: 23.07 mfu: 7.39% global_avg_ntp_loss: 3.3184 global_avg_mtp_loss: 15.7507 +[titan] 2025-06-13 15:26:28,886 - root - INFO - lr: 5.5473e-05 gnorm: 1.41 [ 2:44:58< 0:11:35] +[titan] 2025-06-13 15:26:32,322 - root - INFO - step: 14020 loss: 18.2257 memory: 6.46GiB(27.34%) tps: 23,844 tflops: 24.00 mfu: 7.69% global_avg_ntp_loss: 3.1243 global_avg_mtp_loss: 15.1014 +[titan] 2025-06-13 15:26:32,322 - root - INFO - lr: 5.5418e-05 gnorm: 1.50 [ 2:45:01< 0:11:32] +[titan] 2025-06-13 15:26:35,507 - root - INFO - step: 14025 loss: 18.5477 memory: 6.46GiB(27.34%) tps: 25,719 tflops: 25.88 mfu: 8.30% global_avg_ntp_loss: 3.1421 global_avg_mtp_loss: 15.4057 +[titan] 2025-06-13 15:26:35,508 - root - INFO - lr: 5.5363e-05 gnorm: 1.37 [ 2:45:04< 0:11:28] +[titan] 2025-06-13 15:26:38,962 - root - INFO - step: 14030 loss: 19.7579 memory: 6.46GiB(27.34%) tps: 23,717 tflops: 23.87 mfu: 7.65% global_avg_ntp_loss: 3.4061 global_avg_mtp_loss: 16.3518 +[titan] 2025-06-13 15:26:38,962 - root - INFO - lr: 5.5308e-05 gnorm: 1.27 [ 2:45:08< 0:11:25] +[titan] 2025-06-13 15:26:42,521 - root - INFO - step: 14035 loss: 20.4714 memory: 6.46GiB(27.34%) tps: 23,021 tflops: 23.17 mfu: 7.43% global_avg_ntp_loss: 3.5597 global_avg_mtp_loss: 16.9118 +[titan] 2025-06-13 15:26:42,521 - root - INFO - lr: 5.5254e-05 gnorm: 1.47 [ 2:45:11< 0:11:21] +[titan] 2025-06-13 15:26:46,085 - root - INFO - step: 14040 loss: 20.0667 memory: 6.46GiB(27.34%) tps: 22,990 tflops: 23.14 mfu: 7.42% global_avg_ntp_loss: 3.4001 global_avg_mtp_loss: 16.6666 +[titan] 2025-06-13 15:26:46,085 - root - INFO - lr: 5.5200e-05 gnorm: 1.36 [ 2:45:15< 0:11:17] +[titan] 2025-06-13 15:26:49,133 - root - INFO - step: 14045 loss: 19.3018 memory: 6.46GiB(27.34%) tps: 26,875 tflops: 27.05 mfu: 8.67% global_avg_ntp_loss: 3.2874 global_avg_mtp_loss: 16.0144 +[titan] 2025-06-13 15:26:49,133 - root - INFO - lr: 5.5146e-05 gnorm: 1.32 [ 2:45:18< 0:11:14] +[titan] 2025-06-13 15:26:52,057 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:26:52,590 - root - INFO - step: 14050 loss: 18.6464 memory: 6.46GiB(27.34%) tps: 23,704 tflops: 23.86 mfu: 7.65% global_avg_ntp_loss: 3.1482 global_avg_mtp_loss: 15.4982 +[titan] 2025-06-13 15:26:52,590 - root - INFO - lr: 5.5093e-05 gnorm: 1.39 [ 2:45:22< 0:11:10] +[titan] 2025-06-13 15:26:55,839 - root - INFO - step: 14055 loss: 18.2218 memory: 6.46GiB(27.34%) tps: 25,210 tflops: 25.37 mfu: 8.13% global_avg_ntp_loss: 3.1070 global_avg_mtp_loss: 15.1148 +[titan] 2025-06-13 15:26:55,840 - root - INFO - lr: 5.5039e-05 gnorm: 1.28 [ 2:45:25< 0:11:07] +[titan] 2025-06-13 15:27:00,400 - root - INFO - step: 14060 loss: 18.0095 memory: 6.46GiB(27.34%) tps: 17,963 tflops: 18.08 mfu: 5.79% global_avg_ntp_loss: 3.1536 global_avg_mtp_loss: 14.8559 +[titan] 2025-06-13 15:27:00,401 - root - INFO - lr: 5.4986e-05 gnorm: 1.64 [ 2:45:29< 0:11:03] +[titan] 2025-06-13 15:27:03,520 - root - INFO - step: 14065 loss: 19.9544 memory: 6.46GiB(27.34%) tps: 26,268 tflops: 26.44 mfu: 8.47% global_avg_ntp_loss: 3.4325 global_avg_mtp_loss: 16.5219 +[titan] 2025-06-13 15:27:03,520 - root - INFO - lr: 5.4934e-05 gnorm: 1.29 [ 2:45:32< 0:11:00] +[titan] 2025-06-13 15:27:07,325 - root - INFO - step: 14070 loss: 18.3874 memory: 6.46GiB(27.34%) tps: 21,531 tflops: 21.67 mfu: 6.94% global_avg_ntp_loss: 3.1629 global_avg_mtp_loss: 15.2245 +[titan] 2025-06-13 15:27:07,325 - root - INFO - lr: 5.4881e-05 gnorm: 1.30 [ 2:45:36< 0:10:56] +[titan] 2025-06-13 15:27:11,110 - root - INFO - step: 14075 loss: 19.0463 memory: 6.46GiB(27.34%) tps: 21,641 tflops: 21.78 mfu: 6.98% global_avg_ntp_loss: 3.2899 global_avg_mtp_loss: 15.7565 +[titan] 2025-06-13 15:27:11,111 - root - INFO - lr: 5.4829e-05 gnorm: 1.32 [ 2:45:40< 0:10:53] +[titan] 2025-06-13 15:27:14,265 - root - INFO - step: 14080 loss: 18.2053 memory: 6.46GiB(27.34%) tps: 25,975 tflops: 26.14 mfu: 8.38% global_avg_ntp_loss: 3.0910 global_avg_mtp_loss: 15.1143 +[titan] 2025-06-13 15:27:14,265 - root - INFO - lr: 5.4777e-05 gnorm: 1.67 [ 2:45:43< 0:10:49] +[titan] 2025-06-13 15:27:17,785 - root - INFO - step: 14085 loss: 18.6551 memory: 6.46GiB(27.34%) tps: 23,277 tflops: 23.43 mfu: 7.51% global_avg_ntp_loss: 3.1726 global_avg_mtp_loss: 15.4825 +[titan] 2025-06-13 15:27:17,785 - root - INFO - lr: 5.4726e-05 gnorm: 1.32 [ 2:45:47< 0:10:46] +[titan] 2025-06-13 15:27:21,186 - root - INFO - step: 14090 loss: 19.6927 memory: 6.46GiB(27.34%) tps: 24,086 tflops: 24.24 mfu: 7.77% global_avg_ntp_loss: 3.3375 global_avg_mtp_loss: 16.3551 +[titan] 2025-06-13 15:27:21,187 - root - INFO - lr: 5.4674e-05 gnorm: 1.38 [ 2:45:50< 0:10:42] +[titan] 2025-06-13 15:27:24,610 - root - INFO - step: 14095 loss: 17.9341 memory: 6.46GiB(27.34%) tps: 23,933 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.0693 global_avg_mtp_loss: 14.8648 +[titan] 2025-06-13 15:27:24,610 - root - INFO - lr: 5.4623e-05 gnorm: 1.25 [ 2:45:54< 0:10:39] +[titan] 2025-06-13 15:27:27,374 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:27:28,034 - root - INFO - step: 14100 loss: 15.1069 memory: 6.46GiB(27.34%) tps: 23,925 tflops: 24.08 mfu: 7.72% global_avg_ntp_loss: 2.5960 global_avg_mtp_loss: 12.5109 +[titan] 2025-06-13 15:27:28,034 - root - INFO - lr: 5.4572e-05 gnorm: 1.33 [ 2:45:57< 0:10:35] +[titan] 2025-06-13 15:27:31,429 - root - INFO - step: 14105 loss: 17.9309 memory: 6.46GiB(27.34%) tps: 24,134 tflops: 24.29 mfu: 7.78% global_avg_ntp_loss: 3.0470 global_avg_mtp_loss: 14.8838 +[titan] 2025-06-13 15:27:31,429 - root - INFO - lr: 5.4522e-05 gnorm: 1.28 [ 2:46:00< 0:10:32] +[titan] 2025-06-13 15:27:35,199 - root - INFO - step: 14110 loss: 18.4875 memory: 6.46GiB(27.34%) tps: 21,735 tflops: 21.87 mfu: 7.01% global_avg_ntp_loss: 3.1387 global_avg_mtp_loss: 15.3488 +[titan] 2025-06-13 15:27:35,199 - root - INFO - lr: 5.4472e-05 gnorm: 1.37 [ 2:46:04< 0:10:28] +[titan] 2025-06-13 15:27:38,470 - root - INFO - step: 14115 loss: 19.3713 memory: 6.46GiB(27.34%) tps: 25,043 tflops: 25.20 mfu: 8.08% global_avg_ntp_loss: 3.3115 global_avg_mtp_loss: 16.0598 +[titan] 2025-06-13 15:27:38,470 - root - INFO - lr: 5.4422e-05 gnorm: 1.20 [ 2:46:07< 0:10:24] +[titan] 2025-06-13 15:27:41,773 - root - INFO - step: 14120 loss: 18.5746 memory: 6.46GiB(27.34%) tps: 24,806 tflops: 24.96 mfu: 8.00% global_avg_ntp_loss: 3.1275 global_avg_mtp_loss: 15.4470 +[titan] 2025-06-13 15:27:41,773 - root - INFO - lr: 5.4372e-05 gnorm: 1.63 [ 2:46:11< 0:10:21] +[titan] 2025-06-13 15:27:45,227 - root - INFO - step: 14125 loss: 19.1018 memory: 6.46GiB(27.34%) tps: 23,722 tflops: 23.87 mfu: 7.65% global_avg_ntp_loss: 3.2382 global_avg_mtp_loss: 15.8636 +[titan] 2025-06-13 15:27:45,227 - root - INFO - lr: 5.4323e-05 gnorm: 1.25 [ 2:46:14< 0:10:17] +[titan] 2025-06-13 15:27:48,710 - root - INFO - step: 14130 loss: 19.1123 memory: 6.46GiB(27.34%) tps: 23,521 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.2503 global_avg_mtp_loss: 15.8620 +[titan] 2025-06-13 15:27:48,711 - root - INFO - lr: 5.4274e-05 gnorm: 1.29 [ 2:46:18< 0:10:14] +[titan] 2025-06-13 15:27:52,086 - root - INFO - step: 14135 loss: 19.4297 memory: 6.46GiB(27.34%) tps: 24,273 tflops: 24.43 mfu: 7.83% global_avg_ntp_loss: 3.3080 global_avg_mtp_loss: 16.1217 +[titan] 2025-06-13 15:27:52,086 - root - INFO - lr: 5.4225e-05 gnorm: 1.32 [ 2:46:21< 0:10:10] +[titan] 2025-06-13 15:27:55,072 - root - INFO - step: 14140 loss: 20.5994 memory: 6.46GiB(27.34%) tps: 27,438 tflops: 27.61 mfu: 8.85% global_avg_ntp_loss: 3.6057 global_avg_mtp_loss: 16.9937 +[titan] 2025-06-13 15:27:55,072 - root - INFO - lr: 5.4176e-05 gnorm: 1.40 [ 2:46:24< 0:10:07] +[titan] 2025-06-13 15:27:59,359 - root - INFO - step: 14145 loss: 17.9377 memory: 6.46GiB(27.34%) tps: 19,110 tflops: 19.23 mfu: 6.16% global_avg_ntp_loss: 3.0487 global_avg_mtp_loss: 14.8890 +[titan] 2025-06-13 15:27:59,359 - root - INFO - lr: 5.4128e-05 gnorm: 1.42 [ 2:46:28< 0:10:03] +[titan] 2025-06-13 15:28:01,872 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:28:02,408 - root - INFO - step: 14150 loss: 19.7134 memory: 6.46GiB(27.34%) tps: 26,869 tflops: 27.04 mfu: 8.67% global_avg_ntp_loss: 3.3434 global_avg_mtp_loss: 16.3700 +[titan] 2025-06-13 15:28:02,409 - root - INFO - lr: 5.4080e-05 gnorm: 1.32 [ 2:46:31< 0:10:00] +[titan] 2025-06-13 15:28:05,762 - root - INFO - step: 14155 loss: 19.2017 memory: 6.46GiB(27.34%) tps: 24,434 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 3.3249 global_avg_mtp_loss: 15.8769 +[titan] 2025-06-13 15:28:05,762 - root - INFO - lr: 5.4032e-05 gnorm: 1.41 [ 2:46:35< 0:09:56] +[titan] 2025-06-13 15:28:09,243 - root - INFO - step: 14160 loss: 19.4316 memory: 6.46GiB(27.34%) tps: 23,531 tflops: 23.68 mfu: 7.59% global_avg_ntp_loss: 3.3248 global_avg_mtp_loss: 16.1067 +[titan] 2025-06-13 15:28:09,244 - root - INFO - lr: 5.3985e-05 gnorm: 1.24 [ 2:46:38< 0:09:53] +[titan] 2025-06-13 15:28:12,965 - root - INFO - step: 14165 loss: 19.5344 memory: 6.46GiB(27.34%) tps: 22,011 tflops: 22.15 mfu: 7.10% global_avg_ntp_loss: 3.3378 global_avg_mtp_loss: 16.1965 +[titan] 2025-06-13 15:28:12,966 - root - INFO - lr: 5.3938e-05 gnorm: 1.14 [ 2:46:42< 0:09:49] +[titan] 2025-06-13 15:28:16,662 - root - INFO - step: 14170 loss: 18.8656 memory: 6.46GiB(27.34%) tps: 22,161 tflops: 22.30 mfu: 7.15% global_avg_ntp_loss: 3.2681 global_avg_mtp_loss: 15.5975 +[titan] 2025-06-13 15:28:16,663 - root - INFO - lr: 5.3891e-05 gnorm: 1.29 [ 2:46:46< 0:09:46] +[titan] 2025-06-13 15:28:20,074 - root - INFO - step: 14175 loss: 18.4366 memory: 6.46GiB(27.34%) tps: 24,020 tflops: 24.17 mfu: 7.75% global_avg_ntp_loss: 3.1145 global_avg_mtp_loss: 15.3221 +[titan] 2025-06-13 15:28:20,074 - root - INFO - lr: 5.3844e-05 gnorm: 1.25 [ 2:46:49< 0:09:42] +[titan] 2025-06-13 15:28:24,211 - root - INFO - step: 14180 loss: 19.3355 memory: 6.46GiB(27.34%) tps: 19,801 tflops: 19.93 mfu: 6.39% global_avg_ntp_loss: 3.3249 global_avg_mtp_loss: 16.0106 +[titan] 2025-06-13 15:28:24,212 - root - INFO - lr: 5.3798e-05 gnorm: 1.22 [ 2:46:53< 0:09:39] +[titan] 2025-06-13 15:28:27,727 - root - INFO - step: 14185 loss: 17.7079 memory: 6.46GiB(27.34%) tps: 23,305 tflops: 23.45 mfu: 7.52% global_avg_ntp_loss: 2.9903 global_avg_mtp_loss: 14.7176 +[titan] 2025-06-13 15:28:27,727 - root - INFO - lr: 5.3752e-05 gnorm: 1.18 [ 2:46:57< 0:09:35] +[titan] 2025-06-13 15:28:31,694 - root - INFO - step: 14190 loss: 19.5553 memory: 6.46GiB(27.34%) tps: 20,652 tflops: 20.78 mfu: 6.66% global_avg_ntp_loss: 3.3257 global_avg_mtp_loss: 16.2296 +[titan] 2025-06-13 15:28:31,694 - root - INFO - lr: 5.3706e-05 gnorm: 1.17 [ 2:47:01< 0:09:32] +[titan] 2025-06-13 15:28:34,856 - root - INFO - step: 14195 loss: 19.6843 memory: 6.46GiB(27.34%) tps: 25,908 tflops: 26.07 mfu: 8.36% global_avg_ntp_loss: 3.3901 global_avg_mtp_loss: 16.2942 +[titan] 2025-06-13 15:28:34,856 - root - INFO - lr: 5.3661e-05 gnorm: 1.24 [ 2:47:04< 0:09:28] +[titan] 2025-06-13 15:28:37,709 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:28:38,358 - root - INFO - step: 14200 loss: 19.1392 memory: 6.46GiB(27.34%) tps: 23,395 tflops: 23.54 mfu: 7.55% global_avg_ntp_loss: 3.2748 global_avg_mtp_loss: 15.8645 +[titan] 2025-06-13 15:28:38,358 - root - INFO - lr: 5.3615e-05 gnorm: 1.25 [ 2:47:07< 0:09:24] +[titan] 2025-06-13 15:28:41,894 - root - INFO - step: 14205 loss: 19.0809 memory: 6.46GiB(27.34%) tps: 23,174 tflops: 23.32 mfu: 7.47% global_avg_ntp_loss: 3.2558 global_avg_mtp_loss: 15.8251 +[titan] 2025-06-13 15:28:41,894 - root - INFO - lr: 5.3570e-05 gnorm: 1.36 [ 2:47:11< 0:09:21] +[titan] 2025-06-13 15:28:45,555 - root - INFO - step: 14210 loss: 18.7918 memory: 6.46GiB(27.34%) tps: 22,377 tflops: 22.52 mfu: 7.22% global_avg_ntp_loss: 3.1770 global_avg_mtp_loss: 15.6148 +[titan] 2025-06-13 15:28:45,555 - root - INFO - lr: 5.3526e-05 gnorm: 1.20 [ 2:47:14< 0:09:17] +[titan] 2025-06-13 15:28:48,703 - root - INFO - step: 14215 loss: 19.3663 memory: 6.46GiB(27.34%) tps: 26,029 tflops: 26.19 mfu: 8.40% global_avg_ntp_loss: 3.3152 global_avg_mtp_loss: 16.0511 +[titan] 2025-06-13 15:28:48,703 - root - INFO - lr: 5.3481e-05 gnorm: 1.31 [ 2:47:18< 0:09:14] +[titan] 2025-06-13 15:28:51,873 - root - INFO - step: 14220 loss: 19.2286 memory: 6.46GiB(27.34%) tps: 25,843 tflops: 26.01 mfu: 8.34% global_avg_ntp_loss: 3.3206 global_avg_mtp_loss: 15.9079 +[titan] 2025-06-13 15:28:51,874 - root - INFO - lr: 5.3437e-05 gnorm: 1.40 [ 2:47:21< 0:09:10] +[titan] 2025-06-13 15:28:55,447 - root - INFO - step: 14225 loss: 19.5042 memory: 6.46GiB(27.34%) tps: 22,928 tflops: 23.07 mfu: 7.40% global_avg_ntp_loss: 3.3313 global_avg_mtp_loss: 16.1730 +[titan] 2025-06-13 15:28:55,447 - root - INFO - lr: 5.3393e-05 gnorm: 1.21 [ 2:47:24< 0:09:07] +[titan] 2025-06-13 15:28:58,618 - root - INFO - step: 14230 loss: 19.3598 memory: 6.46GiB(27.34%) tps: 25,835 tflops: 26.00 mfu: 8.33% global_avg_ntp_loss: 3.3262 global_avg_mtp_loss: 16.0336 +[titan] 2025-06-13 15:28:58,618 - root - INFO - lr: 5.3350e-05 gnorm: 1.18 [ 2:47:28< 0:09:03] +[titan] 2025-06-13 15:29:02,069 - root - INFO - step: 14235 loss: 19.2029 memory: 6.46GiB(27.34%) tps: 23,738 tflops: 23.89 mfu: 7.66% global_avg_ntp_loss: 3.3529 global_avg_mtp_loss: 15.8500 +[titan] 2025-06-13 15:29:02,070 - root - INFO - lr: 5.3307e-05 gnorm: 1.53 [ 2:47:31< 0:09:00] +[titan] 2025-06-13 15:29:05,086 - root - INFO - step: 14240 loss: 15.5983 memory: 6.46GiB(27.34%) tps: 27,163 tflops: 27.34 mfu: 8.76% global_avg_ntp_loss: 2.6959 global_avg_mtp_loss: 12.9024 +[titan] 2025-06-13 15:29:05,086 - root - INFO - lr: 5.3264e-05 gnorm: 1.18 [ 2:47:34< 0:08:56] +[titan] 2025-06-13 15:29:08,973 - root - INFO - step: 14245 loss: 19.4478 memory: 6.46GiB(27.34%) tps: 21,074 tflops: 21.21 mfu: 6.80% global_avg_ntp_loss: 3.3165 global_avg_mtp_loss: 16.1313 +[titan] 2025-06-13 15:29:08,974 - root - INFO - lr: 5.3221e-05 gnorm: 1.40 [ 2:47:38< 0:08:53] +[titan] 2025-06-13 15:29:12,261 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:29:13,103 - root - INFO - step: 14250 loss: 18.2189 memory: 6.46GiB(27.34%) tps: 19,840 tflops: 19.97 mfu: 6.40% global_avg_ntp_loss: 3.1130 global_avg_mtp_loss: 15.1059 +[titan] 2025-06-13 15:29:13,103 - root - INFO - lr: 5.3179e-05 gnorm: 1.27 [ 2:47:42< 0:08:49] +[titan] 2025-06-13 15:29:16,507 - root - INFO - step: 14255 loss: 19.4390 memory: 6.46GiB(27.34%) tps: 24,069 tflops: 24.22 mfu: 7.76% global_avg_ntp_loss: 3.2792 global_avg_mtp_loss: 16.1598 +[titan] 2025-06-13 15:29:16,507 - root - INFO - lr: 5.3136e-05 gnorm: 1.23 [ 2:47:45< 0:08:46] +[titan] 2025-06-13 15:29:19,808 - root - INFO - step: 14260 loss: 18.6698 memory: 6.46GiB(27.34%) tps: 24,821 tflops: 24.98 mfu: 8.01% global_avg_ntp_loss: 3.1495 global_avg_mtp_loss: 15.5203 +[titan] 2025-06-13 15:29:19,808 - root - INFO - lr: 5.3095e-05 gnorm: 1.23 [ 2:47:49< 0:08:42] +[titan] 2025-06-13 15:29:23,523 - root - INFO - step: 14265 loss: 19.2127 memory: 6.46GiB(27.34%) tps: 22,051 tflops: 22.19 mfu: 7.11% global_avg_ntp_loss: 3.2812 global_avg_mtp_loss: 15.9315 +[titan] 2025-06-13 15:29:23,524 - root - INFO - lr: 5.3053e-05 gnorm: 1.38 [ 2:47:52< 0:08:39] +[titan] 2025-06-13 15:29:26,506 - root - INFO - step: 14270 loss: 19.4974 memory: 6.46GiB(27.34%) tps: 27,475 tflops: 27.65 mfu: 8.86% global_avg_ntp_loss: 3.3448 global_avg_mtp_loss: 16.1526 +[titan] 2025-06-13 15:29:26,506 - root - INFO - lr: 5.3012e-05 gnorm: 1.32 [ 2:47:55< 0:08:35] +[titan] 2025-06-13 15:29:29,905 - root - INFO - step: 14275 loss: 19.8923 memory: 6.46GiB(27.34%) tps: 24,101 tflops: 24.25 mfu: 7.77% global_avg_ntp_loss: 3.3686 global_avg_mtp_loss: 16.5238 +[titan] 2025-06-13 15:29:29,905 - root - INFO - lr: 5.2971e-05 gnorm: 1.34 [ 2:47:59< 0:08:31] +[titan] 2025-06-13 15:29:33,109 - root - INFO - step: 14280 loss: 18.2959 memory: 6.46GiB(27.34%) tps: 25,571 tflops: 25.73 mfu: 8.25% global_avg_ntp_loss: 3.0856 global_avg_mtp_loss: 15.2103 +[titan] 2025-06-13 15:29:33,109 - root - INFO - lr: 5.2930e-05 gnorm: 1.46 [ 2:48:02< 0:08:28] +[titan] 2025-06-13 15:29:36,605 - root - INFO - step: 14285 loss: 19.3792 memory: 6.46GiB(27.34%) tps: 23,438 tflops: 23.59 mfu: 7.56% global_avg_ntp_loss: 3.3057 global_avg_mtp_loss: 16.0735 +[titan] 2025-06-13 15:29:36,605 - root - INFO - lr: 5.2889e-05 gnorm: 1.25 [ 2:48:06< 0:08:24] +[titan] 2025-06-13 15:29:39,986 - root - INFO - step: 14290 loss: 19.6644 memory: 6.46GiB(27.34%) tps: 24,229 tflops: 24.38 mfu: 7.82% global_avg_ntp_loss: 3.4464 global_avg_mtp_loss: 16.2180 +[titan] 2025-06-13 15:29:39,986 - root - INFO - lr: 5.2849e-05 gnorm: 1.34 [ 2:48:09< 0:08:21] +[titan] 2025-06-13 15:29:43,332 - root - INFO - step: 14295 loss: 19.1912 memory: 6.46GiB(27.34%) tps: 24,487 tflops: 24.64 mfu: 7.90% global_avg_ntp_loss: 3.2433 global_avg_mtp_loss: 15.9480 +[titan] 2025-06-13 15:29:43,332 - root - INFO - lr: 5.2809e-05 gnorm: 1.23 [ 2:48:12< 0:08:17] +[titan] 2025-06-13 15:29:46,057 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:29:46,654 - root - INFO - step: 14300 loss: 19.2049 memory: 6.46GiB(27.34%) tps: 24,661 tflops: 24.82 mfu: 7.95% global_avg_ntp_loss: 3.2420 global_avg_mtp_loss: 15.9629 +[titan] 2025-06-13 15:29:46,654 - root - INFO - lr: 5.2770e-05 gnorm: 1.28 [ 2:48:16< 0:08:14] +[titan] 2025-06-13 15:29:50,187 - root - INFO - step: 14305 loss: 19.3402 memory: 6.46GiB(27.34%) tps: 23,190 tflops: 23.34 mfu: 7.48% global_avg_ntp_loss: 3.2959 global_avg_mtp_loss: 16.0443 +[titan] 2025-06-13 15:29:50,189 - root - INFO - lr: 5.2730e-05 gnorm: 1.24 [ 2:48:19< 0:08:10] +[titan] 2025-06-13 15:29:53,531 - root - INFO - step: 14310 loss: 17.6051 memory: 6.46GiB(27.34%) tps: 24,507 tflops: 24.66 mfu: 7.91% global_avg_ntp_loss: 3.0684 global_avg_mtp_loss: 14.5368 +[titan] 2025-06-13 15:29:53,532 - root - INFO - lr: 5.2691e-05 gnorm: 1.40 [ 2:48:22< 0:08:07] +[titan] 2025-06-13 15:29:57,069 - root - INFO - step: 14315 loss: 18.4013 memory: 6.46GiB(27.34%) tps: 23,159 tflops: 23.31 mfu: 7.47% global_avg_ntp_loss: 3.0924 global_avg_mtp_loss: 15.3089 +[titan] 2025-06-13 15:29:57,069 - root - INFO - lr: 5.2653e-05 gnorm: 1.53 [ 2:48:26< 0:08:03] +[titan] 2025-06-13 15:30:00,402 - root - INFO - step: 14320 loss: 18.2890 memory: 6.46GiB(27.34%) tps: 24,586 tflops: 24.74 mfu: 7.93% global_avg_ntp_loss: 3.0664 global_avg_mtp_loss: 15.2226 +[titan] 2025-06-13 15:30:00,402 - root - INFO - lr: 5.2614e-05 gnorm: 1.53 [ 2:48:29< 0:08:00] +[titan] 2025-06-13 15:30:03,808 - root - INFO - step: 14325 loss: 18.9933 memory: 6.46GiB(27.34%) tps: 24,050 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 3.2561 global_avg_mtp_loss: 15.7371 +[titan] 2025-06-13 15:30:03,808 - root - INFO - lr: 5.2576e-05 gnorm: 1.45 [ 2:48:33< 0:07:56] +[titan] 2025-06-13 15:30:07,275 - root - INFO - step: 14330 loss: 18.7874 memory: 6.46GiB(27.34%) tps: 23,635 tflops: 23.79 mfu: 7.62% global_avg_ntp_loss: 3.2547 global_avg_mtp_loss: 15.5327 +[titan] 2025-06-13 15:30:07,275 - root - INFO - lr: 5.2538e-05 gnorm: 1.35 [ 2:48:36< 0:07:53] +[titan] 2025-06-13 15:30:10,727 - root - INFO - step: 14335 loss: 16.2980 memory: 6.46GiB(27.34%) tps: 23,731 tflops: 23.88 mfu: 7.65% global_avg_ntp_loss: 2.7213 global_avg_mtp_loss: 13.5767 +[titan] 2025-06-13 15:30:10,728 - root - INFO - lr: 5.2500e-05 gnorm: 1.38 [ 2:48:40< 0:07:49] +[titan] 2025-06-13 15:30:11,391 - root - INFO - Dumping profiler traces at step 14336 +[titan] 2025-06-13 15:30:11,478 - root - INFO - Finished dumping profiler traces in 0.09 seconds +[titan] 2025-06-13 15:30:14,186 - root - INFO - step: 14340 loss: 19.0661 memory: 6.46GiB(27.34%) tps: 23,689 tflops: 23.84 mfu: 7.64% global_avg_ntp_loss: 3.2571 global_avg_mtp_loss: 15.8090 +[titan] 2025-06-13 15:30:14,186 - root - INFO - lr: 5.2463e-05 gnorm: 1.23 [ 2:48:43< 0:07:45] +[titan] 2025-06-13 15:30:17,521 - root - INFO - step: 14345 loss: 18.7944 memory: 6.46GiB(27.34%) tps: 24,567 tflops: 24.72 mfu: 7.92% global_avg_ntp_loss: 3.1668 global_avg_mtp_loss: 15.6275 +[titan] 2025-06-13 15:30:17,521 - root - INFO - lr: 5.2426e-05 gnorm: 1.28 [ 2:48:46< 0:07:42] +[titan] 2025-06-13 15:30:19,937 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:30:20,743 - root - INFO - step: 14350 loss: 18.5744 memory: 6.46GiB(27.34%) tps: 25,426 tflops: 25.59 mfu: 8.20% global_avg_ntp_loss: 3.1405 global_avg_mtp_loss: 15.4340 +[titan] 2025-06-13 15:30:20,744 - root - INFO - lr: 5.2389e-05 gnorm: 1.31 [ 2:48:50< 0:07:38] +[titan] 2025-06-13 15:30:24,254 - root - INFO - step: 14355 loss: 19.1547 memory: 6.46GiB(27.34%) tps: 23,339 tflops: 23.49 mfu: 7.53% global_avg_ntp_loss: 3.2418 global_avg_mtp_loss: 15.9129 +[titan] 2025-06-13 15:30:24,254 - root - INFO - lr: 5.2352e-05 gnorm: 1.30 [ 2:48:53< 0:07:35] +[titan] 2025-06-13 15:30:27,228 - root - INFO - step: 14360 loss: 18.3435 memory: 6.46GiB(27.34%) tps: 27,547 tflops: 27.72 mfu: 8.89% global_avg_ntp_loss: 3.1323 global_avg_mtp_loss: 15.2111 +[titan] 2025-06-13 15:30:27,228 - root - INFO - lr: 5.2316e-05 gnorm: 1.23 [ 2:48:56< 0:07:31] +[titan] 2025-06-13 15:30:30,535 - root - INFO - step: 14365 loss: 18.0105 memory: 6.46GiB(27.34%) tps: 24,780 tflops: 24.94 mfu: 7.99% global_avg_ntp_loss: 3.0690 global_avg_mtp_loss: 14.9415 +[titan] 2025-06-13 15:30:30,535 - root - INFO - lr: 5.2280e-05 gnorm: 1.22 [ 2:48:59< 0:07:28] +[titan] 2025-06-13 15:30:34,076 - root - INFO - step: 14370 loss: 19.4706 memory: 6.46GiB(27.34%) tps: 23,137 tflops: 23.28 mfu: 7.46% global_avg_ntp_loss: 3.3355 global_avg_mtp_loss: 16.1351 +[titan] 2025-06-13 15:30:34,076 - root - INFO - lr: 5.2244e-05 gnorm: 1.22 [ 2:49:03< 0:07:24] +[titan] 2025-06-13 15:30:37,389 - root - INFO - step: 14375 loss: 18.3810 memory: 6.46GiB(27.34%) tps: 24,726 tflops: 24.88 mfu: 7.98% global_avg_ntp_loss: 3.1181 global_avg_mtp_loss: 15.2629 +[titan] 2025-06-13 15:30:37,389 - root - INFO - lr: 5.2209e-05 gnorm: 1.33 [ 2:49:06< 0:07:21] +[titan] 2025-06-13 15:30:41,018 - root - INFO - step: 14380 loss: 20.0218 memory: 6.46GiB(27.34%) tps: 22,576 tflops: 22.72 mfu: 7.28% global_avg_ntp_loss: 3.4547 global_avg_mtp_loss: 16.5670 +[titan] 2025-06-13 15:30:41,019 - root - INFO - lr: 5.2174e-05 gnorm: 1.27 [ 2:49:10< 0:07:17] +[titan] 2025-06-13 15:30:44,689 - root - INFO - step: 14385 loss: 18.2297 memory: 6.46GiB(27.34%) tps: 22,320 tflops: 22.46 mfu: 7.20% global_avg_ntp_loss: 3.0473 global_avg_mtp_loss: 15.1824 +[titan] 2025-06-13 15:30:44,689 - root - INFO - lr: 5.2139e-05 gnorm: 1.44 [ 2:49:14< 0:07:14] +[titan] 2025-06-13 15:30:48,078 - root - INFO - step: 14390 loss: 17.4861 memory: 6.46GiB(27.34%) tps: 24,176 tflops: 24.33 mfu: 7.80% global_avg_ntp_loss: 3.0051 global_avg_mtp_loss: 14.4810 +[titan] 2025-06-13 15:30:48,078 - root - INFO - lr: 5.2104e-05 gnorm: 1.44 [ 2:49:17< 0:07:10] +[titan] 2025-06-13 15:30:51,643 - root - INFO - step: 14395 loss: 18.1642 memory: 6.46GiB(27.34%) tps: 22,980 tflops: 23.13 mfu: 7.41% global_avg_ntp_loss: 3.0807 global_avg_mtp_loss: 15.0836 +[titan] 2025-06-13 15:30:51,643 - root - INFO - lr: 5.2070e-05 gnorm: 1.29 [ 2:49:21< 0:07:07] +[titan] 2025-06-13 15:30:54,495 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:30:55,121 - root - INFO - step: 14400 loss: 19.0428 memory: 6.46GiB(27.34%) tps: 23,561 tflops: 23.71 mfu: 7.60% global_avg_ntp_loss: 3.2600 global_avg_mtp_loss: 15.7829 +[titan] 2025-06-13 15:30:55,121 - root - INFO - lr: 5.2036e-05 gnorm: 1.31 [ 2:49:24< 0:07:03] +[titan] 2025-06-13 15:30:58,834 - root - INFO - step: 14405 loss: 19.5908 memory: 6.46GiB(27.34%) tps: 22,061 tflops: 22.20 mfu: 7.12% global_avg_ntp_loss: 3.3611 global_avg_mtp_loss: 16.2296 +[titan] 2025-06-13 15:30:58,835 - root - INFO - lr: 5.2002e-05 gnorm: 1.28 [ 2:49:28< 0:07:00] +[titan] 2025-06-13 15:31:02,387 - root - INFO - step: 14410 loss: 19.0610 memory: 6.46GiB(27.34%) tps: 23,063 tflops: 23.21 mfu: 7.44% global_avg_ntp_loss: 3.2639 global_avg_mtp_loss: 15.7971 +[titan] 2025-06-13 15:31:02,387 - root - INFO - lr: 5.1969e-05 gnorm: 1.36 [ 2:49:31< 0:06:56] +[titan] 2025-06-13 15:31:06,079 - root - INFO - step: 14415 loss: 20.1836 memory: 6.46GiB(27.34%) tps: 22,190 tflops: 22.33 mfu: 7.16% global_avg_ntp_loss: 3.6463 global_avg_mtp_loss: 16.5373 +[titan] 2025-06-13 15:31:06,079 - root - INFO - lr: 5.1936e-05 gnorm: 1.65 [ 2:49:35< 0:06:52] +[titan] 2025-06-13 15:31:10,243 - root - INFO - step: 14420 loss: 18.0978 memory: 6.46GiB(27.34%) tps: 19,677 tflops: 19.80 mfu: 6.35% global_avg_ntp_loss: 3.0944 global_avg_mtp_loss: 15.0035 +[titan] 2025-06-13 15:31:10,243 - root - INFO - lr: 5.1903e-05 gnorm: 1.50 [ 2:49:39< 0:06:49] +[titan] 2025-06-13 15:31:13,568 - root - INFO - step: 14425 loss: 19.4075 memory: 6.46GiB(27.34%) tps: 24,635 tflops: 24.79 mfu: 7.95% global_avg_ntp_loss: 3.2734 global_avg_mtp_loss: 16.1342 +[titan] 2025-06-13 15:31:13,569 - root - INFO - lr: 5.1870e-05 gnorm: 1.27 [ 2:49:42< 0:06:45] +[titan] 2025-06-13 15:31:16,701 - root - INFO - step: 14430 loss: 17.9233 memory: 6.46GiB(27.34%) tps: 26,151 tflops: 26.32 mfu: 8.44% global_avg_ntp_loss: 3.1079 global_avg_mtp_loss: 14.8154 +[titan] 2025-06-13 15:31:16,702 - root - INFO - lr: 5.1838e-05 gnorm: 1.67 [ 2:49:46< 0:06:42] +[titan] 2025-06-13 15:31:20,323 - root - INFO - step: 14435 loss: 15.0705 memory: 6.46GiB(27.34%) tps: 22,621 tflops: 22.77 mfu: 7.30% global_avg_ntp_loss: 2.5510 global_avg_mtp_loss: 12.5195 +[titan] 2025-06-13 15:31:20,323 - root - INFO - lr: 5.1806e-05 gnorm: 1.36 [ 2:49:49< 0:06:38] +[titan] 2025-06-13 15:31:23,667 - root - INFO - step: 14440 loss: 18.7927 memory: 6.46GiB(27.34%) tps: 24,505 tflops: 24.66 mfu: 7.90% global_avg_ntp_loss: 3.1812 global_avg_mtp_loss: 15.6115 +[titan] 2025-06-13 15:31:23,667 - root - INFO - lr: 5.1774e-05 gnorm: 1.60 [ 2:49:53< 0:06:35] +[titan] 2025-06-13 15:31:27,218 - root - INFO - step: 14445 loss: 19.5929 memory: 6.46GiB(27.34%) tps: 23,072 tflops: 23.22 mfu: 7.44% global_avg_ntp_loss: 3.3662 global_avg_mtp_loss: 16.2267 +[titan] 2025-06-13 15:31:27,218 - root - INFO - lr: 5.1742e-05 gnorm: 1.42 [ 2:49:56< 0:06:31] +[titan] 2025-06-13 15:31:30,195 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:31:30,736 - root - INFO - step: 14450 loss: 18.6961 memory: 6.46GiB(27.34%) tps: 23,284 tflops: 23.43 mfu: 7.51% global_avg_ntp_loss: 3.1787 global_avg_mtp_loss: 15.5174 +[titan] 2025-06-13 15:31:30,737 - root - INFO - lr: 5.1711e-05 gnorm: 1.19 [ 2:50:00< 0:06:28] +[titan] 2025-06-13 15:31:33,924 - root - INFO - step: 14455 loss: 18.5294 memory: 6.46GiB(27.34%) tps: 25,705 tflops: 25.87 mfu: 8.29% global_avg_ntp_loss: 3.1249 global_avg_mtp_loss: 15.4045 +[titan] 2025-06-13 15:31:33,924 - root - INFO - lr: 5.1680e-05 gnorm: 1.29 [ 2:50:03< 0:06:24] +[titan] 2025-06-13 15:31:37,374 - root - INFO - step: 14460 loss: 19.0671 memory: 6.46GiB(27.34%) tps: 23,748 tflops: 23.90 mfu: 7.66% global_avg_ntp_loss: 3.2259 global_avg_mtp_loss: 15.8412 +[titan] 2025-06-13 15:31:37,374 - root - INFO - lr: 5.1650e-05 gnorm: 1.30 [ 2:50:06< 0:06:21] +[titan] 2025-06-13 15:31:41,076 - root - INFO - step: 14465 loss: 20.2886 memory: 6.46GiB(27.34%) tps: 22,130 tflops: 22.27 mfu: 7.14% global_avg_ntp_loss: 3.5084 global_avg_mtp_loss: 16.7802 +[titan] 2025-06-13 15:31:41,076 - root - INFO - lr: 5.1619e-05 gnorm: 1.22 [ 2:50:10< 0:06:17] +[titan] 2025-06-13 15:31:44,387 - root - INFO - step: 14470 loss: 18.7169 memory: 6.46GiB(27.34%) tps: 24,744 tflops: 24.90 mfu: 7.98% global_avg_ntp_loss: 3.1645 global_avg_mtp_loss: 15.5524 +[titan] 2025-06-13 15:31:44,387 - root - INFO - lr: 5.1589e-05 gnorm: 1.26 [ 2:50:13< 0:06:14] +[titan] 2025-06-13 15:31:47,518 - root - INFO - step: 14475 loss: 18.1669 memory: 6.46GiB(27.34%) tps: 26,169 tflops: 26.34 mfu: 8.44% global_avg_ntp_loss: 3.1088 global_avg_mtp_loss: 15.0581 +[titan] 2025-06-13 15:31:47,518 - root - INFO - lr: 5.1559e-05 gnorm: 1.29 [ 2:50:16< 0:06:10] +[titan] 2025-06-13 15:31:51,105 - root - INFO - step: 14480 loss: 18.4689 memory: 6.46GiB(27.34%) tps: 22,839 tflops: 22.98 mfu: 7.37% global_avg_ntp_loss: 3.1111 global_avg_mtp_loss: 15.3577 +[titan] 2025-06-13 15:31:51,105 - root - INFO - lr: 5.1530e-05 gnorm: 1.28 [ 2:50:20< 0:06:07] +[titan] 2025-06-13 15:31:54,459 - root - INFO - step: 14485 loss: 18.4220 memory: 6.46GiB(27.34%) tps: 24,429 tflops: 24.58 mfu: 7.88% global_avg_ntp_loss: 3.1587 global_avg_mtp_loss: 15.2633 +[titan] 2025-06-13 15:31:54,460 - root - INFO - lr: 5.1501e-05 gnorm: 1.31 [ 2:50:23< 0:06:03] +[titan] 2025-06-13 15:31:58,000 - root - INFO - step: 14490 loss: 18.4282 memory: 6.46GiB(27.34%) tps: 23,141 tflops: 23.29 mfu: 7.46% global_avg_ntp_loss: 3.1441 global_avg_mtp_loss: 15.2841 +[titan] 2025-06-13 15:31:58,000 - root - INFO - lr: 5.1472e-05 gnorm: 1.29 [ 2:50:27< 0:05:59] +[titan] 2025-06-13 15:32:01,231 - root - INFO - step: 14495 loss: 18.2674 memory: 6.46GiB(27.34%) tps: 25,358 tflops: 25.52 mfu: 8.18% global_avg_ntp_loss: 3.1197 global_avg_mtp_loss: 15.1478 +[titan] 2025-06-13 15:32:01,231 - root - INFO - lr: 5.1443e-05 gnorm: 1.34 [ 2:50:30< 0:05:56] +[titan] 2025-06-13 15:32:04,049 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:32:04,810 - root - INFO - step: 14500 loss: 19.7463 memory: 6.46GiB(27.34%) tps: 22,889 tflops: 23.03 mfu: 7.38% global_avg_ntp_loss: 3.3875 global_avg_mtp_loss: 16.3588 +[titan] 2025-06-13 15:32:04,811 - root - INFO - lr: 5.1415e-05 gnorm: 1.29 [ 2:50:34< 0:05:52] +[titan] 2025-06-13 15:32:08,189 - root - INFO - step: 14505 loss: 18.4844 memory: 6.46GiB(27.34%) tps: 24,247 tflops: 24.40 mfu: 7.82% global_avg_ntp_loss: 3.1256 global_avg_mtp_loss: 15.3589 +[titan] 2025-06-13 15:32:08,190 - root - INFO - lr: 5.1386e-05 gnorm: 1.40 [ 2:50:37< 0:05:49] +[titan] 2025-06-13 15:32:11,464 - root - INFO - step: 14510 loss: 18.9813 memory: 6.46GiB(27.34%) tps: 25,021 tflops: 25.18 mfu: 8.07% global_avg_ntp_loss: 3.2307 global_avg_mtp_loss: 15.7506 +[titan] 2025-06-13 15:32:11,464 - root - INFO - lr: 5.1359e-05 gnorm: 1.27 [ 2:50:40< 0:05:45] +[titan] 2025-06-13 15:32:14,798 - root - INFO - step: 14515 loss: 19.4480 memory: 6.46GiB(27.34%) tps: 24,572 tflops: 24.73 mfu: 7.93% global_avg_ntp_loss: 3.3337 global_avg_mtp_loss: 16.1143 +[titan] 2025-06-13 15:32:14,799 - root - INFO - lr: 5.1331e-05 gnorm: 1.28 [ 2:50:44< 0:05:42] +[titan] 2025-06-13 15:32:18,228 - root - INFO - step: 14520 loss: 18.5908 memory: 6.46GiB(27.34%) tps: 23,887 tflops: 24.04 mfu: 7.71% global_avg_ntp_loss: 3.1488 global_avg_mtp_loss: 15.4420 +[titan] 2025-06-13 15:32:18,228 - root - INFO - lr: 5.1304e-05 gnorm: 1.34 [ 2:50:47< 0:05:38] +[titan] 2025-06-13 15:32:21,815 - root - INFO - step: 14525 loss: 18.5469 memory: 6.46GiB(27.34%) tps: 22,840 tflops: 22.99 mfu: 7.37% global_avg_ntp_loss: 3.2027 global_avg_mtp_loss: 15.3442 +[titan] 2025-06-13 15:32:21,815 - root - INFO - lr: 5.1277e-05 gnorm: 1.25 [ 2:50:51< 0:05:35] +[titan] 2025-06-13 15:32:25,244 - root - INFO - step: 14530 loss: 16.8393 memory: 6.46GiB(27.34%) tps: 23,896 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 2.8095 global_avg_mtp_loss: 14.0298 +[titan] 2025-06-13 15:32:25,245 - root - INFO - lr: 5.1250e-05 gnorm: 1.36 [ 2:50:54< 0:05:31] +[titan] 2025-06-13 15:32:28,684 - root - INFO - step: 14535 loss: 18.8263 memory: 6.46GiB(27.34%) tps: 23,822 tflops: 23.97 mfu: 7.68% global_avg_ntp_loss: 3.1916 global_avg_mtp_loss: 15.6348 +[titan] 2025-06-13 15:32:28,684 - root - INFO - lr: 5.1224e-05 gnorm: 1.46 [ 2:50:58< 0:05:28] +[titan] 2025-06-13 15:32:32,428 - root - INFO - step: 14540 loss: 18.1756 memory: 6.46GiB(27.34%) tps: 21,881 tflops: 22.02 mfu: 7.06% global_avg_ntp_loss: 3.0445 global_avg_mtp_loss: 15.1311 +[titan] 2025-06-13 15:32:32,428 - root - INFO - lr: 5.1197e-05 gnorm: 1.41 [ 2:51:01< 0:05:24] +[titan] 2025-06-13 15:32:35,919 - root - INFO - step: 14545 loss: 18.5471 memory: 6.46GiB(27.34%) tps: 23,469 tflops: 23.62 mfu: 7.57% global_avg_ntp_loss: 3.1498 global_avg_mtp_loss: 15.3973 +[titan] 2025-06-13 15:32:35,919 - root - INFO - lr: 5.1172e-05 gnorm: 1.51 [ 2:51:05< 0:05:21] +[titan] 2025-06-13 15:32:38,974 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:32:39,604 - root - INFO - step: 14550 loss: 18.2247 memory: 6.46GiB(27.34%) tps: 22,232 tflops: 22.37 mfu: 7.17% global_avg_ntp_loss: 3.0590 global_avg_mtp_loss: 15.1657 +[titan] 2025-06-13 15:32:39,604 - root - INFO - lr: 5.1146e-05 gnorm: 1.38 [ 2:51:09< 0:05:17] +[titan] 2025-06-13 15:32:43,111 - root - INFO - step: 14555 loss: 18.6154 memory: 6.46GiB(27.34%) tps: 23,361 tflops: 23.51 mfu: 7.54% global_avg_ntp_loss: 3.1934 global_avg_mtp_loss: 15.4220 +[titan] 2025-06-13 15:32:43,112 - root - INFO - lr: 5.1121e-05 gnorm: 1.45 [ 2:51:12< 0:05:14] +[titan] 2025-06-13 15:32:46,595 - root - INFO - step: 14560 loss: 18.3393 memory: 6.46GiB(27.34%) tps: 23,516 tflops: 23.67 mfu: 7.59% global_avg_ntp_loss: 3.1537 global_avg_mtp_loss: 15.1856 +[titan] 2025-06-13 15:32:46,596 - root - INFO - lr: 5.1096e-05 gnorm: 1.35 [ 2:51:16< 0:05:10] +[titan] 2025-06-13 15:32:50,596 - root - INFO - step: 14565 loss: 19.1344 memory: 6.46GiB(27.34%) tps: 20,476 tflops: 20.61 mfu: 6.60% global_avg_ntp_loss: 3.2693 global_avg_mtp_loss: 15.8651 +[titan] 2025-06-13 15:32:50,597 - root - INFO - lr: 5.1071e-05 gnorm: 1.25 [ 2:51:20< 0:05:07] +[titan] 2025-06-13 15:32:53,920 - root - INFO - step: 14570 loss: 18.2664 memory: 6.46GiB(27.34%) tps: 24,648 tflops: 24.81 mfu: 7.95% global_avg_ntp_loss: 3.0841 global_avg_mtp_loss: 15.1822 +[titan] 2025-06-13 15:32:53,921 - root - INFO - lr: 5.1046e-05 gnorm: 1.34 [ 2:51:23< 0:05:03] +[titan] 2025-06-13 15:32:57,710 - root - INFO - step: 14575 loss: 20.4129 memory: 6.46GiB(27.34%) tps: 21,622 tflops: 21.76 mfu: 6.97% global_avg_ntp_loss: 3.5176 global_avg_mtp_loss: 16.8953 +[titan] 2025-06-13 15:32:57,710 - root - INFO - lr: 5.1022e-05 gnorm: 1.27 [ 2:51:27< 0:04:59] +[titan] 2025-06-13 15:33:00,777 - root - INFO - step: 14580 loss: 19.8770 memory: 6.46GiB(27.34%) tps: 26,717 tflops: 26.89 mfu: 8.62% global_avg_ntp_loss: 3.4072 global_avg_mtp_loss: 16.4698 +[titan] 2025-06-13 15:33:00,777 - root - INFO - lr: 5.0998e-05 gnorm: 1.32 [ 2:51:30< 0:04:56] +[titan] 2025-06-13 15:33:04,212 - root - INFO - step: 14585 loss: 18.3584 memory: 6.46GiB(27.34%) tps: 23,848 tflops: 24.00 mfu: 7.69% global_avg_ntp_loss: 3.1008 global_avg_mtp_loss: 15.2577 +[titan] 2025-06-13 15:33:04,212 - root - INFO - lr: 5.0975e-05 gnorm: 1.43 [ 2:51:33< 0:04:52] +[titan] 2025-06-13 15:33:07,338 - root - INFO - step: 14590 loss: 18.1359 memory: 6.46GiB(27.34%) tps: 26,213 tflops: 26.38 mfu: 8.46% global_avg_ntp_loss: 3.0650 global_avg_mtp_loss: 15.0709 +[titan] 2025-06-13 15:33:07,338 - root - INFO - lr: 5.0951e-05 gnorm: 1.44 [ 2:51:36< 0:04:49] +[titan] 2025-06-13 15:33:10,760 - root - INFO - step: 14595 loss: 19.1792 memory: 6.46GiB(27.34%) tps: 23,937 tflops: 24.09 mfu: 7.72% global_avg_ntp_loss: 3.3298 global_avg_mtp_loss: 15.8495 +[titan] 2025-06-13 15:33:10,760 - root - INFO - lr: 5.0928e-05 gnorm: 1.32 [ 2:51:40< 0:04:45] +[titan] 2025-06-13 15:33:13,530 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:33:14,374 - root - INFO - step: 14600 loss: 17.8864 memory: 6.46GiB(27.34%) tps: 22,669 tflops: 22.81 mfu: 7.31% global_avg_ntp_loss: 3.0538 global_avg_mtp_loss: 14.8326 +[titan] 2025-06-13 15:33:14,375 - root - INFO - lr: 5.0906e-05 gnorm: 1.39 [ 2:51:43< 0:04:42] +[titan] 2025-06-13 15:33:17,717 - root - INFO - step: 14605 loss: 18.1292 memory: 6.46GiB(27.34%) tps: 24,512 tflops: 24.67 mfu: 7.91% global_avg_ntp_loss: 3.0609 global_avg_mtp_loss: 15.0683 +[titan] 2025-06-13 15:33:17,717 - root - INFO - lr: 5.0883e-05 gnorm: 1.32 [ 2:51:47< 0:04:38] +[titan] 2025-06-13 15:33:21,221 - root - INFO - step: 14610 loss: 19.1981 memory: 6.46GiB(27.34%) tps: 23,382 tflops: 23.53 mfu: 7.54% global_avg_ntp_loss: 3.2708 global_avg_mtp_loss: 15.9273 +[titan] 2025-06-13 15:33:21,221 - root - INFO - lr: 5.0861e-05 gnorm: 1.39 [ 2:51:50< 0:04:35] +[titan] 2025-06-13 15:33:24,808 - root - INFO - step: 14615 loss: 19.7179 memory: 6.46GiB(27.34%) tps: 22,835 tflops: 22.98 mfu: 7.37% global_avg_ntp_loss: 3.3729 global_avg_mtp_loss: 16.3451 +[titan] 2025-06-13 15:33:24,809 - root - INFO - lr: 5.0839e-05 gnorm: 1.32 [ 2:51:54< 0:04:31] +[titan] 2025-06-13 15:33:28,097 - root - INFO - step: 14620 loss: 16.6484 memory: 6.46GiB(27.34%) tps: 24,915 tflops: 25.07 mfu: 8.04% global_avg_ntp_loss: 2.8563 global_avg_mtp_loss: 13.7921 +[titan] 2025-06-13 15:33:28,097 - root - INFO - lr: 5.0817e-05 gnorm: 1.37 [ 2:51:57< 0:04:28] +[titan] 2025-06-13 15:33:31,716 - root - INFO - step: 14625 loss: 19.2003 memory: 6.46GiB(27.34%) tps: 22,639 tflops: 22.78 mfu: 7.30% global_avg_ntp_loss: 3.2373 global_avg_mtp_loss: 15.9630 +[titan] 2025-06-13 15:33:31,716 - root - INFO - lr: 5.0796e-05 gnorm: 1.35 [ 2:52:01< 0:04:24] +[titan] 2025-06-13 15:33:35,415 - root - INFO - step: 14630 loss: 18.2708 memory: 6.46GiB(27.34%) tps: 22,148 tflops: 22.29 mfu: 7.14% global_avg_ntp_loss: 3.1145 global_avg_mtp_loss: 15.1563 +[titan] 2025-06-13 15:33:35,415 - root - INFO - lr: 5.0775e-05 gnorm: 1.32 [ 2:52:04< 0:04:21] +[titan] 2025-06-13 15:33:38,685 - root - INFO - step: 14635 loss: 18.9882 memory: 6.46GiB(27.34%) tps: 25,056 tflops: 25.22 mfu: 8.08% global_avg_ntp_loss: 3.2505 global_avg_mtp_loss: 15.7377 +[titan] 2025-06-13 15:33:38,685 - root - INFO - lr: 5.0754e-05 gnorm: 1.31 [ 2:52:08< 0:04:17] +[titan] 2025-06-13 15:33:41,946 - root - INFO - step: 14640 loss: 18.2677 memory: 6.46GiB(27.34%) tps: 25,122 tflops: 25.28 mfu: 8.10% global_avg_ntp_loss: 3.1647 global_avg_mtp_loss: 15.1029 +[titan] 2025-06-13 15:33:41,947 - root - INFO - lr: 5.0734e-05 gnorm: 1.48 [ 2:52:11< 0:04:14] +[titan] 2025-06-13 15:33:45,359 - root - INFO - step: 14645 loss: 20.2683 memory: 6.46GiB(27.34%) tps: 24,010 tflops: 24.16 mfu: 7.74% global_avg_ntp_loss: 3.5224 global_avg_mtp_loss: 16.7459 +[titan] 2025-06-13 15:33:45,359 - root - INFO - lr: 5.0713e-05 gnorm: 1.33 [ 2:52:14< 0:04:10] +[titan] 2025-06-13 15:33:48,254 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:33:48,994 - root - INFO - step: 14650 loss: 19.4536 memory: 6.46GiB(27.34%) tps: 22,537 tflops: 22.68 mfu: 7.27% global_avg_ntp_loss: 3.3142 global_avg_mtp_loss: 16.1394 +[titan] 2025-06-13 15:33:48,994 - root - INFO - lr: 5.0694e-05 gnorm: 1.32 [ 2:52:18< 0:04:06] +[titan] 2025-06-13 15:33:52,326 - root - INFO - step: 14655 loss: 18.3276 memory: 6.46GiB(27.34%) tps: 24,587 tflops: 24.74 mfu: 7.93% global_avg_ntp_loss: 3.1302 global_avg_mtp_loss: 15.1974 +[titan] 2025-06-13 15:33:52,327 - root - INFO - lr: 5.0674e-05 gnorm: 1.30 [ 2:52:21< 0:04:03] +[titan] 2025-06-13 15:33:55,734 - root - INFO - step: 14660 loss: 19.1640 memory: 6.46GiB(27.34%) tps: 24,043 tflops: 24.20 mfu: 7.76% global_avg_ntp_loss: 3.2951 global_avg_mtp_loss: 15.8689 +[titan] 2025-06-13 15:33:55,734 - root - INFO - lr: 5.0654e-05 gnorm: 1.30 [ 2:52:25< 0:03:59] +[titan] 2025-06-13 15:33:59,149 - root - INFO - step: 14665 loss: 19.1488 memory: 6.46GiB(27.34%) tps: 23,989 tflops: 24.14 mfu: 7.74% global_avg_ntp_loss: 3.2285 global_avg_mtp_loss: 15.9203 +[titan] 2025-06-13 15:33:59,150 - root - INFO - lr: 5.0635e-05 gnorm: 1.28 [ 2:52:28< 0:03:56] +[titan] 2025-06-13 15:34:02,440 - root - INFO - step: 14670 loss: 18.1341 memory: 6.46GiB(27.34%) tps: 24,899 tflops: 25.06 mfu: 8.03% global_avg_ntp_loss: 3.0887 global_avg_mtp_loss: 15.0454 +[titan] 2025-06-13 15:34:02,440 - root - INFO - lr: 5.0617e-05 gnorm: 1.27 [ 2:52:31< 0:03:52] +[titan] 2025-06-13 15:34:05,852 - root - INFO - step: 14675 loss: 19.1176 memory: 6.46GiB(27.34%) tps: 24,010 tflops: 24.16 mfu: 7.74% global_avg_ntp_loss: 3.2404 global_avg_mtp_loss: 15.8772 +[titan] 2025-06-13 15:34:05,852 - root - INFO - lr: 5.0598e-05 gnorm: 1.30 [ 2:52:35< 0:03:49] +[titan] 2025-06-13 15:34:09,256 - root - INFO - step: 14680 loss: 19.0028 memory: 6.46GiB(27.34%) tps: 24,071 tflops: 24.22 mfu: 7.76% global_avg_ntp_loss: 3.2380 global_avg_mtp_loss: 15.7647 +[titan] 2025-06-13 15:34:09,256 - root - INFO - lr: 5.0580e-05 gnorm: 1.30 [ 2:52:38< 0:03:45] +[titan] 2025-06-13 15:34:12,850 - root - INFO - step: 14685 loss: 18.5371 memory: 6.46GiB(27.34%) tps: 22,798 tflops: 22.94 mfu: 7.35% global_avg_ntp_loss: 3.1785 global_avg_mtp_loss: 15.3587 +[titan] 2025-06-13 15:34:12,850 - root - INFO - lr: 5.0562e-05 gnorm: 1.62 [ 2:52:42< 0:03:42] +[titan] 2025-06-13 15:34:16,277 - root - INFO - step: 14690 loss: 18.6031 memory: 6.46GiB(27.34%) tps: 23,906 tflops: 24.06 mfu: 7.71% global_avg_ntp_loss: 3.1724 global_avg_mtp_loss: 15.4307 +[titan] 2025-06-13 15:34:16,277 - root - INFO - lr: 5.0544e-05 gnorm: 1.36 [ 2:52:45< 0:03:38] +[titan] 2025-06-13 15:34:20,228 - root - INFO - step: 14695 loss: 18.7767 memory: 6.46GiB(27.34%) tps: 20,737 tflops: 20.87 mfu: 6.69% global_avg_ntp_loss: 3.2140 global_avg_mtp_loss: 15.5627 +[titan] 2025-06-13 15:34:20,228 - root - INFO - lr: 5.0527e-05 gnorm: 1.33 [ 2:52:49< 0:03:35] +[titan] 2025-06-13 15:34:23,070 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:34:23,768 - root - INFO - step: 14700 loss: 18.4061 memory: 6.46GiB(27.34%) tps: 23,144 tflops: 23.29 mfu: 7.47% global_avg_ntp_loss: 3.0995 global_avg_mtp_loss: 15.3066 +[titan] 2025-06-13 15:34:23,768 - root - INFO - lr: 5.0510e-05 gnorm: 1.31 [ 2:52:53< 0:03:31] +[titan] 2025-06-13 15:34:27,154 - root - INFO - step: 14705 loss: 18.7921 memory: 6.46GiB(27.34%) tps: 24,194 tflops: 24.35 mfu: 7.80% global_avg_ntp_loss: 3.2246 global_avg_mtp_loss: 15.5676 +[titan] 2025-06-13 15:34:27,154 - root - INFO - lr: 5.0493e-05 gnorm: 1.38 [ 2:52:56< 0:03:28] +[titan] 2025-06-13 15:34:30,893 - root - INFO - step: 14710 loss: 19.0628 memory: 6.46GiB(27.34%) tps: 21,912 tflops: 22.05 mfu: 7.07% global_avg_ntp_loss: 3.2405 global_avg_mtp_loss: 15.8222 +[titan] 2025-06-13 15:34:30,894 - root - INFO - lr: 5.0476e-05 gnorm: 1.29 [ 2:53:00< 0:03:24] +[titan] 2025-06-13 15:34:34,302 - root - INFO - step: 14715 loss: 18.6360 memory: 6.46GiB(27.34%) tps: 24,038 tflops: 24.19 mfu: 7.75% global_avg_ntp_loss: 3.1823 global_avg_mtp_loss: 15.4537 +[titan] 2025-06-13 15:34:34,302 - root - INFO - lr: 5.0460e-05 gnorm: 1.22 [ 2:53:03< 0:03:21] +[titan] 2025-06-13 15:34:37,696 - root - INFO - step: 14720 loss: 19.1368 memory: 6.46GiB(27.34%) tps: 24,140 tflops: 24.29 mfu: 7.79% global_avg_ntp_loss: 3.2431 global_avg_mtp_loss: 15.8937 +[titan] 2025-06-13 15:34:37,696 - root - INFO - lr: 5.0444e-05 gnorm: 1.28 [ 2:53:07< 0:03:17] +[titan] 2025-06-13 15:34:40,951 - root - INFO - step: 14725 loss: 19.1444 memory: 6.46GiB(27.34%) tps: 25,167 tflops: 25.33 mfu: 8.12% global_avg_ntp_loss: 3.2915 global_avg_mtp_loss: 15.8529 +[titan] 2025-06-13 15:34:40,951 - root - INFO - lr: 5.0428e-05 gnorm: 1.26 [ 2:53:10< 0:03:14] +[titan] 2025-06-13 15:34:44,537 - root - INFO - step: 14730 loss: 18.9410 memory: 6.46GiB(27.34%) tps: 22,850 tflops: 23.00 mfu: 7.37% global_avg_ntp_loss: 3.2505 global_avg_mtp_loss: 15.6904 +[titan] 2025-06-13 15:34:44,537 - root - INFO - lr: 5.0413e-05 gnorm: 1.33 [ 2:53:13< 0:03:10] +[titan] 2025-06-13 15:34:47,921 - root - INFO - step: 14735 loss: 15.5297 memory: 6.46GiB(27.34%) tps: 24,204 tflops: 24.36 mfu: 7.81% global_avg_ntp_loss: 2.6003 global_avg_mtp_loss: 12.9295 +[titan] 2025-06-13 15:34:47,922 - root - INFO - lr: 5.0398e-05 gnorm: 1.54 [ 2:53:17< 0:03:06] +[titan] 2025-06-13 15:34:51,380 - root - INFO - step: 14740 loss: 19.7592 memory: 6.46GiB(27.34%) tps: 23,689 tflops: 23.84 mfu: 7.64% global_avg_ntp_loss: 3.4020 global_avg_mtp_loss: 16.3572 +[titan] 2025-06-13 15:34:51,380 - root - INFO - lr: 5.0383e-05 gnorm: 1.48 [ 2:53:20< 0:03:03] +[titan] 2025-06-13 15:34:54,586 - root - INFO - step: 14745 loss: 19.5425 memory: 6.46GiB(27.34%) tps: 25,552 tflops: 25.71 mfu: 8.24% global_avg_ntp_loss: 3.3115 global_avg_mtp_loss: 16.2310 +[titan] 2025-06-13 15:34:54,587 - root - INFO - lr: 5.0368e-05 gnorm: 1.30 [ 2:53:23< 0:02:59] +[titan] 2025-06-13 15:34:57,455 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:34:58,341 - root - INFO - step: 14750 loss: 17.6645 memory: 6.46GiB(27.34%) tps: 21,824 tflops: 21.96 mfu: 7.04% global_avg_ntp_loss: 3.0251 global_avg_mtp_loss: 14.6394 +[titan] 2025-06-13 15:34:58,341 - root - INFO - lr: 5.0354e-05 gnorm: 1.36 [ 2:53:27< 0:02:56] +[titan] 2025-06-13 15:35:01,876 - root - INFO - step: 14755 loss: 19.6918 memory: 6.46GiB(27.34%) tps: 23,176 tflops: 23.32 mfu: 7.48% global_avg_ntp_loss: 3.3719 global_avg_mtp_loss: 16.3200 +[titan] 2025-06-13 15:35:01,876 - root - INFO - lr: 5.0340e-05 gnorm: 1.30 [ 2:53:31< 0:02:52] +[titan] 2025-06-13 15:35:05,308 - root - INFO - step: 14760 loss: 17.9921 memory: 6.46GiB(27.34%) tps: 23,872 tflops: 24.02 mfu: 7.70% global_avg_ntp_loss: 3.1350 global_avg_mtp_loss: 14.8571 +[titan] 2025-06-13 15:35:05,308 - root - INFO - lr: 5.0326e-05 gnorm: 1.44 [ 2:53:34< 0:02:49] +[titan] 2025-06-13 15:35:08,615 - root - INFO - step: 14765 loss: 18.7502 memory: 6.46GiB(27.34%) tps: 24,772 tflops: 24.93 mfu: 7.99% global_avg_ntp_loss: 3.1892 global_avg_mtp_loss: 15.5610 +[titan] 2025-06-13 15:35:08,616 - root - INFO - lr: 5.0313e-05 gnorm: 1.32 [ 2:53:38< 0:02:45] +[titan] 2025-06-13 15:35:12,126 - root - INFO - step: 14770 loss: 19.7836 memory: 6.46GiB(27.34%) tps: 23,335 tflops: 23.48 mfu: 7.53% global_avg_ntp_loss: 3.4206 global_avg_mtp_loss: 16.3630 +[titan] 2025-06-13 15:35:12,127 - root - INFO - lr: 5.0300e-05 gnorm: 1.28 [ 2:53:41< 0:02:42] +[titan] 2025-06-13 15:35:15,555 - root - INFO - step: 14775 loss: 16.9907 memory: 6.46GiB(27.34%) tps: 23,893 tflops: 24.04 mfu: 7.71% global_avg_ntp_loss: 2.8600 global_avg_mtp_loss: 14.1307 +[titan] 2025-06-13 15:35:15,556 - root - INFO - lr: 5.0287e-05 gnorm: 1.37 [ 2:53:44< 0:02:38] +[titan] 2025-06-13 15:35:18,908 - root - INFO - step: 14780 loss: 18.6731 memory: 6.46GiB(27.34%) tps: 24,439 tflops: 24.59 mfu: 7.88% global_avg_ntp_loss: 3.2213 global_avg_mtp_loss: 15.4517 +[titan] 2025-06-13 15:35:18,908 - root - INFO - lr: 5.0274e-05 gnorm: 1.35 [ 2:53:48< 0:02:35] +[titan] 2025-06-13 15:35:22,511 - root - INFO - step: 14785 loss: 18.8922 memory: 6.46GiB(27.34%) tps: 22,740 tflops: 22.88 mfu: 7.33% global_avg_ntp_loss: 3.2273 global_avg_mtp_loss: 15.6649 +[titan] 2025-06-13 15:35:22,511 - root - INFO - lr: 5.0262e-05 gnorm: 1.22 [ 2:53:51< 0:02:31] +[titan] 2025-06-13 15:35:26,156 - root - INFO - step: 14790 loss: 18.0318 memory: 6.46GiB(27.34%) tps: 22,473 tflops: 22.62 mfu: 7.25% global_avg_ntp_loss: 3.0488 global_avg_mtp_loss: 14.9830 +[titan] 2025-06-13 15:35:26,157 - root - INFO - lr: 5.0250e-05 gnorm: 1.35 [ 2:53:55< 0:02:28] +[titan] 2025-06-13 15:35:29,967 - root - INFO - step: 14795 loss: 18.5930 memory: 6.46GiB(27.34%) tps: 21,500 tflops: 21.64 mfu: 6.93% global_avg_ntp_loss: 3.1499 global_avg_mtp_loss: 15.4430 +[titan] 2025-06-13 15:35:29,967 - root - INFO - lr: 5.0238e-05 gnorm: 1.23 [ 2:53:59< 0:02:24] +[titan] 2025-06-13 15:35:32,687 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:35:33,259 - root - INFO - step: 14800 loss: 17.4944 memory: 6.46GiB(27.34%) tps: 24,886 tflops: 25.05 mfu: 8.03% global_avg_ntp_loss: 2.9533 global_avg_mtp_loss: 14.5411 +[titan] 2025-06-13 15:35:33,260 - root - INFO - lr: 5.0227e-05 gnorm: 1.39 [ 2:54:02< 0:02:21] +[titan] 2025-06-13 15:35:36,995 - root - INFO - step: 14805 loss: 18.1252 memory: 6.46GiB(27.34%) tps: 21,933 tflops: 22.07 mfu: 7.07% global_avg_ntp_loss: 3.1128 global_avg_mtp_loss: 15.0124 +[titan] 2025-06-13 15:35:36,995 - root - INFO - lr: 5.0215e-05 gnorm: 1.26 [ 2:54:06< 0:02:17] +[titan] 2025-06-13 15:35:40,352 - root - INFO - step: 14810 loss: 17.9173 memory: 6.46GiB(27.34%) tps: 24,402 tflops: 24.56 mfu: 7.87% global_avg_ntp_loss: 2.9973 global_avg_mtp_loss: 14.9199 +[titan] 2025-06-13 15:35:40,353 - root - INFO - lr: 5.0204e-05 gnorm: 1.48 [ 2:54:09< 0:02:14] +[titan] 2025-06-13 15:35:43,790 - root - INFO - step: 14815 loss: 18.8740 memory: 6.46GiB(27.34%) tps: 23,832 tflops: 23.98 mfu: 7.69% global_avg_ntp_loss: 3.2388 global_avg_mtp_loss: 15.6351 +[titan] 2025-06-13 15:35:43,790 - root - INFO - lr: 5.0194e-05 gnorm: 1.41 [ 2:54:13< 0:02:10] +[titan] 2025-06-13 15:35:46,872 - root - INFO - step: 14820 loss: 19.3129 memory: 6.46GiB(27.34%) tps: 26,581 tflops: 26.75 mfu: 8.57% global_avg_ntp_loss: 3.2731 global_avg_mtp_loss: 16.0397 +[titan] 2025-06-13 15:35:46,873 - root - INFO - lr: 5.0183e-05 gnorm: 1.35 [ 2:54:16< 0:02:06] +[titan] 2025-06-13 15:35:50,804 - root - INFO - step: 14825 loss: 18.7904 memory: 6.46GiB(27.34%) tps: 20,840 tflops: 20.97 mfu: 6.72% global_avg_ntp_loss: 3.1942 global_avg_mtp_loss: 15.5962 +[titan] 2025-06-13 15:35:50,804 - root - INFO - lr: 5.0173e-05 gnorm: 1.25 [ 2:54:20< 0:02:03] +[titan] 2025-06-13 15:35:54,193 - root - INFO - step: 14830 loss: 19.0661 memory: 6.46GiB(27.34%) tps: 24,173 tflops: 24.33 mfu: 7.80% global_avg_ntp_loss: 3.2658 global_avg_mtp_loss: 15.8003 +[titan] 2025-06-13 15:35:54,193 - root - INFO - lr: 5.0164e-05 gnorm: 1.44 [ 2:54:23< 0:01:59] +[titan] 2025-06-13 15:35:57,791 - root - INFO - step: 14835 loss: 18.2615 memory: 6.46GiB(27.34%) tps: 22,772 tflops: 22.92 mfu: 7.35% global_avg_ntp_loss: 3.1037 global_avg_mtp_loss: 15.1578 +[titan] 2025-06-13 15:35:57,791 - root - INFO - lr: 5.0154e-05 gnorm: 1.32 [ 2:54:27< 0:01:56] +[titan] 2025-06-13 15:36:00,855 - root - INFO - step: 14840 loss: 18.7027 memory: 6.46GiB(27.34%) tps: 26,736 tflops: 26.91 mfu: 8.62% global_avg_ntp_loss: 3.1941 global_avg_mtp_loss: 15.5086 +[titan] 2025-06-13 15:36:00,855 - root - INFO - lr: 5.0145e-05 gnorm: 1.39 [ 2:54:30< 0:01:52] +[titan] 2025-06-13 15:36:04,285 - root - INFO - step: 14845 loss: 19.4550 memory: 6.46GiB(27.34%) tps: 23,885 tflops: 24.04 mfu: 7.70% global_avg_ntp_loss: 3.3543 global_avg_mtp_loss: 16.1007 +[titan] 2025-06-13 15:36:04,286 - root - INFO - lr: 5.0136e-05 gnorm: 1.26 [ 2:54:33< 0:01:49] +[titan] 2025-06-13 15:36:06,507 - root - INFO - Dumping profiler traces at step 14848 +[titan] 2025-06-13 15:36:06,603 - root - INFO - Finished dumping profiler traces in 0.10 seconds +[titan] 2025-06-13 15:36:07,278 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:36:07,939 - root - INFO - step: 14850 loss: 17.3363 memory: 6.46GiB(27.34%) tps: 22,426 tflops: 22.57 mfu: 7.23% global_avg_ntp_loss: 2.8996 global_avg_mtp_loss: 14.4367 +[titan] 2025-06-13 15:36:07,939 - root - INFO - lr: 5.0127e-05 gnorm: 1.33 [ 2:54:37< 0:01:45] +[titan] 2025-06-13 15:36:11,175 - root - INFO - step: 14855 loss: 19.7931 memory: 6.46GiB(27.34%) tps: 25,314 tflops: 25.48 mfu: 8.17% global_avg_ntp_loss: 3.4279 global_avg_mtp_loss: 16.3652 +[titan] 2025-06-13 15:36:11,175 - root - INFO - lr: 5.0119e-05 gnorm: 1.52 [ 2:54:40< 0:01:42] +[titan] 2025-06-13 15:36:14,649 - root - INFO - step: 14860 loss: 18.8759 memory: 6.46GiB(27.34%) tps: 23,586 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 3.2216 global_avg_mtp_loss: 15.6543 +[titan] 2025-06-13 15:36:14,649 - root - INFO - lr: 5.0111e-05 gnorm: 1.23 [ 2:54:44< 0:01:38] +[titan] 2025-06-13 15:36:18,097 - root - INFO - step: 14865 loss: 19.5665 memory: 6.46GiB(27.34%) tps: 23,761 tflops: 23.91 mfu: 7.66% global_avg_ntp_loss: 3.3082 global_avg_mtp_loss: 16.2583 +[titan] 2025-06-13 15:36:18,097 - root - INFO - lr: 5.0103e-05 gnorm: 1.39 [ 2:54:47< 0:01:35] +[titan] 2025-06-13 15:36:21,725 - root - INFO - step: 14870 loss: 19.3908 memory: 6.46GiB(27.34%) tps: 22,583 tflops: 22.73 mfu: 7.28% global_avg_ntp_loss: 3.3054 global_avg_mtp_loss: 16.0854 +[titan] 2025-06-13 15:36:21,725 - root - INFO - lr: 5.0096e-05 gnorm: 1.39 [ 2:54:51< 0:01:31] +[titan] 2025-06-13 15:36:25,096 - root - INFO - step: 14875 loss: 17.8043 memory: 6.46GiB(27.34%) tps: 24,303 tflops: 24.46 mfu: 7.84% global_avg_ntp_loss: 3.0640 global_avg_mtp_loss: 14.7403 +[titan] 2025-06-13 15:36:25,096 - root - INFO - lr: 5.0088e-05 gnorm: 1.30 [ 2:54:54< 0:01:28] +[titan] 2025-06-13 15:36:28,468 - root - INFO - step: 14880 loss: 18.2535 memory: 6.46GiB(27.34%) tps: 24,300 tflops: 24.46 mfu: 7.84% global_avg_ntp_loss: 3.0965 global_avg_mtp_loss: 15.1570 +[titan] 2025-06-13 15:36:28,468 - root - INFO - lr: 5.0082e-05 gnorm: 1.24 [ 2:54:57< 0:01:24] +[titan] 2025-06-13 15:36:32,147 - root - INFO - step: 14885 loss: 19.7476 memory: 6.46GiB(27.34%) tps: 22,271 tflops: 22.41 mfu: 7.18% global_avg_ntp_loss: 3.3538 global_avg_mtp_loss: 16.3937 +[titan] 2025-06-13 15:36:32,147 - root - INFO - lr: 5.0075e-05 gnorm: 1.26 [ 2:55:01< 0:01:21] +[titan] 2025-06-13 15:36:35,440 - root - INFO - step: 14890 loss: 17.8268 memory: 6.46GiB(27.34%) tps: 24,879 tflops: 25.04 mfu: 8.02% global_avg_ntp_loss: 3.0474 global_avg_mtp_loss: 14.7795 +[titan] 2025-06-13 15:36:35,440 - root - INFO - lr: 5.0069e-05 gnorm: 1.39 [ 2:55:04< 0:01:17] +[titan] 2025-06-13 15:36:39,304 - root - INFO - step: 14895 loss: 19.1404 memory: 6.46GiB(27.34%) tps: 21,203 tflops: 21.34 mfu: 6.84% global_avg_ntp_loss: 3.2467 global_avg_mtp_loss: 15.8938 +[titan] 2025-06-13 15:36:39,304 - root - INFO - lr: 5.0062e-05 gnorm: 1.32 [ 2:55:08< 0:01:14] +[titan] 2025-06-13 15:36:42,149 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:36:42,697 - root - INFO - step: 14900 loss: 19.3225 memory: 6.46GiB(27.34%) tps: 24,151 tflops: 24.30 mfu: 7.79% global_avg_ntp_loss: 3.2664 global_avg_mtp_loss: 16.0562 +[titan] 2025-06-13 15:36:42,697 - root - INFO - lr: 5.0057e-05 gnorm: 1.29 [ 2:55:12< 0:01:10] +[titan] 2025-06-13 15:36:45,979 - root - INFO - step: 14905 loss: 18.7692 memory: 6.46GiB(27.34%) tps: 24,962 tflops: 25.12 mfu: 8.05% global_avg_ntp_loss: 3.2101 global_avg_mtp_loss: 15.5592 +[titan] 2025-06-13 15:36:45,979 - root - INFO - lr: 5.0051e-05 gnorm: 1.34 [ 2:55:15< 0:01:07] +[titan] 2025-06-13 15:36:49,691 - root - INFO - step: 14910 loss: 16.9041 memory: 6.46GiB(27.34%) tps: 22,068 tflops: 22.21 mfu: 7.12% global_avg_ntp_loss: 2.8682 global_avg_mtp_loss: 14.0359 +[titan] 2025-06-13 15:36:49,692 - root - INFO - lr: 5.0046e-05 gnorm: 1.49 [ 2:55:19< 0:01:03] +[titan] 2025-06-13 15:36:53,681 - root - INFO - step: 14915 loss: 20.2626 memory: 6.46GiB(27.34%) tps: 20,536 tflops: 20.67 mfu: 6.62% global_avg_ntp_loss: 3.4488 global_avg_mtp_loss: 16.8138 +[titan] 2025-06-13 15:36:53,681 - root - INFO - lr: 5.0041e-05 gnorm: 1.24 [ 2:55:23< 0:00:59] +[titan] 2025-06-13 15:36:56,667 - root - INFO - step: 14920 loss: 18.3838 memory: 6.46GiB(27.34%) tps: 27,435 tflops: 27.61 mfu: 8.85% global_avg_ntp_loss: 3.1854 global_avg_mtp_loss: 15.1984 +[titan] 2025-06-13 15:36:56,668 - root - INFO - lr: 5.0036e-05 gnorm: 1.38 [ 2:55:26< 0:00:56] +[titan] 2025-06-13 15:37:00,023 - root - INFO - step: 14925 loss: 19.3207 memory: 6.46GiB(27.34%) tps: 24,413 tflops: 24.57 mfu: 7.87% global_avg_ntp_loss: 3.3319 global_avg_mtp_loss: 15.9888 +[titan] 2025-06-13 15:37:00,024 - root - INFO - lr: 5.0032e-05 gnorm: 1.37 [ 2:55:29< 0:00:52] +[titan] 2025-06-13 15:37:03,496 - root - INFO - step: 14930 loss: 18.7751 memory: 6.46GiB(27.34%) tps: 23,591 tflops: 23.74 mfu: 7.61% global_avg_ntp_loss: 3.2286 global_avg_mtp_loss: 15.5465 +[titan] 2025-06-13 15:37:03,496 - root - INFO - lr: 5.0028e-05 gnorm: 1.42 [ 2:55:32< 0:00:49] +[titan] 2025-06-13 15:37:06,704 - root - INFO - step: 14935 loss: 17.7409 memory: 6.46GiB(27.34%) tps: 25,543 tflops: 25.71 mfu: 8.24% global_avg_ntp_loss: 3.0037 global_avg_mtp_loss: 14.7372 +[titan] 2025-06-13 15:37:06,704 - root - INFO - lr: 5.0024e-05 gnorm: 1.40 [ 2:55:36< 0:00:45] +[titan] 2025-06-13 15:37:10,190 - root - INFO - step: 14940 loss: 19.6094 memory: 6.46GiB(27.34%) tps: 23,499 tflops: 23.65 mfu: 7.58% global_avg_ntp_loss: 3.3256 global_avg_mtp_loss: 16.2838 +[titan] 2025-06-13 15:37:10,191 - root - INFO - lr: 5.0020e-05 gnorm: 1.27 [ 2:55:39< 0:00:42] +[titan] 2025-06-13 15:37:13,922 - root - INFO - step: 14945 loss: 18.7161 memory: 6.46GiB(27.34%) tps: 21,956 tflops: 22.10 mfu: 7.08% global_avg_ntp_loss: 3.1495 global_avg_mtp_loss: 15.5666 +[titan] 2025-06-13 15:37:13,922 - root - INFO - lr: 5.0017e-05 gnorm: 1.35 [ 2:55:43< 0:00:38] +[titan] 2025-06-13 15:37:16,694 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:37:17,549 - root - INFO - step: 14950 loss: 19.1148 memory: 6.46GiB(27.34%) tps: 22,591 tflops: 22.73 mfu: 7.29% global_avg_ntp_loss: 3.2514 global_avg_mtp_loss: 15.8634 +[titan] 2025-06-13 15:37:17,549 - root - INFO - lr: 5.0014e-05 gnorm: 1.26 [ 2:55:46< 0:00:35] +[titan] 2025-06-13 15:37:21,067 - root - INFO - step: 14955 loss: 19.4986 memory: 6.46GiB(27.34%) tps: 23,287 tflops: 23.44 mfu: 7.51% global_avg_ntp_loss: 3.3668 global_avg_mtp_loss: 16.1319 +[titan] 2025-06-13 15:37:21,067 - root - INFO - lr: 5.0011e-05 gnorm: 1.31 [ 2:55:50< 0:00:31] +[titan] 2025-06-13 15:37:24,522 - root - INFO - step: 14960 loss: 17.9098 memory: 6.46GiB(27.34%) tps: 23,715 tflops: 23.87 mfu: 7.65% global_avg_ntp_loss: 3.0560 global_avg_mtp_loss: 14.8537 +[titan] 2025-06-13 15:37:24,522 - root - INFO - lr: 5.0009e-05 gnorm: 1.52 [ 2:55:53< 0:00:28] +[titan] 2025-06-13 15:37:27,837 - root - INFO - step: 14965 loss: 19.2633 memory: 6.46GiB(27.34%) tps: 24,709 tflops: 24.87 mfu: 7.97% global_avg_ntp_loss: 3.2865 global_avg_mtp_loss: 15.9768 +[titan] 2025-06-13 15:37:27,838 - root - INFO - lr: 5.0007e-05 gnorm: 1.25 [ 2:55:57< 0:00:24] +[titan] 2025-06-13 15:37:31,295 - root - INFO - step: 14970 loss: 19.6959 memory: 6.46GiB(27.34%) tps: 23,692 tflops: 23.84 mfu: 7.64% global_avg_ntp_loss: 3.4123 global_avg_mtp_loss: 16.2836 +[titan] 2025-06-13 15:37:31,296 - root - INFO - lr: 5.0005e-05 gnorm: 1.38 [ 2:56:00< 0:00:21] +[titan] 2025-06-13 15:37:34,604 - root - INFO - step: 14975 loss: 18.1812 memory: 6.46GiB(27.34%) tps: 24,767 tflops: 24.93 mfu: 7.99% global_avg_ntp_loss: 3.0455 global_avg_mtp_loss: 15.1357 +[titan] 2025-06-13 15:37:34,604 - root - INFO - lr: 5.0004e-05 gnorm: 1.43 [ 2:56:03< 0:00:17] +[titan] 2025-06-13 15:37:37,931 - root - INFO - step: 14980 loss: 18.8792 memory: 6.46GiB(27.34%) tps: 24,621 tflops: 24.78 mfu: 7.94% global_avg_ntp_loss: 3.2167 global_avg_mtp_loss: 15.6625 +[titan] 2025-06-13 15:37:37,932 - root - INFO - lr: 5.0002e-05 gnorm: 1.22 [ 2:56:07< 0:00:14] +[titan] 2025-06-13 15:37:41,271 - root - INFO - step: 14985 loss: 19.5896 memory: 6.46GiB(27.34%) tps: 24,533 tflops: 24.69 mfu: 7.91% global_avg_ntp_loss: 3.4001 global_avg_mtp_loss: 16.1896 +[titan] 2025-06-13 15:37:41,271 - root - INFO - lr: 5.0001e-05 gnorm: 1.26 [ 2:56:10< 0:00:10] +[titan] 2025-06-13 15:37:44,792 - root - INFO - step: 14990 loss: 20.0679 memory: 6.46GiB(27.34%) tps: 23,269 tflops: 23.42 mfu: 7.51% global_avg_ntp_loss: 3.4984 global_avg_mtp_loss: 16.5695 +[titan] 2025-06-13 15:37:44,792 - root - INFO - lr: 5.0001e-05 gnorm: 1.46 [ 2:56:14< 0:00:07] +[titan] 2025-06-13 15:37:48,227 - root - INFO - step: 14995 loss: 19.9043 memory: 6.46GiB(27.34%) tps: 23,852 tflops: 24.00 mfu: 7.69% global_avg_ntp_loss: 3.4222 global_avg_mtp_loss: 16.4822 +[titan] 2025-06-13 15:37:48,227 - root - INFO - lr: 5.0000e-05 gnorm: 1.24 [ 2:56:17< 0:00:03] +[titan] 2025-06-13 15:37:50,842 - root - INFO - [GC] Peforming periodical GC collection. 0.00 seconds. +[titan] 2025-06-13 15:37:51,655 - root - INFO - step: 15000 loss: 19.6282 memory: 6.46GiB(27.34%) tps: 23,900 tflops: 24.05 mfu: 7.71% global_avg_ntp_loss: 3.3725 global_avg_mtp_loss: 16.2557 +[titan] 2025-06-13 15:37:51,655 - root - INFO - lr: 5.0000e-05 gnorm: 1.23 [ 2:56:21< 0:00:00] +[titan] 2025-06-13 15:37:51,655 - root - INFO - Saving the checkpoint (or staging if async is enabled). +[titan] 2025-06-13 15:37:51,655 - root - INFO - Saving a full checkpoint at last step, step 15000. +[titan] 2025-06-13 15:37:53,153 - root - INFO - [GC] GC collection invoked by checkpointer. 0.01 seconds. +[titan] 2025-06-13 15:37:53,153 - root - INFO - Finished saving the checkpoint (or staging if async is enabled)in 1.50 seconds. +[titan] 2025-06-13 15:37:53,153 - root - INFO - Training completed +[titan] 2025-06-13 15:37:53,153 - root - INFO - Destroying the purge thread. diff --git a/profile_trace/iteration_5632/rank0_trace.json b/profile_trace/iteration_5632/rank0_trace.json new file mode 100644 index 0000000000000000000000000000000000000000..be386e8e8559fc877ba41250bda48db5ffc3a754 --- /dev/null +++ b/profile_trace/iteration_5632/rank0_trace.json @@ -0,0 +1,109184 @@ + +{ + "schemaVersion": 1, + "deviceProperties": [ + { + "id": 0, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + }, + { + "id": 1, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + }, + { + "id": 2, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + }, + { + "id": 3, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + } + ], + "cupti_version": 22, + "cuda_runtime_version": 12040, + "cuda_driver_version": 12040, + "distributedInfo": {"backend": "nccl", "rank": 0, "world_size": 4, "pg_count": 1, "pg_config": [{"pg_name": "0", "pg_desc": "default_pg", "backend_config": "cuda:nccl", "pg_size": 4, "ranks": [0, 1, 2, 3]}], "nccl_version": "2.21.5"}, + "record_shapes": 1, + "trace_id": "D7D26ABD626B44B8ADBB0618EFCC97D1", + "traceEvents": [ + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: DivBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866005198.432, "dur": 131.029, + "args": { + "External id": 86017,"Record function id": 0, "Sequence number": 1771204, "Fwd thread id": 1, "Ev Idx": 0 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "DivBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866005217.802, "dur": 99.719, + "args": { + "External id": 86018,"Sequence number": 1771204, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 1 + } + }, + { + "ph": "f", "id": 1, "pid": 5714, "tid": 6744, "ts": 6300866005217.802, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866005226.662, "dur": 87.590, + "args": { + "External id": 86019,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 2 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866005342.012, "dur": 274.959, + "args": { + "External id": 86020,"Record function id": 0, "Ev Idx": 3 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward", "pid": 5714, "tid": 6744, + "ts": 6300866005411.211, "dur": 118.170, + "args": { + "External id": 86021,"Record function id": 0, "Ev Idx": 4 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.9", "pid": 5714, "tid": 6744, + "ts": 6300866005448.281, "dur": 65.000, + "args": { + "External id": 86022,"Record function id": 0, "Ev Idx": 5 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866005535.871, "dur": 2.360, + "args": { + "External id": 86023,"Sequence number": 1771203, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 6 + } + }, + { + "ph": "f", "id": 2, "pid": 5714, "tid": 6744, "ts": 6300866005535.871, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866005544.261, "dur": 65.770, + "args": { + "External id": 86024,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866005554.211, "dur": 55.080, + "args": { + "External id": 86025,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 8 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866005569.911, "dur": 3.880, + "args": { + "External id": 86026,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 9 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866005631.151, "dur": 15461.935, + "args": { + "External id": 86027,"Record function id": 0, "Sequence number": 1771201, "Fwd thread id": 1, "Ev Idx": 10 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866005634.261, "dur": 15448.465, + "args": { + "External id": 86028,"Sequence number": 1771201, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 11 + } + }, + { + "ph": "f", "id": 3, "pid": 5714, "tid": 6744, "ts": 6300866005634.261, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866005702.411, "dur": 6.720, + "args": { + "External id": 86029,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 12 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866005715.501, "dur": 15268.985, + "args": { + "External id": 86030,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 13 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866005718.161, "dur": 15265.995, + "args": { + "External id": 86031,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 14 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866005722.451, "dur": 10.529, + "args": { + "External id": 86032,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 15 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866005735.451, "dur": 15247.845, + "args": { + "External id": 86033,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 16 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5714, "tid": 6744, + "ts": 6300866020988.936, "dur": 0.440, + "args": { + "External id": 86034,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 17 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5714, "tid": 6744, + "ts": 6300866020991.936, "dur": 3.440, + "args": { + "External id": 86035,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 18 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5714, "tid": 6744, + "ts": 6300866020993.896, "dur": 1.260, + "args": { + "External id": 86036,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 19 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 6744, + "ts": 6300866021001.006, "dur": 32.340, + "args": { + "External id": 86037,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 20 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 6744, + "ts": 6300866021041.806, "dur": 31.640, + "args": { + "External id": 86038,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 21 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 6744, + "ts": 6300866021044.326, "dur": 28.860, + "args": { + "External id": 86039,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 22 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 6744, + "ts": 6300866021046.486, "dur": 26.270, + "args": { + "External id": 86040,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 23 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021105.286, "dur": 20.140, + "args": { + "External id": 86041,"Record function id": 0, "Sequence number": 1771200, "Fwd thread id": 1, "Ev Idx": 24 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021107.686, "dur": 13.970, + "args": { + "External id": 86042,"Sequence number": 1771200, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 25 + } + }, + { + "ph": "f", "id": 4, "pid": 5714, "tid": 6744, "ts": 6300866021107.686, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866021111.746, "dur": 9.640, + "args": { + "External id": 86043,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 26 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866021114.536, "dur": 6.550, + "args": { + "External id": 86044,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 27 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021130.156, "dur": 100.440, + "args": { + "External id": 86045,"Record function id": 0, "Sequence number": 1771199, "Fwd thread id": 1, "Ev Idx": 28 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021131.586, "dur": 90.070, + "args": { + "External id": 86046,"Sequence number": 1771199, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 29 + } + }, + { + "ph": "f", "id": 5, "pid": 5714, "tid": 6744, "ts": 6300866021131.586, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866021135.176, "dur": 85.889, + "args": { + "External id": 86047,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 30 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866021141.485, "dur": 41.680, + "args": { + "External id": 86048,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 31 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866021144.665, "dur": 7.491, + "args": { + "External id": 86049,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 32 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866021154.336, "dur": 28.449, + "args": { + "External id": 86050,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 33 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866021160.356, "dur": 21.480, + "args": { + "External id": 86051,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 34 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866021186.225, "dur": 4.851, + "args": { + "External id": 86052,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 35 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866021189.425, "dur": 1.100, + "args": { + "External id": 86053,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 36 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866021192.076, "dur": 27.989, + "args": { + "External id": 86054,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 37 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021239.605, "dur": 80.720, + "args": { + "External id": 86055,"Record function id": 0, "Sequence number": 1771198, "Fwd thread id": 1, "Ev Idx": 38 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021241.535, "dur": 72.620, + "args": { + "External id": 86056,"Sequence number": 1771198, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 39 + } + }, + { + "ph": "f", "id": 6, "pid": 5714, "tid": 6744, "ts": 6300866021241.535, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5714, "tid": 6744, + "ts": 6300866021245.265, "dur": 68.410, + "args": { + "External id": 86057,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "3"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 40 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866021248.515, "dur": 25.300, + "args": { + "External id": 86058,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 41 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866021249.795, "dur": 6.960, + "args": { + "External id": 86059,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 42 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866021257.715, "dur": 15.790, + "args": { + "External id": 86060,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 43 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866021259.825, "dur": 12.730, + "args": { + "External id": 86061,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 44 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6300866021275.795, "dur": 8.950, + "args": { + "External id": 86062,"Record function id": 0, "Concrete Inputs": ["", "2", "3"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 45 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866021281.605, "dur": 1.570, + "args": { + "External id": 86063,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 46 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866021285.525, "dur": 27.210, + "args": { + "External id": 86064,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 47 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021329.185, "dur": 70.180, + "args": { + "External id": 86065,"Record function id": 0, "Sequence number": 1771197, "Fwd thread id": 1, "Ev Idx": 48 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021331.055, "dur": 62.910, + "args": { + "External id": 86066,"Sequence number": 1771197, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 49 + } + }, + { + "ph": "f", "id": 7, "pid": 5714, "tid": 6744, "ts": 6300866021331.055, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866021333.325, "dur": 60.180, + "args": { + "External id": 86067,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 50 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866021335.285, "dur": 25.200, + "args": { + "External id": 86068,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 51 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866021337.575, "dur": 5.530, + "args": { + "External id": 86069,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 52 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866021343.995, "dur": 16.180, + "args": { + "External id": 86070,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 53 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866021347.605, "dur": 11.680, + "args": { + "External id": 86071,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 54 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866021361.945, "dur": 3.480, + "args": { + "External id": 86072,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 55 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866021363.995, "dur": 1.000, + "args": { + "External id": 86073,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 56 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866021366.315, "dur": 26.310, + "args": { + "External id": 86074,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 57 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021408.685, "dur": 53.920, + "args": { + "External id": 86075,"Record function id": 0, "Sequence number": 1771196, "Fwd thread id": 1, "Ev Idx": 58 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021410.495, "dur": 46.720, + "args": { + "External id": 86076,"Sequence number": 1771196, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 59 + } + }, + { + "ph": "f", "id": 8, "pid": 5714, "tid": 6744, "ts": 6300866021410.495, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866021412.495, "dur": 44.380, + "args": { + "External id": 86077,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 60 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866021414.015, "dur": 20.740, + "args": { + "External id": 86078,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 61 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866021415.225, "dur": 5.180, + "args": { + "External id": 86079,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 62 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866021421.395, "dur": 13.070, + "args": { + "External id": 86080,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 63 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866021422.465, "dur": 11.150, + "args": { + "External id": 86081,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 64 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866021437.075, "dur": 4.340, + "args": { + "External id": 86082,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 65 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866021438.795, "dur": 2.210, + "args": { + "External id": 86083,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 66 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866021442.155, "dur": 14.000, + "args": { + "External id": 86084,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 67 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021470.575, "dur": 37.060, + "args": { + "External id": 86085,"Record function id": 0, "Sequence number": 1771195, "Fwd thread id": 1, "Ev Idx": 68 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866021472.535, "dur": 0.980, + "args": { + "External id": 86086,"Sequence number": 1771195, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 69 + } + }, + { + "ph": "f", "id": 9, "pid": 5714, "tid": 6744, "ts": 6300866021472.535, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866021475.825, "dur": 27.260, + "args": { + "External id": 86087,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 70 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866021477.545, "dur": 25.080, + "args": { + "External id": 86088,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 71 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866021485.055, "dur": 0.740, + "args": { + "External id": 86089,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 72 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866021515.285, "dur": 660.228, + "args": { + "External id": 86090,"Record function id": 0, "Sequence number": 1771193, "Fwd thread id": 1, "Ev Idx": 73 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866021517.235, "dur": 621.858, + "args": { + "External id": 86091,"Sequence number": 1771193, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 74 + } + }, + { + "ph": "f", "id": 10, "pid": 5714, "tid": 6744, "ts": 6300866021517.235, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866021545.555, "dur": 2.710, + "args": { + "External id": 86092,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 75 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866021550.395, "dur": 527.588, + "args": { + "External id": 86093,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 76 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866021551.535, "dur": 526.159, + "args": { + "External id": 86094,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 77 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866021553.535, "dur": 6.460, + "args": { + "External id": 86095,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 78 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866021562.415, "dur": 514.788, + "args": { + "External id": 86096,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 79 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5714, "tid": 6744, + "ts": 6300866022080.443, "dur": 0.271, + "args": { + "External id": 86097,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 80 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5714, "tid": 6744, + "ts": 6300866022081.863, "dur": 2.040, + "args": { + "External id": 86098,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 81 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5714, "tid": 6744, + "ts": 6300866022082.854, "dur": 0.860, + "args": { + "External id": 86099,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 82 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 6744, + "ts": 6300866022087.023, "dur": 20.160, + "args": { + "External id": 86100,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 83 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 6744, + "ts": 6300866022112.183, "dur": 19.890, + "args": { + "External id": 86101,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 84 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 6744, + "ts": 6300866022113.093, "dur": 18.730, + "args": { + "External id": 86102,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 85 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 6744, + "ts": 6300866022114.053, "dur": 17.330, + "args": { + "External id": 86103,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 86 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866022149.823, "dur": 21.740, + "args": { + "External id": 86104,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 87 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022188.093, "dur": 13.850, + "args": { + "External id": 86105,"Record function id": 0, "Sequence number": 1771192, "Fwd thread id": 1, "Ev Idx": 88 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022190.753, "dur": 7.970, + "args": { + "External id": 86106,"Sequence number": 1771192, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 89 + } + }, + { + "ph": "f", "id": 11, "pid": 5714, "tid": 6744, "ts": 6300866022190.753, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866022193.653, "dur": 4.790, + "args": { + "External id": 86107,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 90 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866022194.863, "dur": 3.310, + "args": { + "External id": 86108,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 91 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022206.383, "dur": 64.220, + "args": { + "External id": 86109,"Record function id": 0, "Sequence number": 1771191, "Fwd thread id": 1, "Ev Idx": 92 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022207.733, "dur": 55.530, + "args": { + "External id": 86110,"Sequence number": 1771191, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 93 + } + }, + { + "ph": "f", "id": 12, "pid": 5714, "tid": 6744, "ts": 6300866022207.733, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866022209.753, "dur": 52.930, + "args": { + "External id": 86111,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 94 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866022212.283, "dur": 24.510, + "args": { + "External id": 86112,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 95 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866022213.743, "dur": 5.980, + "args": { + "External id": 86113,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 96 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866022220.823, "dur": 15.650, + "args": { + "External id": 86114,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 97 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866022222.283, "dur": 13.300, + "args": { + "External id": 86115,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 98 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866022239.493, "dur": 3.810, + "args": { + "External id": 86116,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 99 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866022241.893, "dur": 0.950, + "args": { + "External id": 86117,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866022244.193, "dur": 17.580, + "args": { + "External id": 86118,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022278.573, "dur": 65.040, + "args": { + "External id": 86119,"Record function id": 0, "Sequence number": 1771190, "Fwd thread id": 1, "Ev Idx": 102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022280.523, "dur": 58.120, + "args": { + "External id": 86120,"Sequence number": 1771190, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 103 + } + }, + { + "ph": "f", "id": 13, "pid": 5714, "tid": 6744, "ts": 6300866022280.523, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5714, "tid": 6744, + "ts": 6300866022282.713, "dur": 55.500, + "args": { + "External id": 86121,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "2"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866022284.263, "dur": 32.070, + "args": { + "External id": 86122,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866022285.453, "dur": 6.490, + "args": { + "External id": 86123,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866022292.873, "dur": 23.170, + "args": { + "External id": 86124,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866022293.933, "dur": 21.110, + "args": { + "External id": 86125,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6300866022318.693, "dur": 5.020, + "args": { + "External id": 86126,"Record function id": 0, "Concrete Inputs": ["", "2", "2"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866022321.753, "dur": 1.250, + "args": { + "External id": 86127,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866022324.493, "dur": 12.990, + "args": { + "External id": 86128,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022351.783, "dur": 59.790, + "args": { + "External id": 86129,"Record function id": 0, "Sequence number": 1771189, "Fwd thread id": 1, "Ev Idx": 112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022353.633, "dur": 53.130, + "args": { + "External id": 86130,"Sequence number": 1771189, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 113 + } + }, + { + "ph": "f", "id": 14, "pid": 5714, "tid": 6744, "ts": 6300866022353.633, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866022355.663, "dur": 50.570, + "args": { + "External id": 86131,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866022357.363, "dur": 21.850, + "args": { + "External id": 86132,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866022358.493, "dur": 6.020, + "args": { + "External id": 86133,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866022365.303, "dur": 13.570, + "args": { + "External id": 86134,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866022367.453, "dur": 10.570, + "args": { + "External id": 86135,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866022381.743, "dur": 3.320, + "args": { + "External id": 86136,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866022383.703, "dur": 0.920, + "args": { + "External id": 86137,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866022385.863, "dur": 19.480, + "args": { + "External id": 86138,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022419.673, "dur": 66.420, + "args": { + "External id": 86139,"Record function id": 0, "Sequence number": 1771188, "Fwd thread id": 1, "Ev Idx": 122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022421.493, "dur": 44.820, + "args": { + "External id": 86140,"Sequence number": 1771188, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 123 + } + }, + { + "ph": "f", "id": 15, "pid": 5714, "tid": 6744, "ts": 6300866022421.493, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866022424.553, "dur": 41.309, + "args": { + "External id": 86141,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866022426.143, "dur": 19.990, + "args": { + "External id": 86142,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866022427.113, "dur": 4.680, + "args": { + "External id": 86143,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866022432.643, "dur": 13.230, + "args": { + "External id": 86144,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866022433.723, "dur": 11.330, + "args": { + "External id": 86145,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866022447.253, "dur": 2.870, + "args": { + "External id": 86146,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866022448.893, "dur": 0.900, + "args": { + "External id": 86147,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866022452.043, "dur": 13.030, + "args": { + "External id": 86148,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866022471.873, "dur": 11.540, + "args": { + "External id": 86149,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022494.762, "dur": 35.660, + "args": { + "External id": 86150,"Record function id": 0, "Sequence number": 1771187, "Fwd thread id": 1, "Ev Idx": 133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866022496.653, "dur": 1.249, + "args": { + "External id": 86151,"Sequence number": 1771187, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 134 + } + }, + { + "ph": "f", "id": 16, "pid": 5714, "tid": 6744, "ts": 6300866022496.653, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866022500.393, "dur": 25.960, + "args": { + "External id": 86152,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866022502.122, "dur": 23.751, + "args": { + "External id": 86153,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866022509.942, "dur": 0.851, + "args": { + "External id": 86154,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866022537.482, "dur": 1078.698, + "args": { + "External id": 86155,"Record function id": 0, "Sequence number": 1771185, "Fwd thread id": 1, "Ev Idx": 138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866022539.493, "dur": 1048.577, + "args": { + "External id": 86156,"Sequence number": 1771185, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 139 + } + }, + { + "ph": "f", "id": 17, "pid": 5714, "tid": 6744, "ts": 6300866022539.493, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866022567.042, "dur": 2.540, + "args": { + "External id": 86157,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866022571.692, "dur": 952.208, + "args": { + "External id": 86158,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866022572.802, "dur": 950.788, + "args": { + "External id": 86159,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866022574.842, "dur": 7.590, + "args": { + "External id": 86160,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866022583.582, "dur": 939.438, + "args": { + "External id": 86161,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5714, "tid": 6744, + "ts": 6300866023526.460, "dur": 0.200, + "args": { + "External id": 86162,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5714, "tid": 6744, + "ts": 6300866023527.940, "dur": 3.310, + "args": { + "External id": 86163,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5714, "tid": 6744, + "ts": 6300866023530.290, "dur": 0.770, + "args": { + "External id": 86164,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 6744, + "ts": 6300866023534.430, "dur": 20.860, + "args": { + "External id": 86165,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 6744, + "ts": 6300866023560.390, "dur": 20.720, + "args": { + "External id": 86166,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 6744, + "ts": 6300866023561.270, "dur": 19.590, + "args": { + "External id": 86167,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 6744, + "ts": 6300866023562.320, "dur": 18.130, + "args": { + "External id": 86168,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866023596.720, "dur": 14.960, + "args": { + "External id": 86169,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023628.700, "dur": 14.550, + "args": { + "External id": 86170,"Record function id": 0, "Sequence number": 1771184, "Fwd thread id": 1, "Ev Idx": 153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023631.300, "dur": 8.870, + "args": { + "External id": 86171,"Sequence number": 1771184, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 154 + } + }, + { + "ph": "f", "id": 18, "pid": 5714, "tid": 6744, "ts": 6300866023631.300, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866023634.270, "dur": 5.610, + "args": { + "External id": 86172,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866023636.390, "dur": 3.240, + "args": { + "External id": 86173,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023647.570, "dur": 62.440, + "args": { + "External id": 86174,"Record function id": 0, "Sequence number": 1771183, "Fwd thread id": 1, "Ev Idx": 157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023648.840, "dur": 53.540, + "args": { + "External id": 86175,"Sequence number": 1771183, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 158 + } + }, + { + "ph": "f", "id": 19, "pid": 5714, "tid": 6744, "ts": 6300866023648.840, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866023650.610, "dur": 51.230, + "args": { + "External id": 86176,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866023653.120, "dur": 24.480, + "args": { + "External id": 86177,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866023654.600, "dur": 5.840, + "args": { + "External id": 86178,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866023661.500, "dur": 15.770, + "args": { + "External id": 86179,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866023662.940, "dur": 13.440, + "args": { + "External id": 86180,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866023679.120, "dur": 3.540, + "args": { + "External id": 86181,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866023681.320, "dur": 0.890, + "args": { + "External id": 86182,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866023683.630, "dur": 17.210, + "args": { + "External id": 86183,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023718.400, "dur": 54.970, + "args": { + "External id": 86184,"Record function id": 0, "Sequence number": 1771182, "Fwd thread id": 1, "Ev Idx": 167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023721.360, "dur": 47.290, + "args": { + "External id": 86185,"Sequence number": 1771182, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 168 + } + }, + { + "ph": "f", "id": 20, "pid": 5714, "tid": 6744, "ts": 6300866023721.360, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5714, "tid": 6744, + "ts": 6300866023723.340, "dur": 44.940, + "args": { + "External id": 86186,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866023724.950, "dur": 23.270, + "args": { + "External id": 86187,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866023727.210, "dur": 5.370, + "args": { + "External id": 86188,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866023733.430, "dur": 14.480, + "args": { + "External id": 86189,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866023735.300, "dur": 11.690, + "args": { + "External id": 86190,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6300866023749.380, "dur": 5.140, + "args": { + "External id": 86191,"Record function id": 0, "Concrete Inputs": ["", "2", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866023752.600, "dur": 1.200, + "args": { + "External id": 86192,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866023755.260, "dur": 12.290, + "args": { + "External id": 86193,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023780.650, "dur": 51.709, + "args": { + "External id": 86194,"Record function id": 0, "Sequence number": 1771181, "Fwd thread id": 1, "Ev Idx": 177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023783.559, "dur": 44.000, + "args": { + "External id": 86195,"Sequence number": 1771181, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 178 + } + }, + { + "ph": "f", "id": 21, "pid": 5714, "tid": 6744, "ts": 6300866023783.559, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866023785.719, "dur": 41.391, + "args": { + "External id": 86196,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866023787.290, "dur": 19.480, + "args": { + "External id": 86197,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866023788.750, "dur": 4.829, + "args": { + "External id": 86198,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866023794.319, "dur": 12.151, + "args": { + "External id": 86199,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866023795.430, "dur": 10.240, + "args": { + "External id": 86200,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866023808.090, "dur": 2.940, + "args": { + "External id": 86201,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866023809.690, "dur": 0.940, + "args": { + "External id": 86202,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866023811.779, "dur": 14.440, + "args": { + "External id": 86203,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023840.059, "dur": 67.240, + "args": { + "External id": 86204,"Record function id": 0, "Sequence number": 1771180, "Fwd thread id": 1, "Ev Idx": 187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023843.250, "dur": 45.229, + "args": { + "External id": 86205,"Sequence number": 1771180, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 188 + } + }, + { + "ph": "f", "id": 22, "pid": 5714, "tid": 6744, "ts": 6300866023843.250, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866023845.299, "dur": 42.720, + "args": { + "External id": 86206,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866023846.790, "dur": 22.520, + "args": { + "External id": 86207,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866023849.010, "dur": 5.800, + "args": { + "External id": 86208,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866023855.659, "dur": 13.371, + "args": { + "External id": 86209,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866023857.250, "dur": 10.889, + "args": { + "External id": 86210,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866023870.479, "dur": 2.700, + "args": { + "External id": 86211,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866023872.059, "dur": 0.780, + "args": { + "External id": 86212,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866023873.879, "dur": 13.360, + "args": { + "External id": 86213,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866023893.849, "dur": 10.890, + "args": { + "External id": 86214,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023915.559, "dur": 36.830, + "args": { + "External id": 86215,"Record function id": 0, "Sequence number": 1771179, "Fwd thread id": 1, "Ev Idx": 198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866023917.389, "dur": 1.070, + "args": { + "External id": 86216,"Sequence number": 1771179, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 199 + } + }, + { + "ph": "f", "id": 23, "pid": 5714, "tid": 6744, "ts": 6300866023917.389, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866023920.139, "dur": 26.320, + "args": { + "External id": 86217,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866023923.129, "dur": 22.820, + "args": { + "External id": 86218,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866023929.649, "dur": 1.000, + "args": { + "External id": 86219,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866023959.189, "dur": 1100.098, + "args": { + "External id": 86220,"Record function id": 0, "Sequence number": 1771178, "Fwd thread id": 1, "Ev Idx": 203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866023968.269, "dur": 1062.138, + "args": { + "External id": 86221,"Sequence number": 1771178, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 204 + } + }, + { + "ph": "f", "id": 24, "pid": 5714, "tid": 6744, "ts": 6300866023968.269, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866023993.729, "dur": 2.810, + "args": { + "External id": 86222,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866023998.599, "dur": 968.138, + "args": { + "External id": 86223,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866023999.709, "dur": 966.678, + "args": { + "External id": 86224,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866024001.769, "dur": 5.740, + "args": { + "External id": 86225,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866024009.289, "dur": 956.458, + "args": { + "External id": 86226,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5714, "tid": 6744, + "ts": 6300866024969.417, "dur": 0.190, + "args": { + "External id": 86227,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5714, "tid": 6744, + "ts": 6300866024970.817, "dur": 2.030, + "args": { + "External id": 86228,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5714, "tid": 6744, + "ts": 6300866024971.887, "dur": 0.760, + "args": { + "External id": 86229,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 6744, + "ts": 6300866024976.207, "dur": 20.980, + "args": { + "External id": 86230,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 6744, + "ts": 6300866025002.227, "dur": 21.290, + "args": { + "External id": 86231,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 6744, + "ts": 6300866025004.377, "dur": 18.910, + "args": { + "External id": 86232,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 6744, + "ts": 6300866025005.357, "dur": 17.570, + "args": { + "External id": 86233,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866025039.857, "dur": 15.010, + "args": { + "External id": 86234,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025070.777, "dur": 13.050, + "args": { + "External id": 86235,"Record function id": 0, "Sequence number": 1771177, "Fwd thread id": 1, "Ev Idx": 218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025073.357, "dur": 7.590, + "args": { + "External id": 86236,"Sequence number": 1771177, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 219 + } + }, + { + "ph": "f", "id": 25, "pid": 5714, "tid": 6744, "ts": 6300866025073.357, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866025076.157, "dur": 4.530, + "args": { + "External id": 86237,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866025077.167, "dur": 3.260, + "args": { + "External id": 86238,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025088.227, "dur": 64.920, + "args": { + "External id": 86239,"Record function id": 0, "Sequence number": 1771176, "Fwd thread id": 1, "Ev Idx": 222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025089.557, "dur": 55.859, + "args": { + "External id": 86240,"Sequence number": 1771176, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 223 + } + }, + { + "ph": "f", "id": 26, "pid": 5714, "tid": 6744, "ts": 6300866025089.557, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866025091.517, "dur": 53.359, + "args": { + "External id": 86241,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866025094.167, "dur": 25.580, + "args": { + "External id": 86242,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866025096.747, "dur": 6.129, + "args": { + "External id": 86243,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866025103.956, "dur": 15.471, + "args": { + "External id": 86244,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866025105.347, "dur": 13.160, + "args": { + "External id": 86245,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866025121.316, "dur": 3.700, + "args": { + "External id": 86246,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866025123.616, "dur": 0.951, + "args": { + "External id": 86247,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866025125.927, "dur": 18.009, + "args": { + "External id": 86248,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025162.796, "dur": 54.680, + "args": { + "External id": 86249,"Record function id": 0, "Sequence number": 1771175, "Fwd thread id": 1, "Ev Idx": 232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025164.787, "dur": 48.199, + "args": { + "External id": 86250,"Sequence number": 1771175, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 233 + } + }, + { + "ph": "f", "id": 27, "pid": 5714, "tid": 6744, "ts": 6300866025164.787, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5714, "tid": 6744, + "ts": 6300866025166.936, "dur": 45.600, + "args": { + "External id": 86251,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866025168.587, "dur": 23.969, + "args": { + "External id": 86252,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866025169.727, "dur": 6.349, + "args": { + "External id": 86253,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866025178.187, "dur": 14.029, + "args": { + "External id": 86254,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866025179.747, "dur": 11.549, + "args": { + "External id": 86255,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6300866025193.786, "dur": 4.780, + "args": { + "External id": 86256,"Record function id": 0, "Concrete Inputs": ["", "2", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866025196.756, "dur": 1.110, + "args": { + "External id": 86257,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866025199.406, "dur": 12.410, + "args": { + "External id": 86258,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025224.866, "dur": 61.480, + "args": { + "External id": 86259,"Record function id": 0, "Sequence number": 1771174, "Fwd thread id": 1, "Ev Idx": 242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025226.606, "dur": 55.100, + "args": { + "External id": 86260,"Sequence number": 1771174, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 243 + } + }, + { + "ph": "f", "id": 28, "pid": 5714, "tid": 6744, "ts": 6300866025226.606, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866025228.536, "dur": 52.740, + "args": { + "External id": 86261,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866025230.356, "dur": 21.670, + "args": { + "External id": 86262,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866025232.536, "dur": 4.940, + "args": { + "External id": 86263,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866025239.586, "dur": 12.140, + "args": { + "External id": 86264,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866025240.696, "dur": 10.260, + "args": { + "External id": 86265,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866025253.406, "dur": 3.260, + "args": { + "External id": 86266,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866025255.356, "dur": 0.910, + "args": { + "External id": 86267,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866025257.416, "dur": 22.940, + "args": { + "External id": 86268,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025294.046, "dur": 89.830, + "args": { + "External id": 86269,"Record function id": 0, "Sequence number": 1771173, "Fwd thread id": 1, "Ev Idx": 252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025295.766, "dur": 66.740, + "args": { + "External id": 86270,"Sequence number": 1771173, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 253 + } + }, + { + "ph": "f", "id": 29, "pid": 5714, "tid": 6744, "ts": 6300866025295.766, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6300866025306.736, "dur": 55.270, + "args": { + "External id": 86271,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6300866025308.436, "dur": 32.290, + "args": { + "External id": 86272,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866025309.626, "dur": 5.690, + "args": { + "External id": 86273,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6300866025316.426, "dur": 23.930, + "args": { + "External id": 86274,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6300866025325.946, "dur": 13.280, + "args": { + "External id": 86275,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866025342.276, "dur": 3.450, + "args": { + "External id": 86276,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866025344.176, "dur": 1.140, + "args": { + "External id": 86277,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866025346.546, "dur": 14.700, + "args": { + "External id": 86278,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866025368.726, "dur": 11.730, + "args": { + "External id": 86279,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866025395.366, "dur": 329.269, + "args": { + "External id": 86280,"Record function id": 0, "Sequence number": 1771172, "Fwd thread id": 1, "Ev Idx": 263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866025398.226, "dur": 316.209, + "args": { + "External id": 86281,"Sequence number": 1771172, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 264 + } + }, + { + "ph": "f", "id": 30, "pid": 5714, "tid": 6744, "ts": 6300866025398.226, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866025554.015, "dur": 37.820, + "args": { + "External id": 86282,"kernel_hash": "c6pwrjtaatk26ciodo5pmvyk7s5bgtuynny44q5qcq4cspn3x4h2", "grid": "grid(131328,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "8", "2048", "4", "131328", "384"], "kernel_file": "/tmp/torchinductor_root/6p/c6pwrjtaatk26ciodo5pmvyk7s5bgtuynny44q5qcq4cspn3x4h2.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], [8192, 4, 1, 1], [131328, 131328, 131328, 1, 768], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], [8, 2048, 4, 1], [1, 1, 1, 768, 171], [], [], [], [], []], "Ev Idx": 265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866025617.275, "dur": 20.260, + "args": { + "External id": 86283,"kernel_hash": "cvi2geo3kp7he4ronsudo7p4b3n4w3af22ohl5mnz4aa62cpmlkv", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "171"], "kernel_file": "/tmp/torchinductor_root/vi/cvi2geo3kp7he4ronsudo7p4b3n4w3af22ohl5mnz4aa62cpmlkv.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[131328, 131328, 131328, 1, 768], [768, 768, 768, 1], [], []], "Input Dims": [[1, 1, 1, 768, 171], [1, 1, 1, 768], [], []], "Ev Idx": 266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866025661.195, "dur": 22.790, + "args": { + "External id": 86284,"kernel_hash": "clabhfcwwwl4deuomn7k5h33jcp5cek5y5pevfi3r5njb7ep6ocn", "grid": "grid(65536,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "65536", "768"], "kernel_file": "/tmp/torchinductor_root/la/clabhfcwwwl4deuomn7k5h33jcp5cek5y5pevfi3r5njb7ep6ocn.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [1], [6291456, 3072, 768, 1], [8192, 4, 1, 1], [6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [768], [8, 2048, 4, 768], [8, 2048, 4, 1], [8, 2048, 4, 768], [], []], "Ev Idx": 267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866025741.285, "dur": 16.840, + "args": { + "External id": 86285,"Record function id": 0, "Ev Idx": 268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866025744.855, "dur": 10.800, + "args": { + "External id": 86286,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866025749.475, "dur": 5.320, + "args": { + "External id": 86287,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866025750.465, "dur": 4.090, + "args": { + "External id": 86288,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: StackBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025764.415, "dur": 32.720, + "args": { + "External id": 86289,"Record function id": 0, "Sequence number": 1771171, "Fwd thread id": 1, "Ev Idx": 272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "StackBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025765.945, "dur": 23.040, + "args": { + "External id": 86290,"Sequence number": 1771171, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 273 + } + }, + { + "ph": "f", "id": 31, "pid": 5714, "tid": 6744, "ts": 6300866025765.945, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6300866025768.035, "dur": 9.850, + "args": { + "External id": 86291,"Record function id": 0, "Concrete Inputs": ["", "-2", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866025774.505, "dur": 1.570, + "args": { + "External id": 86292,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6300866025778.605, "dur": 3.850, + "args": { + "External id": 86293,"Record function id": 0, "Concrete Inputs": ["", "-2", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866025780.285, "dur": 1.550, + "args": { + "External id": 86294,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6300866025783.045, "dur": 2.520, + "args": { + "External id": 86295,"Record function id": 0, "Concrete Inputs": ["", "-2", "2"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866025784.525, "dur": 0.310, + "args": { + "External id": 86296,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6300866025786.075, "dur": 2.220, + "args": { + "External id": 86297,"Record function id": 0, "Concrete Inputs": ["", "-2", "3"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866025787.465, "dur": 0.270, + "args": { + "External id": 86298,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025801.715, "dur": 6.040, + "args": { + "External id": 86299,"Record function id": 0, "Sequence number": 1771170, "Fwd thread id": 1, "Ev Idx": 282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866025803.125, "dur": 1.080, + "args": { + "External id": 86300,"Sequence number": 1771170, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 283 + } + }, + { + "ph": "f", "id": 32, "pid": 5714, "tid": 6744, "ts": 6300866025803.125, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866025812.425, "dur": 455.559, + "args": { + "External id": 86301,"Record function id": 0, "Sequence number": 1771169, "Fwd thread id": 1, "Ev Idx": 284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866025814.035, "dur": 440.489, + "args": { + "External id": 86302,"Sequence number": 1771169, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 285 + } + }, + { + "ph": "f", "id": 33, "pid": 5714, "tid": 6744, "ts": 6300866025814.035, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866025857.575, "dur": 11.950, + "args": { + "External id": 86303,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6300866025863.915, "dur": 5.010, + "args": { + "External id": 86304,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866025874.075, "dur": 7.160, + "args": { + "External id": 86305,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866025877.615, "dur": 2.730, + "args": { + "External id": 86306,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866025879.315, "dur": 0.720, + "args": { + "External id": 86307,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 6744, + "ts": 6300866025887.475, "dur": 80.980, + "args": { + "External id": 86308,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866025888.535, "dur": 3.080, + "args": { + "External id": 86309,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866025889.175, "dur": 1.860, + "args": { + "External id": 86310,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866025890.375, "dur": 0.480, + "args": { + "External id": 86311,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 6744, + "ts": 6300866025893.615, "dur": 73.850, + "args": { + "External id": 86312,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866025896.545, "dur": 69.960, + "args": { + "External id": 86313,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 6744, + "ts": 6300866025974.585, "dur": 5.249, + "args": { + "External id": 86314,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866025977.105, "dur": 2.520, + "args": { + "External id": 86315,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866026015.894, "dur": 9.500, + "args": { + "External id": 86316,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866026027.025, "dur": 3.249, + "args": { + "External id": 86317,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866026031.414, "dur": 4.151, + "args": { + "External id": 86318,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866026074.304, "dur": 4.110, + "args": { + "External id": 86319,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866026075.334, "dur": 2.710, + "args": { + "External id": 86320,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5714, "tid": 6744, + "ts": 6300866026101.134, "dur": 133.040, + "args": { + "External id": 86321,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6300866026108.644, "dur": 7.190, + "args": { + "External id": 86322,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026113.134, "dur": 1.250, + "args": { + "External id": 86323,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866026118.104, "dur": 5.560, + "args": { + "External id": 86324,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026122.384, "dur": 0.420, + "args": { + "External id": 86325,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6300866026125.524, "dur": 2.110, + "args": { + "External id": 86326,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026126.904, "dur": 0.330, + "args": { + "External id": 86327,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866026128.474, "dur": 2.510, + "args": { + "External id": 86328,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026130.224, "dur": 0.280, + "args": { + "External id": 86329,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866026135.854, "dur": 3.600, + "args": { + "External id": 86330,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026137.634, "dur": 1.460, + "args": { + "External id": 86331,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866026141.744, "dur": 5.310, + "args": { + "External id": 86332,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6300866026145.244, "dur": 1.570, + "args": { + "External id": 86333,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866026148.134, "dur": 2.400, + "args": { + "External id": 86334,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026149.834, "dur": 0.330, + "args": { + "External id": 86335,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866026151.384, "dur": 2.420, + "args": { + "External id": 86336,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866026152.524, "dur": 1.110, + "args": { + "External id": 86337,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866026155.424, "dur": 64.030, + "args": { + "External id": 86338,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866026222.604, "dur": 2.230, + "args": { + "External id": 86339,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866026225.864, "dur": 3.380, + "args": { + "External id": 86340,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026228.094, "dur": 0.540, + "args": { + "External id": 86341,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866026231.874, "dur": 0.800, + "args": { + "External id": 86342,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866026281.864, "dur": 11.770, + "args": { + "External id": 86343,"Record function id": 0, "Ev Idx": 326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866026285.234, "dur": 6.930, + "args": { + "External id": 86344,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866026287.584, "dur": 3.600, + "args": { + "External id": 86345,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866026288.504, "dur": 2.460, + "args": { + "External id": 86346,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026308.044, "dur": 10.550, + "args": { + "External id": 86347,"Record function id": 0, "Sequence number": 1771168, "Fwd thread id": 1, "Ev Idx": 330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026310.124, "dur": 5.510, + "args": { + "External id": 86348,"Sequence number": 1771168, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 331 + } + }, + { + "ph": "f", "id": 34, "pid": 5714, "tid": 6744, "ts": 6300866026310.124, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866026312.224, "dur": 3.130, + "args": { + "External id": 86349,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866026313.254, "dur": 1.830, + "args": { + "External id": 86350,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026323.224, "dur": 113.900, + "args": { + "External id": 86351,"Record function id": 0, "Sequence number": 1771167, "Fwd thread id": 1, "Ev Idx": 334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026325.684, "dur": 101.000, + "args": { + "External id": 86352,"Sequence number": 1771167, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 335 + } + }, + { + "ph": "f", "id": 35, "pid": 5714, "tid": 6744, "ts": 6300866026325.684, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866026329.214, "dur": 5.710, + "args": { + "External id": 86353,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866026330.664, "dur": 3.460, + "args": { + "External id": 86354,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026332.594, "dur": 1.180, + "args": { + "External id": 86355,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866026336.254, "dur": 45.340, + "args": { + "External id": 86356,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866026383.764, "dur": 7.580, + "args": { + "External id": 86357,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866026385.024, "dur": 5.290, + "args": { + "External id": 86358,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026387.944, "dur": 2.050, + "args": { + "External id": 86359,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866026393.364, "dur": 2.920, + "args": { + "External id": 86360,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866026394.214, "dur": 1.510, + "args": { + "External id": 86361,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026395.194, "dur": 0.350, + "args": { + "External id": 86362,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866026397.054, "dur": 28.479, + "args": { + "External id": 86363,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026447.344, "dur": 11.149, + "args": { + "External id": 86364,"Record function id": 0, "Sequence number": 1771166, "Fwd thread id": 1, "Ev Idx": 347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026449.553, "dur": 6.311, + "args": { + "External id": 86365,"Sequence number": 1771166, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 348 + } + }, + { + "ph": "f", "id": 36, "pid": 5714, "tid": 6744, "ts": 6300866026449.553, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866026451.753, "dur": 3.871, + "args": { + "External id": 86366,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866026452.853, "dur": 2.571, + "args": { + "External id": 86367,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026463.924, "dur": 10.369, + "args": { + "External id": 86368,"Record function id": 0, "Sequence number": 1771165, "Fwd thread id": 1, "Ev Idx": 351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026465.413, "dur": 5.700, + "args": { + "External id": 86369,"Sequence number": 1771165, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 352 + } + }, + { + "ph": "f", "id": 37, "pid": 5714, "tid": 6744, "ts": 6300866026465.413, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866026466.324, "dur": 4.509, + "args": { + "External id": 86370,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866026467.513, "dur": 2.691, + "args": { + "External id": 86371,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026469.153, "dur": 0.760, + "args": { + "External id": 86372,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866026479.944, "dur": 8.600, + "args": { + "External id": 86373,"Record function id": 0, "Ev Idx": 356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866026482.153, "dur": 5.151, + "args": { + "External id": 86374,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866026484.024, "dur": 2.840, + "args": { + "External id": 86375,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866026484.824, "dur": 1.820, + "args": { + "External id": 86376,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026492.524, "dur": 9.120, + "args": { + "External id": 86377,"Record function id": 0, "Sequence number": 1771164, "Fwd thread id": 1, "Ev Idx": 360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026494.144, "dur": 4.620, + "args": { + "External id": 86378,"Sequence number": 1771164, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 361 + } + }, + { + "ph": "f", "id": 38, "pid": 5714, "tid": 6744, "ts": 6300866026494.144, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866026496.553, "dur": 2.020, + "args": { + "External id": 86379,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866026497.333, "dur": 1.051, + "args": { + "External id": 86380,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026505.593, "dur": 96.400, + "args": { + "External id": 86381,"Record function id": 0, "Sequence number": 1771163, "Fwd thread id": 1, "Ev Idx": 364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026506.944, "dur": 84.519, + "args": { + "External id": 86382,"Sequence number": 1771163, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 365 + } + }, + { + "ph": "f", "id": 39, "pid": 5714, "tid": 6744, "ts": 6300866026506.944, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866026509.224, "dur": 4.479, + "args": { + "External id": 86383,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866026509.893, "dur": 3.300, + "args": { + "External id": 86384,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026512.173, "dur": 0.760, + "args": { + "External id": 86385,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866026514.743, "dur": 38.330, + "args": { + "External id": 86386,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866026555.033, "dur": 5.350, + "args": { + "External id": 86387,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866026555.893, "dur": 3.600, + "args": { + "External id": 86388,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026557.433, "dur": 1.810, + "args": { + "External id": 86389,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866026562.083, "dur": 4.460, + "args": { + "External id": 86390,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866026563.003, "dur": 2.840, + "args": { + "External id": 86391,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026565.123, "dur": 0.550, + "args": { + "External id": 86392,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866026567.333, "dur": 23.220, + "args": { + "External id": 86393,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026612.393, "dur": 38.940, + "args": { + "External id": 86394,"Record function id": 0, "Sequence number": 1771162, "Fwd thread id": 1, "Ev Idx": 377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026614.323, "dur": 6.170, + "args": { + "External id": 86395,"Sequence number": 1771162, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 378 + } + }, + { + "ph": "f", "id": 40, "pid": 5714, "tid": 6744, "ts": 6300866026614.323, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866026616.423, "dur": 3.870, + "args": { + "External id": 86396,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866026617.523, "dur": 2.570, + "args": { + "External id": 86397,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866026624.723, "dur": 21.790, + "args": { + "External id": 86398,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026658.293, "dur": 13.950, + "args": { + "External id": 86399,"Record function id": 0, "Sequence number": 1771161, "Fwd thread id": 1, "Ev Idx": 382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026660.353, "dur": 8.750, + "args": { + "External id": 86400,"Sequence number": 1771161, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 383 + } + }, + { + "ph": "f", "id": 41, "pid": 5714, "tid": 6744, "ts": 6300866026660.353, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866026661.623, "dur": 7.180, + "args": { + "External id": 86401,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866026664.283, "dur": 3.490, + "args": { + "External id": 86402,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026666.453, "dur": 1.010, + "args": { + "External id": 86403,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866026677.973, "dur": 8.480, + "args": { + "External id": 86404,"Record function id": 0, "Ev Idx": 387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866026680.003, "dur": 5.220, + "args": { + "External id": 86405,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866026681.893, "dur": 2.860, + "args": { + "External id": 86406,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866026682.673, "dur": 1.880, + "args": { + "External id": 86407,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026690.723, "dur": 98.750, + "args": { + "External id": 86408,"Record function id": 0, "Sequence number": 1771160, "Fwd thread id": 1, "Ev Idx": 391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026691.983, "dur": 45.030, + "args": { + "External id": 86409,"Sequence number": 1771160, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 392 + } + }, + { + "ph": "f", "id": 42, "pid": 5714, "tid": 6744, "ts": 6300866026691.983, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866026695.313, "dur": 24.740, + "args": { + "External id": 86410,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866026721.913, "dur": 14.580, + "args": { + "External id": 86411,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866026740.313, "dur": 35.000, + "args": { + "External id": 86412,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866026780.013, "dur": 2.470, + "args": { + "External id": 86413,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866026798.993, "dur": 8.470, + "args": { + "External id": 86414,"Record function id": 0, "Ev Idx": 397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866026801.443, "dur": 4.790, + "args": { + "External id": 86415,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866026803.133, "dur": 2.590, + "args": { + "External id": 86416,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866026803.853, "dur": 1.660, + "args": { + "External id": 86417,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026811.803, "dur": 42.450, + "args": { + "External id": 86418,"Record function id": 0, "Sequence number": 1771159, "Fwd thread id": 1, "Ev Idx": 401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026813.243, "dur": 37.260, + "args": { + "External id": 86419,"Sequence number": 1771159, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 402 + } + }, + { + "ph": "f", "id": 43, "pid": 5714, "tid": 6744, "ts": 6300866026813.243, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866026816.443, "dur": 33.590, + "args": { + "External id": 86420,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866026819.043, "dur": 30.690, + "args": { + "External id": 86421,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866026822.883, "dur": 6.010, + "args": { + "External id": 86422,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866026830.013, "dur": 19.010, + "args": { + "External id": 86423,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026860.743, "dur": 67.180, + "args": { + "External id": 86424,"Record function id": 0, "Sequence number": 1771158, "Fwd thread id": 1, "Ev Idx": 407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026862.443, "dur": 39.449, + "args": { + "External id": 86425,"Sequence number": 1771158, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 408 + } + }, + { + "ph": "f", "id": 44, "pid": 5714, "tid": 6744, "ts": 6300866026862.443, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866026864.832, "dur": 18.871, + "args": { + "External id": 86426,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866026885.283, "dur": 16.109, + "args": { + "External id": 86427,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866026905.223, "dur": 16.440, + "args": { + "External id": 86428,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026935.772, "dur": 82.190, + "args": { + "External id": 86429,"Record function id": 0, "Sequence number": 1771157, "Fwd thread id": 1, "Ev Idx": 412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866026937.663, "dur": 74.489, + "args": { + "External id": 86430,"Sequence number": 1771157, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 413 + } + }, + { + "ph": "f", "id": 45, "pid": 5714, "tid": 6744, "ts": 6300866026937.663, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866026942.952, "dur": 29.220, + "args": { + "External id": 86431,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866026946.423, "dur": 0.729, + "args": { + "External id": 86432,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866026949.132, "dur": 0.520, + "args": { + "External id": 86433,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866026973.972, "dur": 24.020, + "args": { + "External id": 86434,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866026979.052, "dur": 17.800, + "args": { + "External id": 86435,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866026998.982, "dur": 11.100, + "args": { + "External id": 86436,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866027025.832, "dur": 4.990, + "args": { + "External id": 86437,"Record function id": 0, "Sequence number": 1771156, "Fwd thread id": 1, "Ev Idx": 420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866027027.612, "dur": 0.500, + "args": { + "External id": 86438,"Sequence number": 1771156, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 421 + } + }, + { + "ph": "f", "id": 46, "pid": 5714, "tid": 6744, "ts": 6300866027027.612, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866027034.752, "dur": 47.340, + "args": { + "External id": 86439,"Record function id": 0, "Sequence number": 1771155, "Fwd thread id": 1, "Ev Idx": 422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866027037.232, "dur": 40.090, + "args": { + "External id": 86440,"Sequence number": 1771155, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 423 + } + }, + { + "ph": "f", "id": 47, "pid": 5714, "tid": 6744, "ts": 6300866027037.232, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6300866027041.202, "dur": 8.180, + "args": { + "External id": 86441,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027046.192, "dur": 1.480, + "args": { + "External id": 86442,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866027051.152, "dur": 25.540, + "args": { + "External id": 86443,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866027054.872, "dur": 20.830, + "args": { + "External id": 86444,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027089.362, "dur": 95.530, + "args": { + "External id": 86445,"Record function id": 0, "Sequence number": 1771154, "Fwd thread id": 1, "Ev Idx": 428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027091.472, "dur": 69.180, + "args": { + "External id": 86446,"Sequence number": 1771154, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 429 + } + }, + { + "ph": "f", "id": 48, "pid": 5714, "tid": 6744, "ts": 6300866027091.472, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866027093.522, "dur": 32.230, + "args": { + "External id": 86447,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866027094.892, "dur": 0.360, + "args": { + "External id": 86448,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866027096.262, "dur": 0.230, + "args": { + "External id": 86449,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866027102.392, "dur": 21.850, + "args": { + "External id": 86450,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866027126.852, "dur": 20.060, + "args": { + "External id": 86451,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866027131.012, "dur": 14.820, + "args": { + "External id": 86452,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866027147.792, "dur": 10.720, + "args": { + "External id": 86453,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866027167.522, "dur": 13.680, + "args": { + "External id": 86454,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027195.172, "dur": 60.310, + "args": { + "External id": 86455,"Record function id": 0, "Sequence number": 1771153, "Fwd thread id": 1, "Ev Idx": 438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027197.402, "dur": 29.350, + "args": { + "External id": 86456,"Sequence number": 1771153, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 439 + } + }, + { + "ph": "f", "id": 49, "pid": 5714, "tid": 6744, "ts": 6300866027197.402, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866027199.202, "dur": 27.140, + "args": { + "External id": 86457,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866027200.252, "dur": 25.810, + "args": { + "External id": 86458,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027204.122, "dur": 5.280, + "args": { + "External id": 86459,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866027210.392, "dur": 15.110, + "args": { + "External id": 86460,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866027231.952, "dur": 17.290, + "args": { + "External id": 86461,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027263.562, "dur": 6.830, + "args": { + "External id": 86462,"Record function id": 0, "Sequence number": 1771152, "Fwd thread id": 1, "Ev Idx": 445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027265.662, "dur": 1.110, + "args": { + "External id": 86463,"Sequence number": 1771152, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 446 + } + }, + { + "ph": "f", "id": 50, "pid": 5714, "tid": 6744, "ts": 6300866027265.662, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027274.552, "dur": 20.819, + "args": { + "External id": 86464,"Record function id": 0, "Sequence number": 1771151, "Fwd thread id": 1, "Ev Idx": 447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027283.772, "dur": 8.070, + "args": { + "External id": 86465,"Sequence number": 1771151, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 448 + } + }, + { + "ph": "f", "id": 51, "pid": 5714, "tid": 6744, "ts": 6300866027283.772, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866027285.882, "dur": 5.760, + "args": { + "External id": 86466,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866027288.172, "dur": 3.150, + "args": { + "External id": 86467,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027307.562, "dur": 108.209, + "args": { + "External id": 86468,"Record function id": 0, "Sequence number": 1771150, "Fwd thread id": 1, "Ev Idx": 451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027309.142, "dur": 100.879, + "args": { + "External id": 86469,"Sequence number": 1771150, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 452 + } + }, + { + "ph": "f", "id": 52, "pid": 5714, "tid": 6744, "ts": 6300866027309.142, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866027311.922, "dur": 6.560, + "args": { + "External id": 86470,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866027313.422, "dur": 4.080, + "args": { + "External id": 86471,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027315.622, "dur": 1.480, + "args": { + "External id": 86472,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866027320.631, "dur": 46.160, + "args": { + "External id": 86473,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866027369.011, "dur": 5.731, + "args": { + "External id": 86474,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866027370.202, "dur": 3.600, + "args": { + "External id": 86475,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027371.871, "dur": 1.700, + "args": { + "External id": 86476,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866027376.622, "dur": 4.180, + "args": { + "External id": 86477,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866027377.631, "dur": 2.671, + "args": { + "External id": 86478,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027379.851, "dur": 0.291, + "args": { + "External id": 86479,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866027381.542, "dur": 27.569, + "args": { + "External id": 86480,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027425.331, "dur": 11.200, + "args": { + "External id": 86481,"Record function id": 0, "Sequence number": 1771149, "Fwd thread id": 1, "Ev Idx": 464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027427.331, "dur": 6.400, + "args": { + "External id": 86482,"Sequence number": 1771149, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 465 + } + }, + { + "ph": "f", "id": 53, "pid": 5714, "tid": 6744, "ts": 6300866027427.331, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866027429.721, "dur": 3.820, + "args": { + "External id": 86483,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866027430.851, "dur": 2.480, + "args": { + "External id": 86484,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027440.741, "dur": 10.220, + "args": { + "External id": 86485,"Record function id": 0, "Sequence number": 1771148, "Fwd thread id": 1, "Ev Idx": 468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027442.201, "dur": 6.470, + "args": { + "External id": 86486,"Sequence number": 1771148, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 469 + } + }, + { + "ph": "f", "id": 54, "pid": 5714, "tid": 6744, "ts": 6300866027442.201, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866027443.011, "dur": 5.440, + "args": { + "External id": 86487,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866027444.151, "dur": 3.670, + "args": { + "External id": 86488,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027446.741, "dur": 0.810, + "args": { + "External id": 86489,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866027456.741, "dur": 8.580, + "args": { + "External id": 86490,"Record function id": 0, "Ev Idx": 473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866027458.761, "dur": 5.290, + "args": { + "External id": 86491,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866027460.641, "dur": 2.970, + "args": { + "External id": 86492,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866027461.411, "dur": 1.990, + "args": { + "External id": 86493,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027469.351, "dur": 8.060, + "args": { + "External id": 86494,"Record function id": 0, "Sequence number": 1771147, "Fwd thread id": 1, "Ev Idx": 477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866027470.631, "dur": 4.100, + "args": { + "External id": 86495,"Sequence number": 1771147, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 478 + } + }, + { + "ph": "f", "id": 55, "pid": 5714, "tid": 6744, "ts": 6300866027470.631, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866027472.161, "dur": 2.410, + "args": { + "External id": 86496,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866027472.881, "dur": 1.490, + "args": { + "External id": 86497,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6300866027483.091, "dur": 315.650, + "args": { + "External id": 86498,"Record function id": 0, "Sequence number": 1771146, "Fwd thread id": 1, "Ev Idx": 481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6300866027484.961, "dur": 291.769, + "args": { + "External id": 86499,"Sequence number": 1771146, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 482 + } + }, + { + "ph": "f", "id": 56, "pid": 5714, "tid": 6744, "ts": 6300866027484.961, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866027503.291, "dur": 10.270, + "args": { + "External id": 86500,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027506.261, "dur": 6.530, + "args": { + "External id": 86501,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866027516.141, "dur": 6.030, + "args": { + "External id": 86502,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027517.581, "dur": 4.320, + "args": { + "External id": 86503,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866027523.901, "dur": 5.780, + "args": { + "External id": 86504,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027525.071, "dur": 4.360, + "args": { + "External id": 86505,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866027551.561, "dur": 191.240, + "args": { + "External id": 86506,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866027645.211, "dur": 6.370, + "args": { + "External id": 86507,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866027653.591, "dur": 3.490, + "args": { + "External id": 86508,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866027758.461, "dur": 5.769, + "args": { + "External id": 86509,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866027769.201, "dur": 0.720, + "args": { + "External id": 86510,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866027773.101, "dur": 0.600, + "args": { + "External id": 86511,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866027810.641, "dur": 254.649, + "args": { + "External id": 86512,"Record function id": 0, "Sequence number": 1771145, "Fwd thread id": 1, "Ev Idx": 495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866027813.350, "dur": 242.460, + "args": { + "External id": 86513,"Sequence number": 1771145, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 496 + } + }, + { + "ph": "f", "id": 57, "pid": 5714, "tid": 6744, "ts": 6300866027813.350, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866027834.620, "dur": 38.990, + "args": { + "External id": 86514,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027838.650, "dur": 7.050, + "args": { + "External id": 86515,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866027846.950, "dur": 25.960, + "args": { + "External id": 86516,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866027884.950, "dur": 8.440, + "args": { + "External id": 86517,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866027888.040, "dur": 4.870, + "args": { + "External id": 86518,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866028078.430, "dur": 175.250, + "args": { + "External id": 86519,"Record function id": 0, "Sequence number": 1771144, "Fwd thread id": 1, "Ev Idx": 502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866028081.470, "dur": 163.710, + "args": { + "External id": 86520,"Sequence number": 1771144, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 503 + } + }, + { + "ph": "f", "id": 58, "pid": 5714, "tid": 6744, "ts": 6300866028081.470, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866028097.080, "dur": 33.790, + "args": { + "External id": 86521,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028100.640, "dur": 6.760, + "args": { + "External id": 86522,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866028108.630, "dur": 21.510, + "args": { + "External id": 86523,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866028139.920, "dur": 7.770, + "args": { + "External id": 86524,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028142.160, "dur": 5.110, + "args": { + "External id": 86525,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028265.969, "dur": 138.910, + "args": { + "External id": 86526,"Record function id": 0, "Sequence number": 1771143, "Fwd thread id": 1, "Ev Idx": 509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028383.609, "dur": 16.990, + "args": { + "External id": 86527,"Sequence number": 1771143, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 510 + } + }, + { + "ph": "f", "id": 59, "pid": 5714, "tid": 6744, "ts": 6300866028383.609, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866028386.859, "dur": 13.440, + "args": { + "External id": 86528,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866028395.739, "dur": 4.170, + "args": { + "External id": 86529,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028410.239, "dur": 7.570, + "args": { + "External id": 86530,"Record function id": 0, "Sequence number": 1771142, "Fwd thread id": 1, "Ev Idx": 513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028411.709, "dur": 3.470, + "args": { + "External id": 86531,"Sequence number": 1771142, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 514 + } + }, + { + "ph": "f", "id": 60, "pid": 5714, "tid": 6744, "ts": 6300866028411.709, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866028413.139, "dur": 1.840, + "args": { + "External id": 86532,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866028413.839, "dur": 0.950, + "args": { + "External id": 86533,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028421.619, "dur": 7.750, + "args": { + "External id": 86534,"Record function id": 0, "Sequence number": 1771141, "Fwd thread id": 1, "Ev Idx": 517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028422.849, "dur": 4.060, + "args": { + "External id": 86535,"Sequence number": 1771141, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 518 + } + }, + { + "ph": "f", "id": 61, "pid": 5714, "tid": 6744, "ts": 6300866028422.849, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866028423.889, "dur": 2.870, + "args": { + "External id": 86536,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866028425.769, "dur": 0.800, + "args": { + "External id": 86537,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028433.329, "dur": 6.900, + "args": { + "External id": 86538,"Record function id": 0, "Sequence number": 1771140, "Fwd thread id": 1, "Ev Idx": 521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028434.949, "dur": 2.870, + "args": { + "External id": 86539,"Sequence number": 1771140, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 522 + } + }, + { + "ph": "f", "id": 62, "pid": 5714, "tid": 6744, "ts": 6300866028434.949, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866028435.989, "dur": 1.660, + "args": { + "External id": 86540,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866028436.599, "dur": 0.860, + "args": { + "External id": 86541,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028444.129, "dur": 109.050, + "args": { + "External id": 86542,"Record function id": 0, "Sequence number": 1771139, "Fwd thread id": 1, "Ev Idx": 525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028445.519, "dur": 97.750, + "args": { + "External id": 86543,"Sequence number": 1771139, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 526 + } + }, + { + "ph": "f", "id": 63, "pid": 5714, "tid": 6744, "ts": 6300866028445.519, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028448.669, "dur": 7.500, + "args": { + "External id": 86544,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028450.159, "dur": 5.360, + "args": { + "External id": 86545,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028452.969, "dur": 2.160, + "args": { + "External id": 86546,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866028457.419, "dur": 47.810, + "args": { + "External id": 86547,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028507.489, "dur": 4.930, + "args": { + "External id": 86548,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028508.349, "dur": 3.140, + "args": { + "External id": 86549,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028510.039, "dur": 1.140, + "args": { + "External id": 86550,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028514.189, "dur": 3.950, + "args": { + "External id": 86551,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028516.029, "dur": 1.370, + "args": { + "External id": 86552,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028516.909, "dur": 0.310, + "args": { + "External id": 86553,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866028518.839, "dur": 23.520, + "args": { + "External id": 86554,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028562.829, "dur": 10.590, + "args": { + "External id": 86555,"Record function id": 0, "Sequence number": 1771138, "Fwd thread id": 1, "Ev Idx": 538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028565.009, "dur": 6.150, + "args": { + "External id": 86556,"Sequence number": 1771138, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 539 + } + }, + { + "ph": "f", "id": 64, "pid": 5714, "tid": 6744, "ts": 6300866028565.009, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866028567.159, "dur": 3.800, + "args": { + "External id": 86557,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866028568.229, "dur": 2.530, + "args": { + "External id": 86558,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028577.499, "dur": 10.720, + "args": { + "External id": 86559,"Record function id": 0, "Sequence number": 1771137, "Fwd thread id": 1, "Ev Idx": 542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028578.779, "dur": 6.760, + "args": { + "External id": 86560,"Sequence number": 1771137, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 543 + } + }, + { + "ph": "f", "id": 65, "pid": 5714, "tid": 6744, "ts": 6300866028578.779, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028580.879, "dur": 4.410, + "args": { + "External id": 86561,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028582.069, "dur": 2.580, + "args": { + "External id": 86562,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028583.599, "dur": 0.770, + "args": { + "External id": 86563,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866028594.039, "dur": 10.260, + "args": { + "External id": 86564,"Record function id": 0, "Ev Idx": 547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866028596.519, "dur": 6.460, + "args": { + "External id": 86565,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866028598.499, "dur": 3.980, + "args": { + "External id": 86566,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866028599.289, "dur": 2.980, + "args": { + "External id": 86567,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028609.709, "dur": 8.679, + "args": { + "External id": 86568,"Record function id": 0, "Sequence number": 1771136, "Fwd thread id": 1, "Ev Idx": 551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028611.309, "dur": 4.410, + "args": { + "External id": 86569,"Sequence number": 1771136, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 552 + } + }, + { + "ph": "f", "id": 66, "pid": 5714, "tid": 6744, "ts": 6300866028611.309, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866028612.429, "dur": 3.079, + "args": { + "External id": 86570,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866028614.119, "dur": 1.180, + "args": { + "External id": 86571,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028622.459, "dur": 91.289, + "args": { + "External id": 86572,"Record function id": 0, "Sequence number": 1771135, "Fwd thread id": 1, "Ev Idx": 555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028623.799, "dur": 80.609, + "args": { + "External id": 86573,"Sequence number": 1771135, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 556 + } + }, + { + "ph": "f", "id": 67, "pid": 5714, "tid": 6744, "ts": 6300866028623.799, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028626.248, "dur": 3.391, + "args": { + "External id": 86574,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028626.939, "dur": 2.240, + "args": { + "External id": 86575,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028628.248, "dur": 0.680, + "args": { + "External id": 86576,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866028630.639, "dur": 38.169, + "args": { + "External id": 86577,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028670.668, "dur": 5.831, + "args": { + "External id": 86578,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028672.868, "dur": 2.691, + "args": { + "External id": 86579,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028674.619, "dur": 0.709, + "args": { + "External id": 86580,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028678.199, "dur": 2.889, + "args": { + "External id": 86581,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028679.128, "dur": 1.480, + "args": { + "External id": 86582,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028680.128, "dur": 0.320, + "args": { + "External id": 86583,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866028681.839, "dur": 21.669, + "args": { + "External id": 86584,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028722.978, "dur": 36.030, + "args": { + "External id": 86585,"Record function id": 0, "Sequence number": 1771134, "Fwd thread id": 1, "Ev Idx": 568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028724.938, "dur": 6.810, + "args": { + "External id": 86586,"Sequence number": 1771134, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 569 + } + }, + { + "ph": "f", "id": 68, "pid": 5714, "tid": 6744, "ts": 6300866028724.938, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866028727.788, "dur": 3.780, + "args": { + "External id": 86587,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866028729.018, "dur": 2.350, + "args": { + "External id": 86588,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866028734.658, "dur": 20.400, + "args": { + "External id": 86589,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028766.048, "dur": 13.240, + "args": { + "External id": 86590,"Record function id": 0, "Sequence number": 1771133, "Fwd thread id": 1, "Ev Idx": 573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028767.998, "dur": 8.490, + "args": { + "External id": 86591,"Sequence number": 1771133, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 574 + } + }, + { + "ph": "f", "id": 69, "pid": 5714, "tid": 6744, "ts": 6300866028767.998, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028769.028, "dur": 7.160, + "args": { + "External id": 86592,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028770.278, "dur": 4.860, + "args": { + "External id": 86593,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028773.118, "dur": 1.750, + "args": { + "External id": 86594,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866028784.808, "dur": 8.750, + "args": { + "External id": 86595,"Record function id": 0, "Ev Idx": 578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866028786.948, "dur": 5.350, + "args": { + "External id": 86596,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866028788.918, "dur": 2.870, + "args": { + "External id": 86597,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866028789.668, "dur": 1.910, + "args": { + "External id": 86598,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028797.558, "dur": 8.700, + "args": { + "External id": 86599,"Record function id": 0, "Sequence number": 1771132, "Fwd thread id": 1, "Ev Idx": 582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028799.268, "dur": 4.080, + "args": { + "External id": 86600,"Sequence number": 1771132, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 583 + } + }, + { + "ph": "f", "id": 70, "pid": 5714, "tid": 6744, "ts": 6300866028799.268, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866028800.408, "dur": 2.750, + "args": { + "External id": 86601,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866028801.328, "dur": 1.600, + "args": { + "External id": 86602,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028810.198, "dur": 96.420, + "args": { + "External id": 86603,"Record function id": 0, "Sequence number": 1771131, "Fwd thread id": 1, "Ev Idx": 586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028812.458, "dur": 83.560, + "args": { + "External id": 86604,"Sequence number": 1771131, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 587 + } + }, + { + "ph": "f", "id": 71, "pid": 5714, "tid": 6744, "ts": 6300866028812.458, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028814.598, "dur": 4.490, + "args": { + "External id": 86605,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028815.238, "dur": 3.310, + "args": { + "External id": 86606,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028817.478, "dur": 0.810, + "args": { + "External id": 86607,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866028820.048, "dur": 39.950, + "args": { + "External id": 86608,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028862.028, "dur": 5.740, + "args": { + "External id": 86609,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028863.018, "dur": 3.810, + "args": { + "External id": 86610,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028865.638, "dur": 0.940, + "args": { + "External id": 86611,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028869.618, "dur": 2.660, + "args": { + "External id": 86612,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028870.458, "dur": 1.330, + "args": { + "External id": 86613,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028871.368, "dur": 0.270, + "args": { + "External id": 86614,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866028872.998, "dur": 22.110, + "args": { + "External id": 86615,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028916.268, "dur": 31.840, + "args": { + "External id": 86616,"Record function id": 0, "Sequence number": 1771130, "Fwd thread id": 1, "Ev Idx": 599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028918.098, "dur": 7.040, + "args": { + "External id": 86617,"Sequence number": 1771130, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 600 + } + }, + { + "ph": "f", "id": 72, "pid": 5714, "tid": 6744, "ts": 6300866028918.098, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866028920.278, "dur": 4.680, + "args": { + "External id": 86618,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866028922.318, "dur": 2.430, + "args": { + "External id": 86619,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866028927.978, "dur": 16.400, + "args": { + "External id": 86620,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028954.348, "dur": 11.990, + "args": { + "External id": 86621,"Record function id": 0, "Sequence number": 1771129, "Fwd thread id": 1, "Ev Idx": 604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028956.168, "dur": 6.690, + "args": { + "External id": 86622,"Sequence number": 1771129, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 605 + } + }, + { + "ph": "f", "id": 73, "pid": 5714, "tid": 6744, "ts": 6300866028956.168, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866028957.098, "dur": 5.490, + "args": { + "External id": 86623,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866028958.328, "dur": 3.310, + "args": { + "External id": 86624,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866028960.338, "dur": 1.000, + "args": { + "External id": 86625,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866028971.918, "dur": 10.310, + "args": { + "External id": 86626,"Record function id": 0, "Ev Idx": 609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866028973.868, "dur": 7.160, + "args": { + "External id": 86627,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866028975.768, "dur": 4.760, + "args": { + "External id": 86628,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866028977.488, "dur": 2.830, + "args": { + "External id": 86629,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028986.148, "dur": 81.179, + "args": { + "External id": 86630,"Record function id": 0, "Sequence number": 1771128, "Fwd thread id": 1, "Ev Idx": 613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866028987.508, "dur": 36.750, + "args": { + "External id": 86631,"Sequence number": 1771128, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 614 + } + }, + { + "ph": "f", "id": 74, "pid": 5714, "tid": 6744, "ts": 6300866028987.508, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866028989.268, "dur": 20.190, + "args": { + "External id": 86632,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866029011.098, "dur": 12.570, + "args": { + "External id": 86633,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866029027.608, "dur": 27.939, + "args": { + "External id": 86634,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866029058.727, "dur": 2.480, + "args": { + "External id": 86635,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866029077.138, "dur": 9.580, + "args": { + "External id": 86636,"Record function id": 0, "Ev Idx": 619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866029079.998, "dur": 5.329, + "args": { + "External id": 86637,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866029081.698, "dur": 3.100, + "args": { + "External id": 86638,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866029083.018, "dur": 1.580, + "args": { + "External id": 86639,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029091.707, "dur": 36.151, + "args": { + "External id": 86640,"Record function id": 0, "Sequence number": 1771127, "Fwd thread id": 1, "Ev Idx": 623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029093.078, "dur": 30.400, + "args": { + "External id": 86641,"Sequence number": 1771127, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 624 + } + }, + { + "ph": "f", "id": 75, "pid": 5714, "tid": 6744, "ts": 6300866029093.078, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866029094.718, "dur": 28.369, + "args": { + "External id": 86642,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866029095.867, "dur": 26.891, + "args": { + "External id": 86643,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029099.167, "dur": 5.971, + "args": { + "External id": 86644,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866029106.258, "dur": 15.880, + "args": { + "External id": 86645,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029134.727, "dur": 61.580, + "args": { + "External id": 86646,"Record function id": 0, "Sequence number": 1771126, "Fwd thread id": 1, "Ev Idx": 629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029136.458, "dur": 34.429, + "args": { + "External id": 86647,"Sequence number": 1771126, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 630 + } + }, + { + "ph": "f", "id": 76, "pid": 5714, "tid": 6744, "ts": 6300866029136.458, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866029138.878, "dur": 16.019, + "args": { + "External id": 86648,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866029157.187, "dur": 13.210, + "args": { + "External id": 86649,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866029174.277, "dur": 16.610, + "args": { + "External id": 86650,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029203.967, "dur": 63.250, + "args": { + "External id": 86651,"Record function id": 0, "Sequence number": 1771125, "Fwd thread id": 1, "Ev Idx": 634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029205.847, "dur": 55.860, + "args": { + "External id": 86652,"Sequence number": 1771125, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 635 + } + }, + { + "ph": "f", "id": 77, "pid": 5714, "tid": 6744, "ts": 6300866029205.847, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866029208.547, "dur": 20.390, + "args": { + "External id": 86653,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866029210.417, "dur": 0.510, + "args": { + "External id": 86654,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866029211.977, "dur": 0.290, + "args": { + "External id": 86655,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866029230.217, "dur": 17.160, + "args": { + "External id": 86656,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866029233.497, "dur": 12.770, + "args": { + "External id": 86657,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866029248.267, "dur": 11.360, + "args": { + "External id": 86658,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866029275.117, "dur": 5.860, + "args": { + "External id": 86659,"Record function id": 0, "Sequence number": 1771124, "Fwd thread id": 1, "Ev Idx": 642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866029277.957, "dur": 0.480, + "args": { + "External id": 86660,"Sequence number": 1771124, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 643 + } + }, + { + "ph": "f", "id": 78, "pid": 5714, "tid": 6744, "ts": 6300866029277.957, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866029284.837, "dur": 74.190, + "args": { + "External id": 86661,"Record function id": 0, "Sequence number": 1771123, "Fwd thread id": 1, "Ev Idx": 644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866029286.117, "dur": 68.440, + "args": { + "External id": 86662,"Sequence number": 1771123, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 645 + } + }, + { + "ph": "f", "id": 79, "pid": 5714, "tid": 6744, "ts": 6300866029286.117, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6300866029288.677, "dur": 5.910, + "args": { + "External id": 86663,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029291.367, "dur": 1.580, + "args": { + "External id": 86664,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866029295.587, "dur": 58.180, + "args": { + "External id": 86665,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866029330.957, "dur": 21.540, + "args": { + "External id": 86666,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029367.347, "dur": 92.370, + "args": { + "External id": 86667,"Record function id": 0, "Sequence number": 1771122, "Fwd thread id": 1, "Ev Idx": 650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029369.447, "dur": 69.850, + "args": { + "External id": 86668,"Sequence number": 1771122, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 651 + } + }, + { + "ph": "f", "id": 80, "pid": 5714, "tid": 6744, "ts": 6300866029369.447, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866029371.367, "dur": 31.350, + "args": { + "External id": 86669,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866029372.797, "dur": 0.400, + "args": { + "External id": 86670,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866029374.257, "dur": 0.250, + "args": { + "External id": 86671,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866029380.737, "dur": 20.460, + "args": { + "External id": 86672,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866029404.707, "dur": 19.760, + "args": { + "External id": 86673,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866029408.557, "dur": 14.660, + "args": { + "External id": 86674,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866029426.597, "dur": 10.640, + "args": { + "External id": 86675,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866029445.387, "dur": 10.480, + "args": { + "External id": 86676,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029468.717, "dur": 45.360, + "args": { + "External id": 86677,"Record function id": 0, "Sequence number": 1771121, "Fwd thread id": 1, "Ev Idx": 660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029470.657, "dur": 27.300, + "args": { + "External id": 86678,"Sequence number": 1771121, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 661 + } + }, + { + "ph": "f", "id": 81, "pid": 5714, "tid": 6744, "ts": 6300866029470.657, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866029472.347, "dur": 25.210, + "args": { + "External id": 86679,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866029473.507, "dur": 23.779, + "args": { + "External id": 86680,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029477.357, "dur": 5.730, + "args": { + "External id": 86681,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866029484.187, "dur": 12.519, + "args": { + "External id": 86682,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866029502.717, "dur": 9.100, + "args": { + "External id": 86683,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029522.606, "dur": 6.640, + "args": { + "External id": 86684,"Record function id": 0, "Sequence number": 1771120, "Fwd thread id": 1, "Ev Idx": 667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029524.506, "dur": 1.151, + "args": { + "External id": 86685,"Sequence number": 1771120, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 668 + } + }, + { + "ph": "f", "id": 82, "pid": 5714, "tid": 6744, "ts": 6300866029524.506, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866029534.457, "dur": 319.629, + "args": { + "External id": 86686,"Record function id": 0, "Sequence number": 1771119, "Fwd thread id": 1, "Ev Idx": 669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866029536.277, "dur": 306.549, + "args": { + "External id": 86687,"Sequence number": 1771119, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 670 + } + }, + { + "ph": "f", "id": 83, "pid": 5714, "tid": 6744, "ts": 6300866029536.277, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866029564.566, "dur": 8.191, + "args": { + "External id": 86688,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6300866029568.486, "dur": 3.700, + "args": { + "External id": 86689,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866029575.417, "dur": 5.300, + "args": { + "External id": 86690,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866029576.817, "dur": 3.089, + "args": { + "External id": 86691,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029578.677, "dur": 0.920, + "args": { + "External id": 86692,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 6744, + "ts": 6300866029584.246, "dur": 43.600, + "args": { + "External id": 86693,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866029584.906, "dur": 2.540, + "args": { + "External id": 86694,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866029585.506, "dur": 1.531, + "args": { + "External id": 86695,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029586.546, "dur": 0.291, + "args": { + "External id": 86696,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 6744, + "ts": 6300866029588.437, "dur": 38.489, + "args": { + "External id": 86697,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866029589.457, "dur": 36.609, + "args": { + "External id": 86698,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 6744, + "ts": 6300866029632.636, "dur": 5.150, + "args": { + "External id": 86699,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866029635.146, "dur": 2.410, + "args": { + "External id": 86700,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866029660.816, "dur": 6.530, + "args": { + "External id": 86701,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866029668.646, "dur": 4.250, + "args": { + "External id": 86702,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866029673.866, "dur": 3.960, + "args": { + "External id": 86703,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866029707.696, "dur": 4.270, + "args": { + "External id": 86704,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866029709.126, "dur": 2.460, + "args": { + "External id": 86705,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5714, "tid": 6744, + "ts": 6300866029725.736, "dur": 99.480, + "args": { + "External id": 86706,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6300866029729.356, "dur": 5.080, + "args": { + "External id": 86707,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029732.186, "dur": 1.270, + "args": { + "External id": 86708,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866029735.716, "dur": 3.430, + "args": { + "External id": 86709,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029738.116, "dur": 0.390, + "args": { + "External id": 86710,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6300866029740.536, "dur": 1.940, + "args": { + "External id": 86711,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029741.746, "dur": 0.340, + "args": { + "External id": 86712,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866029743.416, "dur": 2.230, + "args": { + "External id": 86713,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029744.916, "dur": 0.350, + "args": { + "External id": 86714,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866029748.726, "dur": 2.350, + "args": { + "External id": 86715,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029750.036, "dur": 0.690, + "args": { + "External id": 86716,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866029752.036, "dur": 5.050, + "args": { + "External id": 86717,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6300866029755.336, "dur": 1.500, + "args": { + "External id": 86718,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866029759.186, "dur": 2.880, + "args": { + "External id": 86719,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029760.506, "dur": 1.190, + "args": { + "External id": 86720,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866029762.866, "dur": 2.440, + "args": { + "External id": 86721,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866029763.666, "dur": 1.480, + "args": { + "External id": 86722,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866029766.336, "dur": 45.990, + "args": { + "External id": 86723,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866029815.146, "dur": 1.860, + "args": { + "External id": 86724,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866029817.936, "dur": 3.150, + "args": { + "External id": 86725,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029819.946, "dur": 0.460, + "args": { + "External id": 86726,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866029823.446, "dur": 0.800, + "args": { + "External id": 86727,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866029867.926, "dur": 11.440, + "args": { + "External id": 86728,"Record function id": 0, "Ev Idx": 711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866029870.936, "dur": 6.980, + "args": { + "External id": 86729,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866029873.326, "dur": 3.610, + "args": { + "External id": 86730,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866029874.236, "dur": 2.470, + "args": { + "External id": 86731,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029883.976, "dur": 9.180, + "args": { + "External id": 86732,"Record function id": 0, "Sequence number": 1771118, "Fwd thread id": 1, "Ev Idx": 715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029885.606, "dur": 4.950, + "args": { + "External id": 86733,"Sequence number": 1771118, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 716 + } + }, + { + "ph": "f", "id": 84, "pid": 5714, "tid": 6744, "ts": 6300866029885.606, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866029887.366, "dur": 2.920, + "args": { + "External id": 86734,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866029888.386, "dur": 1.640, + "args": { + "External id": 86735,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029897.286, "dur": 103.190, + "args": { + "External id": 86736,"Record function id": 0, "Sequence number": 1771117, "Fwd thread id": 1, "Ev Idx": 719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866029898.646, "dur": 92.759, + "args": { + "External id": 86737,"Sequence number": 1771117, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 720 + } + }, + { + "ph": "f", "id": 85, "pid": 5714, "tid": 6744, "ts": 6300866029898.646, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866029902.406, "dur": 5.240, + "args": { + "External id": 86738,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866029903.826, "dur": 3.070, + "args": { + "External id": 86739,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029905.416, "dur": 1.090, + "args": { + "External id": 86740,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866029908.886, "dur": 42.639, + "args": { + "External id": 86741,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866029953.685, "dur": 5.660, + "args": { + "External id": 86742,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866029954.625, "dur": 3.760, + "args": { + "External id": 86743,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029957.265, "dur": 0.840, + "args": { + "External id": 86744,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866029961.336, "dur": 3.340, + "args": { + "External id": 86745,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866029962.256, "dur": 1.789, + "args": { + "External id": 86746,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866029963.296, "dur": 0.600, + "args": { + "External id": 86747,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866029965.376, "dur": 25.129, + "args": { + "External id": 86748,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030010.336, "dur": 11.340, + "args": { + "External id": 86749,"Record function id": 0, "Sequence number": 1771116, "Fwd thread id": 1, "Ev Idx": 732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030013.065, "dur": 6.291, + "args": { + "External id": 86750,"Sequence number": 1771116, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 733 + } + }, + { + "ph": "f", "id": 86, "pid": 5714, "tid": 6744, "ts": 6300866030013.065, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866030015.105, "dur": 4.051, + "args": { + "External id": 86751,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866030016.345, "dur": 2.580, + "args": { + "External id": 86752,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030025.776, "dur": 10.779, + "args": { + "External id": 86753,"Record function id": 0, "Sequence number": 1771115, "Fwd thread id": 1, "Ev Idx": 736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030028.145, "dur": 5.720, + "args": { + "External id": 86754,"Sequence number": 1771115, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 737 + } + }, + { + "ph": "f", "id": 87, "pid": 5714, "tid": 6744, "ts": 6300866030028.145, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866030028.985, "dur": 4.630, + "args": { + "External id": 86755,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866030030.176, "dur": 2.789, + "args": { + "External id": 86756,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030031.936, "dur": 0.759, + "args": { + "External id": 86757,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866030042.105, "dur": 8.060, + "args": { + "External id": 86758,"Record function id": 0, "Ev Idx": 741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866030044.045, "dur": 4.930, + "args": { + "External id": 86759,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866030045.835, "dur": 2.650, + "args": { + "External id": 86760,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866030046.605, "dur": 1.690, + "args": { + "External id": 86761,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030054.265, "dur": 8.070, + "args": { + "External id": 86762,"Record function id": 0, "Sequence number": 1771114, "Fwd thread id": 1, "Ev Idx": 745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030055.885, "dur": 3.940, + "args": { + "External id": 86763,"Sequence number": 1771114, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 746 + } + }, + { + "ph": "f", "id": 88, "pid": 5714, "tid": 6744, "ts": 6300866030055.885, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866030056.935, "dur": 2.760, + "args": { + "External id": 86764,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866030058.585, "dur": 0.920, + "args": { + "External id": 86765,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030066.415, "dur": 96.730, + "args": { + "External id": 86766,"Record function id": 0, "Sequence number": 1771113, "Fwd thread id": 1, "Ev Idx": 749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030067.755, "dur": 84.460, + "args": { + "External id": 86767,"Sequence number": 1771113, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 750 + } + }, + { + "ph": "f", "id": 89, "pid": 5714, "tid": 6744, "ts": 6300866030067.755, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866030069.845, "dur": 3.530, + "args": { + "External id": 86768,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866030070.535, "dur": 2.300, + "args": { + "External id": 86769,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030071.925, "dur": 0.690, + "args": { + "External id": 86770,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866030075.345, "dur": 38.260, + "args": { + "External id": 86771,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866030115.615, "dur": 4.400, + "args": { + "External id": 86772,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866030116.555, "dur": 2.570, + "args": { + "External id": 86773,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030118.015, "dur": 0.860, + "args": { + "External id": 86774,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866030121.855, "dur": 4.950, + "args": { + "External id": 86775,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866030122.725, "dur": 3.570, + "args": { + "External id": 86776,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030124.675, "dur": 1.460, + "args": { + "External id": 86777,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866030127.515, "dur": 23.780, + "args": { + "External id": 86778,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030172.365, "dur": 35.400, + "args": { + "External id": 86779,"Record function id": 0, "Sequence number": 1771112, "Fwd thread id": 1, "Ev Idx": 762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030174.425, "dur": 6.000, + "args": { + "External id": 86780,"Sequence number": 1771112, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 763 + } + }, + { + "ph": "f", "id": 90, "pid": 5714, "tid": 6744, "ts": 6300866030174.425, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866030176.575, "dur": 3.650, + "args": { + "External id": 86781,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866030177.585, "dur": 2.450, + "args": { + "External id": 86782,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866030183.395, "dur": 20.180, + "args": { + "External id": 86783,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030214.535, "dur": 12.770, + "args": { + "External id": 86784,"Record function id": 0, "Sequence number": 1771111, "Fwd thread id": 1, "Ev Idx": 767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030216.575, "dur": 7.770, + "args": { + "External id": 86785,"Sequence number": 1771111, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 768 + } + }, + { + "ph": "f", "id": 91, "pid": 5714, "tid": 6744, "ts": 6300866030216.575, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866030218.385, "dur": 5.680, + "args": { + "External id": 86786,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866030219.805, "dur": 3.190, + "args": { + "External id": 86787,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030221.695, "dur": 1.020, + "args": { + "External id": 86788,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866030232.965, "dur": 8.160, + "args": { + "External id": 86789,"Record function id": 0, "Ev Idx": 772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866030234.925, "dur": 4.900, + "args": { + "External id": 86790,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866030236.765, "dur": 2.570, + "args": { + "External id": 86791,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866030237.475, "dur": 1.650, + "args": { + "External id": 86792,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030245.865, "dur": 90.630, + "args": { + "External id": 86793,"Record function id": 0, "Sequence number": 1771110, "Fwd thread id": 1, "Ev Idx": 776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030247.175, "dur": 34.630, + "args": { + "External id": 86794,"Sequence number": 1771110, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 777 + } + }, + { + "ph": "f", "id": 92, "pid": 5714, "tid": 6744, "ts": 6300866030247.175, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866030249.085, "dur": 18.470, + "args": { + "External id": 86795,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866030269.115, "dur": 12.240, + "args": { + "External id": 86796,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866030284.945, "dur": 37.260, + "args": { + "External id": 86797,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866030326.885, "dur": 3.950, + "args": { + "External id": 86798,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866030346.865, "dur": 8.970, + "args": { + "External id": 86799,"Record function id": 0, "Ev Idx": 782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866030349.395, "dur": 5.160, + "args": { + "External id": 86800,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866030351.375, "dur": 2.650, + "args": { + "External id": 86801,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866030352.095, "dur": 1.720, + "args": { + "External id": 86802,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030360.055, "dur": 37.509, + "args": { + "External id": 86803,"Record function id": 0, "Sequence number": 1771109, "Fwd thread id": 1, "Ev Idx": 786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030361.435, "dur": 31.760, + "args": { + "External id": 86804,"Sequence number": 1771109, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 787 + } + }, + { + "ph": "f", "id": 93, "pid": 5714, "tid": 6744, "ts": 6300866030361.435, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866030363.055, "dur": 29.709, + "args": { + "External id": 86805,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866030365.165, "dur": 27.319, + "args": { + "External id": 86806,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030368.435, "dur": 5.970, + "args": { + "External id": 86807,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866030375.535, "dur": 16.269, + "args": { + "External id": 86808,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030404.124, "dur": 59.160, + "args": { + "External id": 86809,"Record function id": 0, "Sequence number": 1771108, "Fwd thread id": 1, "Ev Idx": 792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030405.764, "dur": 33.491, + "args": { + "External id": 86810,"Sequence number": 1771108, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 793 + } + }, + { + "ph": "f", "id": 94, "pid": 5714, "tid": 6744, "ts": 6300866030405.764, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866030407.964, "dur": 15.751, + "args": { + "External id": 86811,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866030425.135, "dur": 13.560, + "args": { + "External id": 86812,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866030442.684, "dur": 15.840, + "args": { + "External id": 86813,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030471.104, "dur": 62.720, + "args": { + "External id": 86814,"Record function id": 0, "Sequence number": 1771107, "Fwd thread id": 1, "Ev Idx": 797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030473.024, "dur": 55.820, + "args": { + "External id": 86815,"Sequence number": 1771107, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 798 + } + }, + { + "ph": "f", "id": 95, "pid": 5714, "tid": 6744, "ts": 6300866030473.024, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866030476.874, "dur": 19.860, + "args": { + "External id": 86816,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866030478.484, "dur": 0.490, + "args": { + "External id": 86817,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866030480.044, "dur": 0.270, + "args": { + "External id": 86818,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866030498.174, "dur": 17.810, + "args": { + "External id": 86819,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866030501.564, "dur": 13.330, + "args": { + "External id": 86820,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 803 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866030516.914, "dur": 10.120, + "args": { + "External id": 86821,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866030541.614, "dur": 4.500, + "args": { + "External id": 86822,"Record function id": 0, "Sequence number": 1771106, "Fwd thread id": 1, "Ev Idx": 805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866030543.534, "dur": 0.470, + "args": { + "External id": 86823,"Sequence number": 1771106, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 806 + } + }, + { + "ph": "f", "id": 96, "pid": 5714, "tid": 6744, "ts": 6300866030543.534, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866030549.944, "dur": 38.120, + "args": { + "External id": 86824,"Record function id": 0, "Sequence number": 1771105, "Fwd thread id": 1, "Ev Idx": 807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866030551.234, "dur": 32.870, + "args": { + "External id": 86825,"Sequence number": 1771105, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 808 + } + }, + { + "ph": "f", "id": 97, "pid": 5714, "tid": 6744, "ts": 6300866030551.234, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6300866030553.864, "dur": 5.600, + "args": { + "External id": 86826,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030556.314, "dur": 1.490, + "args": { + "External id": 86827,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866030561.454, "dur": 21.950, + "args": { + "External id": 86828,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866030563.984, "dur": 18.460, + "args": { + "External id": 86829,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030594.954, "dur": 88.970, + "args": { + "External id": 86830,"Record function id": 0, "Sequence number": 1771104, "Fwd thread id": 1, "Ev Idx": 813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030596.754, "dur": 67.290, + "args": { + "External id": 86831,"Sequence number": 1771104, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 814 + } + }, + { + "ph": "f", "id": 98, "pid": 5714, "tid": 6744, "ts": 6300866030596.754, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866030598.634, "dur": 31.420, + "args": { + "External id": 86832,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866030599.794, "dur": 0.380, + "args": { + "External id": 86833,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866030601.064, "dur": 0.220, + "args": { + "External id": 86834,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866030607.144, "dur": 21.430, + "args": { + "External id": 86835,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866030631.164, "dur": 17.900, + "args": { + "External id": 86836,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866030633.854, "dur": 14.140, + "args": { + "External id": 86837,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866030650.144, "dur": 11.580, + "args": { + "External id": 86838,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866030669.984, "dur": 10.380, + "args": { + "External id": 86839,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030693.004, "dur": 54.720, + "args": { + "External id": 86840,"Record function id": 0, "Sequence number": 1771103, "Fwd thread id": 1, "Ev Idx": 823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030695.064, "dur": 27.160, + "args": { + "External id": 86841,"Sequence number": 1771103, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 824 + } + }, + { + "ph": "f", "id": 99, "pid": 5714, "tid": 6744, "ts": 6300866030695.064, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866030697.664, "dur": 24.130, + "args": { + "External id": 86842,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866030698.694, "dur": 22.770, + "args": { + "External id": 86843,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030702.124, "dur": 5.410, + "args": { + "External id": 86844,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866030708.544, "dur": 12.360, + "args": { + "External id": 86845,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866030727.154, "dur": 14.880, + "args": { + "External id": 86846,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030757.274, "dur": 20.530, + "args": { + "External id": 86847,"Record function id": 0, "Sequence number": 1771102, "Fwd thread id": 1, "Ev Idx": 830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030759.384, "dur": 1.410, + "args": { + "External id": 86848,"Sequence number": 1771102, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 831 + } + }, + { + "ph": "f", "id": 100, "pid": 5714, "tid": 6744, "ts": 6300866030759.384, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866030763.874, "dur": 11.350, + "args": { + "External id": 86849,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030783.354, "dur": 10.710, + "args": { + "External id": 86850,"Record function id": 0, "Sequence number": 1771101, "Fwd thread id": 1, "Ev Idx": 833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030784.944, "dur": 6.380, + "args": { + "External id": 86851,"Sequence number": 1771101, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 834 + } + }, + { + "ph": "f", "id": 101, "pid": 5714, "tid": 6744, "ts": 6300866030784.944, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866030786.934, "dur": 4.180, + "args": { + "External id": 86852,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866030787.814, "dur": 3.040, + "args": { + "External id": 86853,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030798.164, "dur": 97.319, + "args": { + "External id": 86854,"Record function id": 0, "Sequence number": 1771100, "Fwd thread id": 1, "Ev Idx": 837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030799.534, "dur": 89.509, + "args": { + "External id": 86855,"Sequence number": 1771100, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 838 + } + }, + { + "ph": "f", "id": 102, "pid": 5714, "tid": 6744, "ts": 6300866030799.534, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866030803.154, "dur": 5.570, + "args": { + "External id": 86856,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866030804.644, "dur": 3.190, + "args": { + "External id": 86857,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030806.224, "dur": 1.250, + "args": { + "External id": 86858,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866030809.754, "dur": 42.869, + "args": { + "External id": 86859,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866030854.734, "dur": 6.000, + "args": { + "External id": 86860,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866030855.674, "dur": 4.140, + "args": { + "External id": 86861,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030858.154, "dur": 1.429, + "args": { + "External id": 86862,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866030862.563, "dur": 2.711, + "args": { + "External id": 86863,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866030863.483, "dur": 1.231, + "args": { + "External id": 86864,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030864.223, "dur": 0.331, + "args": { + "External id": 86865,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866030865.974, "dur": 22.169, + "args": { + "External id": 86866,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030904.443, "dur": 10.580, + "args": { + "External id": 86867,"Record function id": 0, "Sequence number": 1771099, "Fwd thread id": 1, "Ev Idx": 850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030906.543, "dur": 5.520, + "args": { + "External id": 86868,"Sequence number": 1771099, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 851 + } + }, + { + "ph": "f", "id": 103, "pid": 5714, "tid": 6744, "ts": 6300866030906.543, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866030908.543, "dur": 3.351, + "args": { + "External id": 86869,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866030909.423, "dur": 2.260, + "args": { + "External id": 86870,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030919.003, "dur": 9.260, + "args": { + "External id": 86871,"Record function id": 0, "Sequence number": 1771098, "Fwd thread id": 1, "Ev Idx": 854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030920.333, "dur": 5.890, + "args": { + "External id": 86872,"Sequence number": 1771098, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 855 + } + }, + { + "ph": "f", "id": 104, "pid": 5714, "tid": 6744, "ts": 6300866030920.333, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866030921.983, "dur": 4.020, + "args": { + "External id": 86873,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866030922.993, "dur": 2.400, + "args": { + "External id": 86874,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030924.583, "dur": 0.550, + "args": { + "External id": 86875,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866030934.053, "dur": 8.450, + "args": { + "External id": 86876,"Record function id": 0, "Ev Idx": 859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866030935.943, "dur": 5.290, + "args": { + "External id": 86877,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866030937.863, "dur": 2.910, + "args": { + "External id": 86878,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866030938.653, "dur": 1.910, + "args": { + "External id": 86879,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030946.433, "dur": 8.220, + "args": { + "External id": 86880,"Record function id": 0, "Sequence number": 1771097, "Fwd thread id": 1, "Ev Idx": 863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866030947.683, "dur": 4.550, + "args": { + "External id": 86881,"Sequence number": 1771097, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 864 + } + }, + { + "ph": "f", "id": 105, "pid": 5714, "tid": 6744, "ts": 6300866030947.683, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866030948.803, "dur": 3.280, + "args": { + "External id": 86882,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866030950.583, "dur": 1.310, + "args": { + "External id": 86883,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6300866030959.753, "dur": 236.410, + "args": { + "External id": 86884,"Record function id": 0, "Sequence number": 1771096, "Fwd thread id": 1, "Ev Idx": 867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6300866030961.393, "dur": 216.390, + "args": { + "External id": 86885,"Sequence number": 1771096, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 868 + } + }, + { + "ph": "f", "id": 106, "pid": 5714, "tid": 6744, "ts": 6300866030961.393, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866030974.173, "dur": 8.980, + "args": { + "External id": 86886,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030976.353, "dur": 6.250, + "args": { + "External id": 86887,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866030985.663, "dur": 6.380, + "args": { + "External id": 86888,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030987.153, "dur": 4.590, + "args": { + "External id": 86889,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866030995.063, "dur": 5.610, + "args": { + "External id": 86890,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866030995.933, "dur": 4.460, + "args": { + "External id": 86891,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866031017.323, "dur": 132.600, + "args": { + "External id": 86892,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866031070.393, "dur": 6.220, + "args": { + "External id": 86893,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866031079.123, "dur": 3.630, + "args": { + "External id": 86894,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866031162.393, "dur": 3.590, + "args": { + "External id": 86895,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866031170.723, "dur": 0.630, + "args": { + "External id": 86896,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866031174.273, "dur": 0.530, + "args": { + "External id": 86897,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866031207.563, "dur": 198.759, + "args": { + "External id": 86898,"Record function id": 0, "Sequence number": 1771095, "Fwd thread id": 1, "Ev Idx": 881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866031210.113, "dur": 186.259, + "args": { + "External id": 86899,"Sequence number": 1771095, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 882 + } + }, + { + "ph": "f", "id": 107, "pid": 5714, "tid": 6744, "ts": 6300866031210.113, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866031226.533, "dur": 37.380, + "args": { + "External id": 86900,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031229.953, "dur": 6.700, + "args": { + "External id": 86901,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866031237.863, "dur": 25.430, + "args": { + "External id": 86902,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866031273.353, "dur": 6.920, + "args": { + "External id": 86903,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031275.062, "dur": 4.751, + "args": { + "External id": 86904,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866031419.292, "dur": 174.610, + "args": { + "External id": 86905,"Record function id": 0, "Sequence number": 1771094, "Fwd thread id": 1, "Ev Idx": 888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866031422.242, "dur": 162.550, + "args": { + "External id": 86906,"Sequence number": 1771094, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 889 + } + }, + { + "ph": "f", "id": 108, "pid": 5714, "tid": 6744, "ts": 6300866031422.242, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866031437.302, "dur": 36.030, + "args": { + "External id": 86907,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031441.272, "dur": 7.080, + "args": { + "External id": 86908,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866031449.652, "dur": 23.060, + "args": { + "External id": 86909,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866031482.252, "dur": 7.550, + "args": { + "External id": 86910,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031484.312, "dur": 5.040, + "args": { + "External id": 86911,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031606.652, "dur": 14.010, + "args": { + "External id": 86912,"Record function id": 0, "Sequence number": 1771093, "Fwd thread id": 1, "Ev Idx": 895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031609.212, "dur": 8.540, + "args": { + "External id": 86913,"Sequence number": 1771093, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 896 + } + }, + { + "ph": "f", "id": 109, "pid": 5714, "tid": 6744, "ts": 6300866031609.212, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866031612.072, "dur": 5.390, + "args": { + "External id": 86914,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866031613.292, "dur": 3.890, + "args": { + "External id": 86915,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031625.122, "dur": 9.080, + "args": { + "External id": 86916,"Record function id": 0, "Sequence number": 1771092, "Fwd thread id": 1, "Ev Idx": 899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031626.742, "dur": 5.240, + "args": { + "External id": 86917,"Sequence number": 1771092, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 900 + } + }, + { + "ph": "f", "id": 110, "pid": 5714, "tid": 6744, "ts": 6300866031626.742, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866031627.942, "dur": 3.850, + "args": { + "External id": 86918,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866031630.652, "dur": 0.950, + "args": { + "External id": 86919,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031638.002, "dur": 6.650, + "args": { + "External id": 86920,"Record function id": 0, "Sequence number": 1771091, "Fwd thread id": 1, "Ev Idx": 903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031639.292, "dur": 3.160, + "args": { + "External id": 86921,"Sequence number": 1771091, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 904 + } + }, + { + "ph": "f", "id": 111, "pid": 5714, "tid": 6744, "ts": 6300866031639.292, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866031640.422, "dur": 1.850, + "args": { + "External id": 86922,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866031641.222, "dur": 0.860, + "args": { + "External id": 86923,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031649.692, "dur": 6.500, + "args": { + "External id": 86924,"Record function id": 0, "Sequence number": 1771090, "Fwd thread id": 1, "Ev Idx": 907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031651.042, "dur": 2.880, + "args": { + "External id": 86925,"Sequence number": 1771090, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 908 + } + }, + { + "ph": "f", "id": 112, "pid": 5714, "tid": 6744, "ts": 6300866031651.042, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866031651.972, "dur": 1.780, + "args": { + "External id": 86926,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866031652.682, "dur": 0.850, + "args": { + "External id": 86927,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031660.132, "dur": 107.249, + "args": { + "External id": 86928,"Record function id": 0, "Sequence number": 1771089, "Fwd thread id": 1, "Ev Idx": 911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031662.732, "dur": 95.080, + "args": { + "External id": 86929,"Sequence number": 1771089, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 912 + } + }, + { + "ph": "f", "id": 113, "pid": 5714, "tid": 6744, "ts": 6300866031662.732, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866031665.862, "dur": 6.520, + "args": { + "External id": 86930,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866031667.322, "dur": 4.380, + "args": { + "External id": 86931,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031669.092, "dur": 2.210, + "args": { + "External id": 86932,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866031673.442, "dur": 46.099, + "args": { + "External id": 86933,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866031721.872, "dur": 6.360, + "args": { + "External id": 86934,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866031723.021, "dur": 4.200, + "args": { + "External id": 86935,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031726.152, "dur": 0.829, + "args": { + "External id": 86936,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866031730.021, "dur": 3.460, + "args": { + "External id": 86937,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866031731.032, "dur": 1.840, + "args": { + "External id": 86938,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031732.381, "dur": 0.320, + "args": { + "External id": 86939,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866031734.212, "dur": 22.669, + "args": { + "External id": 86940,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031776.912, "dur": 11.220, + "args": { + "External id": 86941,"Record function id": 0, "Sequence number": 1771088, "Fwd thread id": 1, "Ev Idx": 924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031778.841, "dur": 7.051, + "args": { + "External id": 86942,"Sequence number": 1771088, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 925 + } + }, + { + "ph": "f", "id": 114, "pid": 5714, "tid": 6744, "ts": 6300866031778.841, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866031781.081, "dur": 4.620, + "args": { + "External id": 86943,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866031782.941, "dur": 2.531, + "args": { + "External id": 86944,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031792.711, "dur": 9.680, + "args": { + "External id": 86945,"Record function id": 0, "Sequence number": 1771087, "Fwd thread id": 1, "Ev Idx": 928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031794.071, "dur": 5.340, + "args": { + "External id": 86946,"Sequence number": 1771087, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 929 + } + }, + { + "ph": "f", "id": 115, "pid": 5714, "tid": 6744, "ts": 6300866031794.071, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866031794.891, "dur": 4.290, + "args": { + "External id": 86947,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866031795.951, "dur": 2.580, + "args": { + "External id": 86948,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031797.501, "dur": 0.750, + "args": { + "External id": 86949,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866031808.271, "dur": 10.570, + "args": { + "External id": 86950,"Record function id": 0, "Ev Idx": 933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866031810.311, "dur": 7.230, + "args": { + "External id": 86951,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866031812.261, "dur": 4.830, + "args": { + "External id": 86952,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866031814.061, "dur": 2.810, + "args": { + "External id": 86953,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031822.851, "dur": 7.410, + "args": { + "External id": 86954,"Record function id": 0, "Sequence number": 1771086, "Fwd thread id": 1, "Ev Idx": 937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031824.361, "dur": 3.710, + "args": { + "External id": 86955,"Sequence number": 1771086, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 938 + } + }, + { + "ph": "f", "id": 116, "pid": 5714, "tid": 6744, "ts": 6300866031824.361, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866031825.651, "dur": 2.270, + "args": { + "External id": 86956,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866031826.431, "dur": 1.290, + "args": { + "External id": 86957,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031834.051, "dur": 90.520, + "args": { + "External id": 86958,"Record function id": 0, "Sequence number": 1771085, "Fwd thread id": 1, "Ev Idx": 941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031835.361, "dur": 80.840, + "args": { + "External id": 86959,"Sequence number": 1771085, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 942 + } + }, + { + "ph": "f", "id": 117, "pid": 5714, "tid": 6744, "ts": 6300866031835.361, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866031837.461, "dur": 4.360, + "args": { + "External id": 86960,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866031838.121, "dur": 3.270, + "args": { + "External id": 86961,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031840.511, "dur": 0.650, + "args": { + "External id": 86962,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866031842.621, "dur": 38.320, + "args": { + "External id": 86963,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866031882.851, "dur": 4.550, + "args": { + "External id": 86964,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866031883.741, "dur": 2.650, + "args": { + "External id": 86965,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031885.311, "dur": 0.830, + "args": { + "External id": 86966,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866031889.021, "dur": 3.600, + "args": { + "External id": 86967,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866031890.721, "dur": 1.420, + "args": { + "External id": 86968,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031891.721, "dur": 0.260, + "args": { + "External id": 86969,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866031893.351, "dur": 21.920, + "args": { + "External id": 86970,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031934.111, "dur": 36.100, + "args": { + "External id": 86971,"Record function id": 0, "Sequence number": 1771084, "Fwd thread id": 1, "Ev Idx": 954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031936.161, "dur": 6.080, + "args": { + "External id": 86972,"Sequence number": 1771084, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 955 + } + }, + { + "ph": "f", "id": 118, "pid": 5714, "tid": 6744, "ts": 6300866031936.161, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866031938.391, "dur": 3.650, + "args": { + "External id": 86973,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866031939.471, "dur": 2.370, + "args": { + "External id": 86974,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866031945.111, "dur": 20.530, + "args": { + "External id": 86975,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031977.191, "dur": 14.980, + "args": { + "External id": 86976,"Record function id": 0, "Sequence number": 1771083, "Fwd thread id": 1, "Ev Idx": 959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866031979.141, "dur": 10.280, + "args": { + "External id": 86977,"Sequence number": 1771083, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 960 + } + }, + { + "ph": "f", "id": 119, "pid": 5714, "tid": 6744, "ts": 6300866031979.141, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866031980.131, "dur": 8.990, + "args": { + "External id": 86978,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866031981.461, "dur": 6.550, + "args": { + "External id": 86979,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866031985.841, "dur": 1.840, + "args": { + "External id": 86980,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866031998.181, "dur": 9.480, + "args": { + "External id": 86981,"Record function id": 0, "Ev Idx": 964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866032000.261, "dur": 6.140, + "args": { + "External id": 86982,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866032002.121, "dur": 3.780, + "args": { + "External id": 86983,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866032003.811, "dur": 1.880, + "args": { + "External id": 86984,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032012.341, "dur": 8.340, + "args": { + "External id": 86985,"Record function id": 0, "Sequence number": 1771082, "Fwd thread id": 1, "Ev Idx": 968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032013.871, "dur": 3.980, + "args": { + "External id": 86986,"Sequence number": 1771082, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 969 + } + }, + { + "ph": "f", "id": 120, "pid": 5714, "tid": 6744, "ts": 6300866032013.871, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866032015.101, "dur": 2.580, + "args": { + "External id": 86987,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866032015.971, "dur": 1.490, + "args": { + "External id": 86988,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032024.571, "dur": 93.130, + "args": { + "External id": 86989,"Record function id": 0, "Sequence number": 1771081, "Fwd thread id": 1, "Ev Idx": 972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032025.871, "dur": 81.680, + "args": { + "External id": 86990,"Sequence number": 1771081, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 973 + } + }, + { + "ph": "f", "id": 121, "pid": 5714, "tid": 6744, "ts": 6300866032025.871, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866032028.101, "dur": 4.230, + "args": { + "External id": 86991,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866032029.681, "dur": 2.160, + "args": { + "External id": 86992,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032030.881, "dur": 0.730, + "args": { + "External id": 86993,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866032033.251, "dur": 39.340, + "args": { + "External id": 86994,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866032074.631, "dur": 4.430, + "args": { + "External id": 86995,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866032075.551, "dur": 2.570, + "args": { + "External id": 86996,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032077.181, "dur": 0.710, + "args": { + "External id": 86997,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866032081.641, "dur": 2.780, + "args": { + "External id": 86998,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866032082.521, "dur": 1.520, + "args": { + "External id": 86999,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032083.651, "dur": 0.230, + "args": { + "External id": 87000,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866032085.161, "dur": 21.590, + "args": { + "External id": 87001,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032127.231, "dur": 30.949, + "args": { + "External id": 87002,"Record function id": 0, "Sequence number": 1771080, "Fwd thread id": 1, "Ev Idx": 985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032129.301, "dur": 6.070, + "args": { + "External id": 87003,"Sequence number": 1771080, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 986 + } + }, + { + "ph": "f", "id": 122, "pid": 5714, "tid": 6744, "ts": 6300866032129.301, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866032131.391, "dur": 3.769, + "args": { + "External id": 87004,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866032132.491, "dur": 2.460, + "args": { + "External id": 87005,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866032138.240, "dur": 16.291, + "args": { + "External id": 87006,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032164.660, "dur": 11.751, + "args": { + "External id": 87007,"Record function id": 0, "Sequence number": 1771079, "Fwd thread id": 1, "Ev Idx": 990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032167.280, "dur": 6.651, + "args": { + "External id": 87008,"Sequence number": 1771079, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 991 + } + }, + { + "ph": "f", "id": 123, "pid": 5714, "tid": 6744, "ts": 6300866032167.280, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866032168.271, "dur": 5.400, + "args": { + "External id": 87009,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866032169.500, "dur": 3.251, + "args": { + "External id": 87010,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032171.400, "dur": 1.080, + "args": { + "External id": 87011,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866032182.471, "dur": 9.089, + "args": { + "External id": 87012,"Record function id": 0, "Ev Idx": 995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866032184.580, "dur": 5.780, + "args": { + "External id": 87013,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866032186.411, "dur": 3.480, + "args": { + "External id": 87014,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866032187.160, "dur": 2.520, + "args": { + "External id": 87015,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032195.491, "dur": 81.129, + "args": { + "External id": 87016,"Record function id": 0, "Sequence number": 1771078, "Fwd thread id": 1, "Ev Idx": 999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032196.891, "dur": 36.919, + "args": { + "External id": 87017,"Sequence number": 1771078, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1000 + } + }, + { + "ph": "f", "id": 124, "pid": 5714, "tid": 6744, "ts": 6300866032196.891, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866032199.591, "dur": 19.020, + "args": { + "External id": 87018,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866032220.280, "dur": 13.060, + "args": { + "External id": 87019,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866032237.220, "dur": 28.510, + "args": { + "External id": 87020,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866032268.770, "dur": 2.470, + "args": { + "External id": 87021,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866032286.060, "dur": 8.560, + "args": { + "External id": 87022,"Record function id": 0, "Ev Idx": 1005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866032288.440, "dur": 4.890, + "args": { + "External id": 87023,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866032290.130, "dur": 2.660, + "args": { + "External id": 87024,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866032291.040, "dur": 1.560, + "args": { + "External id": 87025,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032307.630, "dur": 38.880, + "args": { + "External id": 87026,"Record function id": 0, "Sequence number": 1771077, "Fwd thread id": 1, "Ev Idx": 1009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032309.020, "dur": 33.490, + "args": { + "External id": 87027,"Sequence number": 1771077, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1010 + } + }, + { + "ph": "f", "id": 125, "pid": 5714, "tid": 6744, "ts": 6300866032309.020, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866032311.790, "dur": 30.280, + "args": { + "External id": 87028,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866032312.940, "dur": 28.840, + "args": { + "External id": 87029,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032316.290, "dur": 6.360, + "args": { + "External id": 87030,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866032323.790, "dur": 17.310, + "args": { + "External id": 87031,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032354.360, "dur": 60.620, + "args": { + "External id": 87032,"Record function id": 0, "Sequence number": 1771076, "Fwd thread id": 1, "Ev Idx": 1015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032355.980, "dur": 33.230, + "args": { + "External id": 87033,"Sequence number": 1771076, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1016 + } + }, + { + "ph": "f", "id": 126, "pid": 5714, "tid": 6744, "ts": 6300866032355.980, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866032358.320, "dur": 16.810, + "args": { + "External id": 87034,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866032376.660, "dur": 12.120, + "args": { + "External id": 87035,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866032392.570, "dur": 17.000, + "args": { + "External id": 87036,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032422.830, "dur": 72.860, + "args": { + "External id": 87037,"Record function id": 0, "Sequence number": 1771075, "Fwd thread id": 1, "Ev Idx": 1020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032424.710, "dur": 65.760, + "args": { + "External id": 87038,"Sequence number": 1771075, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1021 + } + }, + { + "ph": "f", "id": 127, "pid": 5714, "tid": 6744, "ts": 6300866032424.710, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866032428.180, "dur": 29.080, + "args": { + "External id": 87039,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866032430.100, "dur": 0.490, + "args": { + "External id": 87040,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866032439.880, "dur": 0.280, + "args": { + "External id": 87041,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866032458.790, "dur": 18.330, + "args": { + "External id": 87042,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866032462.820, "dur": 13.250, + "args": { + "External id": 87043,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866032478.000, "dur": 10.410, + "args": { + "External id": 87044,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866032503.850, "dur": 4.830, + "args": { + "External id": 87045,"Record function id": 0, "Sequence number": 1771074, "Fwd thread id": 1, "Ev Idx": 1028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866032505.590, "dur": 0.490, + "args": { + "External id": 87046,"Sequence number": 1771074, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1029 + } + }, + { + "ph": "f", "id": 128, "pid": 5714, "tid": 6744, "ts": 6300866032505.590, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866032512.540, "dur": 39.410, + "args": { + "External id": 87047,"Record function id": 0, "Sequence number": 1771073, "Fwd thread id": 1, "Ev Idx": 1030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866032514.760, "dur": 32.910, + "args": { + "External id": 87048,"Sequence number": 1771073, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1031 + } + }, + { + "ph": "f", "id": 129, "pid": 5714, "tid": 6744, "ts": 6300866032514.760, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6300866032517.410, "dur": 6.360, + "args": { + "External id": 87049,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032520.630, "dur": 1.500, + "args": { + "External id": 87050,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866032524.760, "dur": 22.230, + "args": { + "External id": 87051,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866032527.440, "dur": 18.480, + "args": { + "External id": 87052,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032558.940, "dur": 88.399, + "args": { + "External id": 87053,"Record function id": 0, "Sequence number": 1771072, "Fwd thread id": 1, "Ev Idx": 1036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032560.620, "dur": 66.510, + "args": { + "External id": 87054,"Sequence number": 1771072, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1037 + } + }, + { + "ph": "f", "id": 130, "pid": 5714, "tid": 6744, "ts": 6300866032560.620, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866032562.400, "dur": 30.690, + "args": { + "External id": 87055,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866032563.850, "dur": 0.350, + "args": { + "External id": 87056,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866032565.110, "dur": 0.240, + "args": { + "External id": 87057,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866032571.350, "dur": 20.249, + "args": { + "External id": 87058,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866032594.219, "dur": 19.351, + "args": { + "External id": 87059,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866032598.279, "dur": 14.180, + "args": { + "External id": 87060,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866032614.570, "dur": 10.509, + "args": { + "External id": 87061,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866032632.850, "dur": 10.869, + "args": { + "External id": 87062,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032656.679, "dur": 45.020, + "args": { + "External id": 87063,"Record function id": 0, "Sequence number": 1771071, "Fwd thread id": 1, "Ev Idx": 1046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032658.710, "dur": 26.809, + "args": { + "External id": 87064,"Sequence number": 1771071, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1047 + } + }, + { + "ph": "f", "id": 131, "pid": 5714, "tid": 6744, "ts": 6300866032658.710, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866032660.550, "dur": 24.609, + "args": { + "External id": 87065,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866032661.619, "dur": 23.240, + "args": { + "External id": 87066,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032665.559, "dur": 5.611, + "args": { + "External id": 87067,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866032672.130, "dur": 12.209, + "args": { + "External id": 87068,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866032690.219, "dur": 9.170, + "args": { + "External id": 87069,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032708.929, "dur": 6.160, + "args": { + "External id": 87070,"Record function id": 0, "Sequence number": 1771070, "Fwd thread id": 1, "Ev Idx": 1053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866032710.699, "dur": 1.040, + "args": { + "External id": 87071,"Sequence number": 1771070, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1054 + } + }, + { + "ph": "f", "id": 132, "pid": 5714, "tid": 6744, "ts": 6300866032710.699, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866032719.929, "dur": 322.069, + "args": { + "External id": 87072,"Record function id": 0, "Sequence number": 1771069, "Fwd thread id": 1, "Ev Idx": 1055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866032721.629, "dur": 309.220, + "args": { + "External id": 87073,"Sequence number": 1771069, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1056 + } + }, + { + "ph": "f", "id": 133, "pid": 5714, "tid": 6744, "ts": 6300866032721.629, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866032750.569, "dur": 8.370, + "args": { + "External id": 87074,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6300866032754.649, "dur": 3.830, + "args": { + "External id": 87075,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 1058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866032761.719, "dur": 5.510, + "args": { + "External id": 87076,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866032763.059, "dur": 3.270, + "args": { + "External id": 87077,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032765.129, "dur": 0.880, + "args": { + "External id": 87078,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 6744, + "ts": 6300866032770.679, "dur": 43.690, + "args": { + "External id": 87079,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 1062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866032771.269, "dur": 3.580, + "args": { + "External id": 87080,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 1063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866032771.849, "dur": 2.600, + "args": { + "External id": 87081,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032772.929, "dur": 1.340, + "args": { + "External id": 87082,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 6744, + "ts": 6300866032775.739, "dur": 37.800, + "args": { + "External id": 87083,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866032776.729, "dur": 36.000, + "args": { + "External id": 87084,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 6744, + "ts": 6300866032819.089, "dur": 4.940, + "args": { + "External id": 87085,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 1068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866032821.489, "dur": 2.330, + "args": { + "External id": 87086,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866032846.949, "dur": 6.350, + "args": { + "External id": 87087,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866032854.709, "dur": 4.150, + "args": { + "External id": 87088,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866032859.819, "dur": 2.990, + "args": { + "External id": 87089,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866032891.969, "dur": 4.870, + "args": { + "External id": 87090,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866032894.039, "dur": 2.390, + "args": { + "External id": 87091,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5714, "tid": 6744, + "ts": 6300866032910.639, "dur": 102.240, + "args": { + "External id": 87092,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 1075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6300866032914.089, "dur": 5.030, + "args": { + "External id": 87093,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032916.889, "dur": 1.240, + "args": { + "External id": 87094,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866032920.409, "dur": 4.160, + "args": { + "External id": 87095,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 1078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032922.829, "dur": 1.120, + "args": { + "External id": 87096,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 1079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6300866032925.929, "dur": 1.870, + "args": { + "External id": 87097,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032927.079, "dur": 0.330, + "args": { + "External id": 87098,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866032928.789, "dur": 2.210, + "args": { + "External id": 87099,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032930.279, "dur": 0.320, + "args": { + "External id": 87100,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 1083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866032934.039, "dur": 1.900, + "args": { + "External id": 87101,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 1084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032935.249, "dur": 0.320, + "args": { + "External id": 87102,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 1085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866032936.859, "dur": 4.670, + "args": { + "External id": 87103,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 1086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6300866032939.789, "dur": 1.490, + "args": { + "External id": 87104,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 1087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866032946.459, "dur": 2.020, + "args": { + "External id": 87105,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 1088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866032947.829, "dur": 0.270, + "args": { + "External id": 87106,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 1089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866032949.299, "dur": 2.370, + "args": { + "External id": 87107,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866032950.069, "dur": 1.420, + "args": { + "External id": 87108,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866032952.629, "dur": 45.100, + "args": { + "External id": 87109,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 1092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866033000.629, "dur": 2.980, + "args": { + "External id": 87110,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 1093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866033004.629, "dur": 4.300, + "args": { + "External id": 87111,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 1094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033007.799, "dur": 0.520, + "args": { + "External id": 87112,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 1095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866033011.229, "dur": 0.780, + "args": { + "External id": 87113,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 1096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866033055.598, "dur": 11.071, + "args": { + "External id": 87114,"Record function id": 0, "Ev Idx": 1097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866033058.489, "dur": 6.769, + "args": { + "External id": 87115,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866033060.889, "dur": 3.480, + "args": { + "External id": 87116,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866033061.789, "dur": 2.360, + "args": { + "External id": 87117,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033071.278, "dur": 8.640, + "args": { + "External id": 87118,"Record function id": 0, "Sequence number": 1771068, "Fwd thread id": 1, "Ev Idx": 1101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033072.758, "dur": 5.120, + "args": { + "External id": 87119,"Sequence number": 1771068, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1102 + } + }, + { + "ph": "f", "id": 134, "pid": 5714, "tid": 6744, "ts": 6300866033072.758, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866033074.829, "dur": 2.789, + "args": { + "External id": 87120,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866033075.798, "dur": 1.580, + "args": { + "External id": 87121,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033084.118, "dur": 100.290, + "args": { + "External id": 87122,"Record function id": 0, "Sequence number": 1771067, "Fwd thread id": 1, "Ev Idx": 1105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033085.398, "dur": 90.540, + "args": { + "External id": 87123,"Sequence number": 1771067, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1106 + } + }, + { + "ph": "f", "id": 135, "pid": 5714, "tid": 6744, "ts": 6300866033085.398, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866033088.898, "dur": 5.331, + "args": { + "External id": 87124,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866033090.318, "dur": 3.191, + "args": { + "External id": 87125,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033092.109, "dur": 1.029, + "args": { + "External id": 87126,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866033095.469, "dur": 41.929, + "args": { + "External id": 87127,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866033139.648, "dur": 5.640, + "args": { + "External id": 87128,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866033140.698, "dur": 3.630, + "args": { + "External id": 87129,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033143.188, "dur": 0.850, + "args": { + "External id": 87130,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866033147.228, "dur": 2.980, + "args": { + "External id": 87131,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866033148.198, "dur": 1.410, + "args": { + "External id": 87132,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033149.118, "dur": 0.320, + "args": { + "External id": 87133,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866033150.878, "dur": 24.050, + "args": { + "External id": 87134,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033193.958, "dur": 11.380, + "args": { + "External id": 87135,"Record function id": 0, "Sequence number": 1771066, "Fwd thread id": 1, "Ev Idx": 1118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033196.038, "dur": 7.070, + "args": { + "External id": 87136,"Sequence number": 1771066, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1119 + } + }, + { + "ph": "f", "id": 136, "pid": 5714, "tid": 6744, "ts": 6300866033196.038, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866033198.208, "dur": 4.680, + "args": { + "External id": 87137,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866033199.328, "dur": 3.300, + "args": { + "External id": 87138,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033210.308, "dur": 9.220, + "args": { + "External id": 87139,"Record function id": 0, "Sequence number": 1771065, "Fwd thread id": 1, "Ev Idx": 1122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033211.598, "dur": 5.550, + "args": { + "External id": 87140,"Sequence number": 1771065, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1123 + } + }, + { + "ph": "f", "id": 137, "pid": 5714, "tid": 6744, "ts": 6300866033211.598, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866033212.618, "dur": 4.300, + "args": { + "External id": 87141,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866033213.788, "dur": 2.500, + "args": { + "External id": 87142,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033215.188, "dur": 0.820, + "args": { + "External id": 87143,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866033224.968, "dur": 7.960, + "args": { + "External id": 87144,"Record function id": 0, "Ev Idx": 1127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866033226.868, "dur": 4.870, + "args": { + "External id": 87145,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866033228.688, "dur": 2.660, + "args": { + "External id": 87146,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866033229.478, "dur": 1.670, + "args": { + "External id": 87147,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033237.038, "dur": 7.920, + "args": { + "External id": 87148,"Record function id": 0, "Sequence number": 1771064, "Fwd thread id": 1, "Ev Idx": 1131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033238.448, "dur": 4.130, + "args": { + "External id": 87149,"Sequence number": 1771064, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1132 + } + }, + { + "ph": "f", "id": 138, "pid": 5714, "tid": 6744, "ts": 6300866033238.448, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866033240.578, "dur": 1.840, + "args": { + "External id": 87150,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866033241.278, "dur": 0.950, + "args": { + "External id": 87151,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033248.908, "dur": 105.640, + "args": { + "External id": 87152,"Record function id": 0, "Sequence number": 1771063, "Fwd thread id": 1, "Ev Idx": 1135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033250.188, "dur": 94.080, + "args": { + "External id": 87153,"Sequence number": 1771063, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1136 + } + }, + { + "ph": "f", "id": 139, "pid": 5714, "tid": 6744, "ts": 6300866033250.188, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866033252.338, "dur": 4.050, + "args": { + "External id": 87154,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866033252.978, "dur": 2.930, + "args": { + "External id": 87155,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033254.998, "dur": 0.650, + "args": { + "External id": 87156,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866033257.368, "dur": 37.280, + "args": { + "External id": 87157,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866033304.738, "dur": 5.350, + "args": { + "External id": 87158,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866033305.978, "dur": 3.130, + "args": { + "External id": 87159,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033307.818, "dur": 0.940, + "args": { + "External id": 87160,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866033312.018, "dur": 4.020, + "args": { + "External id": 87161,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866033312.938, "dur": 2.540, + "args": { + "External id": 87162,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033315.048, "dur": 0.270, + "args": { + "External id": 87163,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866033316.838, "dur": 26.430, + "args": { + "External id": 87164,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033364.828, "dur": 36.060, + "args": { + "External id": 87165,"Record function id": 0, "Sequence number": 1771062, "Fwd thread id": 1, "Ev Idx": 1148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033366.968, "dur": 6.570, + "args": { + "External id": 87166,"Sequence number": 1771062, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1149 + } + }, + { + "ph": "f", "id": 140, "pid": 5714, "tid": 6744, "ts": 6300866033366.968, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866033369.248, "dur": 4.090, + "args": { + "External id": 87167,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866033370.438, "dur": 2.680, + "args": { + "External id": 87168,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866033376.488, "dur": 20.250, + "args": { + "External id": 87169,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033407.628, "dur": 12.720, + "args": { + "External id": 87170,"Record function id": 0, "Sequence number": 1771061, "Fwd thread id": 1, "Ev Idx": 1153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033409.768, "dur": 8.060, + "args": { + "External id": 87171,"Sequence number": 1771061, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1154 + } + }, + { + "ph": "f", "id": 141, "pid": 5714, "tid": 6744, "ts": 6300866033409.768, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866033411.658, "dur": 5.880, + "args": { + "External id": 87172,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866033412.978, "dur": 3.580, + "args": { + "External id": 87173,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033415.248, "dur": 0.970, + "args": { + "External id": 87174,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866033425.998, "dur": 8.220, + "args": { + "External id": 87175,"Record function id": 0, "Ev Idx": 1158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866033428.018, "dur": 5.010, + "args": { + "External id": 87176,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866033429.918, "dur": 2.670, + "args": { + "External id": 87177,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866033430.668, "dur": 1.730, + "args": { + "External id": 87178,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033438.348, "dur": 80.620, + "args": { + "External id": 87179,"Record function id": 0, "Sequence number": 1771060, "Fwd thread id": 1, "Ev Idx": 1162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033439.668, "dur": 35.240, + "args": { + "External id": 87180,"Sequence number": 1771060, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1163 + } + }, + { + "ph": "f", "id": 142, "pid": 5714, "tid": 6744, "ts": 6300866033439.668, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866033441.628, "dur": 18.789, + "args": { + "External id": 87181,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866033462.088, "dur": 12.349, + "args": { + "External id": 87182,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866033478.088, "dur": 27.840, + "args": { + "External id": 87183,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866033509.957, "dur": 3.271, + "args": { + "External id": 87184,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866033528.657, "dur": 8.860, + "args": { + "External id": 87185,"Record function id": 0, "Ev Idx": 1168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866033531.308, "dur": 4.869, + "args": { + "External id": 87186,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866033533.077, "dur": 2.611, + "args": { + "External id": 87187,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866033533.848, "dur": 1.609, + "args": { + "External id": 87188,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033541.817, "dur": 37.630, + "args": { + "External id": 87189,"Record function id": 0, "Sequence number": 1771059, "Fwd thread id": 1, "Ev Idx": 1172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033543.168, "dur": 31.839, + "args": { + "External id": 87190,"Sequence number": 1771059, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1173 + } + }, + { + "ph": "f", "id": 143, "pid": 5714, "tid": 6744, "ts": 6300866033543.168, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866033544.737, "dur": 29.880, + "args": { + "External id": 87191,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866033546.848, "dur": 27.489, + "args": { + "External id": 87192,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033550.468, "dur": 5.939, + "args": { + "External id": 87193,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866033557.527, "dur": 16.180, + "args": { + "External id": 87194,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033586.117, "dur": 58.210, + "args": { + "External id": 87195,"Record function id": 0, "Sequence number": 1771058, "Fwd thread id": 1, "Ev Idx": 1178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033587.757, "dur": 32.810, + "args": { + "External id": 87196,"Sequence number": 1771058, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1179 + } + }, + { + "ph": "f", "id": 144, "pid": 5714, "tid": 6744, "ts": 6300866033587.757, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866033590.347, "dur": 15.720, + "args": { + "External id": 87197,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866033607.567, "dur": 12.540, + "args": { + "External id": 87198,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866033623.867, "dur": 15.450, + "args": { + "External id": 87199,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033652.077, "dur": 63.800, + "args": { + "External id": 87200,"Record function id": 0, "Sequence number": 1771057, "Fwd thread id": 1, "Ev Idx": 1183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033653.987, "dur": 56.380, + "args": { + "External id": 87201,"Sequence number": 1771057, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1184 + } + }, + { + "ph": "f", "id": 145, "pid": 5714, "tid": 6744, "ts": 6300866033653.987, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866033657.567, "dur": 19.810, + "args": { + "External id": 87202,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866033659.507, "dur": 0.490, + "args": { + "External id": 87203,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866033661.077, "dur": 0.290, + "args": { + "External id": 87204,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866033678.687, "dur": 18.700, + "args": { + "External id": 87205,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866033681.967, "dur": 14.410, + "args": { + "External id": 87206,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866033698.257, "dur": 10.090, + "args": { + "External id": 87207,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866033723.867, "dur": 5.530, + "args": { + "External id": 87208,"Record function id": 0, "Sequence number": 1771056, "Fwd thread id": 1, "Ev Idx": 1191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866033726.707, "dur": 0.510, + "args": { + "External id": 87209,"Sequence number": 1771056, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1192 + } + }, + { + "ph": "f", "id": 146, "pid": 5714, "tid": 6744, "ts": 6300866033726.707, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866033733.357, "dur": 37.290, + "args": { + "External id": 87210,"Record function id": 0, "Sequence number": 1771055, "Fwd thread id": 1, "Ev Idx": 1193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866033734.617, "dur": 31.730, + "args": { + "External id": 87211,"Sequence number": 1771055, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1194 + } + }, + { + "ph": "f", "id": 147, "pid": 5714, "tid": 6744, "ts": 6300866033734.617, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6300866033737.167, "dur": 5.640, + "args": { + "External id": 87212,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033739.747, "dur": 1.470, + "args": { + "External id": 87213,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866033743.757, "dur": 21.920, + "args": { + "External id": 87214,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866033746.237, "dur": 18.550, + "args": { + "External id": 87215,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033777.867, "dur": 90.980, + "args": { + "External id": 87216,"Record function id": 0, "Sequence number": 1771054, "Fwd thread id": 1, "Ev Idx": 1199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033779.647, "dur": 69.150, + "args": { + "External id": 87217,"Sequence number": 1771054, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1200 + } + }, + { + "ph": "f", "id": 148, "pid": 5714, "tid": 6744, "ts": 6300866033779.647, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866033781.417, "dur": 34.300, + "args": { + "External id": 87218,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866033782.757, "dur": 0.360, + "args": { + "External id": 87219,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866033784.197, "dur": 0.220, + "args": { + "External id": 87220,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866033790.367, "dur": 23.840, + "args": { + "External id": 87221,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866033817.727, "dur": 17.790, + "args": { + "External id": 87222,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866033820.327, "dur": 14.200, + "args": { + "External id": 87223,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866033836.367, "dur": 10.290, + "args": { + "External id": 87224,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866033854.907, "dur": 10.410, + "args": { + "External id": 87225,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033877.787, "dur": 53.660, + "args": { + "External id": 87226,"Record function id": 0, "Sequence number": 1771053, "Fwd thread id": 1, "Ev Idx": 1209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033879.707, "dur": 26.120, + "args": { + "External id": 87227,"Sequence number": 1771053, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1210 + } + }, + { + "ph": "f", "id": 149, "pid": 5714, "tid": 6744, "ts": 6300866033879.707, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866033881.367, "dur": 24.029, + "args": { + "External id": 87228,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866033882.387, "dur": 22.709, + "args": { + "External id": 87229,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033886.117, "dur": 5.350, + "args": { + "External id": 87230,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866033892.507, "dur": 12.049, + "args": { + "External id": 87231,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866033910.676, "dur": 14.811, + "args": { + "External id": 87232,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033939.287, "dur": 20.480, + "args": { + "External id": 87233,"Record function id": 0, "Sequence number": 1771052, "Fwd thread id": 1, "Ev Idx": 1216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033941.396, "dur": 1.091, + "args": { + "External id": 87234,"Sequence number": 1771052, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1217 + } + }, + { + "ph": "f", "id": 150, "pid": 5714, "tid": 6744, "ts": 6300866033941.396, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866033945.247, "dur": 11.480, + "args": { + "External id": 87235,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033965.107, "dur": 11.480, + "args": { + "External id": 87236,"Record function id": 0, "Sequence number": 1771051, "Fwd thread id": 1, "Ev Idx": 1219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033966.867, "dur": 7.269, + "args": { + "External id": 87237,"Sequence number": 1771051, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1220 + } + }, + { + "ph": "f", "id": 151, "pid": 5714, "tid": 6744, "ts": 6300866033966.867, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866033968.756, "dur": 5.100, + "args": { + "External id": 87238,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866033970.596, "dur": 3.040, + "args": { + "External id": 87239,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033980.887, "dur": 97.899, + "args": { + "External id": 87240,"Record function id": 0, "Sequence number": 1771050, "Fwd thread id": 1, "Ev Idx": 1223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866033982.176, "dur": 90.530, + "args": { + "External id": 87241,"Sequence number": 1771050, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1224 + } + }, + { + "ph": "f", "id": 152, "pid": 5714, "tid": 6744, "ts": 6300866033982.176, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866033984.776, "dur": 5.760, + "args": { + "External id": 87242,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866033986.247, "dur": 3.360, + "args": { + "External id": 87243,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866033987.887, "dur": 1.349, + "args": { + "External id": 87244,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866033992.527, "dur": 43.799, + "args": { + "External id": 87245,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866034038.226, "dur": 4.740, + "args": { + "External id": 87246,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866034039.326, "dur": 2.670, + "args": { + "External id": 87247,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034040.896, "dur": 0.860, + "args": { + "External id": 87248,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866034044.866, "dur": 3.970, + "args": { + "External id": 87249,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866034045.886, "dur": 2.440, + "args": { + "External id": 87250,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034047.906, "dur": 0.250, + "args": { + "External id": 87251,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866034049.526, "dur": 22.270, + "args": { + "External id": 87252,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034088.206, "dur": 11.030, + "args": { + "External id": 87253,"Record function id": 0, "Sequence number": 1771049, "Fwd thread id": 1, "Ev Idx": 1236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034090.446, "dur": 6.560, + "args": { + "External id": 87254,"Sequence number": 1771049, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1237 + } + }, + { + "ph": "f", "id": 153, "pid": 5714, "tid": 6744, "ts": 6300866034090.446, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866034092.326, "dur": 4.500, + "args": { + "External id": 87255,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866034094.246, "dur": 2.390, + "args": { + "External id": 87256,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034103.286, "dur": 9.920, + "args": { + "External id": 87257,"Record function id": 0, "Sequence number": 1771048, "Fwd thread id": 1, "Ev Idx": 1240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034104.606, "dur": 6.220, + "args": { + "External id": 87258,"Sequence number": 1771048, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1241 + } + }, + { + "ph": "f", "id": 154, "pid": 5714, "tid": 6744, "ts": 6300866034104.606, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866034105.416, "dur": 5.160, + "args": { + "External id": 87259,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866034107.456, "dur": 2.440, + "args": { + "External id": 87260,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034108.896, "dur": 0.720, + "args": { + "External id": 87261,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866034118.996, "dur": 8.630, + "args": { + "External id": 87262,"Record function id": 0, "Ev Idx": 1245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866034120.986, "dur": 5.360, + "args": { + "External id": 87263,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866034122.956, "dur": 2.970, + "args": { + "External id": 87264,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866034123.726, "dur": 2.000, + "args": { + "External id": 87265,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034131.486, "dur": 7.380, + "args": { + "External id": 87266,"Record function id": 0, "Sequence number": 1771047, "Fwd thread id": 1, "Ev Idx": 1249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034132.806, "dur": 3.600, + "args": { + "External id": 87267,"Sequence number": 1771047, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1250 + } + }, + { + "ph": "f", "id": 155, "pid": 5714, "tid": 6744, "ts": 6300866034132.806, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866034133.906, "dur": 2.350, + "args": { + "External id": 87268,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866034134.696, "dur": 1.350, + "args": { + "External id": 87269,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6300866034143.686, "dur": 244.709, + "args": { + "External id": 87270,"Record function id": 0, "Sequence number": 1771046, "Fwd thread id": 1, "Ev Idx": 1253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6300866034145.256, "dur": 225.259, + "args": { + "External id": 87271,"Sequence number": 1771046, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1254 + } + }, + { + "ph": "f", "id": 156, "pid": 5714, "tid": 6744, "ts": 6300866034145.256, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866034158.036, "dur": 10.280, + "args": { + "External id": 87272,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034160.286, "dur": 7.460, + "args": { + "External id": 87273,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866034170.656, "dur": 4.820, + "args": { + "External id": 87274,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034171.776, "dur": 3.460, + "args": { + "External id": 87275,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866034177.126, "dur": 5.220, + "args": { + "External id": 87276,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034178.196, "dur": 3.890, + "args": { + "External id": 87277,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866034198.286, "dur": 143.089, + "args": { + "External id": 87278,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 1261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866034253.206, "dur": 6.520, + "args": { + "External id": 87279,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866034261.446, "dur": 3.320, + "args": { + "External id": 87280,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866034354.666, "dur": 3.700, + "args": { + "External id": 87281,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866034363.406, "dur": 0.660, + "args": { + "External id": 87282,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866034366.975, "dur": 0.540, + "args": { + "External id": 87283,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866034399.826, "dur": 187.139, + "args": { + "External id": 87284,"Record function id": 0, "Sequence number": 1771045, "Fwd thread id": 1, "Ev Idx": 1267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866034402.175, "dur": 175.650, + "args": { + "External id": 87285,"Sequence number": 1771045, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1268 + } + }, + { + "ph": "f", "id": 157, "pid": 5714, "tid": 6744, "ts": 6300866034402.175, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866034419.086, "dur": 36.539, + "args": { + "External id": 87286,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034422.715, "dur": 6.671, + "args": { + "External id": 87287,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866034430.626, "dur": 24.299, + "args": { + "External id": 87288,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866034464.695, "dur": 7.190, + "args": { + "External id": 87289,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034466.715, "dur": 4.680, + "args": { + "External id": 87290,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866034599.445, "dur": 184.340, + "args": { + "External id": 87291,"Record function id": 0, "Sequence number": 1771044, "Fwd thread id": 1, "Ev Idx": 1274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866034602.175, "dur": 172.850, + "args": { + "External id": 87292,"Sequence number": 1771044, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1275 + } + }, + { + "ph": "f", "id": 158, "pid": 5714, "tid": 6744, "ts": 6300866034602.175, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866034624.255, "dur": 37.250, + "args": { + "External id": 87293,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034628.085, "dur": 7.110, + "args": { + "External id": 87294,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866034636.495, "dur": 24.290, + "args": { + "External id": 87295,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866034670.605, "dur": 7.450, + "args": { + "External id": 87296,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034672.545, "dur": 5.080, + "args": { + "External id": 87297,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034796.214, "dur": 14.171, + "args": { + "External id": 87298,"Record function id": 0, "Sequence number": 1771043, "Fwd thread id": 1, "Ev Idx": 1281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034798.794, "dur": 8.531, + "args": { + "External id": 87299,"Sequence number": 1771043, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1282 + } + }, + { + "ph": "f", "id": 159, "pid": 5714, "tid": 6744, "ts": 6300866034798.794, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866034801.474, "dur": 5.540, + "args": { + "External id": 87300,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866034802.645, "dur": 4.100, + "args": { + "External id": 87301,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034814.954, "dur": 7.340, + "args": { + "External id": 87302,"Record function id": 0, "Sequence number": 1771042, "Fwd thread id": 1, "Ev Idx": 1285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034816.294, "dur": 3.291, + "args": { + "External id": 87303,"Sequence number": 1771042, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1286 + } + }, + { + "ph": "f", "id": 160, "pid": 5714, "tid": 6744, "ts": 6300866034816.294, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866034817.445, "dur": 1.969, + "args": { + "External id": 87304,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866034818.265, "dur": 0.969, + "args": { + "External id": 87305,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034826.114, "dur": 6.991, + "args": { + "External id": 87306,"Record function id": 0, "Sequence number": 1771041, "Fwd thread id": 1, "Ev Idx": 1289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034827.394, "dur": 3.831, + "args": { + "External id": 87307,"Sequence number": 1771041, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1290 + } + }, + { + "ph": "f", "id": 161, "pid": 5714, "tid": 6744, "ts": 6300866034827.394, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866034829.474, "dur": 1.620, + "args": { + "External id": 87308,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866034830.105, "dur": 0.820, + "args": { + "External id": 87309,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034837.025, "dur": 6.320, + "args": { + "External id": 87310,"Record function id": 0, "Sequence number": 1771040, "Fwd thread id": 1, "Ev Idx": 1293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034838.345, "dur": 2.720, + "args": { + "External id": 87311,"Sequence number": 1771040, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1294 + } + }, + { + "ph": "f", "id": 162, "pid": 5714, "tid": 6744, "ts": 6300866034838.345, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866034839.205, "dur": 1.669, + "args": { + "External id": 87312,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866034839.774, "dur": 0.920, + "args": { + "External id": 87313,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034847.154, "dur": 104.940, + "args": { + "External id": 87314,"Record function id": 0, "Sequence number": 1771039, "Fwd thread id": 1, "Ev Idx": 1297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034848.485, "dur": 94.769, + "args": { + "External id": 87315,"Sequence number": 1771039, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1298 + } + }, + { + "ph": "f", "id": 163, "pid": 5714, "tid": 6744, "ts": 6300866034848.485, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866034851.814, "dur": 7.000, + "args": { + "External id": 87316,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866034854.045, "dur": 4.049, + "args": { + "External id": 87317,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034855.714, "dur": 2.000, + "args": { + "External id": 87318,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866034860.134, "dur": 46.470, + "args": { + "External id": 87319,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866034908.844, "dur": 4.690, + "args": { + "External id": 87320,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866034909.984, "dur": 2.670, + "args": { + "External id": 87321,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034911.604, "dur": 0.810, + "args": { + "External id": 87322,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866034916.214, "dur": 2.790, + "args": { + "External id": 87323,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866034917.204, "dur": 1.300, + "args": { + "External id": 87324,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034918.004, "dur": 0.330, + "args": { + "External id": 87325,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866034919.684, "dur": 22.700, + "args": { + "External id": 87326,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034962.054, "dur": 10.110, + "args": { + "External id": 87327,"Record function id": 0, "Sequence number": 1771038, "Fwd thread id": 1, "Ev Idx": 1310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034963.994, "dur": 5.940, + "args": { + "External id": 87328,"Sequence number": 1771038, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1311 + } + }, + { + "ph": "f", "id": 164, "pid": 5714, "tid": 6744, "ts": 6300866034963.994, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866034966.084, "dur": 3.660, + "args": { + "External id": 87329,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866034967.024, "dur": 2.510, + "args": { + "External id": 87330,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034976.314, "dur": 9.640, + "args": { + "External id": 87331,"Record function id": 0, "Sequence number": 1771037, "Fwd thread id": 1, "Ev Idx": 1314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866034978.514, "dur": 5.220, + "args": { + "External id": 87332,"Sequence number": 1771037, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1315 + } + }, + { + "ph": "f", "id": 165, "pid": 5714, "tid": 6744, "ts": 6300866034978.514, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866034979.364, "dur": 4.140, + "args": { + "External id": 87333,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866034980.424, "dur": 2.390, + "args": { + "External id": 87334,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866034981.754, "dur": 0.790, + "args": { + "External id": 87335,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866034991.714, "dur": 9.910, + "args": { + "External id": 87336,"Record function id": 0, "Ev Idx": 1319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866034993.794, "dur": 6.560, + "args": { + "External id": 87337,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866034995.804, "dur": 4.020, + "args": { + "External id": 87338,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866034996.614, "dur": 2.990, + "args": { + "External id": 87339,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035005.814, "dur": 8.200, + "args": { + "External id": 87340,"Record function id": 0, "Sequence number": 1771036, "Fwd thread id": 1, "Ev Idx": 1323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035007.164, "dur": 4.520, + "args": { + "External id": 87341,"Sequence number": 1771036, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1324 + } + }, + { + "ph": "f", "id": 166, "pid": 5714, "tid": 6744, "ts": 6300866035007.164, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866035008.454, "dur": 3.080, + "args": { + "External id": 87342,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866035010.134, "dur": 1.200, + "args": { + "External id": 87343,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035018.194, "dur": 88.810, + "args": { + "External id": 87344,"Record function id": 0, "Sequence number": 1771035, "Fwd thread id": 1, "Ev Idx": 1327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035019.494, "dur": 79.320, + "args": { + "External id": 87345,"Sequence number": 1771035, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1328 + } + }, + { + "ph": "f", "id": 167, "pid": 5714, "tid": 6744, "ts": 6300866035019.494, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866035021.634, "dur": 3.390, + "args": { + "External id": 87346,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866035022.244, "dur": 2.310, + "args": { + "External id": 87347,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035023.404, "dur": 0.900, + "args": { + "External id": 87348,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866035026.694, "dur": 37.810, + "args": { + "External id": 87349,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866035066.514, "dur": 4.530, + "args": { + "External id": 87350,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866035067.484, "dur": 2.510, + "args": { + "External id": 87351,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035068.954, "dur": 0.820, + "args": { + "External id": 87352,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866035072.764, "dur": 3.160, + "args": { + "External id": 87353,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866035073.514, "dur": 2.020, + "args": { + "External id": 87354,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035075.104, "dur": 0.260, + "args": { + "External id": 87355,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866035076.594, "dur": 21.330, + "args": { + "External id": 87356,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035116.654, "dur": 34.900, + "args": { + "External id": 87357,"Record function id": 0, "Sequence number": 1771034, "Fwd thread id": 1, "Ev Idx": 1340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035118.734, "dur": 5.900, + "args": { + "External id": 87358,"Sequence number": 1771034, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1341 + } + }, + { + "ph": "f", "id": 168, "pid": 5714, "tid": 6744, "ts": 6300866035118.734, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866035120.634, "dur": 3.830, + "args": { + "External id": 87359,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866035121.564, "dur": 2.700, + "args": { + "External id": 87360,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866035127.464, "dur": 20.470, + "args": { + "External id": 87361,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035159.104, "dur": 11.900, + "args": { + "External id": 87362,"Record function id": 0, "Sequence number": 1771033, "Fwd thread id": 1, "Ev Idx": 1345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035161.164, "dur": 7.390, + "args": { + "External id": 87363,"Sequence number": 1771033, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1346 + } + }, + { + "ph": "f", "id": 169, "pid": 5714, "tid": 6744, "ts": 6300866035161.164, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866035162.124, "dur": 6.160, + "args": { + "External id": 87364,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866035163.374, "dur": 3.790, + "args": { + "External id": 87365,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035165.244, "dur": 1.610, + "args": { + "External id": 87366,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866035176.594, "dur": 8.470, + "args": { + "External id": 87367,"Record function id": 0, "Ev Idx": 1350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866035178.654, "dur": 5.190, + "args": { + "External id": 87368,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866035180.564, "dur": 2.790, + "args": { + "External id": 87369,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866035181.374, "dur": 1.780, + "args": { + "External id": 87370,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035189.074, "dur": 8.730, + "args": { + "External id": 87371,"Record function id": 0, "Sequence number": 1771032, "Fwd thread id": 1, "Ev Idx": 1354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035190.434, "dur": 4.820, + "args": { + "External id": 87372,"Sequence number": 1771032, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1355 + } + }, + { + "ph": "f", "id": 170, "pid": 5714, "tid": 6744, "ts": 6300866035190.434, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866035192.704, "dur": 2.390, + "args": { + "External id": 87373,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866035193.414, "dur": 1.460, + "args": { + "External id": 87374,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035201.684, "dur": 91.629, + "args": { + "External id": 87375,"Record function id": 0, "Sequence number": 1771031, "Fwd thread id": 1, "Ev Idx": 1358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035202.994, "dur": 80.079, + "args": { + "External id": 87376,"Sequence number": 1771031, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1359 + } + }, + { + "ph": "f", "id": 171, "pid": 5714, "tid": 6744, "ts": 6300866035202.994, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866035205.054, "dur": 4.000, + "args": { + "External id": 87377,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866035205.654, "dur": 2.960, + "args": { + "External id": 87378,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035207.604, "dur": 0.750, + "args": { + "External id": 87379,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866035209.944, "dur": 38.329, + "args": { + "External id": 87380,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866035250.213, "dur": 4.431, + "args": { + "External id": 87381,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866035251.133, "dur": 2.640, + "args": { + "External id": 87382,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035252.724, "dur": 0.789, + "args": { + "External id": 87383,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866035256.413, "dur": 3.720, + "args": { + "External id": 87384,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866035257.284, "dur": 2.480, + "args": { + "External id": 87385,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035259.304, "dur": 0.289, + "args": { + "External id": 87386,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866035260.853, "dur": 21.360, + "args": { + "External id": 87387,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035311.664, "dur": 32.399, + "args": { + "External id": 87388,"Record function id": 0, "Sequence number": 1771030, "Fwd thread id": 1, "Ev Idx": 1371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035313.963, "dur": 6.260, + "args": { + "External id": 87389,"Sequence number": 1771030, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1372 + } + }, + { + "ph": "f", "id": 172, "pid": 5714, "tid": 6744, "ts": 6300866035313.963, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866035316.173, "dur": 3.830, + "args": { + "External id": 87390,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866035317.143, "dur": 2.580, + "args": { + "External id": 87391,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866035323.143, "dur": 17.080, + "args": { + "External id": 87392,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035350.403, "dur": 12.650, + "args": { + "External id": 87393,"Record function id": 0, "Sequence number": 1771029, "Fwd thread id": 1, "Ev Idx": 1376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035352.103, "dur": 8.140, + "args": { + "External id": 87394,"Sequence number": 1771029, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1377 + } + }, + { + "ph": "f", "id": 173, "pid": 5714, "tid": 6744, "ts": 6300866035352.103, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866035353.173, "dur": 6.770, + "args": { + "External id": 87395,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866035355.553, "dur": 3.310, + "args": { + "External id": 87396,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035357.593, "dur": 0.990, + "args": { + "External id": 87397,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866035368.883, "dur": 9.480, + "args": { + "External id": 87398,"Record function id": 0, "Ev Idx": 1381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866035370.853, "dur": 6.280, + "args": { + "External id": 87399,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866035372.723, "dur": 3.930, + "args": { + "External id": 87400,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866035373.523, "dur": 2.940, + "args": { + "External id": 87401,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035382.433, "dur": 79.660, + "args": { + "External id": 87402,"Record function id": 0, "Sequence number": 1771028, "Fwd thread id": 1, "Ev Idx": 1385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035383.763, "dur": 35.840, + "args": { + "External id": 87403,"Sequence number": 1771028, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1386 + } + }, + { + "ph": "f", "id": 174, "pid": 5714, "tid": 6744, "ts": 6300866035383.763, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866035385.573, "dur": 19.480, + "args": { + "External id": 87404,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866035406.793, "dur": 12.300, + "args": { + "External id": 87405,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866035423.423, "dur": 26.930, + "args": { + "External id": 87406,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866035453.223, "dur": 2.420, + "args": { + "External id": 87407,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866035471.573, "dur": 8.150, + "args": { + "External id": 87408,"Record function id": 0, "Ev Idx": 1391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866035473.863, "dur": 4.530, + "args": { + "External id": 87409,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866035475.483, "dur": 2.450, + "args": { + "External id": 87410,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866035476.223, "dur": 1.490, + "args": { + "External id": 87411,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035484.173, "dur": 35.230, + "args": { + "External id": 87412,"Record function id": 0, "Sequence number": 1771027, "Fwd thread id": 1, "Ev Idx": 1395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035485.443, "dur": 30.200, + "args": { + "External id": 87413,"Sequence number": 1771027, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1396 + } + }, + { + "ph": "f", "id": 175, "pid": 5714, "tid": 6744, "ts": 6300866035485.443, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866035487.003, "dur": 28.190, + "args": { + "External id": 87414,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866035488.133, "dur": 26.770, + "args": { + "External id": 87415,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035491.293, "dur": 5.960, + "args": { + "External id": 87416,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866035498.443, "dur": 15.790, + "args": { + "External id": 87417,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035527.173, "dur": 58.540, + "args": { + "External id": 87418,"Record function id": 0, "Sequence number": 1771026, "Fwd thread id": 1, "Ev Idx": 1401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035528.873, "dur": 32.550, + "args": { + "External id": 87419,"Sequence number": 1771026, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1402 + } + }, + { + "ph": "f", "id": 176, "pid": 5714, "tid": 6744, "ts": 6300866035528.873, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866035531.223, "dur": 15.690, + "args": { + "External id": 87420,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866035548.313, "dur": 12.620, + "args": { + "External id": 87421,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866035564.823, "dur": 15.390, + "args": { + "External id": 87422,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035593.353, "dur": 62.539, + "args": { + "External id": 87423,"Record function id": 0, "Sequence number": 1771025, "Fwd thread id": 1, "Ev Idx": 1406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035595.083, "dur": 55.500, + "args": { + "External id": 87424,"Sequence number": 1771025, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1407 + } + }, + { + "ph": "f", "id": 177, "pid": 5714, "tid": 6744, "ts": 6300866035595.083, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866035597.643, "dur": 20.460, + "args": { + "External id": 87425,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866035599.443, "dur": 0.500, + "args": { + "External id": 87426,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866035601.023, "dur": 0.290, + "args": { + "External id": 87427,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866035619.373, "dur": 17.290, + "args": { + "External id": 87428,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866035622.673, "dur": 12.890, + "args": { + "External id": 87429,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866035638.603, "dur": 10.250, + "args": { + "External id": 87430,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866035663.872, "dur": 4.271, + "args": { + "External id": 87431,"Record function id": 0, "Sequence number": 1771024, "Fwd thread id": 1, "Ev Idx": 1414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866035665.672, "dur": 0.451, + "args": { + "External id": 87432,"Sequence number": 1771024, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1415 + } + }, + { + "ph": "f", "id": 178, "pid": 5714, "tid": 6744, "ts": 6300866035665.672, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866035671.972, "dur": 37.820, + "args": { + "External id": 87433,"Record function id": 0, "Sequence number": 1771023, "Fwd thread id": 1, "Ev Idx": 1416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866035673.232, "dur": 32.540, + "args": { + "External id": 87434,"Sequence number": 1771023, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1417 + } + }, + { + "ph": "f", "id": 179, "pid": 5714, "tid": 6744, "ts": 6300866035673.232, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6300866035675.852, "dur": 5.660, + "args": { + "External id": 87435,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035678.432, "dur": 1.551, + "args": { + "External id": 87436,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866035682.552, "dur": 22.600, + "args": { + "External id": 87437,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866035686.072, "dur": 18.051, + "args": { + "External id": 87438,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035716.752, "dur": 88.270, + "args": { + "External id": 87439,"Record function id": 0, "Sequence number": 1771022, "Fwd thread id": 1, "Ev Idx": 1422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035718.432, "dur": 66.460, + "args": { + "External id": 87440,"Sequence number": 1771022, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1423 + } + }, + { + "ph": "f", "id": 180, "pid": 5714, "tid": 6744, "ts": 6300866035718.432, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866035720.252, "dur": 29.620, + "args": { + "External id": 87441,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866035721.403, "dur": 0.360, + "args": { + "External id": 87442,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866035722.672, "dur": 0.240, + "args": { + "External id": 87443,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866035728.883, "dur": 19.589, + "args": { + "External id": 87444,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866035750.952, "dur": 18.530, + "args": { + "External id": 87445,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866035753.622, "dur": 14.600, + "args": { + "External id": 87446,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866035770.512, "dur": 12.300, + "args": { + "External id": 87447,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866035791.002, "dur": 10.570, + "args": { + "External id": 87448,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035814.132, "dur": 46.400, + "args": { + "External id": 87449,"Record function id": 0, "Sequence number": 1771021, "Fwd thread id": 1, "Ev Idx": 1432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035816.072, "dur": 27.480, + "args": { + "External id": 87450,"Sequence number": 1771021, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1433 + } + }, + { + "ph": "f", "id": 181, "pid": 5714, "tid": 6744, "ts": 6300866035816.072, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866035817.682, "dur": 25.450, + "args": { + "External id": 87451,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866035819.802, "dur": 23.060, + "args": { + "External id": 87452,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035823.452, "dur": 5.490, + "args": { + "External id": 87453,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866035830.012, "dur": 12.260, + "args": { + "External id": 87454,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866035848.782, "dur": 9.260, + "args": { + "External id": 87455,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035867.822, "dur": 5.600, + "args": { + "External id": 87456,"Record function id": 0, "Sequence number": 1771020, "Fwd thread id": 1, "Ev Idx": 1439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866035869.542, "dur": 1.040, + "args": { + "External id": 87457,"Sequence number": 1771020, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1440 + } + }, + { + "ph": "f", "id": 182, "pid": 5714, "tid": 6744, "ts": 6300866035869.542, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866035878.402, "dur": 312.080, + "args": { + "External id": 87458,"Record function id": 0, "Sequence number": 1771019, "Fwd thread id": 1, "Ev Idx": 1441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866035880.012, "dur": 299.690, + "args": { + "External id": 87459,"Sequence number": 1771019, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1442 + } + }, + { + "ph": "f", "id": 183, "pid": 5714, "tid": 6744, "ts": 6300866035880.012, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866035907.602, "dur": 8.010, + "args": { + "External id": 87460,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6300866035911.432, "dur": 3.730, + "args": { + "External id": 87461,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 1444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866035918.242, "dur": 6.440, + "args": { + "External id": 87462,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866035921.052, "dur": 2.860, + "args": { + "External id": 87463,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035922.742, "dur": 0.850, + "args": { + "External id": 87464,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 6744, + "ts": 6300866035927.022, "dur": 43.200, + "args": { + "External id": 87465,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 1448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866035927.632, "dur": 2.500, + "args": { + "External id": 87466,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 1449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866035928.212, "dur": 1.530, + "args": { + "External id": 87467,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866035929.122, "dur": 0.420, + "args": { + "External id": 87468,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 6744, + "ts": 6300866035932.082, "dur": 37.320, + "args": { + "External id": 87469,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866035933.052, "dur": 35.500, + "args": { + "External id": 87470,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 6744, + "ts": 6300866035975.052, "dur": 4.020, + "args": { + "External id": 87471,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 1454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866035976.462, "dur": 2.410, + "args": { + "External id": 87472,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866036002.622, "dur": 6.380, + "args": { + "External id": 87473,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866036010.242, "dur": 4.280, + "args": { + "External id": 87474,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866036015.522, "dur": 3.050, + "args": { + "External id": 87475,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866036048.962, "dur": 4.050, + "args": { + "External id": 87476,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866036050.002, "dur": 2.680, + "args": { + "External id": 87477,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5714, "tid": 6744, + "ts": 6300866036065.162, "dur": 97.540, + "args": { + "External id": 87478,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 1461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6300866036068.592, "dur": 4.990, + "args": { + "External id": 87479,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036071.382, "dur": 1.250, + "args": { + "External id": 87480,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866036074.832, "dur": 2.930, + "args": { + "External id": 87481,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 1464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036076.842, "dur": 0.330, + "args": { + "External id": 87482,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 1465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6300866036079.132, "dur": 1.890, + "args": { + "External id": 87483,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036080.372, "dur": 0.250, + "args": { + "External id": 87484,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866036081.892, "dur": 1.800, + "args": { + "External id": 87485,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036083.042, "dur": 0.290, + "args": { + "External id": 87486,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 1469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866036087.422, "dur": 2.220, + "args": { + "External id": 87487,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 1470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036088.952, "dur": 0.320, + "args": { + "External id": 87488,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 1471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866036090.572, "dur": 5.519, + "args": { + "External id": 87489,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 1472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6300866036093.532, "dur": 2.310, + "args": { + "External id": 87490,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 1473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866036096.931, "dur": 1.931, + "args": { + "External id": 87491,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 1474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036098.231, "dur": 0.300, + "args": { + "External id": 87492,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 1475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866036099.671, "dur": 2.471, + "args": { + "External id": 87493,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866036100.571, "dur": 1.400, + "args": { + "External id": 87494,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866036103.111, "dur": 45.340, + "args": { + "External id": 87495,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 1478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866036151.242, "dur": 2.109, + "args": { + "External id": 87496,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 1479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6300866036155.582, "dur": 3.109, + "args": { + "External id": 87497,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 1480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036157.571, "dur": 0.471, + "args": { + "External id": 87498,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 1481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866036160.902, "dur": 0.900, + "args": { + "External id": 87499,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 1482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866036203.871, "dur": 10.950, + "args": { + "External id": 87500,"Record function id": 0, "Ev Idx": 1483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866036206.671, "dur": 6.790, + "args": { + "External id": 87501,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866036209.021, "dur": 3.460, + "args": { + "External id": 87502,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866036209.911, "dur": 2.350, + "args": { + "External id": 87503,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036219.451, "dur": 10.840, + "args": { + "External id": 87504,"Record function id": 0, "Sequence number": 1771018, "Fwd thread id": 1, "Ev Idx": 1487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036221.131, "dur": 6.850, + "args": { + "External id": 87505,"Sequence number": 1771018, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1488 + } + }, + { + "ph": "f", "id": 184, "pid": 5714, "tid": 6744, "ts": 6300866036221.131, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866036223.991, "dur": 3.710, + "args": { + "External id": 87506,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866036225.861, "dur": 1.580, + "args": { + "External id": 87507,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036234.371, "dur": 113.950, + "args": { + "External id": 87508,"Record function id": 0, "Sequence number": 1771017, "Fwd thread id": 1, "Ev Idx": 1491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036235.761, "dur": 103.390, + "args": { + "External id": 87509,"Sequence number": 1771017, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1492 + } + }, + { + "ph": "f", "id": 185, "pid": 5714, "tid": 6744, "ts": 6300866036235.761, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866036238.781, "dur": 4.890, + "args": { + "External id": 87510,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866036240.181, "dur": 2.780, + "args": { + "External id": 87511,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036241.741, "dur": 0.920, + "args": { + "External id": 87512,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866036244.731, "dur": 42.000, + "args": { + "External id": 87513,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866036290.081, "dur": 5.410, + "args": { + "External id": 87514,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866036291.561, "dur": 2.700, + "args": { + "External id": 87515,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036293.241, "dur": 0.800, + "args": { + "External id": 87516,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866036305.891, "dur": 3.670, + "args": { + "External id": 87517,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866036306.691, "dur": 2.410, + "args": { + "External id": 87518,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036307.651, "dur": 1.260, + "args": { + "External id": 87519,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866036311.571, "dur": 26.500, + "args": { + "External id": 87520,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036358.771, "dur": 10.340, + "args": { + "External id": 87521,"Record function id": 0, "Sequence number": 1771016, "Fwd thread id": 1, "Ev Idx": 1504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036360.971, "dur": 6.320, + "args": { + "External id": 87522,"Sequence number": 1771016, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1505 + } + }, + { + "ph": "f", "id": 186, "pid": 5714, "tid": 6744, "ts": 6300866036360.971, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866036363.131, "dur": 3.940, + "args": { + "External id": 87523,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866036364.141, "dur": 2.710, + "args": { + "External id": 87524,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036373.201, "dur": 9.110, + "args": { + "External id": 87525,"Record function id": 0, "Sequence number": 1771015, "Fwd thread id": 1, "Ev Idx": 1508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036374.481, "dur": 5.450, + "args": { + "External id": 87526,"Sequence number": 1771015, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1509 + } + }, + { + "ph": "f", "id": 187, "pid": 5714, "tid": 6744, "ts": 6300866036374.481, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866036375.471, "dur": 4.260, + "args": { + "External id": 87527,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866036376.601, "dur": 2.460, + "args": { + "External id": 87528,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036377.881, "dur": 0.890, + "args": { + "External id": 87529,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866036387.921, "dur": 9.250, + "args": { + "External id": 87530,"Record function id": 0, "Ev Idx": 1513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866036389.741, "dur": 6.250, + "args": { + "External id": 87531,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866036391.691, "dur": 3.860, + "args": { + "External id": 87532,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866036393.491, "dur": 1.870, + "args": { + "External id": 87533,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036401.271, "dur": 7.040, + "args": { + "External id": 87534,"Record function id": 0, "Sequence number": 1771014, "Fwd thread id": 1, "Ev Idx": 1517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036402.741, "dur": 3.450, + "args": { + "External id": 87535,"Sequence number": 1771014, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1518 + } + }, + { + "ph": "f", "id": 188, "pid": 5714, "tid": 6744, "ts": 6300866036402.741, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866036403.821, "dur": 2.220, + "args": { + "External id": 87536,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866036404.821, "dur": 1.040, + "args": { + "External id": 87537,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036412.471, "dur": 92.920, + "args": { + "External id": 87538,"Record function id": 0, "Sequence number": 1771013, "Fwd thread id": 1, "Ev Idx": 1521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036413.781, "dur": 82.360, + "args": { + "External id": 87539,"Sequence number": 1771013, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1522 + } + }, + { + "ph": "f", "id": 189, "pid": 5714, "tid": 6744, "ts": 6300866036413.781, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866036415.901, "dur": 4.090, + "args": { + "External id": 87540,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866036417.521, "dur": 2.000, + "args": { + "External id": 87541,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036418.621, "dur": 0.670, + "args": { + "External id": 87542,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866036420.891, "dur": 37.430, + "args": { + "External id": 87543,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866036460.401, "dur": 4.550, + "args": { + "External id": 87544,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866036461.381, "dur": 2.640, + "args": { + "External id": 87545,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036463.021, "dur": 0.790, + "args": { + "External id": 87546,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866036468.221, "dur": 3.440, + "args": { + "External id": 87547,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866036469.151, "dur": 2.130, + "args": { + "External id": 87548,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036470.021, "dur": 1.100, + "args": { + "External id": 87549,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866036472.391, "dur": 22.780, + "args": { + "External id": 87550,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036514.791, "dur": 55.279, + "args": { + "External id": 87551,"Record function id": 0, "Sequence number": 1771012, "Fwd thread id": 1, "Ev Idx": 1534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036516.771, "dur": 22.599, + "args": { + "External id": 87552,"Sequence number": 1771012, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1535 + } + }, + { + "ph": "f", "id": 190, "pid": 5714, "tid": 6744, "ts": 6300866036516.771, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866036534.541, "dur": 4.569, + "args": { + "External id": 87553,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866036535.921, "dur": 2.929, + "args": { + "External id": 87554,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866036542.850, "dur": 21.620, + "args": { + "External id": 87555,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036577.890, "dur": 11.711, + "args": { + "External id": 87556,"Record function id": 0, "Sequence number": 1771011, "Fwd thread id": 1, "Ev Idx": 1539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036580.030, "dur": 7.120, + "args": { + "External id": 87557,"Sequence number": 1771011, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1540 + } + }, + { + "ph": "f", "id": 191, "pid": 5714, "tid": 6744, "ts": 6300866036580.030, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866036581.161, "dur": 5.720, + "args": { + "External id": 87558,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866036582.441, "dur": 3.460, + "args": { + "External id": 87559,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036584.690, "dur": 0.911, + "args": { + "External id": 87560,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866036595.370, "dur": 8.720, + "args": { + "External id": 87561,"Record function id": 0, "Ev Idx": 1544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866036597.330, "dur": 5.551, + "args": { + "External id": 87562,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866036599.241, "dur": 3.089, + "args": { + "External id": 87563,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866036600.090, "dur": 2.031, + "args": { + "External id": 87564,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036608.130, "dur": 80.910, + "args": { + "External id": 87565,"Record function id": 0, "Sequence number": 1771010, "Fwd thread id": 1, "Ev Idx": 1548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036610.530, "dur": 35.260, + "args": { + "External id": 87566,"Sequence number": 1771010, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1549 + } + }, + { + "ph": "f", "id": 192, "pid": 5714, "tid": 6744, "ts": 6300866036610.530, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866036612.301, "dur": 18.980, + "args": { + "External id": 87567,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866036632.870, "dur": 12.420, + "args": { + "External id": 87568,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866036649.270, "dur": 28.150, + "args": { + "External id": 87569,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866036680.420, "dur": 3.160, + "args": { + "External id": 87570,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866036698.560, "dur": 9.460, + "args": { + "External id": 87571,"Record function id": 0, "Ev Idx": 1554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866036701.310, "dur": 5.500, + "args": { + "External id": 87572,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866036702.990, "dur": 3.350, + "args": { + "External id": 87573,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866036704.550, "dur": 1.560, + "args": { + "External id": 87574,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036712.390, "dur": 36.360, + "args": { + "External id": 87575,"Record function id": 0, "Sequence number": 1771009, "Fwd thread id": 1, "Ev Idx": 1558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036713.740, "dur": 30.660, + "args": { + "External id": 87576,"Sequence number": 1771009, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1559 + } + }, + { + "ph": "f", "id": 193, "pid": 5714, "tid": 6744, "ts": 6300866036713.740, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866036715.320, "dur": 28.620, + "args": { + "External id": 87577,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866036716.440, "dur": 27.200, + "args": { + "External id": 87578,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036719.580, "dur": 5.790, + "args": { + "External id": 87579,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866036727.120, "dur": 15.850, + "args": { + "External id": 87580,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036755.990, "dur": 58.310, + "args": { + "External id": 87581,"Record function id": 0, "Sequence number": 1771008, "Fwd thread id": 1, "Ev Idx": 1564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036757.750, "dur": 32.780, + "args": { + "External id": 87582,"Sequence number": 1771008, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1565 + } + }, + { + "ph": "f", "id": 194, "pid": 5714, "tid": 6744, "ts": 6300866036757.750, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866036760.790, "dur": 15.760, + "args": { + "External id": 87583,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866036777.990, "dur": 12.000, + "args": { + "External id": 87584,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866036793.990, "dur": 15.600, + "args": { + "External id": 87585,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036821.830, "dur": 64.450, + "args": { + "External id": 87586,"Record function id": 0, "Sequence number": 1771007, "Fwd thread id": 1, "Ev Idx": 1569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036823.750, "dur": 57.250, + "args": { + "External id": 87587,"Sequence number": 1771007, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1570 + } + }, + { + "ph": "f", "id": 195, "pid": 5714, "tid": 6744, "ts": 6300866036823.750, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866036826.410, "dur": 19.680, + "args": { + "External id": 87588,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866036828.360, "dur": 0.490, + "args": { + "External id": 87589,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866036829.930, "dur": 0.270, + "args": { + "External id": 87590,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866036847.520, "dur": 19.060, + "args": { + "External id": 87591,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866036850.950, "dur": 14.530, + "args": { + "External id": 87592,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866036867.460, "dur": 10.450, + "args": { + "External id": 87593,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866036894.040, "dur": 4.200, + "args": { + "External id": 87594,"Record function id": 0, "Sequence number": 1771006, "Fwd thread id": 1, "Ev Idx": 1577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866036895.860, "dur": 0.460, + "args": { + "External id": 87595,"Sequence number": 1771006, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1578 + } + }, + { + "ph": "f", "id": 196, "pid": 5714, "tid": 6744, "ts": 6300866036895.860, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866036902.130, "dur": 36.750, + "args": { + "External id": 87596,"Record function id": 0, "Sequence number": 1771005, "Fwd thread id": 1, "Ev Idx": 1579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866036903.400, "dur": 31.790, + "args": { + "External id": 87597,"Sequence number": 1771005, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1580 + } + }, + { + "ph": "f", "id": 197, "pid": 5714, "tid": 6744, "ts": 6300866036903.400, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6300866036905.840, "dur": 5.700, + "args": { + "External id": 87598,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866036908.420, "dur": 1.530, + "args": { + "External id": 87599,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866036912.510, "dur": 22.110, + "args": { + "External id": 87600,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866036915.160, "dur": 18.460, + "args": { + "External id": 87601,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036945.890, "dur": 90.759, + "args": { + "External id": 87602,"Record function id": 0, "Sequence number": 1771004, "Fwd thread id": 1, "Ev Idx": 1585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866036947.890, "dur": 68.779, + "args": { + "External id": 87603,"Sequence number": 1771004, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1586 + } + }, + { + "ph": "f", "id": 198, "pid": 5714, "tid": 6744, "ts": 6300866036947.890, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866036950.990, "dur": 31.939, + "args": { + "External id": 87604,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866036952.050, "dur": 0.360, + "args": { + "External id": 87605,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866036953.400, "dur": 0.220, + "args": { + "External id": 87606,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866036959.460, "dur": 22.000, + "args": { + "External id": 87607,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866036984.129, "dur": 18.520, + "args": { + "External id": 87608,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866036986.680, "dur": 14.880, + "args": { + "External id": 87609,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866037003.529, "dur": 11.240, + "args": { + "External id": 87610,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866037022.809, "dur": 10.571, + "args": { + "External id": 87611,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037045.829, "dur": 55.330, + "args": { + "External id": 87612,"Record function id": 0, "Sequence number": 1771003, "Fwd thread id": 1, "Ev Idx": 1595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037047.800, "dur": 26.839, + "args": { + "External id": 87613,"Sequence number": 1771003, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1596 + } + }, + { + "ph": "f", "id": 199, "pid": 5714, "tid": 6744, "ts": 6300866037047.800, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866037049.549, "dur": 24.670, + "args": { + "External id": 87614,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866037051.460, "dur": 22.449, + "args": { + "External id": 87615,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037055.000, "dur": 5.409, + "args": { + "External id": 87616,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866037061.440, "dur": 11.889, + "args": { + "External id": 87617,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866037079.589, "dur": 14.590, + "args": { + "External id": 87618,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037109.579, "dur": 20.300, + "args": { + "External id": 87619,"Record function id": 0, "Sequence number": 1771002, "Fwd thread id": 1, "Ev Idx": 1602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037111.629, "dur": 1.080, + "args": { + "External id": 87620,"Sequence number": 1771002, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1603 + } + }, + { + "ph": "f", "id": 200, "pid": 5714, "tid": 6744, "ts": 6300866037111.629, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866037115.779, "dur": 11.210, + "args": { + "External id": 87621,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037135.399, "dur": 10.580, + "args": { + "External id": 87622,"Record function id": 0, "Sequence number": 1771001, "Fwd thread id": 1, "Ev Idx": 1605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037137.319, "dur": 6.360, + "args": { + "External id": 87623,"Sequence number": 1771001, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1606 + } + }, + { + "ph": "f", "id": 201, "pid": 5714, "tid": 6744, "ts": 6300866037137.319, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866037139.079, "dur": 4.350, + "args": { + "External id": 87624,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866037140.099, "dur": 3.070, + "args": { + "External id": 87625,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037150.139, "dur": 97.880, + "args": { + "External id": 87626,"Record function id": 0, "Sequence number": 1771000, "Fwd thread id": 1, "Ev Idx": 1609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037151.559, "dur": 89.860, + "args": { + "External id": 87627,"Sequence number": 1771000, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1610 + } + }, + { + "ph": "f", "id": 202, "pid": 5714, "tid": 6744, "ts": 6300866037151.559, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866037154.289, "dur": 6.630, + "args": { + "External id": 87628,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866037156.669, "dur": 3.360, + "args": { + "External id": 87629,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037158.319, "dur": 1.340, + "args": { + "External id": 87630,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866037161.889, "dur": 43.250, + "args": { + "External id": 87631,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866037207.099, "dur": 4.990, + "args": { + "External id": 87632,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866037208.139, "dur": 2.910, + "args": { + "External id": 87633,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037210.029, "dur": 0.790, + "args": { + "External id": 87634,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866037214.909, "dur": 2.600, + "args": { + "External id": 87635,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866037215.629, "dur": 1.390, + "args": { + "External id": 87636,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037216.479, "dur": 0.380, + "args": { + "External id": 87637,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866037218.209, "dur": 22.290, + "args": { + "External id": 87638,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037257.539, "dur": 10.200, + "args": { + "External id": 87639,"Record function id": 0, "Sequence number": 1770999, "Fwd thread id": 1, "Ev Idx": 1622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037259.479, "dur": 6.110, + "args": { + "External id": 87640,"Sequence number": 1770999, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1623 + } + }, + { + "ph": "f", "id": 203, "pid": 5714, "tid": 6744, "ts": 6300866037259.479, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866037261.639, "dur": 3.730, + "args": { + "External id": 87641,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866037262.919, "dur": 2.230, + "args": { + "External id": 87642,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037271.759, "dur": 9.310, + "args": { + "External id": 87643,"Record function id": 0, "Sequence number": 1770998, "Fwd thread id": 1, "Ev Idx": 1626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037272.989, "dur": 5.890, + "args": { + "External id": 87644,"Sequence number": 1770998, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1627 + } + }, + { + "ph": "f", "id": 204, "pid": 5714, "tid": 6744, "ts": 6300866037272.989, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866037274.729, "dur": 3.900, + "args": { + "External id": 87645,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866037275.749, "dur": 2.180, + "args": { + "External id": 87646,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037277.009, "dur": 0.660, + "args": { + "External id": 87647,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866037287.039, "dur": 8.750, + "args": { + "External id": 87648,"Record function id": 0, "Ev Idx": 1631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866037288.979, "dur": 5.450, + "args": { + "External id": 87649,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866037291.019, "dur": 2.970, + "args": { + "External id": 87650,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866037291.829, "dur": 1.950, + "args": { + "External id": 87651,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037308.129, "dur": 9.230, + "args": { + "External id": 87652,"Record function id": 0, "Sequence number": 1770997, "Fwd thread id": 1, "Ev Idx": 1635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037309.619, "dur": 5.300, + "args": { + "External id": 87653,"Sequence number": 1770997, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1636 + } + }, + { + "ph": "f", "id": 205, "pid": 5714, "tid": 6744, "ts": 6300866037309.619, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866037310.949, "dur": 3.750, + "args": { + "External id": 87654,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866037312.749, "dur": 1.660, + "args": { + "External id": 87655,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6300866037322.829, "dur": 233.559, + "args": { + "External id": 87656,"Record function id": 0, "Sequence number": 1770996, "Fwd thread id": 1, "Ev Idx": 1639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6300866037324.579, "dur": 213.889, + "args": { + "External id": 87657,"Sequence number": 1770996, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1640 + } + }, + { + "ph": "f", "id": 206, "pid": 5714, "tid": 6744, "ts": 6300866037324.579, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866037337.149, "dur": 10.570, + "args": { + "External id": 87658,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037339.339, "dur": 7.780, + "args": { + "External id": 87659,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866037350.229, "dur": 5.160, + "args": { + "External id": 87660,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037351.599, "dur": 3.480, + "args": { + "External id": 87661,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866037356.859, "dur": 4.880, + "args": { + "External id": 87662,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037358.139, "dur": 3.350, + "args": { + "External id": 87663,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866037377.879, "dur": 132.509, + "args": { + "External id": 87664,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 1647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866037431.328, "dur": 6.560, + "args": { + "External id": 87665,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866037439.828, "dur": 4.060, + "args": { + "External id": 87666,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866037523.058, "dur": 3.610, + "args": { + "External id": 87667,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866037531.428, "dur": 0.710, + "args": { + "External id": 87668,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6300866037535.158, "dur": 0.510, + "args": { + "External id": 87669,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866037567.668, "dur": 190.320, + "args": { + "External id": 87670,"Record function id": 0, "Sequence number": 1770995, "Fwd thread id": 1, "Ev Idx": 1653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866037570.038, "dur": 178.700, + "args": { + "External id": 87671,"Sequence number": 1770995, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1654 + } + }, + { + "ph": "f", "id": 207, "pid": 5714, "tid": 6744, "ts": 6300866037570.038, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866037586.268, "dur": 39.620, + "args": { + "External id": 87672,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037589.768, "dur": 7.120, + "args": { + "External id": 87673,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866037599.208, "dur": 26.030, + "args": { + "External id": 87674,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866037635.068, "dur": 7.650, + "args": { + "External id": 87675,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037636.998, "dur": 5.200, + "args": { + "External id": 87676,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866037770.938, "dur": 172.080, + "args": { + "External id": 87677,"Record function id": 0, "Sequence number": 1770994, "Fwd thread id": 1, "Ev Idx": 1660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866037773.818, "dur": 160.689, + "args": { + "External id": 87678,"Sequence number": 1770994, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1661 + } + }, + { + "ph": "f", "id": 208, "pid": 5714, "tid": 6744, "ts": 6300866037773.818, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866037789.458, "dur": 33.130, + "args": { + "External id": 87679,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037793.168, "dur": 6.790, + "args": { + "External id": 87680,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866037801.188, "dur": 20.760, + "args": { + "External id": 87681,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866037831.848, "dur": 7.710, + "args": { + "External id": 87682,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866037833.768, "dur": 5.340, + "args": { + "External id": 87683,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037955.487, "dur": 14.220, + "args": { + "External id": 87684,"Record function id": 0, "Sequence number": 1770993, "Fwd thread id": 1, "Ev Idx": 1667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037958.037, "dur": 8.490, + "args": { + "External id": 87685,"Sequence number": 1770993, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1668 + } + }, + { + "ph": "f", "id": 209, "pid": 5714, "tid": 6744, "ts": 6300866037958.037, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866037960.837, "dur": 5.390, + "args": { + "External id": 87686,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866037962.017, "dur": 3.930, + "args": { + "External id": 87687,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037974.287, "dur": 7.590, + "args": { + "External id": 87688,"Record function id": 0, "Sequence number": 1770992, "Fwd thread id": 1, "Ev Idx": 1671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037976.747, "dur": 3.050, + "args": { + "External id": 87689,"Sequence number": 1770992, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1672 + } + }, + { + "ph": "f", "id": 210, "pid": 5714, "tid": 6744, "ts": 6300866037976.747, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866037977.927, "dur": 1.730, + "args": { + "External id": 87690,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866037978.597, "dur": 0.870, + "args": { + "External id": 87691,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037985.767, "dur": 6.340, + "args": { + "External id": 87692,"Record function id": 0, "Sequence number": 1770991, "Fwd thread id": 1, "Ev Idx": 1675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037987.067, "dur": 3.040, + "args": { + "External id": 87693,"Sequence number": 1770991, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1676 + } + }, + { + "ph": "f", "id": 211, "pid": 5714, "tid": 6744, "ts": 6300866037987.067, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866037988.127, "dur": 1.800, + "args": { + "External id": 87694,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866037988.847, "dur": 0.890, + "args": { + "External id": 87695,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037996.067, "dur": 8.460, + "args": { + "External id": 87696,"Record function id": 0, "Sequence number": 1770990, "Fwd thread id": 1, "Ev Idx": 1679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866037997.367, "dur": 4.600, + "args": { + "External id": 87697,"Sequence number": 1770990, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1680 + } + }, + { + "ph": "f", "id": 212, "pid": 5714, "tid": 6744, "ts": 6300866037997.367, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866037998.357, "dur": 3.420, + "args": { + "External id": 87698,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866037999.937, "dur": 1.640, + "args": { + "External id": 87699,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038008.527, "dur": 117.040, + "args": { + "External id": 87700,"Record function id": 0, "Sequence number": 1770989, "Fwd thread id": 1, "Ev Idx": 1683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038009.897, "dur": 106.090, + "args": { + "External id": 87701,"Sequence number": 1770989, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1684 + } + }, + { + "ph": "f", "id": 213, "pid": 5714, "tid": 6744, "ts": 6300866038009.897, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038013.247, "dur": 5.410, + "args": { + "External id": 87702,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038014.567, "dur": 3.400, + "args": { + "External id": 87703,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038016.227, "dur": 1.330, + "args": { + "External id": 87704,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866038019.857, "dur": 46.660, + "args": { + "External id": 87705,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038069.827, "dur": 4.810, + "args": { + "External id": 87706,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038070.897, "dur": 2.750, + "args": { + "External id": 87707,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038072.517, "dur": 0.860, + "args": { + "External id": 87708,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038085.197, "dur": 3.250, + "args": { + "External id": 87709,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038086.287, "dur": 1.640, + "args": { + "External id": 87710,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038087.417, "dur": 0.320, + "args": { + "External id": 87711,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866038090.497, "dur": 24.440, + "args": { + "External id": 87712,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038136.007, "dur": 10.710, + "args": { + "External id": 87713,"Record function id": 0, "Sequence number": 1770988, "Fwd thread id": 1, "Ev Idx": 1696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038138.217, "dur": 6.180, + "args": { + "External id": 87714,"Sequence number": 1770988, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1697 + } + }, + { + "ph": "f", "id": 214, "pid": 5714, "tid": 6744, "ts": 6300866038138.217, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866038140.427, "dur": 3.790, + "args": { + "External id": 87715,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866038141.477, "dur": 2.510, + "args": { + "External id": 87716,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038150.927, "dur": 9.040, + "args": { + "External id": 87717,"Record function id": 0, "Sequence number": 1770987, "Fwd thread id": 1, "Ev Idx": 1700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038152.327, "dur": 5.570, + "args": { + "External id": 87718,"Sequence number": 1770987, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1701 + } + }, + { + "ph": "f", "id": 215, "pid": 5714, "tid": 6744, "ts": 6300866038152.327, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038153.357, "dur": 4.300, + "args": { + "External id": 87719,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038154.437, "dur": 2.480, + "args": { + "External id": 87720,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038155.987, "dur": 0.670, + "args": { + "External id": 87721,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866038165.747, "dur": 10.720, + "args": { + "External id": 87722,"Record function id": 0, "Ev Idx": 1705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866038167.807, "dur": 7.370, + "args": { + "External id": 87723,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866038169.987, "dur": 4.720, + "args": { + "External id": 87724,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866038171.627, "dur": 2.860, + "args": { + "External id": 87725,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038180.717, "dur": 7.110, + "args": { + "External id": 87726,"Record function id": 0, "Sequence number": 1770986, "Fwd thread id": 1, "Ev Idx": 1709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038182.087, "dur": 3.610, + "args": { + "External id": 87727,"Sequence number": 1770986, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1710 + } + }, + { + "ph": "f", "id": 216, "pid": 5714, "tid": 6744, "ts": 6300866038182.087, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866038183.507, "dur": 2.040, + "args": { + "External id": 87728,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866038184.217, "dur": 1.160, + "args": { + "External id": 87729,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038191.817, "dur": 90.250, + "args": { + "External id": 87730,"Record function id": 0, "Sequence number": 1770985, "Fwd thread id": 1, "Ev Idx": 1713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038193.097, "dur": 81.310, + "args": { + "External id": 87731,"Sequence number": 1770985, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1714 + } + }, + { + "ph": "f", "id": 217, "pid": 5714, "tid": 6744, "ts": 6300866038193.097, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038194.937, "dur": 4.000, + "args": { + "External id": 87732,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038196.517, "dur": 2.000, + "args": { + "External id": 87733,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038197.617, "dur": 0.640, + "args": { + "External id": 87734,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866038199.877, "dur": 38.860, + "args": { + "External id": 87735,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038240.677, "dur": 4.720, + "args": { + "External id": 87736,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038241.667, "dur": 2.720, + "args": { + "External id": 87737,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038243.197, "dur": 0.920, + "args": { + "External id": 87738,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038247.947, "dur": 2.870, + "args": { + "External id": 87739,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038248.847, "dur": 1.450, + "args": { + "External id": 87740,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038249.847, "dur": 0.290, + "args": { + "External id": 87741,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866038251.517, "dur": 22.060, + "args": { + "External id": 87742,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038291.317, "dur": 46.889, + "args": { + "External id": 87743,"Record function id": 0, "Sequence number": 1770984, "Fwd thread id": 1, "Ev Idx": 1726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038293.337, "dur": 14.980, + "args": { + "External id": 87744,"Sequence number": 1770984, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1727 + } + }, + { + "ph": "f", "id": 218, "pid": 5714, "tid": 6744, "ts": 6300866038293.337, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866038295.437, "dur": 12.660, + "args": { + "External id": 87745,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866038305.066, "dur": 2.720, + "args": { + "External id": 87746,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6300866038311.506, "dur": 22.451, + "args": { + "External id": 87747,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038345.826, "dur": 11.960, + "args": { + "External id": 87748,"Record function id": 0, "Sequence number": 1770983, "Fwd thread id": 1, "Ev Idx": 1731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038348.077, "dur": 7.069, + "args": { + "External id": 87749,"Sequence number": 1770983, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1732 + } + }, + { + "ph": "f", "id": 219, "pid": 5714, "tid": 6744, "ts": 6300866038348.077, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038349.057, "dur": 5.780, + "args": { + "External id": 87750,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038350.557, "dur": 3.249, + "args": { + "External id": 87751,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038352.577, "dur": 0.949, + "args": { + "External id": 87752,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866038363.657, "dur": 9.609, + "args": { + "External id": 87753,"Record function id": 0, "Ev Idx": 1736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866038365.477, "dur": 6.509, + "args": { + "External id": 87754,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866038367.417, "dur": 4.120, + "args": { + "External id": 87755,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866038369.317, "dur": 2.009, + "args": { + "External id": 87756,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038377.466, "dur": 7.831, + "args": { + "External id": 87757,"Record function id": 0, "Sequence number": 1770982, "Fwd thread id": 1, "Ev Idx": 1740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038378.966, "dur": 3.940, + "args": { + "External id": 87758,"Sequence number": 1770982, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1741 + } + }, + { + "ph": "f", "id": 220, "pid": 5714, "tid": 6744, "ts": 6300866038378.966, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866038380.166, "dur": 2.580, + "args": { + "External id": 87759,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866038381.037, "dur": 1.500, + "args": { + "External id": 87760,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038389.206, "dur": 94.010, + "args": { + "External id": 87761,"Record function id": 0, "Sequence number": 1770981, "Fwd thread id": 1, "Ev Idx": 1744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038390.517, "dur": 82.349, + "args": { + "External id": 87762,"Sequence number": 1770981, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1745 + } + }, + { + "ph": "f", "id": 221, "pid": 5714, "tid": 6744, "ts": 6300866038390.517, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038392.737, "dur": 4.209, + "args": { + "External id": 87763,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038393.416, "dur": 3.050, + "args": { + "External id": 87764,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038395.446, "dur": 0.800, + "args": { + "External id": 87765,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866038397.906, "dur": 39.360, + "args": { + "External id": 87766,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038439.176, "dur": 4.520, + "args": { + "External id": 87767,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038440.126, "dur": 2.600, + "args": { + "External id": 87768,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038441.716, "dur": 0.800, + "args": { + "External id": 87769,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038445.486, "dur": 3.470, + "args": { + "External id": 87770,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038447.146, "dur": 1.340, + "args": { + "External id": 87771,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038447.966, "dur": 0.350, + "args": { + "External id": 87772,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866038449.666, "dur": 22.250, + "args": { + "External id": 87773,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038492.486, "dur": 30.810, + "args": { + "External id": 87774,"Record function id": 0, "Sequence number": 1770980, "Fwd thread id": 1, "Ev Idx": 1757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038494.366, "dur": 5.770, + "args": { + "External id": 87775,"Sequence number": 1770980, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1758 + } + }, + { + "ph": "f", "id": 222, "pid": 5714, "tid": 6744, "ts": 6300866038494.366, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6300866038496.276, "dur": 3.670, + "args": { + "External id": 87776,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866038497.286, "dur": 2.440, + "args": { + "External id": 87777,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866038502.956, "dur": 16.390, + "args": { + "External id": 87778,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038529.516, "dur": 12.120, + "args": { + "External id": 87779,"Record function id": 0, "Sequence number": 1770979, "Fwd thread id": 1, "Ev Idx": 1762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038531.366, "dur": 7.810, + "args": { + "External id": 87780,"Sequence number": 1770979, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1763 + } + }, + { + "ph": "f", "id": 223, "pid": 5714, "tid": 6744, "ts": 6300866038531.366, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6300866038532.436, "dur": 6.490, + "args": { + "External id": 87781,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6300866038534.876, "dur": 3.110, + "args": { + "External id": 87782,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038536.686, "dur": 1.030, + "args": { + "External id": 87783,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866038547.276, "dur": 8.330, + "args": { + "External id": 87784,"Record function id": 0, "Ev Idx": 1767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866038549.216, "dur": 5.140, + "args": { + "External id": 87785,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866038551.126, "dur": 2.760, + "args": { + "External id": 87786,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866038551.896, "dur": 1.810, + "args": { + "External id": 87787,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038559.846, "dur": 81.810, + "args": { + "External id": 87788,"Record function id": 0, "Sequence number": 1770978, "Fwd thread id": 1, "Ev Idx": 1771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038561.116, "dur": 36.870, + "args": { + "External id": 87789,"Sequence number": 1770978, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1772 + } + }, + { + "ph": "f", "id": 224, "pid": 5714, "tid": 6744, "ts": 6300866038561.116, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866038562.916, "dur": 19.490, + "args": { + "External id": 87790,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866038584.276, "dur": 13.190, + "args": { + "External id": 87791,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866038602.306, "dur": 27.940, + "args": { + "External id": 87792,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866038633.396, "dur": 2.570, + "args": { + "External id": 87793,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866038650.986, "dur": 9.090, + "args": { + "External id": 87794,"Record function id": 0, "Ev Idx": 1777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866038653.416, "dur": 5.330, + "args": { + "External id": 87795,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866038655.056, "dur": 3.180, + "args": { + "External id": 87796,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866038655.786, "dur": 2.260, + "args": { + "External id": 87797,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038664.236, "dur": 36.470, + "args": { + "External id": 87798,"Record function id": 0, "Sequence number": 1770977, "Fwd thread id": 1, "Ev Idx": 1781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038665.556, "dur": 31.040, + "args": { + "External id": 87799,"Sequence number": 1770977, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1782 + } + }, + { + "ph": "f", "id": 225, "pid": 5714, "tid": 6744, "ts": 6300866038665.556, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866038667.096, "dur": 29.060, + "args": { + "External id": 87800,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866038668.286, "dur": 27.580, + "args": { + "External id": 87801,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038671.496, "dur": 5.830, + "args": { + "External id": 87802,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866038679.546, "dur": 15.650, + "args": { + "External id": 87803,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038707.406, "dur": 57.439, + "args": { + "External id": 87804,"Record function id": 0, "Sequence number": 1770976, "Fwd thread id": 1, "Ev Idx": 1787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038709.016, "dur": 32.220, + "args": { + "External id": 87805,"Sequence number": 1770976, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1788 + } + }, + { + "ph": "f", "id": 226, "pid": 5714, "tid": 6744, "ts": 6300866038709.016, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866038711.556, "dur": 15.540, + "args": { + "External id": 87806,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866038728.456, "dur": 12.280, + "args": { + "External id": 87807,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6300866038744.685, "dur": 15.651, + "args": { + "External id": 87808,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038772.436, "dur": 65.019, + "args": { + "External id": 87809,"Record function id": 0, "Sequence number": 1770975, "Fwd thread id": 1, "Ev Idx": 1792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038774.376, "dur": 57.849, + "args": { + "External id": 87810,"Sequence number": 1770975, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1793 + } + }, + { + "ph": "f", "id": 227, "pid": 5714, "tid": 6744, "ts": 6300866038774.376, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866038777.185, "dur": 19.960, + "args": { + "External id": 87811,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866038779.196, "dur": 0.520, + "args": { + "External id": 87812,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866038780.836, "dur": 0.260, + "args": { + "External id": 87813,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866038799.705, "dur": 19.211, + "args": { + "External id": 87814,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866038802.896, "dur": 14.889, + "args": { + "External id": 87815,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866038819.825, "dur": 10.320, + "args": { + "External id": 87816,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866038845.305, "dur": 4.490, + "args": { + "External id": 87817,"Record function id": 0, "Sequence number": 1770974, "Fwd thread id": 1, "Ev Idx": 1800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866038847.265, "dur": 0.550, + "args": { + "External id": 87818,"Sequence number": 1770974, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1801 + } + }, + { + "ph": "f", "id": 228, "pid": 5714, "tid": 6744, "ts": 6300866038847.265, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866038854.075, "dur": 38.010, + "args": { + "External id": 87819,"Record function id": 0, "Sequence number": 1770973, "Fwd thread id": 1, "Ev Idx": 1802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6300866038855.395, "dur": 32.980, + "args": { + "External id": 87820,"Sequence number": 1770973, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1803 + } + }, + { + "ph": "f", "id": 229, "pid": 5714, "tid": 6744, "ts": 6300866038855.395, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6300866038858.055, "dur": 5.870, + "args": { + "External id": 87821,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866038860.725, "dur": 1.570, + "args": { + "External id": 87822,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866038865.015, "dur": 22.670, + "args": { + "External id": 87823,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6300866038867.635, "dur": 18.960, + "args": { + "External id": 87824,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038899.105, "dur": 88.240, + "args": { + "External id": 87825,"Record function id": 0, "Sequence number": 1770972, "Fwd thread id": 1, "Ev Idx": 1808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038902.025, "dur": 65.080, + "args": { + "External id": 87826,"Sequence number": 1770972, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1809 + } + }, + { + "ph": "f", "id": 230, "pid": 5714, "tid": 6744, "ts": 6300866038902.025, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6300866038903.795, "dur": 30.360, + "args": { + "External id": 87827,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6300866038905.015, "dur": 0.350, + "args": { + "External id": 87828,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866038906.335, "dur": 0.210, + "args": { + "External id": 87829,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866038912.375, "dur": 20.300, + "args": { + "External id": 87830,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866038935.325, "dur": 18.500, + "args": { + "External id": 87831,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866038937.965, "dur": 14.590, + "args": { + "External id": 87832,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6300866038954.765, "dur": 10.580, + "args": { + "External id": 87833,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866038973.265, "dur": 10.680, + "args": { + "External id": 87834,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038996.195, "dur": 48.000, + "args": { + "External id": 87835,"Record function id": 0, "Sequence number": 1770971, "Fwd thread id": 1, "Ev Idx": 1818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6300866038998.505, "dur": 28.180, + "args": { + "External id": 87836,"Sequence number": 1770971, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1819 + } + }, + { + "ph": "f", "id": 231, "pid": 5714, "tid": 6744, "ts": 6300866038998.505, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866039000.055, "dur": 26.210, + "args": { + "External id": 87837,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866039002.305, "dur": 23.690, + "args": { + "External id": 87838,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039005.815, "dur": 6.460, + "args": { + "External id": 87839,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866039013.305, "dur": 12.160, + "args": { + "External id": 87840,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866039031.505, "dur": 9.450, + "args": { + "External id": 87841,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866039053.345, "dur": 1307.607, + "args": { + "External id": 87842,"Record function id": 0, "Ev Idx": 1825 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.9)", "pid": 5714, "tid": 6744, + "ts": 6300866039076.705, "dur": 685.948, + "args": { + "External id": 87843,"Record function id": 0, "Ev Idx": 1826 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.8", "pid": 5714, "tid": 6744, + "ts": 6300866039096.515, "dur": 657.668, + "args": { + "External id": 87844,"Record function id": 0, "Ev Idx": 1827 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6300866039109.955, "dur": 629.688, + "args": { + "External id": 87845,"Record function id": 0, "Ev Idx": 1828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866039207.295, "dur": 8.420, + "args": { + "External id": 87846,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866039230.335, "dur": 21.620, + "args": { + "External id": 87847,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 1830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039234.484, "dur": 1.160, + "args": { + "External id": 87848,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039237.735, "dur": 0.300, + "args": { + "External id": 87849,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039239.375, "dur": 0.280, + "args": { + "External id": 87850,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039240.695, "dur": 0.249, + "args": { + "External id": 87851,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039241.904, "dur": 0.460, + "args": { + "External id": 87852,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039243.435, "dur": 0.200, + "args": { + "External id": 87853,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039244.555, "dur": 1.120, + "args": { + "External id": 87854,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039246.975, "dur": 0.180, + "args": { + "External id": 87855,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039248.055, "dur": 0.249, + "args": { + "External id": 87856,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866039262.775, "dur": 30.169, + "args": { + "External id": 87857,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 1840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6300866039344.674, "dur": 105.310, + "args": { + "External id": 87858,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 1841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866039357.424, "dur": 8.040, + "args": { + "External id": 87859,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6300866039371.504, "dur": 11.220, + "args": { + "External id": 87860,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 1843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866039374.984, "dur": 7.330, + "args": { + "External id": 87861,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 1844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039379.344, "dur": 0.810, + "args": { + "External id": 87862,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 1845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866039390.644, "dur": 17.750, + "args": { + "External id": 87863,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 1846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039392.714, "dur": 0.490, + "args": { + "External id": 87864,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039394.614, "dur": 0.240, + "args": { + "External id": 87865,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039395.924, "dur": 0.280, + "args": { + "External id": 87866,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039397.114, "dur": 1.080, + "args": { + "External id": 87867,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039399.494, "dur": 0.520, + "args": { + "External id": 87868,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039400.924, "dur": 0.220, + "args": { + "External id": 87869,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039402.184, "dur": 0.380, + "args": { + "External id": 87870,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039403.484, "dur": 0.230, + "args": { + "External id": 87871,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866039404.694, "dur": 0.180, + "args": { + "External id": 87872,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866039421.424, "dur": 19.930, + "args": { + "External id": 87873,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 1856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6300866039523.714, "dur": 132.940, + "args": { + "External id": 87874,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 1857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866039555.454, "dur": 96.700, + "args": { + "External id": 87875,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 1858, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6300866039572.294, "dur": 73.929, + "args": { + "External id": 87876,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 1859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866039674.534, "dur": 4.080, + "args": { + "External id": 87877,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 1860, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866039768.263, "dur": 579.849, + "args": { + "External id": 87878,"Sequence number": 1770970, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1861 + } + }, + { + "ph": "f", "id": 232, "pid": 5714, "tid": 6744, "ts": 6300866039768.263, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866039854.973, "dur": 37.180, + "args": { + "External id": 87879,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 1862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6300866039927.743, "dur": 29.260, + "args": { + "External id": 87880,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 1863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866039975.543, "dur": 39.300, + "args": { + "External id": 87881,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 1864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866040029.403, "dur": 27.419, + "args": { + "External id": 87882,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 1865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866040069.173, "dur": 21.140, + "args": { + "External id": 87883,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 1866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866040100.793, "dur": 24.129, + "args": { + "External id": 87884,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 1867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866040136.402, "dur": 19.880, + "args": { + "External id": 87885,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 1868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866040183.392, "dur": 25.060, + "args": { + "External id": 87886,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 1869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866040228.822, "dur": 16.940, + "args": { + "External id": 87887,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 1870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6300866040264.942, "dur": 22.730, + "args": { + "External id": 87888,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 1871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866040376.412, "dur": 13.070, + "args": { + "External id": 87889,"Record function id": 0, "Ev Idx": 1872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866040379.932, "dur": 8.080, + "args": { + "External id": 87890,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866040382.702, "dur": 4.400, + "args": { + "External id": 87891,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866040383.912, "dur": 2.900, + "args": { + "External id": 87892,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866040394.442, "dur": 6.660, + "args": { + "External id": 87893,"Record function id": 0, "Ev Idx": 1876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866040396.382, "dur": 3.650, + "args": { + "External id": 87894,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866040397.212, "dur": 2.240, + "args": { + "External id": 87895,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866040397.922, "dur": 1.330, + "args": { + "External id": 87896,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866040405.372, "dur": 5.640, + "args": { + "External id": 87897,"Record function id": 0, "Ev Idx": 1880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866040407.032, "dur": 2.930, + "args": { + "External id": 87898,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866040407.702, "dur": 1.720, + "args": { + "External id": 87899,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866040408.252, "dur": 0.980, + "args": { + "External id": 87900,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866040415.142, "dur": 5.660, + "args": { + "External id": 87901,"Record function id": 0, "Ev Idx": 1884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866040416.932, "dur": 2.810, + "args": { + "External id": 87902,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866040417.602, "dur": 1.580, + "args": { + "External id": 87903,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866040417.992, "dur": 0.980, + "args": { + "External id": 87904,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866040425.242, "dur": 374.609, + "args": { + "External id": 87905,"Record function id": 0, "Sequence number": 1770969, "Fwd thread id": 1, "Ev Idx": 1888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866040426.772, "dur": 364.199, + "args": { + "External id": 87906,"Sequence number": 1770969, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1889 + } + }, + { + "ph": "f", "id": 233, "pid": 5714, "tid": 6744, "ts": 6300866040426.772, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866040504.821, "dur": 43.711, + "args": { + "External id": 87907,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866040562.492, "dur": 20.220, + "args": { + "External id": 87908,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866040622.091, "dur": 141.340, + "args": { + "External id": 87909,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 1892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866040681.911, "dur": 8.110, + "args": { + "External id": 87910,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866040691.911, "dur": 5.190, + "args": { + "External id": 87911,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866040814.971, "dur": 12.210, + "args": { + "External id": 87912,"Record function id": 0, "Ev Idx": 1895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866040818.171, "dur": 7.530, + "args": { + "External id": 87913,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1896 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866040820.711, "dur": 4.000, + "args": { + "External id": 87914,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866040821.851, "dur": 2.620, + "args": { + "External id": 87915,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866040831.941, "dur": 188.739, + "args": { + "External id": 87916,"Record function id": 0, "Sequence number": 1770968, "Fwd thread id": 1, "Ev Idx": 1899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866040833.781, "dur": 179.679, + "args": { + "External id": 87917,"Sequence number": 1770968, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1900 + } + }, + { + "ph": "f", "id": 234, "pid": 5714, "tid": 6744, "ts": 6300866040833.781, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866040850.221, "dur": 38.140, + "args": { + "External id": 87918,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866040853.751, "dur": 7.060, + "args": { + "External id": 87919,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866040862.061, "dur": 25.660, + "args": { + "External id": 87920,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866040898.011, "dur": 7.960, + "args": { + "External id": 87921,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866040900.051, "dur": 5.450, + "args": { + "External id": 87922,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866041033.540, "dur": 172.200, + "args": { + "External id": 87923,"Record function id": 0, "Sequence number": 1770967, "Fwd thread id": 1, "Ev Idx": 1906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866041036.700, "dur": 161.160, + "args": { + "External id": 87924,"Sequence number": 1770967, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1907 + } + }, + { + "ph": "f", "id": 235, "pid": 5714, "tid": 6744, "ts": 6300866041036.700, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866041051.970, "dur": 33.940, + "args": { + "External id": 87925,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866041055.470, "dur": 7.200, + "args": { + "External id": 87926,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866041064.010, "dur": 21.210, + "args": { + "External id": 87927,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866041094.940, "dur": 7.700, + "args": { + "External id": 87928,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866041097.030, "dur": 5.160, + "args": { + "External id": 87929,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866041218.680, "dur": 373.829, + "args": { + "External id": 87930,"Record function id": 0, "Sequence number": 1770966, "Fwd thread id": 1, "Ev Idx": 1913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866041221.790, "dur": 359.279, + "args": { + "External id": 87931,"Sequence number": 1770966, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 1914 + } + }, + { + "ph": "f", "id": 236, "pid": 5714, "tid": 6744, "ts": 6300866041221.790, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866041311.800, "dur": 44.340, + "args": { + "External id": 87932,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866041371.730, "dur": 26.440, + "args": { + "External id": 87933,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866041409.390, "dur": 24.069, + "args": { + "External id": 87934,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866041446.610, "dur": 19.929, + "args": { + "External id": 87935,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866041475.799, "dur": 15.740, + "args": { + "External id": 87936,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866041500.799, "dur": 15.240, + "args": { + "External id": 87937,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6300866041537.719, "dur": 24.600, + "args": { + "External id": 87938,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 1921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866041606.749, "dur": 12.860, + "args": { + "External id": 87939,"Record function id": 0, "Ev Idx": 1922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866041609.919, "dur": 8.150, + "args": { + "External id": 87940,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866041612.669, "dur": 4.610, + "args": { + "External id": 87941,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866041613.879, "dur": 3.140, + "args": { + "External id": 87942,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866041624.429, "dur": 6.090, + "args": { + "External id": 87943,"Record function id": 0, "Ev Idx": 1926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866041626.349, "dur": 3.030, + "args": { + "External id": 87944,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866041627.179, "dur": 1.700, + "args": { + "External id": 87945,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866041627.829, "dur": 0.850, + "args": { + "External id": 87946,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866041634.619, "dur": 5.410, + "args": { + "External id": 87947,"Record function id": 0, "Ev Idx": 1930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866041636.309, "dur": 2.680, + "args": { + "External id": 87948,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866041637.059, "dur": 1.420, + "args": { + "External id": 87949,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866041637.459, "dur": 0.840, + "args": { + "External id": 87950,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866041644.629, "dur": 236.089, + "args": { + "External id": 87951,"Record function id": 0, "Sequence number": 1770965, "Fwd thread id": 1, "Ev Idx": 1934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866041646.299, "dur": 202.430, + "args": { + "External id": 87952,"Sequence number": 1770965, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1935 + } + }, + { + "ph": "f", "id": 237, "pid": 5714, "tid": 6744, "ts": 6300866041646.299, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866041725.479, "dur": 26.500, + "args": { + "External id": 87953,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 1936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866041772.519, "dur": 16.400, + "args": { + "External id": 87954,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 1937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866041809.999, "dur": 18.239, + "args": { + "External id": 87955,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 1938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866041857.078, "dur": 18.351, + "args": { + "External id": 87956,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866041893.578, "dur": 11.140, + "args": { + "External id": 87957,"Record function id": 0, "Ev Idx": 1940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866041896.649, "dur": 6.600, + "args": { + "External id": 87958,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866041898.838, "dur": 3.700, + "args": { + "External id": 87959,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866041899.878, "dur": 2.411, + "args": { + "External id": 87960,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866041909.549, "dur": 909.547, + "args": { + "External id": 87961,"Record function id": 0, "Sequence number": 1770964, "Fwd thread id": 1, "Ev Idx": 1944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866041911.338, "dur": 901.308, + "args": { + "External id": 87962,"Sequence number": 1770964, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1945 + } + }, + { + "ph": "f", "id": 238, "pid": 5714, "tid": 6744, "ts": 6300866041911.338, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.9)", "pid": 5714, "tid": 6744, + "ts": 6300866041934.158, "dur": 31.180, + "args": { + "External id": 87963,"Record function id": 0, "Ev Idx": 1946 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.9)", "pid": 5714, "tid": 6744, + "ts": 6300866041974.718, "dur": 88.850, + "args": { + "External id": 87964,"Record function id": 0, "Ev Idx": 1947 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.9)", "pid": 5714, "tid": 6744, + "ts": 6300866042071.258, "dur": 735.258, + "args": { + "External id": 87965,"Record function id": 0, "Ev Idx": 1948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866042120.818, "dur": 10.370, + "args": { + "External id": 87966,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042140.898, "dur": 3.720, + "args": { + "External id": 87967,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 1950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866042158.578, "dur": 149.850, + "args": { + "External id": 87968,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 1951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866042171.598, "dur": 124.019, + "args": { + "External id": 87969,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 1952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866042228.798, "dur": 2.650, + "args": { + "External id": 87970,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866042236.128, "dur": 35.120, + "args": { + "External id": 87971,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 1954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866042237.948, "dur": 32.880, + "args": { + "External id": 87972,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 1955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866042240.588, "dur": 7.130, + "args": { + "External id": 87973,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866042249.048, "dur": 21.200, + "args": { + "External id": 87974,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 1957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866042383.007, "dur": 10.320, + "args": { + "External id": 87975,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 1958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866042385.767, "dur": 7.060, + "args": { + "External id": 87976,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866042420.867, "dur": 92.400, + "args": { + "External id": 87977,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 1960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866042438.537, "dur": 71.170, + "args": { + "External id": 87978,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 1961, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866042451.297, "dur": 53.760, + "args": { + "External id": 87979,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 1962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866042527.047, "dur": 3.710, + "args": { + "External id": 87980,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 1963, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866042586.467, "dur": 5.880, + "args": { + "External id": 87981,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866042633.817, "dur": 1.270, + "args": { + "External id": 87982,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866042655.887, "dur": 0.880, + "args": { + "External id": 87983,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866042672.377, "dur": 0.840, + "args": { + "External id": 87984,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866042687.627, "dur": 1.920, + "args": { + "External id": 87985,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866042703.556, "dur": 0.831, + "args": { + "External id": 87986,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866042718.916, "dur": 0.940, + "args": { + "External id": 87987,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866042734.827, "dur": 1.189, + "args": { + "External id": 87988,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866042749.607, "dur": 1.749, + "args": { + "External id": 87989,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866042832.046, "dur": 1458.437, + "args": { + "External id": 87990,"Record function id": 0, "Ev Idx": 1973 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6300866042846.836, "dur": 914.988, + "args": { + "External id": 87991,"Record function id": 0, "Ev Idx": 1974 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6300866042859.696, "dur": 268.540, + "args": { + "External id": 87992,"Record function id": 0, "Ev Idx": 1975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042952.046, "dur": 4.120, + "args": { + "External id": 87993,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 1976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042960.766, "dur": 1.230, + "args": { + "External id": 87994,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 1977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042963.796, "dur": 0.870, + "args": { + "External id": 87995,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042966.306, "dur": 0.900, + "args": { + "External id": 87996,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042968.526, "dur": 0.760, + "args": { + "External id": 87997,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042970.896, "dur": 0.800, + "args": { + "External id": 87998,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042973.176, "dur": 2.260, + "args": { + "External id": 87999,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 1982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042976.876, "dur": 0.830, + "args": { + "External id": 88000,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 1983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042979.256, "dur": 0.700, + "args": { + "External id": 88001,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 1984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866042981.496, "dur": 0.860, + "args": { + "External id": 88002,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 1985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866042998.796, "dur": 99.620, + "args": { + "External id": 88003,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 1986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866043013.256, "dur": 81.310, + "args": { + "External id": 88004,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 1987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866043026.576, "dur": 7.390, + "args": { + "External id": 88005,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866043036.816, "dur": 33.950, + "args": { + "External id": 88006,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 1989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866043038.416, "dur": 31.970, + "args": { + "External id": 88007,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 1990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043041.106, "dur": 7.390, + "args": { + "External id": 88008,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866043049.806, "dur": 19.970, + "args": { + "External id": 88009,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 1992 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.7", "pid": 5714, "tid": 6744, + "ts": 6300866043219.406, "dur": 533.898, + "args": { + "External id": 88010,"Record function id": 0, "Ev Idx": 1993 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6300866043235.625, "dur": 503.799, + "args": { + "External id": 88011,"Record function id": 0, "Ev Idx": 1994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866043312.105, "dur": 8.600, + "args": { + "External id": 88012,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866043332.715, "dur": 24.400, + "args": { + "External id": 88013,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 1996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043336.085, "dur": 1.230, + "args": { + "External id": 88014,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043339.835, "dur": 1.520, + "args": { + "External id": 88015,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043342.445, "dur": 0.310, + "args": { + "External id": 88016,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043344.115, "dur": 0.330, + "args": { + "External id": 88017,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043345.725, "dur": 0.280, + "args": { + "External id": 88018,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043347.505, "dur": 0.270, + "args": { + "External id": 88019,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043349.135, "dur": 0.270, + "args": { + "External id": 88020,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043350.765, "dur": 0.290, + "args": { + "External id": 88021,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043352.385, "dur": 0.300, + "args": { + "External id": 88022,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866043365.855, "dur": 23.800, + "args": { + "External id": 88023,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6300866043424.715, "dur": 102.090, + "args": { + "External id": 88024,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866043435.085, "dur": 8.310, + "args": { + "External id": 88025,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6300866043447.795, "dur": 10.990, + "args": { + "External id": 88026,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866043450.385, "dur": 7.950, + "args": { + "External id": 88027,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043455.495, "dur": 0.940, + "args": { + "External id": 88028,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866043467.005, "dur": 20.210, + "args": { + "External id": 88029,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043469.295, "dur": 0.380, + "args": { + "External id": 88030,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043471.075, "dur": 0.250, + "args": { + "External id": 88031,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043472.895, "dur": 0.270, + "args": { + "External id": 88032,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043474.465, "dur": 0.300, + "args": { + "External id": 88033,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043476.035, "dur": 0.290, + "args": { + "External id": 88034,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043477.575, "dur": 0.290, + "args": { + "External id": 88035,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043479.115, "dur": 1.540, + "args": { + "External id": 88036,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043481.965, "dur": 0.290, + "args": { + "External id": 88037,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866043483.485, "dur": 0.310, + "args": { + "External id": 88038,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866043499.365, "dur": 18.940, + "args": { + "External id": 88039,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6300866043583.134, "dur": 86.271, + "args": { + "External id": 88040,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866043599.185, "dur": 67.009, + "args": { + "External id": 88041,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2024, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6300866043611.465, "dur": 50.329, + "args": { + "External id": 88042,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866043683.334, "dur": 3.460, + "args": { + "External id": 88043,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2026, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866043767.784, "dur": 510.329, + "args": { + "External id": 88044,"Sequence number": 1770963, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2027 + } + }, + { + "ph": "f", "id": 239, "pid": 5714, "tid": 6744, "ts": 6300866043767.784, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866043838.704, "dur": 34.780, + "args": { + "External id": 88045,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6300866043905.534, "dur": 25.560, + "args": { + "External id": 88046,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866043950.514, "dur": 36.940, + "args": { + "External id": 88047,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866044002.754, "dur": 28.690, + "args": { + "External id": 88048,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866044043.004, "dur": 22.309, + "args": { + "External id": 88049,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866044077.633, "dur": 25.411, + "args": { + "External id": 88050,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866044115.463, "dur": 21.030, + "args": { + "External id": 88051,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866044162.683, "dur": 23.090, + "args": { + "External id": 88052,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866044203.603, "dur": 14.880, + "args": { + "External id": 88053,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6300866044234.333, "dur": 18.250, + "args": { + "External id": 88054,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866044316.003, "dur": 15.020, + "args": { + "External id": 88055,"Record function id": 0, "Ev Idx": 2038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866044319.973, "dur": 9.300, + "args": { + "External id": 88056,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866044322.893, "dur": 5.280, + "args": { + "External id": 88057,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866044324.043, "dur": 3.840, + "args": { + "External id": 88058,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866044336.473, "dur": 6.780, + "args": { + "External id": 88059,"Record function id": 0, "Ev Idx": 2042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866044338.653, "dur": 3.410, + "args": { + "External id": 88060,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866044339.553, "dur": 1.930, + "args": { + "External id": 88061,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866044340.193, "dur": 1.090, + "args": { + "External id": 88062,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866044347.623, "dur": 75.700, + "args": { + "External id": 88063,"Record function id": 0, "Ev Idx": 2046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866044349.543, "dur": 72.120, + "args": { + "External id": 88064,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866044418.413, "dur": 2.450, + "args": { + "External id": 88065,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866044419.283, "dur": 1.310, + "args": { + "External id": 88066,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866044428.833, "dur": 6.410, + "args": { + "External id": 88067,"Record function id": 0, "Ev Idx": 2050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866044430.923, "dur": 3.140, + "args": { + "External id": 88068,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866044431.993, "dur": 1.520, + "args": { + "External id": 88069,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866044432.443, "dur": 0.880, + "args": { + "External id": 88070,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866044439.833, "dur": 380.309, + "args": { + "External id": 88071,"Record function id": 0, "Sequence number": 1770962, "Fwd thread id": 1, "Ev Idx": 2054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866044441.663, "dur": 368.719, + "args": { + "External id": 88072,"Sequence number": 1770962, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2055 + } + }, + { + "ph": "f", "id": 240, "pid": 5714, "tid": 6744, "ts": 6300866044441.663, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866044515.003, "dur": 46.059, + "args": { + "External id": 88073,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866044576.292, "dur": 21.580, + "args": { + "External id": 88074,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866044634.022, "dur": 147.040, + "args": { + "External id": 88075,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866044696.632, "dur": 7.890, + "args": { + "External id": 88076,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866044706.762, "dur": 4.810, + "args": { + "External id": 88077,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866044835.582, "dur": 12.970, + "args": { + "External id": 88078,"Record function id": 0, "Ev Idx": 2061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866044839.052, "dur": 7.990, + "args": { + "External id": 88079,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866044841.882, "dur": 4.170, + "args": { + "External id": 88080,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866044843.052, "dur": 2.760, + "args": { + "External id": 88081,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866044853.212, "dur": 202.979, + "args": { + "External id": 88082,"Record function id": 0, "Sequence number": 1770961, "Fwd thread id": 1, "Ev Idx": 2065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866044855.082, "dur": 193.529, + "args": { + "External id": 88083,"Sequence number": 1770961, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2066 + } + }, + { + "ph": "f", "id": 241, "pid": 5714, "tid": 6744, "ts": 6300866044855.082, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866044873.072, "dur": 40.450, + "args": { + "External id": 88084,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866044876.692, "dur": 7.540, + "args": { + "External id": 88085,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866044885.622, "dur": 27.300, + "args": { + "External id": 88086,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866044923.102, "dur": 8.449, + "args": { + "External id": 88087,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866044925.322, "dur": 5.760, + "args": { + "External id": 88088,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866045069.941, "dur": 187.310, + "args": { + "External id": 88089,"Record function id": 0, "Sequence number": 1770960, "Fwd thread id": 1, "Ev Idx": 2072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866045073.381, "dur": 175.300, + "args": { + "External id": 88090,"Sequence number": 1770960, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2073 + } + }, + { + "ph": "f", "id": 242, "pid": 5714, "tid": 6744, "ts": 6300866045073.381, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866045089.851, "dur": 37.560, + "args": { + "External id": 88091,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866045093.481, "dur": 8.450, + "args": { + "External id": 88092,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866045103.391, "dur": 23.250, + "args": { + "External id": 88093,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866045137.081, "dur": 7.950, + "args": { + "External id": 88094,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866045139.241, "dur": 5.340, + "args": { + "External id": 88095,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866045270.811, "dur": 396.259, + "args": { + "External id": 88096,"Record function id": 0, "Sequence number": 1770959, "Fwd thread id": 1, "Ev Idx": 2079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866045274.751, "dur": 378.499, + "args": { + "External id": 88097,"Sequence number": 1770959, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2080 + } + }, + { + "ph": "f", "id": 243, "pid": 5714, "tid": 6744, "ts": 6300866045274.751, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866045363.430, "dur": 47.931, + "args": { + "External id": 88098,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866045426.290, "dur": 29.140, + "args": { + "External id": 88099,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866045469.970, "dur": 26.890, + "args": { + "External id": 88100,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866045511.690, "dur": 21.940, + "args": { + "External id": 88101,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866045543.730, "dur": 16.880, + "args": { + "External id": 88102,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866045569.650, "dur": 16.840, + "args": { + "External id": 88103,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6300866045610.180, "dur": 22.810, + "args": { + "External id": 88104,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866045682.700, "dur": 14.600, + "args": { + "External id": 88105,"Record function id": 0, "Ev Idx": 2088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866045686.250, "dur": 9.620, + "args": { + "External id": 88106,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866045689.140, "dur": 5.850, + "args": { + "External id": 88107,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866045690.520, "dur": 4.210, + "args": { + "External id": 88108,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866045702.650, "dur": 6.390, + "args": { + "External id": 88109,"Record function id": 0, "Ev Idx": 2092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866045704.660, "dur": 3.170, + "args": { + "External id": 88110,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866045705.520, "dur": 1.790, + "args": { + "External id": 88111,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866045706.040, "dur": 1.050, + "args": { + "External id": 88112,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866045713.550, "dur": 5.870, + "args": { + "External id": 88113,"Record function id": 0, "Ev Idx": 2096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866045715.370, "dur": 2.890, + "args": { + "External id": 88114,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866045716.160, "dur": 1.570, + "args": { + "External id": 88115,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866045716.590, "dur": 0.920, + "args": { + "External id": 88116,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866045723.910, "dur": 243.409, + "args": { + "External id": 88117,"Record function id": 0, "Sequence number": 1770958, "Fwd thread id": 1, "Ev Idx": 2100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866045725.750, "dur": 205.289, + "args": { + "External id": 88118,"Sequence number": 1770958, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2101 + } + }, + { + "ph": "f", "id": 244, "pid": 5714, "tid": 6744, "ts": 6300866045725.750, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866045804.300, "dur": 26.520, + "args": { + "External id": 88119,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866045852.269, "dur": 16.171, + "args": { + "External id": 88120,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866045889.609, "dur": 18.790, + "args": { + "External id": 88121,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866045941.089, "dur": 20.210, + "args": { + "External id": 88122,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866045981.539, "dur": 13.830, + "args": { + "External id": 88123,"Record function id": 0, "Ev Idx": 2106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866045985.019, "dur": 8.710, + "args": { + "External id": 88124,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866045987.559, "dur": 5.350, + "args": { + "External id": 88125,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866045988.639, "dur": 4.020, + "args": { + "External id": 88126,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866046000.629, "dur": 868.518, + "args": { + "External id": 88127,"Record function id": 0, "Sequence number": 1770957, "Fwd thread id": 1, "Ev Idx": 2110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866046002.579, "dur": 860.008, + "args": { + "External id": 88128,"Sequence number": 1770957, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2111 + } + }, + { + "ph": "f", "id": 245, "pid": 5714, "tid": 6744, "ts": 6300866046002.579, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6300866046023.689, "dur": 28.650, + "args": { + "External id": 88129,"Record function id": 0, "Ev Idx": 2112 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6300866046062.249, "dur": 58.850, + "args": { + "External id": 88130,"Record function id": 0, "Ev Idx": 2113 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6300866046128.949, "dur": 727.638, + "args": { + "External id": 88131,"Record function id": 0, "Ev Idx": 2114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866046202.759, "dur": 8.450, + "args": { + "External id": 88132,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866046220.568, "dur": 3.720, + "args": { + "External id": 88133,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866046237.348, "dur": 147.080, + "args": { + "External id": 88134,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866046248.139, "dur": 132.079, + "args": { + "External id": 88135,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866046310.599, "dur": 7.349, + "args": { + "External id": 88136,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866046321.898, "dur": 35.010, + "args": { + "External id": 88137,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866046323.588, "dur": 32.930, + "args": { + "External id": 88138,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866046326.308, "dur": 7.240, + "args": { + "External id": 88139,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866046334.968, "dur": 20.990, + "args": { + "External id": 88140,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866046456.858, "dur": 9.500, + "args": { + "External id": 88141,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866046458.888, "dur": 6.900, + "args": { + "External id": 88142,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866046485.548, "dur": 91.800, + "args": { + "External id": 88143,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866046502.718, "dur": 71.130, + "args": { + "External id": 88144,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2127, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866046516.268, "dur": 52.910, + "args": { + "External id": 88145,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866046591.118, "dur": 3.390, + "args": { + "External id": 88146,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2129, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866046650.688, "dur": 4.270, + "args": { + "External id": 88147,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866046687.638, "dur": 2.289, + "args": { + "External id": 88148,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866046708.667, "dur": 0.920, + "args": { + "External id": 88149,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866046725.518, "dur": 0.900, + "args": { + "External id": 88150,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866046741.078, "dur": 0.880, + "args": { + "External id": 88151,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866046756.047, "dur": 1.850, + "args": { + "External id": 88152,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866046772.317, "dur": 0.990, + "args": { + "External id": 88153,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866046787.047, "dur": 1.150, + "args": { + "External id": 88154,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866046801.597, "dur": 0.820, + "args": { + "External id": 88155,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866046882.897, "dur": 1409.107, + "args": { + "External id": 88156,"Record function id": 0, "Ev Idx": 2139 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6300866046898.127, "dur": 883.028, + "args": { + "External id": 88157,"Record function id": 0, "Ev Idx": 2140 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6300866046910.837, "dur": 260.640, + "args": { + "External id": 88158,"Record function id": 0, "Ev Idx": 2141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866047002.537, "dur": 5.280, + "args": { + "External id": 88159,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866047012.557, "dur": 1.300, + "args": { + "External id": 88160,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866047015.817, "dur": 0.590, + "args": { + "External id": 88161,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866047018.177, "dur": 0.780, + "args": { + "External id": 88162,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866047020.477, "dur": 0.840, + "args": { + "External id": 88163,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866047023.237, "dur": 0.840, + "args": { + "External id": 88164,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866047025.557, "dur": 1.020, + "args": { + "External id": 88165,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866047028.197, "dur": 0.770, + "args": { + "External id": 88166,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866047030.327, "dur": 1.520, + "args": { + "External id": 88167,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866047034.037, "dur": 0.720, + "args": { + "External id": 88168,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866047049.177, "dur": 92.760, + "args": { + "External id": 88169,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866047061.607, "dur": 76.379, + "args": { + "External id": 88170,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866047073.437, "dur": 6.470, + "args": { + "External id": 88171,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866047082.107, "dur": 33.219, + "args": { + "External id": 88172,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866047083.507, "dur": 31.390, + "args": { + "External id": 88173,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047086.077, "dur": 7.440, + "args": { + "External id": 88174,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866047094.737, "dur": 19.600, + "args": { + "External id": 88175,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2158 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.6", "pid": 5714, "tid": 6744, + "ts": 6300866047261.066, "dur": 511.439, + "args": { + "External id": 88176,"Record function id": 0, "Ev Idx": 2159 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6300866047276.306, "dur": 482.099, + "args": { + "External id": 88177,"Record function id": 0, "Ev Idx": 2160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866047352.496, "dur": 8.300, + "args": { + "External id": 88178,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866047372.536, "dur": 20.140, + "args": { + "External id": 88179,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047375.616, "dur": 1.220, + "args": { + "External id": 88180,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047378.826, "dur": 0.310, + "args": { + "External id": 88181,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047380.406, "dur": 0.250, + "args": { + "External id": 88182,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047381.786, "dur": 1.020, + "args": { + "External id": 88183,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047383.826, "dur": 0.330, + "args": { + "External id": 88184,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047385.316, "dur": 0.200, + "args": { + "External id": 88185,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047386.666, "dur": 0.260, + "args": { + "External id": 88186,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047388.016, "dur": 0.250, + "args": { + "External id": 88187,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047389.256, "dur": 0.240, + "args": { + "External id": 88188,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866047400.886, "dur": 23.670, + "args": { + "External id": 88189,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6300866047456.296, "dur": 92.240, + "args": { + "External id": 88190,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866047466.226, "dur": 7.200, + "args": { + "External id": 88191,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6300866047477.846, "dur": 8.640, + "args": { + "External id": 88192,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866047480.366, "dur": 5.710, + "args": { + "External id": 88193,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047483.266, "dur": 0.810, + "args": { + "External id": 88194,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866047494.536, "dur": 16.870, + "args": { + "External id": 88195,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047496.556, "dur": 1.280, + "args": { + "External id": 88196,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047499.006, "dur": 0.250, + "args": { + "External id": 88197,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047500.306, "dur": 0.270, + "args": { + "External id": 88198,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047501.676, "dur": 0.230, + "args": { + "External id": 88199,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047503.116, "dur": 0.190, + "args": { + "External id": 88200,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047504.266, "dur": 0.250, + "args": { + "External id": 88201,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047505.516, "dur": 0.260, + "args": { + "External id": 88202,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047506.836, "dur": 0.240, + "args": { + "External id": 88203,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866047508.176, "dur": 1.040, + "args": { + "External id": 88204,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866047522.216, "dur": 18.280, + "args": { + "External id": 88205,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6300866047604.936, "dur": 84.259, + "args": { + "External id": 88206,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866047620.265, "dur": 65.790, + "args": { + "External id": 88207,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2190, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6300866047632.345, "dur": 49.180, + "args": { + "External id": 88208,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866047701.625, "dur": 3.460, + "args": { + "External id": 88209,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2192, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866047786.735, "dur": 492.959, + "args": { + "External id": 88210,"Sequence number": 1770956, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2193 + } + }, + { + "ph": "f", "id": 246, "pid": 5714, "tid": 6744, "ts": 6300866047786.735, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866047856.985, "dur": 34.850, + "args": { + "External id": 88211,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6300866047923.445, "dur": 24.860, + "args": { + "External id": 88212,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866047966.725, "dur": 37.170, + "args": { + "External id": 88213,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866048017.975, "dur": 27.429, + "args": { + "External id": 88214,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866048056.284, "dur": 21.620, + "args": { + "External id": 88215,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866048088.644, "dur": 24.840, + "args": { + "External id": 88216,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866048125.634, "dur": 20.310, + "args": { + "External id": 88217,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866048170.214, "dur": 22.180, + "args": { + "External id": 88218,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866048209.214, "dur": 14.630, + "args": { + "External id": 88219,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6300866048239.354, "dur": 17.260, + "args": { + "External id": 88220,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866048316.534, "dur": 13.130, + "args": { + "External id": 88221,"Record function id": 0, "Ev Idx": 2204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866048320.344, "dur": 7.800, + "args": { + "External id": 88222,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866048323.074, "dur": 4.170, + "args": { + "External id": 88223,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866048324.104, "dur": 2.880, + "args": { + "External id": 88224,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866048334.654, "dur": 7.140, + "args": { + "External id": 88225,"Record function id": 0, "Ev Idx": 2208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866048336.474, "dur": 4.170, + "args": { + "External id": 88226,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866048337.274, "dur": 2.860, + "args": { + "External id": 88227,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866048337.814, "dur": 2.100, + "args": { + "External id": 88228,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866048346.174, "dur": 5.610, + "args": { + "External id": 88229,"Record function id": 0, "Ev Idx": 2212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866048348.044, "dur": 2.700, + "args": { + "External id": 88230,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866048348.734, "dur": 1.440, + "args": { + "External id": 88231,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866048349.214, "dur": 0.750, + "args": { + "External id": 88232,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866048355.804, "dur": 5.240, + "args": { + "External id": 88233,"Record function id": 0, "Ev Idx": 2216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866048357.494, "dur": 2.460, + "args": { + "External id": 88234,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866048358.224, "dur": 1.270, + "args": { + "External id": 88235,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866048358.624, "dur": 0.680, + "args": { + "External id": 88236,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866048365.094, "dur": 347.919, + "args": { + "External id": 88237,"Record function id": 0, "Sequence number": 1770955, "Fwd thread id": 1, "Ev Idx": 2220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866048366.774, "dur": 337.059, + "args": { + "External id": 88238,"Sequence number": 1770955, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2221 + } + }, + { + "ph": "f", "id": 247, "pid": 5714, "tid": 6744, "ts": 6300866048366.774, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866048433.694, "dur": 43.049, + "args": { + "External id": 88239,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866048493.114, "dur": 20.560, + "args": { + "External id": 88240,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866048540.303, "dur": 137.130, + "args": { + "External id": 88241,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866048597.573, "dur": 7.080, + "args": { + "External id": 88242,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866048606.483, "dur": 4.800, + "args": { + "External id": 88243,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866048727.803, "dur": 12.250, + "args": { + "External id": 88244,"Record function id": 0, "Ev Idx": 2227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866048731.113, "dur": 7.530, + "args": { + "External id": 88245,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866048733.723, "dur": 3.980, + "args": { + "External id": 88246,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866048734.833, "dur": 2.620, + "args": { + "External id": 88247,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866048744.673, "dur": 188.009, + "args": { + "External id": 88248,"Record function id": 0, "Sequence number": 1770954, "Fwd thread id": 1, "Ev Idx": 2231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866048746.413, "dur": 179.449, + "args": { + "External id": 88249,"Sequence number": 1770954, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2232 + } + }, + { + "ph": "f", "id": 248, "pid": 5714, "tid": 6744, "ts": 6300866048746.413, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866048762.733, "dur": 38.440, + "args": { + "External id": 88250,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866048766.193, "dur": 7.110, + "args": { + "External id": 88251,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866048774.593, "dur": 25.880, + "args": { + "External id": 88252,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866048810.583, "dur": 8.090, + "args": { + "External id": 88253,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866048812.933, "dur": 5.240, + "args": { + "External id": 88254,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866048945.382, "dur": 173.240, + "args": { + "External id": 88255,"Record function id": 0, "Sequence number": 1770953, "Fwd thread id": 1, "Ev Idx": 2238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866048948.382, "dur": 162.500, + "args": { + "External id": 88256,"Sequence number": 1770953, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2239 + } + }, + { + "ph": "f", "id": 249, "pid": 5714, "tid": 6744, "ts": 6300866048948.382, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866048963.032, "dur": 33.940, + "args": { + "External id": 88257,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866048966.472, "dur": 6.910, + "args": { + "External id": 88258,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866048974.632, "dur": 21.760, + "args": { + "External id": 88259,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866049006.472, "dur": 8.460, + "args": { + "External id": 88260,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866049008.382, "dur": 6.100, + "args": { + "External id": 88261,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866049131.022, "dur": 366.289, + "args": { + "External id": 88262,"Record function id": 0, "Sequence number": 1770952, "Fwd thread id": 1, "Ev Idx": 2245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866049134.402, "dur": 351.139, + "args": { + "External id": 88263,"Sequence number": 1770952, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2246 + } + }, + { + "ph": "f", "id": 250, "pid": 5714, "tid": 6744, "ts": 6300866049134.402, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866049203.842, "dur": 43.300, + "args": { + "External id": 88264,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866049260.542, "dur": 25.980, + "args": { + "External id": 88265,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866049307.161, "dur": 27.371, + "args": { + "External id": 88266,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866049349.421, "dur": 19.931, + "args": { + "External id": 88267,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866049377.981, "dur": 15.780, + "args": { + "External id": 88268,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866049401.691, "dur": 15.190, + "args": { + "External id": 88269,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6300866049446.061, "dur": 20.790, + "args": { + "External id": 88270,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866049511.611, "dur": 12.370, + "args": { + "External id": 88271,"Record function id": 0, "Ev Idx": 2254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866049514.721, "dur": 7.800, + "args": { + "External id": 88272,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866049517.371, "dur": 4.310, + "args": { + "External id": 88273,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866049518.591, "dur": 2.830, + "args": { + "External id": 88274,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866049528.891, "dur": 7.070, + "args": { + "External id": 88275,"Record function id": 0, "Ev Idx": 2258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866049530.741, "dur": 4.130, + "args": { + "External id": 88276,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866049531.521, "dur": 2.830, + "args": { + "External id": 88277,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866049532.051, "dur": 2.090, + "args": { + "External id": 88278,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866049540.061, "dur": 5.310, + "args": { + "External id": 88279,"Record function id": 0, "Ev Idx": 2262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866049541.701, "dur": 2.590, + "args": { + "External id": 88280,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866049542.411, "dur": 1.350, + "args": { + "External id": 88281,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866049542.801, "dur": 0.770, + "args": { + "External id": 88282,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866049549.571, "dur": 214.820, + "args": { + "External id": 88283,"Record function id": 0, "Sequence number": 1770951, "Fwd thread id": 1, "Ev Idx": 2266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866049551.261, "dur": 180.600, + "args": { + "External id": 88284,"Sequence number": 1770951, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2267 + } + }, + { + "ph": "f", "id": 251, "pid": 5714, "tid": 6744, "ts": 6300866049551.261, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866049621.441, "dur": 23.640, + "args": { + "External id": 88285,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866049662.851, "dur": 13.970, + "args": { + "External id": 88286,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866049695.171, "dur": 16.970, + "args": { + "External id": 88287,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866049740.720, "dur": 18.251, + "args": { + "External id": 88288,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866049776.960, "dur": 10.931, + "args": { + "External id": 88289,"Record function id": 0, "Ev Idx": 2272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866049780.000, "dur": 6.451, + "args": { + "External id": 88290,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866049782.260, "dur": 3.500, + "args": { + "External id": 88291,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866049783.111, "dur": 2.420, + "args": { + "External id": 88292,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866049792.440, "dur": 803.029, + "args": { + "External id": 88293,"Record function id": 0, "Sequence number": 1770950, "Fwd thread id": 1, "Ev Idx": 2276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866049794.111, "dur": 794.818, + "args": { + "External id": 88294,"Sequence number": 1770950, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2277 + } + }, + { + "ph": "f", "id": 252, "pid": 5714, "tid": 6744, "ts": 6300866049794.111, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6300866049813.440, "dur": 24.870, + "args": { + "External id": 88295,"Record function id": 0, "Ev Idx": 2278 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6300866049846.920, "dur": 57.200, + "args": { + "External id": 88296,"Record function id": 0, "Ev Idx": 2279 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6300866049911.820, "dur": 671.379, + "args": { + "External id": 88297,"Record function id": 0, "Ev Idx": 2280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866049984.530, "dur": 8.300, + "args": { + "External id": 88298,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050002.070, "dur": 4.380, + "args": { + "External id": 88299,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866050019.070, "dur": 104.780, + "args": { + "External id": 88300,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866050029.360, "dur": 90.600, + "args": { + "External id": 88301,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866050054.490, "dur": 6.510, + "args": { + "External id": 88302,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866050065.250, "dur": 32.790, + "args": { + "External id": 88303,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866050066.750, "dur": 30.940, + "args": { + "External id": 88304,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050069.090, "dur": 6.830, + "args": { + "External id": 88305,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866050077.220, "dur": 19.980, + "args": { + "External id": 88306,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866050193.190, "dur": 8.880, + "args": { + "External id": 88307,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866050194.970, "dur": 6.540, + "args": { + "External id": 88308,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866050219.050, "dur": 94.749, + "args": { + "External id": 88309,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866050234.150, "dur": 76.099, + "args": { + "External id": 88310,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2293, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866050246.379, "dur": 58.700, + "args": { + "External id": 88311,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866050326.779, "dur": 3.750, + "args": { + "External id": 88312,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2295, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050385.439, "dur": 4.180, + "args": { + "External id": 88313,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050421.549, "dur": 1.190, + "args": { + "External id": 88314,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050440.439, "dur": 2.010, + "args": { + "External id": 88315,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050457.259, "dur": 0.840, + "args": { + "External id": 88316,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050471.769, "dur": 0.830, + "args": { + "External id": 88317,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050485.589, "dur": 0.800, + "args": { + "External id": 88318,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050499.529, "dur": 2.030, + "args": { + "External id": 88319,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050514.479, "dur": 1.110, + "args": { + "External id": 88320,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050529.519, "dur": 0.870, + "args": { + "External id": 88321,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866050608.659, "dur": 1423.436, + "args": { + "External id": 88322,"Record function id": 0, "Ev Idx": 2305 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6300866050624.009, "dur": 894.747, + "args": { + "External id": 88323,"Record function id": 0, "Ev Idx": 2306 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6300866050635.929, "dur": 253.089, + "args": { + "External id": 88324,"Record function id": 0, "Ev Idx": 2307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050723.128, "dur": 4.000, + "args": { + "External id": 88325,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050731.798, "dur": 1.050, + "args": { + "External id": 88326,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050734.718, "dur": 1.390, + "args": { + "External id": 88327,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050737.828, "dur": 0.630, + "args": { + "External id": 88328,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050740.008, "dur": 0.660, + "args": { + "External id": 88329,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050742.558, "dur": 0.730, + "args": { + "External id": 88330,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050744.888, "dur": 0.980, + "args": { + "External id": 88331,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050747.498, "dur": 0.750, + "args": { + "External id": 88332,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050749.628, "dur": 0.600, + "args": { + "External id": 88333,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866050751.838, "dur": 0.600, + "args": { + "External id": 88334,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866050766.248, "dur": 93.550, + "args": { + "External id": 88335,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866050778.138, "dur": 77.770, + "args": { + "External id": 88336,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866050789.618, "dur": 7.020, + "args": { + "External id": 88337,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866050799.198, "dur": 34.230, + "args": { + "External id": 88338,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866050800.668, "dur": 32.330, + "args": { + "External id": 88339,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866050803.088, "dur": 7.540, + "args": { + "External id": 88340,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866050811.868, "dur": 20.500, + "args": { + "External id": 88341,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2324 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.5", "pid": 5714, "tid": 6744, + "ts": 6300866050976.898, "dur": 533.909, + "args": { + "External id": 88342,"Record function id": 0, "Ev Idx": 2325 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6300866050992.328, "dur": 504.579, + "args": { + "External id": 88343,"Record function id": 0, "Ev Idx": 2326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866051056.358, "dur": 8.159, + "args": { + "External id": 88344,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866051075.928, "dur": 19.929, + "args": { + "External id": 88345,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051078.937, "dur": 1.260, + "args": { + "External id": 88346,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051082.157, "dur": 0.231, + "args": { + "External id": 88347,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051083.448, "dur": 0.249, + "args": { + "External id": 88348,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051084.877, "dur": 0.411, + "args": { + "External id": 88349,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051086.228, "dur": 0.240, + "args": { + "External id": 88350,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051087.628, "dur": 0.909, + "args": { + "External id": 88351,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051089.608, "dur": 0.300, + "args": { + "External id": 88352,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051091.037, "dur": 0.280, + "args": { + "External id": 88353,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051092.457, "dur": 0.311, + "args": { + "External id": 88354,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866051103.968, "dur": 22.840, + "args": { + "External id": 88355,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6300866051157.707, "dur": 92.560, + "args": { + "External id": 88356,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866051168.127, "dur": 7.200, + "args": { + "External id": 88357,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6300866051179.627, "dur": 8.620, + "args": { + "External id": 88358,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866051182.157, "dur": 5.660, + "args": { + "External id": 88359,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051184.997, "dur": 0.900, + "args": { + "External id": 88360,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866051195.707, "dur": 16.780, + "args": { + "External id": 88361,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051197.487, "dur": 0.290, + "args": { + "External id": 88362,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051199.067, "dur": 0.290, + "args": { + "External id": 88363,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051200.447, "dur": 1.180, + "args": { + "External id": 88364,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051202.927, "dur": 0.260, + "args": { + "External id": 88365,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051204.317, "dur": 0.240, + "args": { + "External id": 88366,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051205.587, "dur": 0.340, + "args": { + "External id": 88367,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051206.907, "dur": 0.290, + "args": { + "External id": 88368,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051208.257, "dur": 0.360, + "args": { + "External id": 88369,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866051210.007, "dur": 0.240, + "args": { + "External id": 88370,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866051223.917, "dur": 18.240, + "args": { + "External id": 88371,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6300866051339.297, "dur": 87.640, + "args": { + "External id": 88372,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866051355.797, "dur": 67.840, + "args": { + "External id": 88373,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2356, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6300866051368.237, "dur": 51.030, + "args": { + "External id": 88374,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866051440.437, "dur": 3.490, + "args": { + "External id": 88375,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2358, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866051524.447, "dur": 496.019, + "args": { + "External id": 88376,"Sequence number": 1770949, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2359 + } + }, + { + "ph": "f", "id": 253, "pid": 5714, "tid": 6744, "ts": 6300866051524.447, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866051595.406, "dur": 34.730, + "args": { + "External id": 88377,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6300866051662.186, "dur": 25.210, + "args": { + "External id": 88378,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866051704.726, "dur": 37.460, + "args": { + "External id": 88379,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866051756.926, "dur": 27.990, + "args": { + "External id": 88380,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866051795.936, "dur": 21.490, + "args": { + "External id": 88381,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866051828.746, "dur": 24.660, + "args": { + "External id": 88382,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866051864.106, "dur": 20.270, + "args": { + "External id": 88383,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866051909.656, "dur": 21.690, + "args": { + "External id": 88384,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866051950.386, "dur": 14.329, + "args": { + "External id": 88385,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6300866051980.126, "dur": 17.429, + "args": { + "External id": 88386,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866052047.425, "dur": 12.230, + "args": { + "External id": 88387,"Record function id": 0, "Ev Idx": 2370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866052050.705, "dur": 7.510, + "args": { + "External id": 88388,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866052053.435, "dur": 3.930, + "args": { + "External id": 88389,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866052054.445, "dur": 2.670, + "args": { + "External id": 88390,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866052064.775, "dur": 5.810, + "args": { + "External id": 88391,"Record function id": 0, "Ev Idx": 2374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866052066.615, "dur": 2.790, + "args": { + "External id": 88392,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866052067.475, "dur": 1.440, + "args": { + "External id": 88393,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866052067.895, "dur": 0.810, + "args": { + "External id": 88394,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866052074.845, "dur": 5.510, + "args": { + "External id": 88395,"Record function id": 0, "Ev Idx": 2378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866052076.675, "dur": 2.620, + "args": { + "External id": 88396,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866052077.525, "dur": 1.300, + "args": { + "External id": 88397,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866052078.015, "dur": 0.620, + "args": { + "External id": 88398,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866052084.555, "dur": 6.370, + "args": { + "External id": 88399,"Record function id": 0, "Ev Idx": 2382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866052086.245, "dur": 3.620, + "args": { + "External id": 88400,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866052087.055, "dur": 2.330, + "args": { + "External id": 88401,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866052087.645, "dur": 1.520, + "args": { + "External id": 88402,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866052095.255, "dur": 353.770, + "args": { + "External id": 88403,"Record function id": 0, "Sequence number": 1770948, "Fwd thread id": 1, "Ev Idx": 2386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866052096.785, "dur": 342.680, + "args": { + "External id": 88404,"Sequence number": 1770948, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2387 + } + }, + { + "ph": "f", "id": 254, "pid": 5714, "tid": 6744, "ts": 6300866052096.785, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866052163.665, "dur": 42.470, + "args": { + "External id": 88405,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866052220.985, "dur": 20.120, + "args": { + "External id": 88406,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866052267.875, "dur": 144.650, + "args": { + "External id": 88407,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866052334.325, "dur": 7.220, + "args": { + "External id": 88408,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866052343.455, "dur": 3.720, + "args": { + "External id": 88409,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866052463.994, "dur": 12.460, + "args": { + "External id": 88410,"Record function id": 0, "Ev Idx": 2393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866052467.645, "dur": 7.389, + "args": { + "External id": 88411,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866052470.294, "dur": 3.791, + "args": { + "External id": 88412,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866052471.234, "dur": 2.620, + "args": { + "External id": 88413,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866052480.984, "dur": 186.160, + "args": { + "External id": 88414,"Record function id": 0, "Sequence number": 1770947, "Fwd thread id": 1, "Ev Idx": 2397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866052482.804, "dur": 177.580, + "args": { + "External id": 88415,"Sequence number": 1770947, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2398 + } + }, + { + "ph": "f", "id": 255, "pid": 5714, "tid": 6744, "ts": 6300866052482.804, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866052498.794, "dur": 38.290, + "args": { + "External id": 88416,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866052502.124, "dur": 6.960, + "args": { + "External id": 88417,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866052510.374, "dur": 26.000, + "args": { + "External id": 88418,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866052546.884, "dur": 7.460, + "args": { + "External id": 88419,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866052548.684, "dur": 5.200, + "args": { + "External id": 88420,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866052680.064, "dur": 169.689, + "args": { + "External id": 88421,"Record function id": 0, "Sequence number": 1770946, "Fwd thread id": 1, "Ev Idx": 2404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866052682.784, "dur": 159.240, + "args": { + "External id": 88422,"Sequence number": 1770946, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2405 + } + }, + { + "ph": "f", "id": 256, "pid": 5714, "tid": 6744, "ts": 6300866052682.784, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866052697.954, "dur": 33.460, + "args": { + "External id": 88423,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866052701.244, "dur": 6.810, + "args": { + "External id": 88424,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866052709.264, "dur": 21.490, + "args": { + "External id": 88425,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866052740.414, "dur": 6.930, + "args": { + "External id": 88426,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866052742.164, "dur": 4.750, + "args": { + "External id": 88427,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866052862.484, "dur": 344.619, + "args": { + "External id": 88428,"Record function id": 0, "Sequence number": 1770945, "Fwd thread id": 1, "Ev Idx": 2411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866052865.673, "dur": 330.100, + "args": { + "External id": 88429,"Sequence number": 1770945, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2412 + } + }, + { + "ph": "f", "id": 257, "pid": 5714, "tid": 6744, "ts": 6300866052865.673, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866052936.643, "dur": 43.500, + "args": { + "External id": 88430,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866052993.073, "dur": 25.810, + "args": { + "External id": 88431,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866053029.903, "dur": 24.440, + "args": { + "External id": 88432,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866053066.683, "dur": 20.190, + "args": { + "External id": 88433,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866053096.433, "dur": 16.220, + "args": { + "External id": 88434,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866053120.833, "dur": 15.190, + "args": { + "External id": 88435,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6300866053156.513, "dur": 21.160, + "args": { + "External id": 88436,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866053221.303, "dur": 12.330, + "args": { + "External id": 88437,"Record function id": 0, "Ev Idx": 2420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866053224.623, "dur": 7.510, + "args": { + "External id": 88438,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866053227.283, "dur": 4.030, + "args": { + "External id": 88439,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866053228.413, "dur": 2.660, + "args": { + "External id": 88440,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866053238.503, "dur": 5.550, + "args": { + "External id": 88441,"Record function id": 0, "Ev Idx": 2424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866053240.233, "dur": 2.700, + "args": { + "External id": 88442,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866053241.013, "dur": 1.300, + "args": { + "External id": 88443,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866053241.393, "dur": 0.720, + "args": { + "External id": 88444,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866053248.263, "dur": 5.380, + "args": { + "External id": 88445,"Record function id": 0, "Ev Idx": 2428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866053249.953, "dur": 2.620, + "args": { + "External id": 88446,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866053250.633, "dur": 1.390, + "args": { + "External id": 88447,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866053251.173, "dur": 0.680, + "args": { + "External id": 88448,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866053257.652, "dur": 239.180, + "args": { + "External id": 88449,"Record function id": 0, "Sequence number": 1770944, "Fwd thread id": 1, "Ev Idx": 2432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866053259.123, "dur": 204.609, + "args": { + "External id": 88450,"Sequence number": 1770944, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2433 + } + }, + { + "ph": "f", "id": 258, "pid": 5714, "tid": 6744, "ts": 6300866053259.123, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866053350.463, "dur": 24.999, + "args": { + "External id": 88451,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866053393.732, "dur": 14.510, + "args": { + "External id": 88452,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866053426.372, "dur": 16.800, + "args": { + "External id": 88453,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866053472.962, "dur": 18.500, + "args": { + "External id": 88454,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866053509.862, "dur": 10.810, + "args": { + "External id": 88455,"Record function id": 0, "Ev Idx": 2438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866053512.812, "dur": 6.450, + "args": { + "External id": 88456,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866053515.042, "dur": 3.460, + "args": { + "External id": 88457,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866053515.892, "dur": 2.400, + "args": { + "External id": 88458,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866053525.292, "dur": 792.818, + "args": { + "External id": 88459,"Record function id": 0, "Sequence number": 1770943, "Fwd thread id": 1, "Ev Idx": 2442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866053527.032, "dur": 784.278, + "args": { + "External id": 88460,"Sequence number": 1770943, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2443 + } + }, + { + "ph": "f", "id": 259, "pid": 5714, "tid": 6744, "ts": 6300866053527.032, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6300866053546.282, "dur": 24.920, + "args": { + "External id": 88461,"Record function id": 0, "Ev Idx": 2444 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6300866053579.322, "dur": 56.670, + "args": { + "External id": 88462,"Record function id": 0, "Ev Idx": 2445 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6300866053643.372, "dur": 661.818, + "args": { + "External id": 88463,"Record function id": 0, "Ev Idx": 2446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866053714.162, "dur": 8.360, + "args": { + "External id": 88464,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866053731.131, "dur": 3.640, + "args": { + "External id": 88465,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866053746.922, "dur": 102.869, + "args": { + "External id": 88466,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866053757.071, "dur": 88.810, + "args": { + "External id": 88467,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866053780.991, "dur": 6.400, + "args": { + "External id": 88468,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866053791.351, "dur": 32.370, + "args": { + "External id": 88469,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866053792.762, "dur": 30.589, + "args": { + "External id": 88470,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866053795.081, "dur": 6.660, + "args": { + "External id": 88471,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866053803.011, "dur": 19.850, + "args": { + "External id": 88472,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866053918.801, "dur": 9.670, + "args": { + "External id": 88473,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866053920.511, "dur": 7.420, + "args": { + "External id": 88474,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866053946.491, "dur": 86.810, + "args": { + "External id": 88475,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866053961.601, "dur": 68.320, + "args": { + "External id": 88476,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2459, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866053973.771, "dur": 51.250, + "args": { + "External id": 88477,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866054045.701, "dur": 3.300, + "args": { + "External id": 88478,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2461, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054102.441, "dur": 4.070, + "args": { + "External id": 88479,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054137.410, "dur": 1.220, + "args": { + "External id": 88480,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054156.390, "dur": 1.100, + "args": { + "External id": 88481,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054172.221, "dur": 0.889, + "args": { + "External id": 88482,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054187.850, "dur": 0.831, + "args": { + "External id": 88483,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054202.110, "dur": 0.780, + "args": { + "External id": 88484,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054216.241, "dur": 0.869, + "args": { + "External id": 88485,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054230.070, "dur": 1.011, + "args": { + "External id": 88486,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054244.690, "dur": 0.760, + "args": { + "External id": 88487,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866054331.340, "dur": 1394.607, + "args": { + "External id": 88488,"Record function id": 0, "Ev Idx": 2471 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6300866054346.670, "dur": 857.048, + "args": { + "External id": 88489,"Record function id": 0, "Ev Idx": 2472 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6300866054358.440, "dur": 251.920, + "args": { + "External id": 88490,"Record function id": 0, "Ev Idx": 2473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866054447.780, "dur": 3.880, + "args": { + "External id": 88491,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866054456.320, "dur": 0.960, + "args": { + "External id": 88492,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866054459.380, "dur": 0.680, + "args": { + "External id": 88493,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866054461.870, "dur": 0.660, + "args": { + "External id": 88494,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866054464.080, "dur": 0.640, + "args": { + "External id": 88495,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866054466.190, "dur": 1.450, + "args": { + "External id": 88496,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866054469.310, "dur": 1.030, + "args": { + "External id": 88497,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866054471.870, "dur": 0.660, + "args": { + "External id": 88498,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866054473.950, "dur": 0.620, + "args": { + "External id": 88499,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866054476.000, "dur": 0.580, + "args": { + "External id": 88500,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866054490.510, "dur": 90.979, + "args": { + "External id": 88501,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866054502.540, "dur": 75.289, + "args": { + "External id": 88502,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866054513.430, "dur": 6.420, + "args": { + "External id": 88503,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866054522.290, "dur": 32.150, + "args": { + "External id": 88504,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866054523.760, "dur": 30.270, + "args": { + "External id": 88505,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054526.040, "dur": 6.360, + "args": { + "External id": 88506,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866054533.610, "dur": 19.880, + "args": { + "External id": 88507,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2490 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.4", "pid": 5714, "tid": 6744, + "ts": 6300866054698.949, "dur": 496.559, + "args": { + "External id": 88508,"Record function id": 0, "Ev Idx": 2491 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6300866054714.479, "dur": 467.179, + "args": { + "External id": 88509,"Record function id": 0, "Ev Idx": 2492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866054779.289, "dur": 8.560, + "args": { + "External id": 88510,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866054798.939, "dur": 18.880, + "args": { + "External id": 88511,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054801.719, "dur": 1.900, + "args": { + "External id": 88512,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054805.239, "dur": 0.360, + "args": { + "External id": 88513,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054806.609, "dur": 0.360, + "args": { + "External id": 88514,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054807.919, "dur": 0.220, + "args": { + "External id": 88515,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054809.119, "dur": 0.210, + "args": { + "External id": 88516,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054810.269, "dur": 0.280, + "args": { + "External id": 88517,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054811.459, "dur": 0.320, + "args": { + "External id": 88518,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054812.629, "dur": 0.300, + "args": { + "External id": 88519,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054813.789, "dur": 0.940, + "args": { + "External id": 88520,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866054826.029, "dur": 22.400, + "args": { + "External id": 88521,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6300866054879.269, "dur": 92.090, + "args": { + "External id": 88522,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866054889.299, "dur": 7.510, + "args": { + "External id": 88523,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6300866054901.059, "dur": 8.400, + "args": { + "External id": 88524,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866054903.679, "dur": 5.380, + "args": { + "External id": 88525,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054906.379, "dur": 0.810, + "args": { + "External id": 88526,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866054916.909, "dur": 17.060, + "args": { + "External id": 88527,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054918.559, "dur": 0.380, + "args": { + "External id": 88528,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054920.039, "dur": 0.340, + "args": { + "External id": 88529,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054921.309, "dur": 0.270, + "args": { + "External id": 88530,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054923.759, "dur": 0.290, + "args": { + "External id": 88531,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054925.019, "dur": 0.220, + "args": { + "External id": 88532,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054926.239, "dur": 1.220, + "args": { + "External id": 88533,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054928.599, "dur": 0.340, + "args": { + "External id": 88534,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054930.199, "dur": 0.340, + "args": { + "External id": 88535,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866054931.459, "dur": 0.330, + "args": { + "External id": 88536,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866054945.339, "dur": 17.820, + "args": { + "External id": 88537,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6300866055026.119, "dur": 85.660, + "args": { + "External id": 88538,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866055042.599, "dur": 66.009, + "args": { + "External id": 88539,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2522, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6300866055054.379, "dur": 49.840, + "args": { + "External id": 88540,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866055124.898, "dur": 3.480, + "args": { + "External id": 88541,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2524, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866055209.118, "dur": 505.119, + "args": { + "External id": 88542,"Sequence number": 1770942, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2525 + } + }, + { + "ph": "f", "id": 260, "pid": 5714, "tid": 6744, "ts": 6300866055209.118, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866055278.848, "dur": 43.480, + "args": { + "External id": 88543,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6300866055357.218, "dur": 26.260, + "args": { + "External id": 88544,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866055400.898, "dur": 37.690, + "args": { + "External id": 88545,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866055452.888, "dur": 27.759, + "args": { + "External id": 88546,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866055491.278, "dur": 21.349, + "args": { + "External id": 88547,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866055523.587, "dur": 24.420, + "args": { + "External id": 88548,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866055558.907, "dur": 20.040, + "args": { + "External id": 88549,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866055603.257, "dur": 22.430, + "args": { + "External id": 88550,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866055643.267, "dur": 14.110, + "args": { + "External id": 88551,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6300866055672.917, "dur": 16.910, + "args": { + "External id": 88552,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866055741.277, "dur": 12.300, + "args": { + "External id": 88553,"Record function id": 0, "Ev Idx": 2536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866055744.597, "dur": 7.550, + "args": { + "External id": 88554,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866055747.357, "dur": 3.900, + "args": { + "External id": 88555,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866055748.337, "dur": 2.670, + "args": { + "External id": 88556,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866055758.607, "dur": 5.590, + "args": { + "External id": 88557,"Record function id": 0, "Ev Idx": 2540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866055760.337, "dur": 2.790, + "args": { + "External id": 88558,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866055761.157, "dur": 1.400, + "args": { + "External id": 88559,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866055761.607, "dur": 0.770, + "args": { + "External id": 88560,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866055768.527, "dur": 5.310, + "args": { + "External id": 88561,"Record function id": 0, "Ev Idx": 2544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866055770.247, "dur": 2.520, + "args": { + "External id": 88562,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866055770.997, "dur": 1.300, + "args": { + "External id": 88563,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866055771.467, "dur": 0.650, + "args": { + "External id": 88564,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866055777.957, "dur": 5.280, + "args": { + "External id": 88565,"Record function id": 0, "Ev Idx": 2548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866055779.687, "dur": 2.490, + "args": { + "External id": 88566,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866055780.427, "dur": 1.260, + "args": { + "External id": 88567,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866055780.927, "dur": 0.580, + "args": { + "External id": 88568,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866055787.427, "dur": 342.969, + "args": { + "External id": 88569,"Record function id": 0, "Sequence number": 1770941, "Fwd thread id": 1, "Ev Idx": 2552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866055789.007, "dur": 332.489, + "args": { + "External id": 88570,"Sequence number": 1770941, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2553 + } + }, + { + "ph": "f", "id": 261, "pid": 5714, "tid": 6744, "ts": 6300866055789.007, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866055856.147, "dur": 42.739, + "args": { + "External id": 88571,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866055914.106, "dur": 20.471, + "args": { + "External id": 88572,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866055961.357, "dur": 133.889, + "args": { + "External id": 88573,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866056016.396, "dur": 7.380, + "args": { + "External id": 88574,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866056025.696, "dur": 3.960, + "args": { + "External id": 88575,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866056145.366, "dur": 54.010, + "args": { + "External id": 88576,"Record function id": 0, "Ev Idx": 2559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866056189.386, "dur": 8.190, + "args": { + "External id": 88577,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866056192.446, "dur": 3.960, + "args": { + "External id": 88578,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866056193.426, "dur": 2.750, + "args": { + "External id": 88579,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866056205.036, "dur": 199.149, + "args": { + "External id": 88580,"Record function id": 0, "Sequence number": 1770940, "Fwd thread id": 1, "Ev Idx": 2563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866056207.066, "dur": 189.859, + "args": { + "External id": 88581,"Sequence number": 1770940, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2564 + } + }, + { + "ph": "f", "id": 262, "pid": 5714, "tid": 6744, "ts": 6300866056207.066, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866056223.806, "dur": 39.300, + "args": { + "External id": 88582,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866056227.196, "dur": 7.190, + "args": { + "External id": 88583,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866056235.736, "dur": 26.680, + "args": { + "External id": 88584,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866056272.656, "dur": 8.930, + "args": { + "External id": 88585,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866056274.936, "dur": 6.180, + "args": { + "External id": 88586,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866056416.925, "dur": 171.150, + "args": { + "External id": 88587,"Record function id": 0, "Sequence number": 1770939, "Fwd thread id": 1, "Ev Idx": 2570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866056420.116, "dur": 159.929, + "args": { + "External id": 88588,"Sequence number": 1770939, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2571 + } + }, + { + "ph": "f", "id": 263, "pid": 5714, "tid": 6744, "ts": 6300866056420.116, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866056434.916, "dur": 34.039, + "args": { + "External id": 88589,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866056438.365, "dur": 7.070, + "args": { + "External id": 88590,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866056446.665, "dur": 21.600, + "args": { + "External id": 88591,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866056478.085, "dur": 7.300, + "args": { + "External id": 88592,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866056480.075, "dur": 4.870, + "args": { + "External id": 88593,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866056600.445, "dur": 343.349, + "args": { + "External id": 88594,"Record function id": 0, "Sequence number": 1770938, "Fwd thread id": 1, "Ev Idx": 2577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866056603.515, "dur": 328.629, + "args": { + "External id": 88595,"Sequence number": 1770938, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2578 + } + }, + { + "ph": "f", "id": 264, "pid": 5714, "tid": 6744, "ts": 6300866056603.515, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866056674.225, "dur": 43.500, + "args": { + "External id": 88596,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866056731.285, "dur": 25.570, + "args": { + "External id": 88597,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866056768.645, "dur": 24.319, + "args": { + "External id": 88598,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866056804.995, "dur": 20.120, + "args": { + "External id": 88599,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866056833.915, "dur": 15.689, + "args": { + "External id": 88600,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866056857.635, "dur": 15.409, + "args": { + "External id": 88601,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6300866056893.814, "dur": 20.530, + "args": { + "External id": 88602,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866056958.054, "dur": 13.030, + "args": { + "External id": 88603,"Record function id": 0, "Ev Idx": 2586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866056961.234, "dur": 8.390, + "args": { + "External id": 88604,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866056963.904, "dur": 4.870, + "args": { + "External id": 88605,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866056965.054, "dur": 3.480, + "args": { + "External id": 88606,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866056975.914, "dur": 5.610, + "args": { + "External id": 88607,"Record function id": 0, "Ev Idx": 2590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866056977.814, "dur": 2.630, + "args": { + "External id": 88608,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866056978.634, "dur": 1.230, + "args": { + "External id": 88609,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866056979.024, "dur": 0.650, + "args": { + "External id": 88610,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866056985.604, "dur": 5.300, + "args": { + "External id": 88611,"Record function id": 0, "Ev Idx": 2594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866056987.274, "dur": 2.560, + "args": { + "External id": 88612,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866056987.974, "dur": 1.400, + "args": { + "External id": 88613,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866056988.514, "dur": 0.680, + "args": { + "External id": 88614,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866056995.164, "dur": 216.730, + "args": { + "External id": 88615,"Record function id": 0, "Sequence number": 1770937, "Fwd thread id": 1, "Ev Idx": 2598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866056996.854, "dur": 182.510, + "args": { + "External id": 88616,"Sequence number": 1770937, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2599 + } + }, + { + "ph": "f", "id": 265, "pid": 5714, "tid": 6744, "ts": 6300866056996.854, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866057068.224, "dur": 23.880, + "args": { + "External id": 88617,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866057110.274, "dur": 14.170, + "args": { + "External id": 88618,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866057142.474, "dur": 16.980, + "args": { + "External id": 88619,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866057188.024, "dur": 18.610, + "args": { + "External id": 88620,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866057224.774, "dur": 11.960, + "args": { + "External id": 88621,"Record function id": 0, "Ev Idx": 2604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866057227.774, "dur": 7.460, + "args": { + "External id": 88622,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866057230.103, "dur": 4.340, + "args": { + "External id": 88623,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866057230.934, "dur": 3.309, + "args": { + "External id": 88624,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866057241.563, "dur": 831.019, + "args": { + "External id": 88625,"Record function id": 0, "Sequence number": 1770936, "Fwd thread id": 1, "Ev Idx": 2608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866057243.434, "dur": 822.848, + "args": { + "External id": 88626,"Sequence number": 1770936, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2609 + } + }, + { + "ph": "f", "id": 266, "pid": 5714, "tid": 6744, "ts": 6300866057243.434, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6300866057262.243, "dur": 25.540, + "args": { + "External id": 88627,"Record function id": 0, "Ev Idx": 2610 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6300866057296.074, "dur": 68.369, + "args": { + "External id": 88628,"Record function id": 0, "Ev Idx": 2611 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6300866057372.383, "dur": 688.269, + "args": { + "External id": 88629,"Record function id": 0, "Ev Idx": 2612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866057443.853, "dur": 8.220, + "args": { + "External id": 88630,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866057461.183, "dur": 3.750, + "args": { + "External id": 88631,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866057477.863, "dur": 130.400, + "args": { + "External id": 88632,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866057488.103, "dur": 116.350, + "args": { + "External id": 88633,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866057537.823, "dur": 6.470, + "args": { + "External id": 88634,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866057548.393, "dur": 33.660, + "args": { + "External id": 88635,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866057549.983, "dur": 31.680, + "args": { + "External id": 88636,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866057552.823, "dur": 6.690, + "args": { + "External id": 88637,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866057560.953, "dur": 20.140, + "args": { + "External id": 88638,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866057677.033, "dur": 8.880, + "args": { + "External id": 88639,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866057678.842, "dur": 6.520, + "args": { + "External id": 88640,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866057703.673, "dur": 85.929, + "args": { + "External id": 88641,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866057719.282, "dur": 67.050, + "args": { + "External id": 88642,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2625, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866057731.133, "dur": 50.879, + "args": { + "External id": 88643,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866057803.132, "dur": 3.620, + "args": { + "External id": 88644,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2627, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866057859.782, "dur": 4.060, + "args": { + "External id": 88645,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866057894.982, "dur": 2.190, + "args": { + "External id": 88646,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866057918.282, "dur": 1.050, + "args": { + "External id": 88647,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866057935.332, "dur": 0.850, + "args": { + "External id": 88648,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866057950.072, "dur": 0.840, + "args": { + "External id": 88649,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866057964.312, "dur": 1.780, + "args": { + "External id": 88650,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866057979.712, "dur": 0.750, + "args": { + "External id": 88651,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866057993.832, "dur": 1.020, + "args": { + "External id": 88652,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058008.032, "dur": 0.880, + "args": { + "External id": 88653,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866058085.562, "dur": 1398.167, + "args": { + "External id": 88654,"Record function id": 0, "Ev Idx": 2637 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6300866058100.521, "dur": 860.389, + "args": { + "External id": 88655,"Record function id": 0, "Ev Idx": 2638 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6300866058112.092, "dur": 262.239, + "args": { + "External id": 88656,"Record function id": 0, "Ev Idx": 2639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866058201.211, "dur": 4.650, + "args": { + "External id": 88657,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866058210.111, "dur": 0.840, + "args": { + "External id": 88658,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866058212.751, "dur": 0.740, + "args": { + "External id": 88659,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866058215.011, "dur": 0.630, + "args": { + "External id": 88660,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866058217.031, "dur": 0.620, + "args": { + "External id": 88661,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866058219.191, "dur": 0.640, + "args": { + "External id": 88662,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866058221.601, "dur": 1.010, + "args": { + "External id": 88663,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866058224.101, "dur": 0.740, + "args": { + "External id": 88664,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866058226.391, "dur": 1.400, + "args": { + "External id": 88665,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866058229.271, "dur": 0.680, + "args": { + "External id": 88666,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866058243.881, "dur": 100.990, + "args": { + "External id": 88667,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866058255.841, "dur": 85.160, + "args": { + "External id": 88668,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866058266.381, "dur": 6.470, + "args": { + "External id": 88669,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866058275.291, "dur": 42.600, + "args": { + "External id": 88670,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866058276.881, "dur": 40.600, + "args": { + "External id": 88671,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058279.211, "dur": 6.410, + "args": { + "External id": 88672,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866058286.871, "dur": 29.890, + "args": { + "External id": 88673,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2656 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.3", "pid": 5714, "tid": 6744, + "ts": 6300866058461.871, "dur": 490.599, + "args": { + "External id": 88674,"Record function id": 0, "Ev Idx": 2657 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6300866058477.541, "dur": 461.329, + "args": { + "External id": 88675,"Record function id": 0, "Ev Idx": 2658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866058542.000, "dur": 7.880, + "args": { + "External id": 88676,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866058561.720, "dur": 17.951, + "args": { + "External id": 88677,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058564.680, "dur": 1.331, + "args": { + "External id": 88678,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058567.680, "dur": 0.331, + "args": { + "External id": 88679,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058568.911, "dur": 0.229, + "args": { + "External id": 88680,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058569.991, "dur": 1.049, + "args": { + "External id": 88681,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058571.891, "dur": 0.329, + "args": { + "External id": 88682,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058573.040, "dur": 0.271, + "args": { + "External id": 88683,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058574.160, "dur": 0.191, + "args": { + "External id": 88684,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058575.191, "dur": 0.180, + "args": { + "External id": 88685,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058576.211, "dur": 0.269, + "args": { + "External id": 88686,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866058587.891, "dur": 22.709, + "args": { + "External id": 88687,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6300866058641.750, "dur": 89.880, + "args": { + "External id": 88688,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866058651.730, "dur": 7.500, + "args": { + "External id": 88689,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6300866058663.560, "dur": 8.340, + "args": { + "External id": 88690,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866058666.270, "dur": 5.230, + "args": { + "External id": 88691,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058668.940, "dur": 0.750, + "args": { + "External id": 88692,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866058679.470, "dur": 14.880, + "args": { + "External id": 88693,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058681.130, "dur": 1.050, + "args": { + "External id": 88694,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058683.210, "dur": 0.340, + "args": { + "External id": 88695,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058684.450, "dur": 0.300, + "args": { + "External id": 88696,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058685.620, "dur": 0.290, + "args": { + "External id": 88697,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058686.740, "dur": 0.270, + "args": { + "External id": 88698,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058687.860, "dur": 0.300, + "args": { + "External id": 88699,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058688.990, "dur": 0.230, + "args": { + "External id": 88700,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058690.060, "dur": 0.270, + "args": { + "External id": 88701,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866058691.150, "dur": 0.980, + "args": { + "External id": 88702,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866058705.080, "dur": 18.410, + "args": { + "External id": 88703,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6300866058785.960, "dur": 84.580, + "args": { + "External id": 88704,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866058801.300, "dur": 66.020, + "args": { + "External id": 88705,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2688, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6300866058813.270, "dur": 49.720, + "args": { + "External id": 88706,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866058883.280, "dur": 3.490, + "args": { + "External id": 88707,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2690, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866058966.680, "dur": 505.149, + "args": { + "External id": 88708,"Sequence number": 1770935, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2691 + } + }, + { + "ph": "f", "id": 267, "pid": 5714, "tid": 6744, "ts": 6300866058966.680, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866059037.859, "dur": 35.140, + "args": { + "External id": 88709,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6300866059104.539, "dur": 25.420, + "args": { + "External id": 88710,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866059147.539, "dur": 36.930, + "args": { + "External id": 88711,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866059198.219, "dur": 26.840, + "args": { + "External id": 88712,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866059235.609, "dur": 21.140, + "args": { + "External id": 88713,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866059267.639, "dur": 24.600, + "args": { + "External id": 88714,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866059314.029, "dur": 22.850, + "args": { + "External id": 88715,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866059362.379, "dur": 22.310, + "args": { + "External id": 88716,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866059402.109, "dur": 13.620, + "args": { + "External id": 88717,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6300866059431.258, "dur": 17.940, + "args": { + "External id": 88718,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866059498.658, "dur": 12.331, + "args": { + "External id": 88719,"Record function id": 0, "Ev Idx": 2702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866059502.038, "dur": 7.511, + "args": { + "External id": 88720,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866059504.758, "dur": 3.931, + "args": { + "External id": 88721,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866059505.738, "dur": 2.711, + "args": { + "External id": 88722,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866059516.028, "dur": 6.920, + "args": { + "External id": 88723,"Record function id": 0, "Ev Idx": 2706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866059518.008, "dur": 3.820, + "args": { + "External id": 88724,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866059518.858, "dur": 2.480, + "args": { + "External id": 88725,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866059519.288, "dur": 1.860, + "args": { + "External id": 88726,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866059527.128, "dur": 5.320, + "args": { + "External id": 88727,"Record function id": 0, "Ev Idx": 2710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866059528.868, "dur": 2.540, + "args": { + "External id": 88728,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866059529.628, "dur": 1.300, + "args": { + "External id": 88729,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866059530.088, "dur": 0.650, + "args": { + "External id": 88730,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866059536.638, "dur": 5.220, + "args": { + "External id": 88731,"Record function id": 0, "Ev Idx": 2714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866059538.288, "dur": 2.510, + "args": { + "External id": 88732,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866059539.068, "dur": 1.270, + "args": { + "External id": 88733,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866059539.498, "dur": 0.660, + "args": { + "External id": 88734,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866059546.088, "dur": 355.429, + "args": { + "External id": 88735,"Record function id": 0, "Sequence number": 1770934, "Fwd thread id": 1, "Ev Idx": 2718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866059547.788, "dur": 344.660, + "args": { + "External id": 88736,"Sequence number": 1770934, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2719 + } + }, + { + "ph": "f", "id": 268, "pid": 5714, "tid": 6744, "ts": 6300866059547.788, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866059615.708, "dur": 42.330, + "args": { + "External id": 88737,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866059671.808, "dur": 20.280, + "args": { + "External id": 88738,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866059729.668, "dur": 136.129, + "args": { + "External id": 88739,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866059787.668, "dur": 7.190, + "args": { + "External id": 88740,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866059796.808, "dur": 3.720, + "args": { + "External id": 88741,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866059916.388, "dur": 11.980, + "args": { + "External id": 88742,"Record function id": 0, "Ev Idx": 2725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866059919.617, "dur": 7.360, + "args": { + "External id": 88743,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866059922.208, "dur": 3.769, + "args": { + "External id": 88744,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866059923.197, "dur": 2.531, + "args": { + "External id": 88745,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866059933.097, "dur": 187.810, + "args": { + "External id": 88746,"Record function id": 0, "Sequence number": 1770933, "Fwd thread id": 1, "Ev Idx": 2729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866059934.868, "dur": 178.999, + "args": { + "External id": 88747,"Sequence number": 1770933, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2730 + } + }, + { + "ph": "f", "id": 269, "pid": 5714, "tid": 6744, "ts": 6300866059934.868, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866059951.197, "dur": 38.370, + "args": { + "External id": 88748,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866059954.517, "dur": 7.030, + "args": { + "External id": 88749,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866059962.877, "dur": 25.980, + "args": { + "External id": 88750,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866059999.077, "dur": 7.870, + "args": { + "External id": 88751,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866060001.137, "dur": 5.350, + "args": { + "External id": 88752,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866060134.317, "dur": 181.379, + "args": { + "External id": 88753,"Record function id": 0, "Sequence number": 1770932, "Fwd thread id": 1, "Ev Idx": 2736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866060137.197, "dur": 169.930, + "args": { + "External id": 88754,"Sequence number": 1770932, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2737 + } + }, + { + "ph": "f", "id": 270, "pid": 5714, "tid": 6744, "ts": 6300866060137.197, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866060152.727, "dur": 33.710, + "args": { + "External id": 88755,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866060156.037, "dur": 7.000, + "args": { + "External id": 88756,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866060164.277, "dur": 21.500, + "args": { + "External id": 88757,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866060195.397, "dur": 8.600, + "args": { + "External id": 88758,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866060197.637, "dur": 5.870, + "args": { + "External id": 88759,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866060328.456, "dur": 341.230, + "args": { + "External id": 88760,"Record function id": 0, "Sequence number": 1770931, "Fwd thread id": 1, "Ev Idx": 2743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866060331.867, "dur": 326.119, + "args": { + "External id": 88761,"Sequence number": 1770931, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2744 + } + }, + { + "ph": "f", "id": 271, "pid": 5714, "tid": 6744, "ts": 6300866060331.867, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866060401.466, "dur": 43.820, + "args": { + "External id": 88762,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866060458.496, "dur": 26.070, + "args": { + "External id": 88763,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866060495.386, "dur": 23.810, + "args": { + "External id": 88764,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866060531.336, "dur": 20.320, + "args": { + "External id": 88765,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866060560.236, "dur": 15.430, + "args": { + "External id": 88766,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866060583.506, "dur": 14.790, + "args": { + "External id": 88767,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6300866060619.386, "dur": 20.780, + "args": { + "External id": 88768,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866060683.836, "dur": 12.160, + "args": { + "External id": 88769,"Record function id": 0, "Ev Idx": 2752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866060686.946, "dur": 7.510, + "args": { + "External id": 88770,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866060689.596, "dur": 4.030, + "args": { + "External id": 88771,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866060690.596, "dur": 2.820, + "args": { + "External id": 88772,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866060700.876, "dur": 6.580, + "args": { + "External id": 88773,"Record function id": 0, "Ev Idx": 2756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866060702.706, "dur": 3.640, + "args": { + "External id": 88774,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866060703.526, "dur": 2.310, + "args": { + "External id": 88775,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866060703.916, "dur": 1.730, + "args": { + "External id": 88776,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866060711.626, "dur": 5.490, + "args": { + "External id": 88777,"Record function id": 0, "Ev Idx": 2760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866060713.416, "dur": 2.600, + "args": { + "External id": 88778,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866060714.136, "dur": 1.380, + "args": { + "External id": 88779,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866060714.656, "dur": 0.680, + "args": { + "External id": 88780,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866060721.386, "dur": 212.429, + "args": { + "External id": 88781,"Record function id": 0, "Sequence number": 1770930, "Fwd thread id": 1, "Ev Idx": 2764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866060723.086, "dur": 178.919, + "args": { + "External id": 88782,"Sequence number": 1770930, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2765 + } + }, + { + "ph": "f", "id": 272, "pid": 5714, "tid": 6744, "ts": 6300866060723.086, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866060793.686, "dur": 23.689, + "args": { + "External id": 88783,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866060835.095, "dur": 13.960, + "args": { + "External id": 88784,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866060866.415, "dur": 16.300, + "args": { + "External id": 88785,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866060910.575, "dur": 18.460, + "args": { + "External id": 88786,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866060946.875, "dur": 10.840, + "args": { + "External id": 88787,"Record function id": 0, "Ev Idx": 2770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866060949.905, "dur": 6.410, + "args": { + "External id": 88788,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866060952.155, "dur": 3.480, + "args": { + "External id": 88789,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866060953.045, "dur": 2.380, + "args": { + "External id": 88790,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866060962.415, "dur": 823.638, + "args": { + "External id": 88791,"Record function id": 0, "Sequence number": 1770929, "Fwd thread id": 1, "Ev Idx": 2774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866060964.205, "dur": 815.388, + "args": { + "External id": 88792,"Sequence number": 1770929, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2775 + } + }, + { + "ph": "f", "id": 273, "pid": 5714, "tid": 6744, "ts": 6300866060964.205, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6300866060983.215, "dur": 25.790, + "args": { + "External id": 88793,"Record function id": 0, "Ev Idx": 2776 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6300866061016.825, "dur": 58.260, + "args": { + "External id": 88794,"Record function id": 0, "Ev Idx": 2777 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6300866061082.455, "dur": 691.438, + "args": { + "External id": 88795,"Record function id": 0, "Ev Idx": 2778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866061153.375, "dur": 8.210, + "args": { + "External id": 88796,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061170.415, "dur": 4.120, + "args": { + "External id": 88797,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866061186.985, "dur": 138.519, + "args": { + "External id": 88798,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866061197.514, "dur": 123.930, + "args": { + "External id": 88799,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866061247.674, "dur": 6.511, + "args": { + "External id": 88800,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866061258.394, "dur": 32.250, + "args": { + "External id": 88801,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866061259.914, "dur": 30.370, + "args": { + "External id": 88802,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061262.294, "dur": 6.600, + "args": { + "External id": 88803,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866061270.154, "dur": 19.580, + "args": { + "External id": 88804,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866061393.794, "dur": 9.390, + "args": { + "External id": 88805,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866061395.594, "dur": 7.070, + "args": { + "External id": 88806,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866061420.624, "dur": 86.540, + "args": { + "External id": 88807,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866061436.764, "dur": 67.020, + "args": { + "External id": 88808,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2791, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866061449.234, "dur": 50.200, + "args": { + "External id": 88809,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866061520.614, "dur": 3.470, + "args": { + "External id": 88810,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2793, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061577.204, "dur": 4.030, + "args": { + "External id": 88811,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061612.104, "dur": 1.210, + "args": { + "External id": 88812,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061631.333, "dur": 1.731, + "args": { + "External id": 88813,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061648.053, "dur": 0.791, + "args": { + "External id": 88814,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061663.344, "dur": 0.760, + "args": { + "External id": 88815,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061677.393, "dur": 0.731, + "args": { + "External id": 88816,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061692.184, "dur": 1.440, + "args": { + "External id": 88817,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061707.213, "dur": 1.011, + "args": { + "External id": 88818,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061721.253, "dur": 0.790, + "args": { + "External id": 88819,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866061798.913, "dur": 1411.707, + "args": { + "External id": 88820,"Record function id": 0, "Ev Idx": 2803 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6300866061813.943, "dur": 889.218, + "args": { + "External id": 88821,"Record function id": 0, "Ev Idx": 2804 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6300866061825.963, "dur": 252.920, + "args": { + "External id": 88822,"Record function id": 0, "Ev Idx": 2805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061915.133, "dur": 3.970, + "args": { + "External id": 88823,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061923.513, "dur": 0.950, + "args": { + "External id": 88824,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061926.573, "dur": 1.620, + "args": { + "External id": 88825,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061930.123, "dur": 0.540, + "args": { + "External id": 88826,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061932.193, "dur": 0.690, + "args": { + "External id": 88827,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061934.463, "dur": 0.600, + "args": { + "External id": 88828,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061936.913, "dur": 0.910, + "args": { + "External id": 88829,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061939.433, "dur": 0.710, + "args": { + "External id": 88830,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061941.713, "dur": 0.650, + "args": { + "External id": 88831,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866061944.233, "dur": 0.690, + "args": { + "External id": 88832,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866061958.613, "dur": 91.350, + "args": { + "External id": 88833,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866061970.953, "dur": 75.100, + "args": { + "External id": 88834,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866061981.343, "dur": 6.990, + "args": { + "External id": 88835,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866061990.813, "dur": 32.850, + "args": { + "External id": 88836,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866061992.223, "dur": 31.040, + "args": { + "External id": 88837,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866061994.613, "dur": 6.770, + "args": { + "External id": 88838,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866062002.563, "dur": 20.140, + "args": { + "External id": 88839,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2822 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.2", "pid": 5714, "tid": 6744, + "ts": 6300866062164.902, "dur": 529.679, + "args": { + "External id": 88840,"Record function id": 0, "Ev Idx": 2823 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6300866062179.662, "dur": 501.569, + "args": { + "External id": 88841,"Record function id": 0, "Ev Idx": 2824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866062243.522, "dur": 7.610, + "args": { + "External id": 88842,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866062262.772, "dur": 19.820, + "args": { + "External id": 88843,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062265.742, "dur": 1.300, + "args": { + "External id": 88844,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062268.662, "dur": 0.320, + "args": { + "External id": 88845,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062270.222, "dur": 0.360, + "args": { + "External id": 88846,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062271.952, "dur": 0.270, + "args": { + "External id": 88847,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062273.182, "dur": 0.320, + "args": { + "External id": 88848,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062274.532, "dur": 0.890, + "args": { + "External id": 88849,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062276.432, "dur": 0.210, + "args": { + "External id": 88850,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062277.762, "dur": 0.280, + "args": { + "External id": 88851,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062279.402, "dur": 0.290, + "args": { + "External id": 88852,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866062291.212, "dur": 52.670, + "args": { + "External id": 88853,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6300866062377.132, "dur": 93.240, + "args": { + "External id": 88854,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866062387.702, "dur": 7.320, + "args": { + "External id": 88855,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6300866062399.452, "dur": 8.560, + "args": { + "External id": 88856,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866062401.992, "dur": 5.610, + "args": { + "External id": 88857,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062404.712, "dur": 0.950, + "args": { + "External id": 88858,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866062415.762, "dur": 16.040, + "args": { + "External id": 88859,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062417.702, "dur": 0.340, + "args": { + "External id": 88860,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062419.252, "dur": 0.380, + "args": { + "External id": 88861,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062420.652, "dur": 0.820, + "args": { + "External id": 88862,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062422.492, "dur": 0.390, + "args": { + "External id": 88863,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062423.982, "dur": 0.300, + "args": { + "External id": 88864,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062425.252, "dur": 0.330, + "args": { + "External id": 88865,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062426.612, "dur": 0.290, + "args": { + "External id": 88866,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062427.962, "dur": 0.300, + "args": { + "External id": 88867,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866062429.302, "dur": 0.250, + "args": { + "External id": 88868,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866062443.362, "dur": 18.540, + "args": { + "External id": 88869,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6300866062525.702, "dur": 85.689, + "args": { + "External id": 88870,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866062542.582, "dur": 65.709, + "args": { + "External id": 88871,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2854, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6300866062554.531, "dur": 49.260, + "args": { + "External id": 88872,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866062625.221, "dur": 3.560, + "args": { + "External id": 88873,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2856, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866062708.791, "dur": 489.779, + "args": { + "External id": 88874,"Sequence number": 1770928, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2857 + } + }, + { + "ph": "f", "id": 274, "pid": 5714, "tid": 6744, "ts": 6300866062708.791, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866062779.031, "dur": 34.960, + "args": { + "External id": 88875,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6300866062847.521, "dur": 24.610, + "args": { + "External id": 88876,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866062888.221, "dur": 37.420, + "args": { + "External id": 88877,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866062938.921, "dur": 26.949, + "args": { + "External id": 88878,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866062977.290, "dur": 21.620, + "args": { + "External id": 88879,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866063009.341, "dur": 24.320, + "args": { + "External id": 88880,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866063044.710, "dur": 20.020, + "args": { + "External id": 88881,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866063088.860, "dur": 22.150, + "args": { + "External id": 88882,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866063128.640, "dur": 14.070, + "args": { + "External id": 88883,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6300866063157.470, "dur": 17.160, + "args": { + "External id": 88884,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866063225.670, "dur": 12.390, + "args": { + "External id": 88885,"Record function id": 0, "Ev Idx": 2868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866063229.010, "dur": 7.550, + "args": { + "External id": 88886,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866063231.750, "dur": 3.940, + "args": { + "External id": 88887,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866063232.720, "dur": 2.700, + "args": { + "External id": 88888,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866063243.080, "dur": 5.790, + "args": { + "External id": 88889,"Record function id": 0, "Ev Idx": 2872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866063244.940, "dur": 2.850, + "args": { + "External id": 88890,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866063245.820, "dur": 1.400, + "args": { + "External id": 88891,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866063246.250, "dur": 0.770, + "args": { + "External id": 88892,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866063253.270, "dur": 5.360, + "args": { + "External id": 88893,"Record function id": 0, "Ev Idx": 2876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866063254.990, "dur": 2.600, + "args": { + "External id": 88894,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866063255.760, "dur": 1.330, + "args": { + "External id": 88895,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866063256.250, "dur": 0.660, + "args": { + "External id": 88896,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866063262.690, "dur": 5.250, + "args": { + "External id": 88897,"Record function id": 0, "Ev Idx": 2880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866063264.460, "dur": 2.440, + "args": { + "External id": 88898,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866063265.240, "dur": 1.190, + "args": { + "External id": 88899,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866063265.620, "dur": 0.640, + "args": { + "External id": 88900,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866063272.290, "dur": 353.079, + "args": { + "External id": 88901,"Record function id": 0, "Sequence number": 1770927, "Fwd thread id": 1, "Ev Idx": 2884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866063273.970, "dur": 342.519, + "args": { + "External id": 88902,"Sequence number": 1770927, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2885 + } + }, + { + "ph": "f", "id": 275, "pid": 5714, "tid": 6744, "ts": 6300866063273.970, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866063349.940, "dur": 42.740, + "args": { + "External id": 88903,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866063408.069, "dur": 20.791, + "args": { + "External id": 88904,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866063454.600, "dur": 135.199, + "args": { + "External id": 88905,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866063511.459, "dur": 7.120, + "args": { + "External id": 88906,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866063520.649, "dur": 3.830, + "args": { + "External id": 88907,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866063640.329, "dur": 11.930, + "args": { + "External id": 88908,"Record function id": 0, "Ev Idx": 2891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866063643.439, "dur": 7.410, + "args": { + "External id": 88909,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866063646.169, "dur": 3.670, + "args": { + "External id": 88910,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866063647.109, "dur": 2.500, + "args": { + "External id": 88911,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866063657.019, "dur": 186.020, + "args": { + "External id": 88912,"Record function id": 0, "Sequence number": 1770926, "Fwd thread id": 1, "Ev Idx": 2895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866063658.879, "dur": 177.220, + "args": { + "External id": 88913,"Sequence number": 1770926, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2896 + } + }, + { + "ph": "f", "id": 276, "pid": 5714, "tid": 6744, "ts": 6300866063658.879, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866063674.959, "dur": 37.650, + "args": { + "External id": 88914,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866063678.379, "dur": 6.940, + "args": { + "External id": 88915,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866063686.599, "dur": 25.350, + "args": { + "External id": 88916,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866063722.319, "dur": 8.630, + "args": { + "External id": 88917,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866063724.509, "dur": 5.920, + "args": { + "External id": 88918,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866063855.648, "dur": 172.070, + "args": { + "External id": 88919,"Record function id": 0, "Sequence number": 1770925, "Fwd thread id": 1, "Ev Idx": 2902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866063858.788, "dur": 160.810, + "args": { + "External id": 88920,"Sequence number": 1770925, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2903 + } + }, + { + "ph": "f", "id": 277, "pid": 5714, "tid": 6744, "ts": 6300866063858.788, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866063874.028, "dur": 32.911, + "args": { + "External id": 88921,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866063877.148, "dur": 6.800, + "args": { + "External id": 88922,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866063885.228, "dur": 21.100, + "args": { + "External id": 88923,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866063915.988, "dur": 7.060, + "args": { + "External id": 88924,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866063918.068, "dur": 4.500, + "args": { + "External id": 88925,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866064040.078, "dur": 353.519, + "args": { + "External id": 88926,"Record function id": 0, "Sequence number": 1770924, "Fwd thread id": 1, "Ev Idx": 2909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866064043.118, "dur": 339.269, + "args": { + "External id": 88927,"Sequence number": 1770924, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2910 + } + }, + { + "ph": "f", "id": 278, "pid": 5714, "tid": 6744, "ts": 6300866064043.118, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866064112.398, "dur": 43.140, + "args": { + "External id": 88928,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866064169.078, "dur": 25.930, + "args": { + "External id": 88929,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866064206.328, "dur": 24.150, + "args": { + "External id": 88930,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866064242.028, "dur": 19.759, + "args": { + "External id": 88931,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866064270.167, "dur": 15.611, + "args": { + "External id": 88932,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866064293.878, "dur": 26.100, + "args": { + "External id": 88933,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6300866064343.187, "dur": 20.770, + "args": { + "External id": 88934,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866064407.937, "dur": 12.620, + "args": { + "External id": 88935,"Record function id": 0, "Ev Idx": 2918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866064411.087, "dur": 8.040, + "args": { + "External id": 88936,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866064413.777, "dur": 4.560, + "args": { + "External id": 88937,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866064414.717, "dur": 3.390, + "args": { + "External id": 88938,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866064425.387, "dur": 5.430, + "args": { + "External id": 88939,"Record function id": 0, "Ev Idx": 2922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866064427.177, "dur": 2.570, + "args": { + "External id": 88940,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866064428.017, "dur": 1.260, + "args": { + "External id": 88941,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866064428.427, "dur": 0.660, + "args": { + "External id": 88942,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866064435.187, "dur": 5.300, + "args": { + "External id": 88943,"Record function id": 0, "Ev Idx": 2926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866064436.987, "dur": 2.480, + "args": { + "External id": 88944,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866064437.707, "dur": 1.280, + "args": { + "External id": 88945,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866064438.227, "dur": 0.590, + "args": { + "External id": 88946,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866064444.617, "dur": 218.720, + "args": { + "External id": 88947,"Record function id": 0, "Sequence number": 1770923, "Fwd thread id": 1, "Ev Idx": 2930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866064446.287, "dur": 182.660, + "args": { + "External id": 88948,"Sequence number": 1770923, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2931 + } + }, + { + "ph": "f", "id": 279, "pid": 5714, "tid": 6744, "ts": 6300866064446.287, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866064517.847, "dur": 23.600, + "args": { + "External id": 88949,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866064560.157, "dur": 14.450, + "args": { + "External id": 88950,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866064592.557, "dur": 16.550, + "args": { + "External id": 88951,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866064637.557, "dur": 20.670, + "args": { + "External id": 88952,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866064676.097, "dur": 11.410, + "args": { + "External id": 88953,"Record function id": 0, "Ev Idx": 2936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866064679.107, "dur": 7.020, + "args": { + "External id": 88954,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866064681.357, "dur": 3.960, + "args": { + "External id": 88955,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866064682.177, "dur": 2.910, + "args": { + "External id": 88956,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866064692.007, "dur": 804.048, + "args": { + "External id": 88957,"Record function id": 0, "Sequence number": 1770922, "Fwd thread id": 1, "Ev Idx": 2940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866064693.767, "dur": 795.908, + "args": { + "External id": 88958,"Sequence number": 1770922, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2941 + } + }, + { + "ph": "f", "id": 280, "pid": 5714, "tid": 6744, "ts": 6300866064693.767, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6300866064712.726, "dur": 24.771, + "args": { + "External id": 88959,"Record function id": 0, "Ev Idx": 2942 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6300866064745.857, "dur": 58.649, + "args": { + "External id": 88960,"Record function id": 0, "Ev Idx": 2943 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6300866064811.736, "dur": 671.989, + "args": { + "External id": 88961,"Record function id": 0, "Ev Idx": 2944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866064883.426, "dur": 8.210, + "args": { + "External id": 88962,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866064900.526, "dur": 3.590, + "args": { + "External id": 88963,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866064916.636, "dur": 105.850, + "args": { + "External id": 88964,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866064926.756, "dur": 91.810, + "args": { + "External id": 88965,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866064951.686, "dur": 9.200, + "args": { + "External id": 88966,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866064964.496, "dur": 32.600, + "args": { + "External id": 88967,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866064965.906, "dur": 30.800, + "args": { + "External id": 88968,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866064968.226, "dur": 6.860, + "args": { + "External id": 88969,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866064976.396, "dur": 19.800, + "args": { + "External id": 88970,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866065090.086, "dur": 8.960, + "args": { + "External id": 88971,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866065091.826, "dur": 6.690, + "args": { + "External id": 88972,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866065116.006, "dur": 86.759, + "args": { + "External id": 88973,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866065131.376, "dur": 68.140, + "args": { + "External id": 88974,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2957, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866065143.585, "dur": 51.600, + "args": { + "External id": 88975,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866065216.505, "dur": 3.431, + "args": { + "External id": 88976,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2959, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065274.045, "dur": 3.960, + "args": { + "External id": 88977,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065318.765, "dur": 1.530, + "args": { + "External id": 88978,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065338.755, "dur": 1.120, + "args": { + "External id": 88979,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065355.575, "dur": 0.920, + "args": { + "External id": 88980,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065371.585, "dur": 0.830, + "args": { + "External id": 88981,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065385.765, "dur": 0.940, + "args": { + "External id": 88982,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065400.125, "dur": 0.930, + "args": { + "External id": 88983,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065414.635, "dur": 1.090, + "args": { + "External id": 88984,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065429.455, "dur": 0.920, + "args": { + "External id": 88985,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866065509.245, "dur": 1389.847, + "args": { + "External id": 88986,"Record function id": 0, "Ev Idx": 2969 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6300866065524.475, "dur": 866.328, + "args": { + "External id": 88987,"Record function id": 0, "Ev Idx": 2970 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6300866065536.285, "dur": 250.309, + "args": { + "External id": 88988,"Record function id": 0, "Ev Idx": 2971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866065624.895, "dur": 3.940, + "args": { + "External id": 88989,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866065633.264, "dur": 1.100, + "args": { + "External id": 88990,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866065636.284, "dur": 0.931, + "args": { + "External id": 88991,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866065638.875, "dur": 0.589, + "args": { + "External id": 88992,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866065641.084, "dur": 0.891, + "args": { + "External id": 88993,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866065643.664, "dur": 0.731, + "args": { + "External id": 88994,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866065645.975, "dur": 1.029, + "args": { + "External id": 88995,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866065648.535, "dur": 0.929, + "args": { + "External id": 88996,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866065651.344, "dur": 0.571, + "args": { + "External id": 88997,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866065653.575, "dur": 0.680, + "args": { + "External id": 88998,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866065668.084, "dur": 90.230, + "args": { + "External id": 88999,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866065679.734, "dur": 74.930, + "args": { + "External id": 89000,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866065690.324, "dur": 6.690, + "args": { + "External id": 89001,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866065699.494, "dur": 33.270, + "args": { + "External id": 89002,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866065700.984, "dur": 31.370, + "args": { + "External id": 89003,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065703.354, "dur": 7.210, + "args": { + "External id": 89004,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866065711.754, "dur": 20.070, + "args": { + "External id": 89005,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2988 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.1", "pid": 5714, "tid": 6744, + "ts": 6300866065875.244, "dur": 507.409, + "args": { + "External id": 89006,"Record function id": 0, "Ev Idx": 2989 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6300866065890.504, "dur": 478.249, + "args": { + "External id": 89007,"Record function id": 0, "Ev Idx": 2990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866065954.634, "dur": 8.340, + "args": { + "External id": 89008,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866065974.034, "dur": 19.800, + "args": { + "External id": 89009,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065977.154, "dur": 1.180, + "args": { + "External id": 89010,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065980.224, "dur": 0.390, + "args": { + "External id": 89011,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065981.894, "dur": 0.300, + "args": { + "External id": 89012,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065983.214, "dur": 0.280, + "args": { + "External id": 89013,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065984.794, "dur": 0.270, + "args": { + "External id": 89014,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065986.004, "dur": 0.490, + "args": { + "External id": 89015,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065987.534, "dur": 0.330, + "args": { + "External id": 89016,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065988.984, "dur": 0.330, + "args": { + "External id": 89017,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866065990.434, "dur": 0.390, + "args": { + "External id": 89018,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866066002.214, "dur": 22.880, + "args": { + "External id": 89019,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6300866066056.463, "dur": 91.450, + "args": { + "External id": 89020,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866066066.774, "dur": 7.380, + "args": { + "External id": 89021,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6300866066078.383, "dur": 8.520, + "args": { + "External id": 89022,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866066080.883, "dur": 5.631, + "args": { + "External id": 89023,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866066083.783, "dur": 0.871, + "args": { + "External id": 89024,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866066094.603, "dur": 15.671, + "args": { + "External id": 89025,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866066096.543, "dur": 0.451, + "args": { + "External id": 89026,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866066098.034, "dur": 0.300, + "args": { + "External id": 89027,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866066099.274, "dur": 0.300, + "args": { + "External id": 89028,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866066100.483, "dur": 0.360, + "args": { + "External id": 89029,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866066101.994, "dur": 0.329, + "args": { + "External id": 89030,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866066103.414, "dur": 0.300, + "args": { + "External id": 89031,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866066104.694, "dur": 0.360, + "args": { + "External id": 89032,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866066106.163, "dur": 0.451, + "args": { + "External id": 89033,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866066107.703, "dur": 0.340, + "args": { + "External id": 89034,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866066120.673, "dur": 18.790, + "args": { + "External id": 89035,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6300866066202.103, "dur": 86.080, + "args": { + "External id": 89036,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866066219.343, "dur": 65.750, + "args": { + "External id": 89037,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3020, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6300866066230.993, "dur": 49.750, + "args": { + "External id": 89038,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866066311.343, "dur": 3.910, + "args": { + "External id": 89039,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3022, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866066396.593, "dur": 491.179, + "args": { + "External id": 89040,"Sequence number": 1770921, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3023 + } + }, + { + "ph": "f", "id": 281, "pid": 5714, "tid": 6744, "ts": 6300866066396.593, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866066466.262, "dur": 35.071, + "args": { + "External id": 89041,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6300866066533.213, "dur": 25.749, + "args": { + "External id": 89042,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 3025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866066576.742, "dur": 37.870, + "args": { + "External id": 89043,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 3026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866066628.102, "dur": 27.690, + "args": { + "External id": 89044,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866066666.592, "dur": 21.160, + "args": { + "External id": 89045,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866066698.092, "dur": 24.390, + "args": { + "External id": 89046,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866066733.762, "dur": 19.920, + "args": { + "External id": 89047,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866066778.122, "dur": 21.730, + "args": { + "External id": 89048,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866066817.282, "dur": 13.820, + "args": { + "External id": 89049,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6300866066846.702, "dur": 17.310, + "args": { + "External id": 89050,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 3033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866066914.112, "dur": 12.360, + "args": { + "External id": 89051,"Record function id": 0, "Ev Idx": 3034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866066917.341, "dur": 7.640, + "args": { + "External id": 89052,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866066920.081, "dur": 4.091, + "args": { + "External id": 89053,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866066921.081, "dur": 2.860, + "args": { + "External id": 89054,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866066931.472, "dur": 5.860, + "args": { + "External id": 89055,"Record function id": 0, "Ev Idx": 3038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866066933.352, "dur": 2.889, + "args": { + "External id": 89056,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866066934.261, "dur": 1.480, + "args": { + "External id": 89057,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866066934.692, "dur": 0.869, + "args": { + "External id": 89058,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866066941.732, "dur": 5.889, + "args": { + "External id": 89059,"Record function id": 0, "Ev Idx": 3042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866066943.641, "dur": 2.860, + "args": { + "External id": 89060,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866066944.472, "dur": 1.500, + "args": { + "External id": 89061,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866066945.021, "dur": 0.760, + "args": { + "External id": 89062,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866066951.712, "dur": 5.400, + "args": { + "External id": 89063,"Record function id": 0, "Ev Idx": 3046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866066953.481, "dur": 2.560, + "args": { + "External id": 89064,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866066954.261, "dur": 1.311, + "args": { + "External id": 89065,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866066954.672, "dur": 0.729, + "args": { + "External id": 89066,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866066961.161, "dur": 352.250, + "args": { + "External id": 89067,"Record function id": 0, "Sequence number": 1770920, "Fwd thread id": 1, "Ev Idx": 3050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866066962.852, "dur": 328.639, + "args": { + "External id": 89068,"Sequence number": 1770920, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3051 + } + }, + { + "ph": "f", "id": 282, "pid": 5714, "tid": 6744, "ts": 6300866066962.852, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866067028.981, "dur": 41.780, + "args": { + "External id": 89069,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866067084.911, "dur": 20.380, + "args": { + "External id": 89070,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866067131.501, "dur": 133.360, + "args": { + "External id": 89071,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 3054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866067186.391, "dur": 7.420, + "args": { + "External id": 89072,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866067195.711, "dur": 3.920, + "args": { + "External id": 89073,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866067328.861, "dur": 12.519, + "args": { + "External id": 89074,"Record function id": 0, "Ev Idx": 3057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866067332.331, "dur": 7.569, + "args": { + "External id": 89075,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866067335.041, "dur": 3.750, + "args": { + "External id": 89076,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866067336.081, "dur": 2.490, + "args": { + "External id": 89077,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866067346.000, "dur": 188.330, + "args": { + "External id": 89078,"Record function id": 0, "Sequence number": 1770919, "Fwd thread id": 1, "Ev Idx": 3061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866067347.791, "dur": 179.489, + "args": { + "External id": 89079,"Sequence number": 1770919, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3062 + } + }, + { + "ph": "f", "id": 283, "pid": 5714, "tid": 6744, "ts": 6300866067347.791, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866067364.731, "dur": 39.740, + "args": { + "External id": 89080,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866067368.291, "dur": 7.440, + "args": { + "External id": 89081,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866067377.100, "dur": 26.640, + "args": { + "External id": 89082,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866067414.000, "dur": 7.471, + "args": { + "External id": 89083,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866067415.960, "dur": 5.031, + "args": { + "External id": 89084,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866067546.940, "dur": 214.950, + "args": { + "External id": 89085,"Record function id": 0, "Sequence number": 1770918, "Fwd thread id": 1, "Ev Idx": 3068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866067549.940, "dur": 204.270, + "args": { + "External id": 89086,"Sequence number": 1770918, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3069 + } + }, + { + "ph": "f", "id": 284, "pid": 5714, "tid": 6744, "ts": 6300866067549.940, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866067564.850, "dur": 74.090, + "args": { + "External id": 89087,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866067568.280, "dur": 6.820, + "args": { + "External id": 89088,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866067615.390, "dur": 22.750, + "args": { + "External id": 89089,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866067648.410, "dur": 7.990, + "args": { + "External id": 89090,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866067650.750, "dur": 5.150, + "args": { + "External id": 89091,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866067774.730, "dur": 341.819, + "args": { + "External id": 89092,"Record function id": 0, "Sequence number": 1770917, "Fwd thread id": 1, "Ev Idx": 3075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866067778.320, "dur": 326.039, + "args": { + "External id": 89093,"Sequence number": 1770917, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 3076 + } + }, + { + "ph": "f", "id": 285, "pid": 5714, "tid": 6744, "ts": 6300866067778.320, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866067847.750, "dur": 43.849, + "args": { + "External id": 89094,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866067905.349, "dur": 25.670, + "args": { + "External id": 89095,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866067941.539, "dur": 24.050, + "args": { + "External id": 89096,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866067977.829, "dur": 20.280, + "args": { + "External id": 89097,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866068006.709, "dur": 15.750, + "args": { + "External id": 89098,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866068030.089, "dur": 14.950, + "args": { + "External id": 89099,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6300866068066.579, "dur": 20.030, + "args": { + "External id": 89100,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 3083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866068130.679, "dur": 12.170, + "args": { + "External id": 89101,"Record function id": 0, "Ev Idx": 3084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866068133.769, "dur": 7.600, + "args": { + "External id": 89102,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866068136.559, "dur": 4.030, + "args": { + "External id": 89103,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866068137.529, "dur": 2.840, + "args": { + "External id": 89104,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866068147.719, "dur": 5.900, + "args": { + "External id": 89105,"Record function id": 0, "Ev Idx": 3088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866068149.459, "dur": 2.850, + "args": { + "External id": 89106,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866068150.329, "dur": 1.460, + "args": { + "External id": 89107,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866068150.729, "dur": 0.870, + "args": { + "External id": 89108,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866068157.749, "dur": 5.660, + "args": { + "External id": 89109,"Record function id": 0, "Ev Idx": 3092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866068159.549, "dur": 2.810, + "args": { + "External id": 89110,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866068160.389, "dur": 1.400, + "args": { + "External id": 89111,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866068160.869, "dur": 0.740, + "args": { + "External id": 89112,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866068167.519, "dur": 229.439, + "args": { + "External id": 89113,"Record function id": 0, "Sequence number": 1770916, "Fwd thread id": 1, "Ev Idx": 3096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866068169.089, "dur": 195.369, + "args": { + "External id": 89114,"Sequence number": 1770916, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3097 + } + }, + { + "ph": "f", "id": 286, "pid": 5714, "tid": 6744, "ts": 6300866068169.089, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866068238.378, "dur": 23.660, + "args": { + "External id": 89115,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866068279.938, "dur": 14.551, + "args": { + "External id": 89116,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866068323.658, "dur": 18.850, + "args": { + "External id": 89117,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 3100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866068373.338, "dur": 18.500, + "args": { + "External id": 89118,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866068410.128, "dur": 11.680, + "args": { + "External id": 89119,"Record function id": 0, "Ev Idx": 3102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866068413.518, "dur": 6.840, + "args": { + "External id": 89120,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866068415.858, "dur": 3.780, + "args": { + "External id": 89121,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866068416.718, "dur": 2.680, + "args": { + "External id": 89122,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866068426.428, "dur": 847.488, + "args": { + "External id": 89123,"Record function id": 0, "Sequence number": 1770915, "Fwd thread id": 1, "Ev Idx": 3106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866068428.018, "dur": 838.598, + "args": { + "External id": 89124,"Sequence number": 1770915, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3107 + } + }, + { + "ph": "f", "id": 287, "pid": 5714, "tid": 6744, "ts": 6300866068428.018, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6300866068447.098, "dur": 24.910, + "args": { + "External id": 89125,"Record function id": 0, "Ev Idx": 3108 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6300866068480.298, "dur": 56.670, + "args": { + "External id": 89126,"Record function id": 0, "Ev Idx": 3109 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6300866068544.358, "dur": 715.798, + "args": { + "External id": 89127,"Record function id": 0, "Ev Idx": 3110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866068615.708, "dur": 8.130, + "args": { + "External id": 89128,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866068632.588, "dur": 3.670, + "args": { + "External id": 89129,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866068648.718, "dur": 117.919, + "args": { + "External id": 89130,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866068659.057, "dur": 103.810, + "args": { + "External id": 89131,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866068697.168, "dur": 6.760, + "args": { + "External id": 89132,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866068708.008, "dur": 33.009, + "args": { + "External id": 89133,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 3116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866068709.548, "dur": 31.089, + "args": { + "External id": 89134,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 3117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866068711.988, "dur": 6.800, + "args": { + "External id": 89135,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866068720.057, "dur": 20.031, + "args": { + "External id": 89136,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 3119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866068834.857, "dur": 9.090, + "args": { + "External id": 89137,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 3120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866068836.527, "dur": 6.820, + "args": { + "External id": 89138,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866068861.727, "dur": 87.090, + "args": { + "External id": 89139,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 3122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866068878.247, "dur": 67.370, + "args": { + "External id": 89140,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3123, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866068890.907, "dur": 50.360, + "args": { + "External id": 89141,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 3124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866068961.997, "dur": 3.500, + "args": { + "External id": 89142,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3125, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069028.967, "dur": 6.050, + "args": { + "External id": 89143,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069075.927, "dur": 1.730, + "args": { + "External id": 89144,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069101.016, "dur": 1.451, + "args": { + "External id": 89145,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069122.016, "dur": 1.111, + "args": { + "External id": 89146,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069139.916, "dur": 0.900, + "args": { + "External id": 89147,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069155.396, "dur": 0.951, + "args": { + "External id": 89148,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069171.007, "dur": 1.009, + "args": { + "External id": 89149,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069186.576, "dur": 1.151, + "args": { + "External id": 89150,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069201.986, "dur": 0.980, + "args": { + "External id": 89151,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866069287.876, "dur": 1690.266, + "args": { + "External id": 89152,"Record function id": 0, "Ev Idx": 3135 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6300866069313.816, "dur": 1030.788, + "args": { + "External id": 89153,"Record function id": 0, "Ev Idx": 3136 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6300866069326.756, "dur": 275.510, + "args": { + "External id": 89154,"Record function id": 0, "Ev Idx": 3137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866069423.566, "dur": 4.580, + "args": { + "External id": 89155,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866069432.866, "dur": 1.150, + "args": { + "External id": 89156,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866069436.386, "dur": 0.850, + "args": { + "External id": 89157,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866069439.026, "dur": 1.200, + "args": { + "External id": 89158,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866069441.936, "dur": 0.810, + "args": { + "External id": 89159,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866069444.356, "dur": 0.890, + "args": { + "External id": 89160,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866069447.106, "dur": 1.130, + "args": { + "External id": 89161,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866069449.956, "dur": 0.690, + "args": { + "External id": 89162,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866069452.296, "dur": 0.830, + "args": { + "External id": 89163,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866069454.686, "dur": 0.790, + "args": { + "External id": 89164,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866069470.446, "dur": 99.340, + "args": { + "External id": 89165,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866069483.556, "dur": 82.159, + "args": { + "External id": 89166,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866069495.156, "dur": 7.400, + "args": { + "External id": 89167,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866069505.136, "dur": 36.930, + "args": { + "External id": 89168,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866069506.766, "dur": 34.880, + "args": { + "External id": 89169,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069509.486, "dur": 8.020, + "args": { + "External id": 89170,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866069518.746, "dur": 22.240, + "args": { + "External id": 89171,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3154 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.0", "pid": 5714, "tid": 6744, + "ts": 6300866069700.275, "dur": 632.709, + "args": { + "External id": 89172,"Record function id": 0, "Ev Idx": 3155 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6300866069717.495, "dur": 596.749, + "args": { + "External id": 89173,"Record function id": 0, "Ev Idx": 3156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866069796.255, "dur": 9.380, + "args": { + "External id": 89174,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866069818.825, "dur": 21.710, + "args": { + "External id": 89175,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069822.365, "dur": 1.560, + "args": { + "External id": 89176,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069825.725, "dur": 0.430, + "args": { + "External id": 89177,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069827.485, "dur": 0.360, + "args": { + "External id": 89178,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069829.075, "dur": 0.380, + "args": { + "External id": 89179,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069830.905, "dur": 0.470, + "args": { + "External id": 89180,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069832.405, "dur": 0.360, + "args": { + "External id": 89181,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069833.885, "dur": 0.390, + "args": { + "External id": 89182,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069835.295, "dur": 0.320, + "args": { + "External id": 89183,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069836.775, "dur": 0.340, + "args": { + "External id": 89184,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866069849.965, "dur": 25.240, + "args": { + "External id": 89185,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6300866069909.225, "dur": 109.720, + "args": { + "External id": 89186,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866069921.205, "dur": 8.040, + "args": { + "External id": 89187,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6300866069933.845, "dur": 10.070, + "args": { + "External id": 89188,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6300866069936.625, "dur": 6.850, + "args": { + "External id": 89189,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069939.965, "dur": 1.420, + "args": { + "External id": 89190,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6300866069952.335, "dur": 17.690, + "args": { + "External id": 89191,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069954.535, "dur": 0.520, + "args": { + "External id": 89192,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069956.525, "dur": 0.440, + "args": { + "External id": 89193,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069958.215, "dur": 0.380, + "args": { + "External id": 89194,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069959.645, "dur": 0.400, + "args": { + "External id": 89195,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069961.195, "dur": 0.430, + "args": { + "External id": 89196,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069962.615, "dur": 0.400, + "args": { + "External id": 89197,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069964.055, "dur": 0.380, + "args": { + "External id": 89198,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069965.555, "dur": 0.400, + "args": { + "External id": 89199,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866069967.205, "dur": 0.430, + "args": { + "External id": 89200,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6300866069984.285, "dur": 23.700, + "args": { + "External id": 89201,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6300866070098.534, "dur": 118.340, + "args": { + "External id": 89202,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866070121.694, "dur": 91.160, + "args": { + "External id": 89203,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3186, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6300866070139.204, "dur": 68.360, + "args": { + "External id": 89204,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866070232.954, "dur": 4.330, + "args": { + "External id": 89205,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3188, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866070351.954, "dur": 610.268, + "args": { + "External id": 89206,"Sequence number": 1770914, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3189 + } + }, + { + "ph": "f", "id": 288, "pid": 5714, "tid": 6744, "ts": 6300866070351.954, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866070440.164, "dur": 43.880, + "args": { + "External id": 89207,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6300866070523.273, "dur": 30.670, + "args": { + "External id": 89208,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 3191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866070575.293, "dur": 46.250, + "args": { + "External id": 89209,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 3192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866070638.273, "dur": 33.700, + "args": { + "External id": 89210,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866070685.143, "dur": 26.600, + "args": { + "External id": 89211,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866070725.013, "dur": 31.050, + "args": { + "External id": 89212,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866070770.473, "dur": 24.930, + "args": { + "External id": 89213,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866070824.933, "dur": 26.260, + "args": { + "External id": 89214,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866070873.052, "dur": 18.000, + "args": { + "External id": 89215,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6300866070910.863, "dur": 21.529, + "args": { + "External id": 89216,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 3199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866071000.242, "dur": 18.450, + "args": { + "External id": 89217,"Record function id": 0, "Ev Idx": 3200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866071005.182, "dur": 11.330, + "args": { + "External id": 89218,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866071008.962, "dur": 6.290, + "args": { + "External id": 89219,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866071010.582, "dur": 4.240, + "args": { + "External id": 89220,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866071025.802, "dur": 9.050, + "args": { + "External id": 89221,"Record function id": 0, "Ev Idx": 3204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866071028.512, "dur": 4.530, + "args": { + "External id": 89222,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866071029.822, "dur": 2.410, + "args": { + "External id": 89223,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866071030.482, "dur": 1.410, + "args": { + "External id": 89224,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866071042.942, "dur": 9.070, + "args": { + "External id": 89225,"Record function id": 0, "Ev Idx": 3208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866071045.762, "dur": 4.530, + "args": { + "External id": 89226,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866071047.002, "dur": 2.390, + "args": { + "External id": 89227,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866071047.862, "dur": 1.220, + "args": { + "External id": 89228,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866071058.892, "dur": 8.710, + "args": { + "External id": 89229,"Record function id": 0, "Ev Idx": 3212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866071061.612, "dur": 4.250, + "args": { + "External id": 89230,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866071062.772, "dur": 2.220, + "args": { + "External id": 89231,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866071063.432, "dur": 1.260, + "args": { + "External id": 89232,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866071074.592, "dur": 503.569, + "args": { + "External id": 89233,"Record function id": 0, "Sequence number": 1770913, "Fwd thread id": 1, "Ev Idx": 3216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866071077.222, "dur": 489.299, + "args": { + "External id": 89234,"Sequence number": 1770913, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3217 + } + }, + { + "ph": "f", "id": 289, "pid": 5714, "tid": 6744, "ts": 6300866071077.222, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866071188.112, "dur": 58.880, + "args": { + "External id": 89235,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866071265.592, "dur": 28.570, + "args": { + "External id": 89236,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866071341.971, "dur": 188.850, + "args": { + "External id": 89237,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 3220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866071420.521, "dur": 10.280, + "args": { + "External id": 89238,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866071433.431, "dur": 6.110, + "args": { + "External id": 89239,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866071598.881, "dur": 17.080, + "args": { + "External id": 89240,"Record function id": 0, "Ev Idx": 3223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866071603.531, "dur": 10.490, + "args": { + "External id": 89241,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866071607.391, "dur": 5.200, + "args": { + "External id": 89242,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866071608.701, "dur": 3.590, + "args": { + "External id": 89243,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866071622.311, "dur": 261.729, + "args": { + "External id": 89244,"Record function id": 0, "Sequence number": 1770912, "Fwd thread id": 1, "Ev Idx": 3227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866071624.531, "dur": 249.779, + "args": { + "External id": 89245,"Sequence number": 1770912, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3228 + } + }, + { + "ph": "f", "id": 290, "pid": 5714, "tid": 6744, "ts": 6300866071624.531, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866071647.681, "dur": 52.600, + "args": { + "External id": 89246,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866071652.801, "dur": 10.000, + "args": { + "External id": 89247,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866071664.521, "dur": 34.820, + "args": { + "External id": 89248,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866071713.771, "dur": 10.470, + "args": { + "External id": 89249,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866071716.701, "dur": 6.870, + "args": { + "External id": 89250,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866071901.860, "dur": 291.489, + "args": { + "External id": 89251,"Record function id": 0, "Sequence number": 1770911, "Fwd thread id": 1, "Ev Idx": 3234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866071906.010, "dur": 275.070, + "args": { + "External id": 89252,"Sequence number": 1770911, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3235 + } + }, + { + "ph": "f", "id": 291, "pid": 5714, "tid": 6744, "ts": 6300866071906.010, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866071926.600, "dur": 46.210, + "args": { + "External id": 89253,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866071931.200, "dur": 9.720, + "args": { + "External id": 89254,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866071942.710, "dur": 29.180, + "args": { + "External id": 89255,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866071985.040, "dur": 11.780, + "args": { + "External id": 89256,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866071988.030, "dur": 7.950, + "args": { + "External id": 89257,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866072214.240, "dur": 586.728, + "args": { + "External id": 89258,"Record function id": 0, "Sequence number": 1770910, "Fwd thread id": 1, "Ev Idx": 3241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866072219.389, "dur": 562.209, + "args": { + "External id": 89259,"Sequence number": 1770910, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 3242 + } + }, + { + "ph": "f", "id": 292, "pid": 5714, "tid": 6744, "ts": 6300866072219.389, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866072355.889, "dur": 72.960, + "args": { + "External id": 89260,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866072451.479, "dur": 41.870, + "args": { + "External id": 89261,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866072511.779, "dur": 39.760, + "args": { + "External id": 89262,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866072570.369, "dur": 32.220, + "args": { + "External id": 89263,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866072617.389, "dur": 26.179, + "args": { + "External id": 89264,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866072657.548, "dur": 25.060, + "args": { + "External id": 89265,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6300866072717.608, "dur": 33.990, + "args": { + "External id": 89266,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 3249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866072825.068, "dur": 20.750, + "args": { + "External id": 89267,"Record function id": 0, "Ev Idx": 3250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866072830.238, "dur": 13.090, + "args": { + "External id": 89268,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866072834.868, "dur": 7.130, + "args": { + "External id": 89269,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866072836.668, "dur": 4.940, + "args": { + "External id": 89270,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866072854.038, "dur": 9.620, + "args": { + "External id": 89271,"Record function id": 0, "Ev Idx": 3254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866072857.098, "dur": 4.700, + "args": { + "External id": 89272,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866072858.548, "dur": 2.390, + "args": { + "External id": 89273,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866072859.278, "dur": 1.340, + "args": { + "External id": 89274,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866072870.578, "dur": 9.250, + "args": { + "External id": 89275,"Record function id": 0, "Ev Idx": 3258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866072873.618, "dur": 4.510, + "args": { + "External id": 89276,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866072874.908, "dur": 2.420, + "args": { + "External id": 89277,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866072875.808, "dur": 1.200, + "args": { + "External id": 89278,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866072886.908, "dur": 497.109, + "args": { + "External id": 89279,"Record function id": 0, "Sequence number": 1770909, "Fwd thread id": 1, "Ev Idx": 3262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866072889.408, "dur": 366.259, + "args": { + "External id": 89280,"Sequence number": 1770909, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3263 + } + }, + { + "ph": "f", "id": 293, "pid": 5714, "tid": 6744, "ts": 6300866072889.408, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866073010.448, "dur": 48.910, + "args": { + "External id": 89281,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866073103.958, "dur": 33.980, + "args": { + "External id": 89282,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866073180.637, "dur": 33.860, + "args": { + "External id": 89283,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 3266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866073273.507, "dur": 97.340, + "args": { + "External id": 89284,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866073411.737, "dur": 23.970, + "args": { + "External id": 89285,"Record function id": 0, "Ev Idx": 3268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866073418.127, "dur": 14.570, + "args": { + "External id": 89286,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866073423.077, "dur": 8.110, + "args": { + "External id": 89287,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866073424.857, "dur": 5.840, + "args": { + "External id": 89288,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866073444.997, "dur": 1908.285, + "args": { + "External id": 89289,"Record function id": 0, "Sequence number": 1770908, "Fwd thread id": 1, "Ev Idx": 3272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866073448.517, "dur": 1883.896, + "args": { + "External id": 89290,"Sequence number": 1770908, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3273 + } + }, + { + "ph": "f", "id": 294, "pid": 5714, "tid": 6744, "ts": 6300866073448.517, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6300866073488.197, "dur": 51.840, + "args": { + "External id": 89291,"Record function id": 0, "Ev Idx": 3274 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6300866073557.477, "dur": 116.379, + "args": { + "External id": 89292,"Record function id": 0, "Ev Idx": 3275 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6300866073688.716, "dur": 1599.486, + "args": { + "External id": 89293,"Record function id": 0, "Ev Idx": 3276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866073833.246, "dur": 16.780, + "args": { + "External id": 89294,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866073868.346, "dur": 7.790, + "args": { + "External id": 89295,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866073901.566, "dur": 224.989, + "args": { + "External id": 89296,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866073923.146, "dur": 193.769, + "args": { + "External id": 89297,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866073972.545, "dur": 16.720, + "args": { + "External id": 89298,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866073995.405, "dur": 68.510, + "args": { + "External id": 89299,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 3282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866073998.856, "dur": 64.149, + "args": { + "External id": 89300,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 3283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866074003.816, "dur": 13.640, + "args": { + "External id": 89301,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866074020.245, "dur": 41.450, + "args": { + "External id": 89302,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 3285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866074314.945, "dur": 24.860, + "args": { + "External id": 89303,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 3286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866074320.145, "dur": 18.200, + "args": { + "External id": 89304,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866074381.524, "dur": 213.060, + "args": { + "External id": 89305,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 3288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866074420.324, "dur": 166.050, + "args": { + "External id": 89306,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3289, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866074451.395, "dur": 123.649, + "args": { + "External id": 89307,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 3290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866074625.484, "dur": 8.340, + "args": { + "External id": 89308,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3291, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866074766.374, "dur": 10.230, + "args": { + "External id": 89309,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866074852.323, "dur": 3.420, + "args": { + "External id": 89310,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866074899.834, "dur": 2.569, + "args": { + "External id": 89311,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866074938.023, "dur": 2.260, + "args": { + "External id": 89312,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866074973.443, "dur": 2.160, + "args": { + "External id": 89313,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866075006.393, "dur": 1.870, + "args": { + "External id": 89314,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866075041.473, "dur": 2.430, + "args": { + "External id": 89315,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866075083.153, "dur": 3.260, + "args": { + "External id": 89316,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866075124.003, "dur": 2.440, + "args": { + "External id": 89317,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866075394.512, "dur": 4426.400, + "args": { + "External id": 89318,"Record function id": 0, "Ev Idx": 3301 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6300866075441.822, "dur": 1283.497, + "args": { + "External id": 89319,"Record function id": 0, "Ev Idx": 3302 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6300866075478.522, "dur": 853.938, + "args": { + "External id": 89320,"Record function id": 0, "Ev Idx": 3303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866075735.561, "dur": 12.600, + "args": { + "External id": 89321,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866075763.441, "dur": 2.680, + "args": { + "External id": 89322,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866075772.121, "dur": 2.280, + "args": { + "External id": 89323,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866075779.652, "dur": 2.000, + "args": { + "External id": 89324,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866075786.141, "dur": 2.020, + "args": { + "External id": 89325,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866075792.941, "dur": 1.780, + "args": { + "External id": 89326,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866075799.671, "dur": 2.360, + "args": { + "External id": 89327,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866075806.821, "dur": 1.890, + "args": { + "External id": 89328,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866075813.391, "dur": 1.840, + "args": { + "External id": 89329,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866075819.641, "dur": 1.780, + "args": { + "External id": 89330,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866075863.491, "dur": 306.429, + "args": { + "External id": 89331,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6300866075899.971, "dur": 252.360, + "args": { + "External id": 89332,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866075932.731, "dur": 24.640, + "args": { + "External id": 89333,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866075963.691, "dur": 101.390, + "args": { + "External id": 89334,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866075968.221, "dur": 95.760, + "args": { + "External id": 89335,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866075975.231, "dur": 21.010, + "args": { + "External id": 89336,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866076000.251, "dur": 62.090, + "args": { + "External id": 89337,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866076761.539, "dur": 2978.493, + "args": { + "External id": 89338,"Sequence number": 1770907, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3321 + } + }, + { + "ph": "f", "id": 295, "pid": 5714, "tid": 6744, "ts": 6300866076761.539, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866077074.429, "dur": 220.249, + "args": { + "External id": 89339,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6300866077573.097, "dur": 179.430, + "args": { + "External id": 89340,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 3323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6300866077874.347, "dur": 267.809, + "args": { + "External id": 89341,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 3324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866078231.626, "dur": 185.120, + "args": { + "External id": 89342,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866078481.815, "dur": 121.870, + "args": { + "External id": 89343,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866078663.025, "dur": 132.069, + "args": { + "External id": 89344,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866078854.474, "dur": 106.610, + "args": { + "External id": 89345,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866079096.414, "dur": 118.140, + "args": { + "External id": 89346,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866079338.753, "dur": 81.930, + "args": { + "External id": 89347,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6300866079511.523, "dur": 94.140, + "args": { + "External id": 89348,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 3331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866079937.312, "dur": 94.060, + "args": { + "External id": 89349,"Record function id": 0, "Ev Idx": 3332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866079962.392, "dur": 58.270, + "args": { + "External id": 89350,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866079983.042, "dur": 31.440, + "args": { + "External id": 89351,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866079991.202, "dur": 21.330, + "args": { + "External id": 89352,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866080069.042, "dur": 43.600, + "args": { + "External id": 89353,"Record function id": 0, "Ev Idx": 3336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866080082.732, "dur": 21.750, + "args": { + "External id": 89354,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866080089.092, "dur": 11.659, + "args": { + "External id": 89355,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866080092.252, "dur": 7.000, + "args": { + "External id": 89356,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866080145.451, "dur": 41.300, + "args": { + "External id": 89357,"Record function id": 0, "Ev Idx": 3340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866080158.591, "dur": 19.960, + "args": { + "External id": 89358,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866080163.902, "dur": 11.289, + "args": { + "External id": 89359,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866080168.202, "dur": 5.509, + "args": { + "External id": 89360,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866080218.281, "dur": 41.320, + "args": { + "External id": 89361,"Record function id": 0, "Ev Idx": 3344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866080231.161, "dur": 20.440, + "args": { + "External id": 89362,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866080236.691, "dur": 9.740, + "args": { + "External id": 89363,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866080239.811, "dur": 5.220, + "args": { + "External id": 89364,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866080289.891, "dur": 2821.264, + "args": { + "External id": 89365,"Record function id": 0, "Sequence number": 1770906, "Fwd thread id": 1, "Ev Idx": 3348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866080344.581, "dur": 2703.714, + "args": { + "External id": 89366,"Sequence number": 1770906, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3349 + } + }, + { + "ph": "f", "id": 296, "pid": 5714, "tid": 6744, "ts": 6300866080344.581, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866080938.260, "dur": 323.769, + "args": { + "External id": 89367,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866081421.868, "dur": 169.040, + "args": { + "External id": 89368,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6300866081791.748, "dur": 1071.327, + "args": { + "External id": 89369,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 3352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866082215.257, "dur": 54.320, + "args": { + "External id": 89370,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866082283.947, "dur": 79.470, + "args": { + "External id": 89371,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866083227.015, "dur": 144.989, + "args": { + "External id": 89372,"Record function id": 0, "Ev Idx": 3355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866083252.024, "dur": 107.090, + "args": { + "External id": 89373,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866083272.495, "dur": 77.959, + "args": { + "External id": 89374,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866083281.654, "dur": 66.670, + "args": { + "External id": 89375,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866083413.214, "dur": 1596.137, + "args": { + "External id": 89376,"Record function id": 0, "Sequence number": 1770905, "Fwd thread id": 1, "Ev Idx": 3359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866083426.254, "dur": 1528.326, + "args": { + "External id": 89377,"Sequence number": 1770905, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3360 + } + }, + { + "ph": "f", "id": 297, "pid": 5714, "tid": 6744, "ts": 6300866083426.254, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866083552.714, "dur": 287.369, + "args": { + "External id": 89378,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866083580.204, "dur": 56.070, + "args": { + "External id": 89379,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866083646.163, "dur": 188.330, + "args": { + "External id": 89380,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866083910.023, "dur": 56.620, + "args": { + "External id": 89381,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866083924.203, "dur": 38.830, + "args": { + "External id": 89382,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866085112.610, "dur": 1470.457, + "args": { + "External id": 89383,"Record function id": 0, "Sequence number": 1770904, "Fwd thread id": 1, "Ev Idx": 3366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866085142.440, "dur": 1377.987, + "args": { + "External id": 89384,"Sequence number": 1770904, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3367 + } + }, + { + "ph": "f", "id": 298, "pid": 5714, "tid": 6744, "ts": 6300866085142.440, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6300866085259.720, "dur": 350.469, + "args": { + "External id": 89385,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866085285.690, "dur": 97.789, + "args": { + "External id": 89386,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866085395.119, "dur": 209.470, + "args": { + "External id": 89387,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6300866085679.399, "dur": 55.530, + "args": { + "External id": 89388,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866085694.159, "dur": 37.380, + "args": { + "External id": 89389,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866086681.577, "dur": 2566.424, + "args": { + "External id": 89390,"Record function id": 0, "Sequence number": 1770903, "Fwd thread id": 1, "Ev Idx": 3373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866086705.116, "dur": 2471.445, + "args": { + "External id": 89391,"Sequence number": 1770903, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 3374 + } + }, + { + "ph": "f", "id": 299, "pid": 5714, "tid": 6744, "ts": 6300866086705.116, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866087235.926, "dur": 380.348, + "args": { + "External id": 89392,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866087723.084, "dur": 191.800, + "args": { + "External id": 89393,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866087998.264, "dur": 177.199, + "args": { + "External id": 89394,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866088257.543, "dur": 184.020, + "args": { + "External id": 89395,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866088509.983, "dur": 113.239, + "args": { + "External id": 89396,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6300866088679.602, "dur": 100.470, + "args": { + "External id": 89397,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6300866088919.002, "dur": 131.039, + "args": { + "External id": 89398,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 3381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866089386.981, "dur": 85.639, + "args": { + "External id": 89399,"Record function id": 0, "Ev Idx": 3382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866089409.361, "dur": 53.199, + "args": { + "External id": 89400,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866089428.330, "dur": 28.150, + "args": { + "External id": 89401,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866089434.901, "dur": 19.969, + "args": { + "External id": 89402,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866089507.930, "dur": 37.330, + "args": { + "External id": 89403,"Record function id": 0, "Ev Idx": 3386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866089520.210, "dur": 17.830, + "args": { + "External id": 89404,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866089525.310, "dur": 9.250, + "args": { + "External id": 89405,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866089527.990, "dur": 5.170, + "args": { + "External id": 89406,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866089573.310, "dur": 36.650, + "args": { + "External id": 89407,"Record function id": 0, "Ev Idx": 3390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866089585.080, "dur": 17.620, + "args": { + "External id": 89408,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866089589.950, "dur": 9.550, + "args": { + "External id": 89409,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866089593.620, "dur": 4.590, + "args": { + "External id": 89410,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866089637.000, "dur": 1712.156, + "args": { + "External id": 89411,"Record function id": 0, "Sequence number": 1770902, "Fwd thread id": 1, "Ev Idx": 3394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866089647.190, "dur": 1416.857, + "args": { + "External id": 89412,"Sequence number": 1770902, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3395 + } + }, + { + "ph": "f", "id": 300, "pid": 5714, "tid": 6744, "ts": 6300866089647.190, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6300866090177.229, "dur": 223.749, + "args": { + "External id": 89413,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6300866090552.628, "dur": 106.060, + "args": { + "External id": 89414,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6300866090793.067, "dur": 126.360, + "args": { + "External id": 89415,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 3398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866091127.637, "dur": 135.699, + "args": { + "External id": 89416,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866091451.726, "dur": 87.030, + "args": { + "External id": 89417,"Record function id": 0, "Ev Idx": 3400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866091475.036, "dur": 52.600, + "args": { + "External id": 89418,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866091493.306, "dur": 28.280, + "args": { + "External id": 89419,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866091500.056, "dur": 19.820, + "args": { + "External id": 89420,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866091574.125, "dur": 5328.428, + "args": { + "External id": 89421,"Record function id": 0, "Sequence number": 1770901, "Fwd thread id": 1, "Ev Idx": 3404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866091586.885, "dur": 5268.379, + "args": { + "External id": 89422,"Sequence number": 1770901, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3405 + } + }, + { + "ph": "f", "id": 301, "pid": 5714, "tid": 6744, "ts": 6300866091586.885, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6300866091735.115, "dur": 206.300, + "args": { + "External id": 89423,"Record function id": 0, "Ev Idx": 3406 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6300866092013.695, "dur": 419.108, + "args": { + "External id": 89424,"Record function id": 0, "Ev Idx": 3407 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6300866092477.914, "dur": 4333.220, + "args": { + "External id": 89425,"Record function id": 0, "Ev Idx": 3408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866092868.493, "dur": 45.629, + "args": { + "External id": 89426,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866092961.542, "dur": 20.360, + "args": { + "External id": 89427,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866093050.442, "dur": 635.329, + "args": { + "External id": 89428,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866093135.092, "dur": 527.129, + "args": { + "External id": 89429,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866093220.702, "dur": 52.970, + "args": { + "External id": 89430,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866093292.042, "dur": 239.949, + "args": { + "External id": 89431,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 3414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866093335.692, "dur": 193.979, + "args": { + "External id": 89432,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 3415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866093351.521, "dur": 43.660, + "args": { + "External id": 89433,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866093403.181, "dur": 122.870, + "args": { + "External id": 89434,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 3417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866094117.580, "dur": 55.980, + "args": { + "External id": 89435,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 3418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866094128.350, "dur": 41.820, + "args": { + "External id": 89436,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866094276.299, "dur": 586.309, + "args": { + "External id": 89437,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 3420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866094409.299, "dur": 431.649, + "args": { + "External id": 89438,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3421, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866094493.379, "dur": 319.279, + "args": { + "External id": 89439,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 3422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866094941.478, "dur": 21.240, + "args": { + "External id": 89440,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3423, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866095366.467, "dur": 29.000, + "args": { + "External id": 89441,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866095607.996, "dur": 9.490, + "args": { + "External id": 89442,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866095738.146, "dur": 7.110, + "args": { + "External id": 89443,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866095842.826, "dur": 5.990, + "args": { + "External id": 89444,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866095943.135, "dur": 5.211, + "args": { + "External id": 89445,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866096037.715, "dur": 5.210, + "args": { + "External id": 89446,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866096133.895, "dur": 5.430, + "args": { + "External id": 89447,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866096238.115, "dur": 6.240, + "args": { + "External id": 89448,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866096387.905, "dur": 7.549, + "args": { + "External id": 89449,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866097002.783, "dur": 1662.457, + "args": { + "External id": 89450,"Record function id": 0, "Sequence number": 1770900, "Fwd thread id": 1, "Ev Idx": 3433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6300866097026.093, "dur": 1390.547, + "args": { + "External id": 89451,"Sequence number": 1770900, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3434 + } + }, + { + "ph": "f", "id": 302, "pid": 5714, "tid": 6744, "ts": 6300866097026.093, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_dense_backward_0", "pid": 5714, "tid": 6744, + "ts": 6300866097625.002, "dur": 181.789, + "args": { + "External id": 89452,"kernel_hash": "c25lgxev5g5pgqmgeas3rsbfpey3d2wvz72yqf537xeysxmqtd4y", "grid": "grid(24576000,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "24576000"], "kernel_file": "/tmp/torchinductor_root/25/c25lgxev5g5pgqmgeas3rsbfpey3d2wvz72yqf537xeysxmqtd4y.py", "kernel_backend": "triton", "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 3435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_dense_backward_1", "pid": 5714, "tid": 6744, + "ts": 6300866097897.911, "dur": 118.320, + "args": { + "External id": 89453,"kernel_hash": "c272mj7qj3kjbzyvvqn5kn2ut5n2c42t7wgsqj2sturgngae2y3a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/27/c272mj7qj3kjbzyvvqn5kn2ut5n2c42t7wgsqj2sturgngae2y3a.py", "kernel_backend": "triton", "Input type": ["long int", "c10::BFloat16", "float", "Scalar"], "Input Strides": [[2048, 1], [1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048], [8, 2048, 768], [32000, 768], []], "Ev Idx": 3436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_dense_backward_2", "pid": 5714, "tid": 6744, + "ts": 6300866098141.291, "dur": 110.549, + "args": { + "External id": 89454,"kernel_hash": "cg2ylt27tmwmnxcgudqpetr6cqsy6lzmizyy2xuskasljjabrsvm", "grid": "grid(24576000,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "24576000"], "kernel_file": "/tmp/torchinductor_root/g2/cg2ylt27tmwmnxcgudqpetr6cqsy6lzmizyy2xuskasljjabrsvm.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 3437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6300866098489.660, "dur": 138.029, + "args": { + "External id": 89455,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 3438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866098766.689, "dur": 93.680, + "args": { + "External id": 89456,"Record function id": 0, "Ev Idx": 3439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6300866098793.639, "dur": 55.670, + "args": { + "External id": 89457,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 3440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6300866098812.959, "dur": 30.160, + "args": { + "External id": 89458,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 3441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6300866098819.419, "dur": 21.860, + "args": { + "External id": 89459,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 3442 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::root_post_backward_callback", "pid": 5714, "tid": 6744, + "ts": 6300866098992.579, "dur": 10926.735, + "args": { + "External id": 89460,"Record function id": 0, "Ev Idx": 3443 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate", "pid": 5714, "tid": 6744, + "ts": 6300866099104.108, "dur": 181.790, + "args": { + "External id": 89461,"Record function id": 0, "Ev Idx": 3444 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard", "pid": 5714, "tid": 6744, + "ts": 6300866099383.558, "dur": 1233.717, + "args": { + "External id": 89462,"Record function id": 0, "Ev Idx": 3445 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce", "pid": 5714, "tid": 6744, + "ts": 6300866100676.965, "dur": 8096.412, + "args": { + "External id": 89463,"Record function id": 0, "Ev Idx": 3446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866101597.583, "dur": 66.759, + "args": { + "External id": 89464,"Record function id": 0, "Concrete Inputs": ["[52894464]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6300866101735.173, "dur": 29.379, + "args": { + "External id": 89465,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[52894464], []], "Ev Idx": 3448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866101880.002, "dur": 1052.768, + "args": { + "External id": 89466,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[], [], [], [13223616, 1]], "Input Dims": [[], [], [], [4, 13223616]], "Ev Idx": 3449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6300866101983.032, "dur": 917.758, + "args": { + "External id": 89467,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[], [], [], [13223616, 1]], "Input Dims": [[], [], [], [4, 13223616]], "Ev Idx": 3450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866102354.301, "dur": 61.010, + "args": { + "External id": 89468,"Record function id": 0, "Concrete Inputs": ["[26063]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6300866102489.661, "dur": 248.959, + "args": { + "External id": 89469,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[26063], [], [], [], [], [], [], []], "Ev Idx": 3452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6300866102501.411, "dur": 234.329, + "args": { + "External id": 89470,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[26063], [], [], [], [], [], []], "Ev Idx": 3453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6300866102520.211, "dur": 51.780, + "args": { + "External id": 89471,"Record function id": 0, "Concrete Inputs": ["[26063]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6300866102581.531, "dur": 150.029, + "args": { + "External id": 89472,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[26063], [26063], []], "Ev Idx": 3455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6300866103619.708, "dur": 70.460, + "args": { + "External id": 89473,"Record function id": 0, "Concrete Inputs": ["", "[13223616]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[52894464], [], [], [], [], []], "Ev Idx": 3456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6300866103633.838, "dur": 52.120, + "args": { + "External id": 89474,"Record function id": 0, "Concrete Inputs": ["[13223616]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6300866103810.988, "dur": 685.738, + "args": { + "External id": 89475,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[13223616], [52894464], [], [], [], []], "Ev Idx": 3458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866103922.037, "dur": 549.619, + "args": { + "External id": 89476,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 13223616, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[52894464], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3459, "In msg nelems": 52894464 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6300866104015.917, "dur": 421.279, + "args": { + "External id": 89477,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[52894464]], "Ev Idx": 3460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6300866104594.026, "dur": 26.660, + "args": { + "External id": 89478,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3461, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866105027.525, "dur": 33.060, + "args": { + "External id": 89479,"Record function id": 0, "Concrete Inputs": ["", "[8000, 768]", "[768, 1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866105284.664, "dur": 9.680, + "args": { + "External id": 89480,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6144000"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866105476.224, "dur": 9.240, + "args": { + "External id": 89481,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6144192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866105597.344, "dur": 6.500, + "args": { + "External id": 89482,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6291648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866105705.853, "dur": 7.060, + "args": { + "External id": 89483,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6439104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866105805.583, "dur": 6.040, + "args": { + "External id": 89484,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6586560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866105926.443, "dur": 6.650, + "args": { + "External id": 89485,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6734016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106030.763, "dur": 5.890, + "args": { + "External id": 89486,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "6734208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106130.392, "dur": 6.120, + "args": { + "External id": 89487,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "7127424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106228.822, "dur": 6.800, + "args": { + "External id": 89488,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "7520640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106359.622, "dur": 7.900, + "args": { + "External id": 89489,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "7913856"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106461.192, "dur": 6.120, + "args": { + "External id": 89490,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "7914048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106547.731, "dur": 5.091, + "args": { + "External id": 89491,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "8061504"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106619.841, "dur": 4.320, + "args": { + "External id": 89492,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "8208960"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106689.551, "dur": 5.120, + "args": { + "External id": 89493,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "8356416"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106760.131, "dur": 4.760, + "args": { + "External id": 89494,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "8503872"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106829.441, "dur": 5.410, + "args": { + "External id": 89495,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "8504064"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106901.361, "dur": 4.510, + "args": { + "External id": 89496,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "8897280"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866106970.361, "dur": 4.740, + "args": { + "External id": 89497,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "9290496"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107041.050, "dur": 4.850, + "args": { + "External id": 89498,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "9683712"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107110.330, "dur": 4.400, + "args": { + "External id": 89499,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "9683904"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107180.720, "dur": 4.370, + "args": { + "External id": 89500,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "9831360"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107251.990, "dur": 4.530, + "args": { + "External id": 89501,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "9978816"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107414.140, "dur": 5.680, + "args": { + "External id": 89502,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "10126272"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107491.169, "dur": 4.950, + "args": { + "External id": 89503,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "10273728"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107561.469, "dur": 4.440, + "args": { + "External id": 89504,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "10273920"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107634.019, "dur": 4.410, + "args": { + "External id": 89505,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "10667136"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107706.339, "dur": 4.370, + "args": { + "External id": 89506,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "11060352"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107777.019, "dur": 4.410, + "args": { + "External id": 89507,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "11453568"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107846.639, "dur": 4.900, + "args": { + "External id": 89508,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11453760"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107917.839, "dur": 5.019, + "args": { + "External id": 89509,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11601216"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866107992.658, "dur": 4.930, + "args": { + "External id": 89510,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11748672"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866108066.428, "dur": 5.030, + "args": { + "External id": 89511,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11896128"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866108136.758, "dur": 4.860, + "args": { + "External id": 89512,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "12043584"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866108207.398, "dur": 4.930, + "args": { + "External id": 89513,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "12043776"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866108278.707, "dur": 4.740, + "args": { + "External id": 89514,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "12436992"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866108380.607, "dur": 5.790, + "args": { + "External id": 89515,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "12830208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6300866108459.107, "dur": 4.500, + "args": { + "External id": 89516,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "13223424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3499 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "ProfilerStep#5631", "pid": 5714, "tid": 5714, + "ts": 6300865683533.683, "dur": 587081.266, + "args": { + "External id": 81921,"Record function id": 0, "Ev Idx": 3500 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "Optimizer.zero_grad#AdamW.zero_grad", "pid": 5714, "tid": 5714, + "ts": 6300865683569.213, "dur": 320.049, + "args": { + "External id": 81922,"Record function id": 0, "Ev Idx": 3501 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "enumerate(DataLoader)#_StatefulMultiProcessingDataLoaderIter.__next__", "pid": 5714, "tid": 5714, + "ts": 6300865683929.832, "dur": 1621.266, + "args": { + "External id": 81923,"Record function id": 0, "Ev Idx": 3502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865684647.450, "dur": 5.300, + "args": { + "External id": 81924,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::set_", "pid": 5714, "tid": 5714, + "ts": 6300865684670.900, "dur": 5.570, + "args": { + "External id": 81925,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "0", "[8, 4096]", "[4096, 1]"], "Input type": ["long int", "", "Scalar", "ScalarList", "ScalarList"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[0], [], [], [], []], "Ev Idx": 3504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865684996.660, "dur": 3.920, + "args": { + "External id": 81926,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::set_", "pid": 5714, "tid": 5714, + "ts": 6300865685011.160, "dur": 3.509, + "args": { + "External id": 81927,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "0", "[8, 4096]", "[4096, 1]"], "Input type": ["long int", "", "Scalar", "ScalarList", "ScalarList"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[0], [], [], [], []], "Ev Idx": 3506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865685461.579, "dur": 3.500, + "args": { + "External id": 81928,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::set_", "pid": 5714, "tid": 5714, + "ts": 6300865685472.839, "dur": 3.320, + "args": { + "External id": 81929,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "0", "[8, 4096]", "[4096, 1]"], "Input type": ["long int", "", "Scalar", "ScalarList", "ScalarList"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[0], [], [], [], []], "Ev Idx": 3508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865685945.087, "dur": 14.080, + "args": { + "External id": 81930,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], []], "Ev Idx": 3509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865685953.397, "dur": 2.410, + "args": { + "External id": 81931,"Record function id": 0, "Concrete Inputs": ["", "[8, 4096]", "[4096, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 3510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865685961.387, "dur": 5.290, + "args": { + "External id": 81932,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], []], "Ev Idx": 3511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865685963.627, "dur": 1.430, + "args": { + "External id": 81933,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 3512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865685989.587, "dur": 223.590, + "args": { + "External id": 81934,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], [], [], []], "Ev Idx": 3513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865685998.937, "dur": 213.730, + "args": { + "External id": 81935,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], [], []], "Ev Idx": 3514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865686008.807, "dur": 14.850, + "args": { + "External id": 81936,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "[2048, 1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865686026.307, "dur": 185.400, + "args": { + "External id": 81937,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865686038.347, "dur": 0.560, + "args": { + "External id": 81938,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 3517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand_as", "pid": 5714, "tid": 5714, + "ts": 6300865686042.667, "dur": 11.900, + "args": { + "External id": 81939,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["long int", "long int"], "Input Strides": [[4096, 1], [2048, 1]], "Input Dims": [[8, 2048], [8, 2048]], "Ev Idx": 3518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 5714, + "ts": 6300865686046.837, "dur": 7.430, + "args": { + "External id": 81940,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "False"], "Input type": ["long int", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], []], "Input Dims": [[8, 2048], [], []], "Ev Idx": 3519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865686052.607, "dur": 1.010, + "args": { + "External id": 81941,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 3520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6300865686057.237, "dur": 87.620, + "args": { + "External id": 81942,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 3521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6300865686060.087, "dur": 84.300, + "args": { + "External id": 81943,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 3522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865686062.837, "dur": 14.290, + "args": { + "External id": 81944,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 3523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865686065.947, "dur": 10.420, + "args": { + "External id": 81945,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865686078.357, "dur": 65.400, + "args": { + "External id": 81946,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865686147.517, "dur": 62.040, + "args": { + "External id": 81947,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865686225.137, "dur": 226.189, + "args": { + "External id": 81948,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], [], [], [], []], "Ev Idx": 3527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865686226.947, "dur": 223.899, + "args": { + "External id": 81949,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], [], [], []], "Ev Idx": 3528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865686230.237, "dur": 7.400, + "args": { + "External id": 81950,"Record function id": 0, "Concrete Inputs": ["[8, 4096]", "[4096, 1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865686238.677, "dur": 210.609, + "args": { + "External id": 81951,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[4096, 1], [4096, 1], []], "Input Dims": [[8, 4096], [8, 4096], []], "Ev Idx": 3530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::arange", "pid": 5714, "tid": 5714, + "ts": 6300865686469.986, "dur": 48.810, + "args": { + "External id": 81952,"Record function id": 0, "Concrete Inputs": ["0", "2048", "", "", "", "False"], "Input type": ["Scalar", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865686474.376, "dur": 6.300, + "args": { + "External id": 81953,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::arange", "pid": 5714, "tid": 5714, + "ts": 6300865686482.956, "dur": 35.320, + "args": { + "External id": 81954,"Record function id": 0, "Concrete Inputs": ["0", "2048", "1", ""], "Input type": ["Scalar", "Scalar", "Scalar", "long int"], "Input Strides": [[], [], [], [1]], "Input Dims": [[], [], [], [0]], "Ev Idx": 3533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865686489.126, "dur": 5.580, + "args": { + "External id": 81955,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["long int", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 3534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::repeat", "pid": 5714, "tid": 5714, + "ts": 6300865686528.596, "dur": 66.980, + "args": { + "External id": 81956,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 3535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 5714, + "ts": 6300865686533.626, "dur": 4.730, + "args": { + "External id": 81957,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048]", "False"], "Input type": ["long int", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[2048], [], []], "Ev Idx": 3536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865686536.206, "dur": 1.710, + "args": { + "External id": 81958,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048]", "[2048, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 3537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865686539.706, "dur": 7.790, + "args": { + "External id": 81959,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 5714, + "ts": 6300865686549.956, "dur": 3.480, + "args": { + "External id": 81960,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[2048, 1]], "Input Dims": [[8, 2048]], "Ev Idx": 3539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unfold", "pid": 5714, "tid": 5714, + "ts": 6300865686555.436, "dur": 5.460, + "args": { + "External id": 81961,"Record function id": 0, "Concrete Inputs": ["", "0", "1", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 3540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865686559.956, "dur": 0.460, + "args": { + "External id": 81962,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 3541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unfold", "pid": 5714, "tid": 5714, + "ts": 6300865686561.876, "dur": 2.160, + "args": { + "External id": 81963,"Record function id": 0, "Concrete Inputs": ["", "1", "2048", "2048"], "Input type": ["long int", "Scalar", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 2048], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 3542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865686562.876, "dur": 0.940, + "args": { + "External id": 81964,"Record function id": 0, "Concrete Inputs": ["", "[8, 1, 1, 2048]", "[2048, 2048, 2048, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 2048], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 3543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand_as", "pid": 5714, "tid": 5714, + "ts": 6300865686565.266, "dur": 4.760, + "args": { + "External id": 81965,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["long int", "long int"], "Input Strides": [[2048, 1], [2048, 2048, 2048, 1]], "Input Dims": [[1, 2048], [8, 1, 1, 2048]], "Ev Idx": 3544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 5714, + "ts": 6300865686566.166, "dur": 3.650, + "args": { + "External id": 81966,"Record function id": 0, "Concrete Inputs": ["", "[8, 1, 1, 2048]", "False"], "Input type": ["long int", "ScalarList", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[1, 2048], [], []], "Ev Idx": 3545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865686568.646, "dur": 0.950, + "args": { + "External id": 81967,"Record function id": 0, "Concrete Inputs": ["", "[8, 1, 1, 2048]", "[0, 2048, 2048, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[1, 2048], [], [], []], "Ev Idx": 3546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865686571.086, "dur": 23.630, + "args": { + "External id": 81968,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 2048, 2048, 1], [0, 2048, 2048, 1], []], "Input Dims": [[8, 1, 1, 2048], [8, 1, 1, 2048], []], "Ev Idx": 3547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865686604.716, "dur": 28.880, + "args": { + "External id": 81969,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "3", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 3548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865686606.106, "dur": 27.200, + "args": { + "External id": 81970,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "3", "", "", "", "False", ""], "Input type": ["long int", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[2048, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], [], []], "Ev Idx": 3549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865686608.856, "dur": 6.200, + "args": { + "External id": 81971,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "[2048, 1]", "3", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865686616.246, "dur": 16.460, + "args": { + "External id": 81972,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["int", "long int", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3551 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::root_pre_forward", "pid": 5714, "tid": 5714, + "ts": 6300865686727.465, "dur": 178.530, + "args": { + "External id": 81973,"Record function id": 0, "Ev Idx": 3552 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::inputs_to_device", "pid": 5714, "tid": 5714, + "ts": 6300865686822.745, "dur": 66.200, + "args": { + "External id": 81974,"Record function id": 0, "Ev Idx": 3553 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865686915.865, "dur": 53.500, + "args": { + "External id": 81975,"Record function id": 0, "Ev Idx": 3554 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward", "pid": 5714, "tid": 5714, + "ts": 6300865686981.865, "dur": 1793.436, + "args": { + "External id": 81976,"Record function id": 0, "Ev Idx": 3555 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather", "pid": 5714, "tid": 5714, + "ts": 6300865686993.715, "dur": 1023.228, + "args": { + "External id": 81977,"Record function id": 0, "Ev Idx": 3556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865687129.595, "dur": 12.149, + "args": { + "External id": 81978,"Record function id": 0, "Concrete Inputs": ["[13223616]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865687275.534, "dur": 94.970, + "args": { + "External id": 81979,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["c10::BFloat16", "", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[13223616], [], []], "Ev Idx": 3558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687280.774, "dur": 1.340, + "args": { + "External id": 81980,"Record function id": 0, "Concrete Inputs": ["", "[6144000]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687283.724, "dur": 0.310, + "args": { + "External id": 81981,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6144000"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687286.044, "dur": 0.450, + "args": { + "External id": 81982,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6144192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687287.544, "dur": 1.350, + "args": { + "External id": 81983,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6291648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687289.634, "dur": 0.590, + "args": { + "External id": 81984,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6439104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687292.354, "dur": 0.200, + "args": { + "External id": 81985,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6586560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687293.614, "dur": 0.240, + "args": { + "External id": 81986,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6734016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687294.694, "dur": 1.110, + "args": { + "External id": 81987,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6734208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687305.694, "dur": 0.320, + "args": { + "External id": 81988,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "7127424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687306.924, "dur": 0.170, + "args": { + "External id": 81989,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "7520640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687308.834, "dur": 0.280, + "args": { + "External id": 81990,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "7913856"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687310.134, "dur": 1.060, + "args": { + "External id": 81991,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "7914048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687311.934, "dur": 0.240, + "args": { + "External id": 81992,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8061504"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687314.274, "dur": 0.180, + "args": { + "External id": 81993,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8208960"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687315.554, "dur": 0.170, + "args": { + "External id": 81994,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8356416"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687316.564, "dur": 1.420, + "args": { + "External id": 81995,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "8503872"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687319.104, "dur": 0.170, + "args": { + "External id": 81996,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "8504064"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687319.984, "dur": 0.180, + "args": { + "External id": 81997,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "8897280"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687322.254, "dur": 0.340, + "args": { + "External id": 81998,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "9290496"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687323.624, "dur": 1.380, + "args": { + "External id": 81999,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "9683712"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687325.724, "dur": 0.190, + "args": { + "External id": 82000,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9683904"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687327.994, "dur": 0.170, + "args": { + "External id": 82001,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9831360"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687329.184, "dur": 0.250, + "args": { + "External id": 82002,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9978816"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687330.194, "dur": 1.480, + "args": { + "External id": 82003,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "10126272"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687332.734, "dur": 0.310, + "args": { + "External id": 82004,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "10273728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687333.754, "dur": 0.250, + "args": { + "External id": 82005,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "10273920"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687335.954, "dur": 0.380, + "args": { + "External id": 82006,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "10667136"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687337.394, "dur": 1.360, + "args": { + "External id": 82007,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "11060352"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687339.554, "dur": 0.180, + "args": { + "External id": 82008,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "11453568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687341.514, "dur": 0.180, + "args": { + "External id": 82009,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11453760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687342.904, "dur": 0.160, + "args": { + "External id": 82010,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11601216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687343.834, "dur": 1.440, + "args": { + "External id": 82011,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11748672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687346.384, "dur": 0.170, + "args": { + "External id": 82012,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11896128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687347.364, "dur": 0.260, + "args": { + "External id": 82013,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "12043584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687349.344, "dur": 0.410, + "args": { + "External id": 82014,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12043776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687350.744, "dur": 1.760, + "args": { + "External id": 82015,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12436992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687353.204, "dur": 0.180, + "args": { + "External id": 82016,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12830208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687355.184, "dur": 0.170, + "args": { + "External id": 82017,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "13223424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865687389.224, "dur": 42.560, + "args": { + "External id": 82018,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 3597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865687501.304, "dur": 214.719, + "args": { + "External id": 82019,"Record function id": 0, "Concrete Inputs": ["", "", "13223616", "4", "0", "15", ""], "Input type": ["TensorList", "", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 3598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865687518.284, "dur": 7.750, + "args": { + "External id": 82020,"Record function id": 0, "Concrete Inputs": ["[52894464]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865687532.534, "dur": 12.310, + "args": { + "External id": 82021,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "13223616"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[52894464], [], [], []], "Ev Idx": 3600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865687537.204, "dur": 7.280, + "args": { + "External id": 82022,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "13223616", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[52894464], [], [], [], []], "Ev Idx": 3601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687540.964, "dur": 1.270, + "args": { + "External id": 82023,"Record function id": 0, "Concrete Inputs": ["", "[13223616]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[52894464], [], [], []], "Ev Idx": 3602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865687556.364, "dur": 79.690, + "args": { + "External id": 82024,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["c10::BFloat16", "", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[13223616], [], []], "Ev Idx": 3603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687559.154, "dur": 0.560, + "args": { + "External id": 82025,"Record function id": 0, "Concrete Inputs": ["", "[6144000]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687561.274, "dur": 0.400, + "args": { + "External id": 82026,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6144000"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687562.444, "dur": 0.270, + "args": { + "External id": 82027,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6144192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687563.904, "dur": 1.520, + "args": { + "External id": 82028,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6291648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687566.524, "dur": 0.270, + "args": { + "External id": 82029,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6439104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687567.814, "dur": 0.150, + "args": { + "External id": 82030,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6586560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687568.674, "dur": 0.190, + "args": { + "External id": 82031,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6734016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687570.094, "dur": 0.260, + "args": { + "External id": 82032,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6734208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687571.344, "dur": 0.190, + "args": { + "External id": 82033,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "7127424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687572.644, "dur": 0.260, + "args": { + "External id": 82034,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "7520640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687573.744, "dur": 0.180, + "args": { + "External id": 82035,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "7913856"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687574.634, "dur": 2.700, + "args": { + "External id": 82036,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "7914048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687578.724, "dur": 0.190, + "args": { + "External id": 82037,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8061504"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687579.643, "dur": 0.180, + "args": { + "External id": 82038,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8208960"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687581.754, "dur": 0.329, + "args": { + "External id": 82039,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8356416"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687583.283, "dur": 0.171, + "args": { + "External id": 82040,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "8503872"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687584.174, "dur": 0.169, + "args": { + "External id": 82041,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "8504064"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687587.054, "dur": 0.169, + "args": { + "External id": 82042,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "8897280"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687587.923, "dur": 0.171, + "args": { + "External id": 82043,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "9290496"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687588.843, "dur": 2.051, + "args": { + "External id": 82044,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "9683712"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687592.023, "dur": 0.171, + "args": { + "External id": 82045,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9683904"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687592.923, "dur": 0.180, + "args": { + "External id": 82046,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9831360"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687595.274, "dur": 0.269, + "args": { + "External id": 82047,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9978816"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687596.623, "dur": 0.171, + "args": { + "External id": 82048,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "10126272"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687597.583, "dur": 0.180, + "args": { + "External id": 82049,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "10273728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687600.494, "dur": 0.180, + "args": { + "External id": 82050,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "10273920"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687601.394, "dur": 0.169, + "args": { + "External id": 82051,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "10667136"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687602.723, "dur": 2.211, + "args": { + "External id": 82052,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "11060352"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687605.963, "dur": 0.240, + "args": { + "External id": 82053,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "11453568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687606.923, "dur": 0.180, + "args": { + "External id": 82054,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11453760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687608.663, "dur": 0.300, + "args": { + "External id": 82055,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11601216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687610.403, "dur": 0.171, + "args": { + "External id": 82056,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11748672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687611.283, "dur": 0.180, + "args": { + "External id": 82057,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11896128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687614.003, "dur": 0.240, + "args": { + "External id": 82058,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "12043584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687614.963, "dur": 0.180, + "args": { + "External id": 82059,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12043776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687615.874, "dur": 2.729, + "args": { + "External id": 82060,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12436992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687619.863, "dur": 0.180, + "args": { + "External id": 82061,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12830208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865687620.774, "dur": 0.169, + "args": { + "External id": 82062,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "13223424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865687659.954, "dur": 36.549, + "args": { + "External id": 82063,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 3642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865687800.173, "dur": 133.320, + "args": { + "External id": 82064,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[52894464], [13223616], [], [], []], "Ev Idx": 3643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865687834.653, "dur": 94.950, + "args": { + "External id": 82065,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 52894464, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[13223616], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3644, "In msg nelems": 13223616 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865687849.823, "dur": 74.770, + "args": { + "External id": 82066,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[13223616]], "Ev Idx": 3645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865687951.963, "dur": 3.920, + "args": { + "External id": 82067,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3646, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out", "pid": 5714, "tid": 5714, + "ts": 6300865688030.873, "dur": 565.118, + "args": { + "External id": 82068,"Record function id": 0, "Ev Idx": 3647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688286.272, "dur": 4.430, + "args": { + "External id": 82069,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[52894464], []], "Ev Idx": 3648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688295.762, "dur": 10.890, + "args": { + "External id": 82070,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[24576000], []], "Ev Idx": 3649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688309.122, "dur": 0.930, + "args": { + "External id": 82071,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688311.832, "dur": 0.680, + "args": { + "External id": 82072,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688314.022, "dur": 0.690, + "args": { + "External id": 82073,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688317.412, "dur": 1.800, + "args": { + "External id": 82074,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688320.912, "dur": 0.880, + "args": { + "External id": 82075,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688323.272, "dur": 1.790, + "args": { + "External id": 82076,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688326.482, "dur": 0.860, + "args": { + "External id": 82077,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688330.212, "dur": 0.810, + "args": { + "External id": 82078,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688332.502, "dur": 0.810, + "args": { + "External id": 82079,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688334.842, "dur": 0.940, + "args": { + "External id": 82080,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688337.262, "dur": 0.700, + "args": { + "External id": 82081,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688340.442, "dur": 1.920, + "args": { + "External id": 82082,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688343.832, "dur": 0.990, + "args": { + "External id": 82083,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688346.322, "dur": 1.890, + "args": { + "External id": 82084,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688349.712, "dur": 0.630, + "args": { + "External id": 82085,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688353.212, "dur": 0.870, + "args": { + "External id": 82086,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688355.552, "dur": 0.840, + "args": { + "External id": 82087,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688357.912, "dur": 0.720, + "args": { + "External id": 82088,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688360.092, "dur": 0.880, + "args": { + "External id": 82089,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688363.542, "dur": 1.800, + "args": { + "External id": 82090,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688366.822, "dur": 0.550, + "args": { + "External id": 82091,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688368.782, "dur": 1.980, + "args": { + "External id": 82092,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688372.152, "dur": 0.710, + "args": { + "External id": 82093,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688376.312, "dur": 0.610, + "args": { + "External id": 82094,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688378.252, "dur": 0.580, + "args": { + "External id": 82095,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688380.322, "dur": 0.900, + "args": { + "External id": 82096,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688382.862, "dur": 0.710, + "args": { + "External id": 82097,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688391.522, "dur": 1.850, + "args": { + "External id": 82098,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688394.822, "dur": 0.770, + "args": { + "External id": 82099,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688396.992, "dur": 2.020, + "args": { + "External id": 82100,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688400.422, "dur": 0.760, + "args": { + "External id": 82101,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688403.912, "dur": 0.820, + "args": { + "External id": 82102,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688406.282, "dur": 0.750, + "args": { + "External id": 82103,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688408.462, "dur": 0.710, + "args": { + "External id": 82104,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688410.562, "dur": 0.600, + "args": { + "External id": 82105,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688413.852, "dur": 1.850, + "args": { + "External id": 82106,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865688417.122, "dur": 0.850, + "args": { + "External id": 82107,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865688439.872, "dur": 120.549, + "args": { + "External id": 82108,"Record function id": 0, "Concrete Inputs": ["", "", "1", ""], "Input type": ["c10::BFloat16", "", "Scalar", "TensorList"], "Input Strides": [[13223616, 1], [], [], []], "Input Dims": [[4, 13223616], [], [], []], "Ev Idx": 3687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865688459.661, "dur": 95.840, + "args": { + "External id": 82109,"Record function id": 0, "Concrete Inputs": ["", "", "1", ""], "Input type": ["c10::BFloat16", "", "Scalar", "TensorList"], "Input Strides": [[13223616, 1], [], [], []], "Input Dims": [[4, 13223616], [], [], []], "Ev Idx": 3688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865688478.281, "dur": 3.140, + "args": { + "External id": 82110,"Record function id": 0, "Concrete Inputs": ["[2750]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865688486.701, "dur": 39.680, + "args": { + "External id": 82111,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[2750], [], [], [], [], [], [], []], "Ev Idx": 3690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865688488.432, "dur": 37.600, + "args": { + "External id": 82112,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[2750], [], [], [], [], [], []], "Ev Idx": 3691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865688492.372, "dur": 7.169, + "args": { + "External id": 82113,"Record function id": 0, "Concrete Inputs": ["[2750]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865688501.952, "dur": 23.480, + "args": { + "External id": 82114,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2750], [2750], []], "Ev Idx": 3693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865688855.331, "dur": 28.930, + "args": { + "External id": 82115,"Record function id": 0, "Ev Idx": 3694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 5714, "tid": 5714, + "ts": 6300865688885.421, "dur": 200.479, + "args": { + "External id": 82116,"Record function id": 0, "Ev Idx": 3695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865688925.931, "dur": 147.139, + "args": { + "External id": 82117,"Sequence number": 1770900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "long int"], "Input Strides": [[768, 1], [2048, 1]], "Input Dims": [[32000, 768], [8, 2048]], "Ev Idx": 3696 + } + }, + { + "ph": "s", "id": 302, "pid": 5714, "tid": 5714, "ts": 6300865688925.931, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_0", "pid": 5714, "tid": 5714, + "ts": 6300865688991.900, "dur": 39.780, + "args": { + "External id": 82118,"kernel_hash": "chx7cxfd4w3vbh4d6l24hldpnxluepxuj4zcshyicrtcgke24jvt", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/hx/chx7cxfd4w3vbh4d6l24hldpnxluepxuj4zcshyicrtcgke24jvt.py", "kernel_backend": "triton", "Input type": ["long int", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048], [32000, 768], [8, 2048, 768], []], "Ev Idx": 3697 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865689152.940, "dur": 56.960, + "args": { + "External id": 82119,"Record function id": 0, "Ev Idx": 3698 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.0)", "pid": 5714, "tid": 5714, + "ts": 6300865689221.290, "dur": 1023.947, + "args": { + "External id": 82120,"Record function id": 0, "Ev Idx": 3699 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 5714, "tid": 5714, + "ts": 6300865689229.150, "dur": 539.829, + "args": { + "External id": 82121,"Record function id": 0, "Ev Idx": 3700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865689309.540, "dur": 9.710, + "args": { + "External id": 82122,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865689330.760, "dur": 25.070, + "args": { + "External id": 82123,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689334.670, "dur": 1.320, + "args": { + "External id": 82124,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689337.530, "dur": 0.220, + "args": { + "External id": 82125,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689339.750, "dur": 1.420, + "args": { + "External id": 82126,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689342.490, "dur": 0.180, + "args": { + "External id": 82127,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689343.470, "dur": 1.369, + "args": { + "External id": 82128,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689345.639, "dur": 0.180, + "args": { + "External id": 82129,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689347.319, "dur": 0.331, + "args": { + "External id": 82130,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689349.619, "dur": 0.420, + "args": { + "External id": 82131,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689351.050, "dur": 0.169, + "args": { + "External id": 82132,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865689364.850, "dur": 25.280, + "args": { + "External id": 82133,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865689423.290, "dur": 114.549, + "args": { + "External id": 82134,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865689435.039, "dur": 10.900, + "args": { + "External id": 82135,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865689451.159, "dur": 11.360, + "args": { + "External id": 82136,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865689454.919, "dur": 7.220, + "args": { + "External id": 82137,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689458.069, "dur": 2.020, + "args": { + "External id": 82138,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865689470.409, "dur": 21.040, + "args": { + "External id": 82139,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689472.159, "dur": 0.400, + "args": { + "External id": 82140,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689475.279, "dur": 0.300, + "args": { + "External id": 82141,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689476.439, "dur": 0.190, + "args": { + "External id": 82142,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689477.739, "dur": 0.270, + "args": { + "External id": 82143,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689480.089, "dur": 0.200, + "args": { + "External id": 82144,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689481.139, "dur": 0.400, + "args": { + "External id": 82145,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689482.629, "dur": 1.180, + "args": { + "External id": 82146,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689484.689, "dur": 1.500, + "args": { + "External id": 82147,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689486.909, "dur": 0.260, + "args": { + "External id": 82148,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865689504.149, "dur": 22.400, + "args": { + "External id": 82149,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865689599.899, "dur": 96.110, + "args": { + "External id": 82150,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865689619.299, "dur": 73.370, + "args": { + "External id": 82151,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3730, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865689631.029, "dur": 57.120, + "args": { + "External id": 82152,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865689711.819, "dur": 3.410, + "args": { + "External id": 82153,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3732, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 5714, "tid": 5714, + "ts": 6300865689790.798, "dur": 287.020, + "args": { + "External id": 82154,"Record function id": 0, "Ev Idx": 3733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865689890.298, "dur": 6.000, + "args": { + "External id": 82155,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865689901.208, "dur": 0.880, + "args": { + "External id": 82156,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865689903.958, "dur": 0.780, + "args": { + "External id": 82157,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865689906.788, "dur": 0.900, + "args": { + "External id": 82158,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865689909.198, "dur": 0.850, + "args": { + "External id": 82159,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865689911.858, "dur": 1.020, + "args": { + "External id": 82160,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865689916.218, "dur": 2.300, + "args": { + "External id": 82161,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865689920.008, "dur": 1.850, + "args": { + "External id": 82162,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865689923.388, "dur": 0.800, + "args": { + "External id": 82163,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865689925.608, "dur": 0.850, + "args": { + "External id": 82164,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865689941.538, "dur": 107.150, + "args": { + "External id": 82165,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865689954.408, "dur": 90.280, + "args": { + "External id": 82166,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865689976.408, "dur": 7.240, + "args": { + "External id": 82167,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865689986.638, "dur": 36.010, + "args": { + "External id": 82168,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865689988.288, "dur": 34.010, + "args": { + "External id": 82169,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865689991.728, "dur": 7.800, + "args": { + "External id": 82170,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865690000.638, "dur": 21.130, + "args": { + "External id": 82171,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6300865690194.988, "dur": 24.110, + "args": { + "External id": 82172,"Sequence number": 1770901, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3751 + } + }, + { + "ph": "s", "id": 301, "pid": 5714, "tid": 5714, "ts": 6300865690194.988, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300865690208.528, "dur": 6.700, + "args": { + "External id": 82173,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 3752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865690211.078, "dur": 3.650, + "args": { + "External id": 82174,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 3753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865690281.337, "dur": 32.691, + "args": { + "External id": 82175,"Record function id": 0, "Ev Idx": 3754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6300865690315.317, "dur": 1925.196, + "args": { + "External id": 82176,"Record function id": 0, "Ev Idx": 3755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865690344.267, "dur": 137.920, + "args": { + "External id": 82177,"Sequence number": 1770902, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 3756 + } + }, + { + "ph": "s", "id": 300, "pid": 5714, "tid": 5714, "ts": 6300865690344.267, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865690404.347, "dur": 35.210, + "args": { + "External id": 82178,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 3757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865690455.247, "dur": 5.870, + "args": { + "External id": 82179,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865690456.797, "dur": 4.060, + "args": { + "External id": 82180,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865690507.107, "dur": 23.780, + "args": { + "External id": 82181,"Record function id": 0, "Ev Idx": 3760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6300865690531.747, "dur": 1246.207, + "args": { + "External id": 82182,"Record function id": 0, "Ev Idx": 3761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865690559.837, "dur": 252.969, + "args": { + "External id": 82183,"Sequence number": 1770903, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 3762 + } + }, + { + "ph": "s", "id": 299, "pid": 5714, "tid": 5714, "ts": 6300865690559.837, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865690596.767, "dur": 67.120, + "args": { + "External id": 82184,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865690678.796, "dur": 19.580, + "args": { + "External id": 82185,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865690710.487, "dur": 17.420, + "args": { + "External id": 82186,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865690768.666, "dur": 7.410, + "args": { + "External id": 82187,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865690784.506, "dur": 1.020, + "args": { + "External id": 82188,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865690790.396, "dur": 0.750, + "args": { + "External id": 82189,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865690836.876, "dur": 17.680, + "args": { + "External id": 82190,"Record function id": 0, "Ev Idx": 3769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6300865690855.816, "dur": 569.269, + "args": { + "External id": 82191,"Record function id": 0, "Ev Idx": 3770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865690881.496, "dur": 7.640, + "args": { + "External id": 82192,"Record function id": 0, "Ev Idx": 3771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865690889.946, "dur": 297.489, + "args": { + "External id": 82193,"Record function id": 0, "Ev Idx": 3772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865690904.756, "dur": 281.450, + "args": { + "External id": 82194,"Sequence number": 1770904, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3773 + } + }, + { + "ph": "s", "id": 298, "pid": 5714, "tid": 5714, "ts": 6300865690904.756, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865690912.926, "dur": 14.040, + "args": { + "External id": 82195,"Record function id": 0, "Ev Idx": 3774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865690928.546, "dur": 248.600, + "args": { + "External id": 82196,"Record function id": 0, "Ev Idx": 3775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865690961.636, "dur": 7.260, + "args": { + "External id": 82197,"Record function id": 0, "Ev Idx": 3776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865690969.726, "dur": 174.720, + "args": { + "External id": 82198,"Record function id": 0, "Ev Idx": 3777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865690975.186, "dur": 11.920, + "args": { + "External id": 82199,"Record function id": 0, "Ev Idx": 3778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865690988.116, "dur": 152.830, + "args": { + "External id": 82200,"Record function id": 0, "Ev Idx": 3779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691036.296, "dur": 13.280, + "args": { + "External id": 82201,"Record function id": 0, "Ev Idx": 3780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865691050.916, "dur": 88.890, + "args": { + "External id": 82202,"Record function id": 0, "Ev Idx": 3781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865691093.456, "dur": 33.130, + "args": { + "External id": 82203,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691152.955, "dur": 4.540, + "args": { + "External id": 82204,"Record function id": 0, "Ev Idx": 3783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865691158.266, "dur": 18.149, + "args": { + "External id": 82205,"Record function id": 0, "Ev Idx": 3784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691192.975, "dur": 9.130, + "args": { + "External id": 82206,"Record function id": 0, "Ev Idx": 3785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6300865691202.895, "dur": 221.790, + "args": { + "External id": 82207,"Record function id": 0, "Ev Idx": 3786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691207.575, "dur": 2.180, + "args": { + "External id": 82208,"Record function id": 0, "Ev Idx": 3787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865691210.325, "dur": 212.730, + "args": { + "External id": 82209,"Record function id": 0, "Ev Idx": 3788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865691224.505, "dur": 197.370, + "args": { + "External id": 82210,"Sequence number": 1770905, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3789 + } + }, + { + "ph": "s", "id": 297, "pid": 5714, "tid": 5714, "ts": 6300865691224.505, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691230.695, "dur": 4.500, + "args": { + "External id": 82211,"Record function id": 0, "Ev Idx": 3790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865691236.045, "dur": 178.240, + "args": { + "External id": 82212,"Record function id": 0, "Ev Idx": 3791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691258.875, "dur": 2.700, + "args": { + "External id": 82213,"Record function id": 0, "Ev Idx": 3792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865691262.395, "dur": 129.360, + "args": { + "External id": 82214,"Record function id": 0, "Ev Idx": 3793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691266.505, "dur": 3.480, + "args": { + "External id": 82215,"Record function id": 0, "Ev Idx": 3794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865691270.685, "dur": 119.000, + "args": { + "External id": 82216,"Record function id": 0, "Ev Idx": 3795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691314.675, "dur": 5.330, + "args": { + "External id": 82217,"Record function id": 0, "Ev Idx": 3796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865691321.045, "dur": 67.670, + "args": { + "External id": 82218,"Record function id": 0, "Ev Idx": 3797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865691352.605, "dur": 24.440, + "args": { + "External id": 82219,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691397.765, "dur": 2.430, + "args": { + "External id": 82220,"Record function id": 0, "Ev Idx": 3799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865691400.945, "dur": 12.650, + "args": { + "External id": 82221,"Record function id": 0, "Ev Idx": 3800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691436.995, "dur": 21.580, + "args": { + "External id": 82222,"Record function id": 0, "Ev Idx": 3801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6300865691459.595, "dur": 317.169, + "args": { + "External id": 82223,"Record function id": 0, "Ev Idx": 3802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865691486.865, "dur": 279.059, + "args": { + "External id": 82224,"Sequence number": 1770906, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 3803 + } + }, + { + "ph": "s", "id": 296, "pid": 5714, "tid": 5714, "ts": 6300865691486.865, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865691519.875, "dur": 160.339, + "args": { + "External id": 82225,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 3804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865691584.494, "dur": 12.851, + "args": { + "External id": 82226,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865691587.614, "dur": 8.740, + "args": { + "External id": 82227,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865691600.194, "dur": 6.340, + "args": { + "External id": 82228,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865691607.714, "dur": 2.731, + "args": { + "External id": 82229,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865691615.054, "dur": 4.811, + "args": { + "External id": 82230,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865691698.164, "dur": 31.570, + "args": { + "External id": 82231,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865691788.094, "dur": 48.650, + "args": { + "External id": 82232,"Record function id": 0, "Ev Idx": 3811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6300865691837.834, "dur": 400.129, + "args": { + "External id": 82233,"Record function id": 0, "Ev Idx": 3812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865691869.524, "dur": 357.429, + "args": { + "External id": 82234,"Sequence number": 1770907, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 3813 + } + }, + { + "ph": "s", "id": 295, "pid": 5714, "tid": 5714, "ts": 6300865691869.524, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865691931.464, "dur": 34.710, + "args": { + "External id": 82235,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 3814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865691981.953, "dur": 32.671, + "args": { + "External id": 82236,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865692026.704, "dur": 18.269, + "args": { + "External id": 82237,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6300865692071.584, "dur": 24.109, + "args": { + "External id": 82238,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 3817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865692109.983, "dur": 30.440, + "args": { + "External id": 82239,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6300865692165.453, "dur": 21.650, + "args": { + "External id": 82240,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3819 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.0)", "pid": 5714, "tid": 5714, + "ts": 6300865692285.373, "dur": 75.130, + "args": { + "External id": 82241,"Record function id": 0, "Ev Idx": 3820 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865692433.972, "dur": 56.400, + "args": { + "External id": 82242,"Record function id": 0, "Ev Idx": 3821 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.1)", "pid": 5714, "tid": 5714, + "ts": 6300865692500.463, "dur": 999.807, + "args": { + "External id": 82243,"Record function id": 0, "Ev Idx": 3822 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 5714, "tid": 5714, + "ts": 6300865692508.252, "dur": 513.789, + "args": { + "External id": 82244,"Record function id": 0, "Ev Idx": 3823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865692579.832, "dur": 9.360, + "args": { + "External id": 82245,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865692599.892, "dur": 28.110, + "args": { + "External id": 82246,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692605.182, "dur": 1.580, + "args": { + "External id": 82247,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692608.312, "dur": 0.420, + "args": { + "External id": 82248,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692609.592, "dur": 0.290, + "args": { + "External id": 82249,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692612.672, "dur": 1.390, + "args": { + "External id": 82250,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692615.242, "dur": 0.190, + "args": { + "External id": 82251,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692616.302, "dur": 1.380, + "args": { + "External id": 82252,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692619.022, "dur": 0.190, + "args": { + "External id": 82253,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692620.052, "dur": 0.230, + "args": { + "External id": 82254,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692622.312, "dur": 0.380, + "args": { + "External id": 82255,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865692636.482, "dur": 24.590, + "args": { + "External id": 82256,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865692694.192, "dur": 102.000, + "args": { + "External id": 82257,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865692704.892, "dur": 7.370, + "args": { + "External id": 82258,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865692716.672, "dur": 8.880, + "args": { + "External id": 82259,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865692719.302, "dur": 5.860, + "args": { + "External id": 82260,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692722.542, "dur": 0.740, + "args": { + "External id": 82261,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865692733.352, "dur": 21.950, + "args": { + "External id": 82262,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692735.012, "dur": 1.650, + "args": { + "External id": 82263,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692737.892, "dur": 0.250, + "args": { + "External id": 82264,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692739.972, "dur": 0.270, + "args": { + "External id": 82265,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692741.442, "dur": 0.250, + "args": { + "External id": 82266,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692742.442, "dur": 0.300, + "args": { + "External id": 82267,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692744.782, "dur": 0.270, + "args": { + "External id": 82268,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692746.342, "dur": 0.170, + "args": { + "External id": 82269,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692747.352, "dur": 1.340, + "args": { + "External id": 82270,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865692749.572, "dur": 1.510, + "args": { + "External id": 82271,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865692768.632, "dur": 19.030, + "args": { + "External id": 82272,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865692853.352, "dur": 94.819, + "args": { + "External id": 82273,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865692872.822, "dur": 71.809, + "args": { + "External id": 82274,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3853, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865692884.791, "dur": 55.271, + "args": { + "External id": 82275,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865692964.971, "dur": 3.560, + "args": { + "External id": 82276,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3855, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 5714, "tid": 5714, + "ts": 6300865693043.001, "dur": 296.700, + "args": { + "External id": 82277,"Record function id": 0, "Ev Idx": 3856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693140.481, "dur": 4.250, + "args": { + "External id": 82278,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693149.191, "dur": 0.920, + "args": { + "External id": 82279,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693152.111, "dur": 0.690, + "args": { + "External id": 82280,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693154.591, "dur": 0.820, + "args": { + "External id": 82281,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693157.051, "dur": 0.770, + "args": { + "External id": 82282,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693160.631, "dur": 0.740, + "args": { + "External id": 82283,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693163.601, "dur": 0.970, + "args": { + "External id": 82284,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693166.091, "dur": 2.900, + "args": { + "External id": 82285,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693170.631, "dur": 0.730, + "args": { + "External id": 82286,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693173.931, "dur": 0.920, + "args": { + "External id": 82287,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865693188.411, "dur": 121.319, + "args": { + "External id": 82288,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865693215.211, "dur": 90.470, + "args": { + "External id": 82289,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865693227.971, "dur": 7.250, + "args": { + "External id": 82290,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865693238.101, "dur": 36.340, + "args": { + "External id": 82291,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865693240.231, "dur": 33.840, + "args": { + "External id": 82292,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865693244.001, "dur": 7.770, + "args": { + "External id": 82293,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865693253.001, "dur": 20.580, + "args": { + "External id": 82294,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6300865693452.710, "dur": 21.860, + "args": { + "External id": 82295,"Sequence number": 1770908, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3874 + } + }, + { + "ph": "s", "id": 294, "pid": 5714, "tid": 5714, "ts": 6300865693452.710, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300865693463.850, "dur": 6.880, + "args": { + "External id": 82296,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 3875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865693466.470, "dur": 3.810, + "args": { + "External id": 82297,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 3876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865693531.880, "dur": 12.260, + "args": { + "External id": 82298,"Record function id": 0, "Ev Idx": 3877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6300865693545.160, "dur": 1565.526, + "args": { + "External id": 82299,"Record function id": 0, "Ev Idx": 3878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865693568.030, "dur": 114.100, + "args": { + "External id": 82300,"Sequence number": 1770909, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 3879 + } + }, + { + "ph": "s", "id": 293, "pid": 5714, "tid": 5714, "ts": 6300865693568.030, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865693616.170, "dur": 28.870, + "args": { + "External id": 82301,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 3880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865693658.210, "dur": 5.760, + "args": { + "External id": 82302,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865693659.640, "dur": 4.080, + "args": { + "External id": 82303,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865693703.800, "dur": 12.190, + "args": { + "External id": 82304,"Record function id": 0, "Ev Idx": 3883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6300865693716.880, "dur": 1008.027, + "args": { + "External id": 82305,"Record function id": 0, "Ev Idx": 3884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865693739.350, "dur": 199.979, + "args": { + "External id": 82306,"Sequence number": 1770910, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 3885 + } + }, + { + "ph": "s", "id": 292, "pid": 5714, "tid": 5714, "ts": 6300865693739.350, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865693768.740, "dur": 34.249, + "args": { + "External id": 82307,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865693815.920, "dur": 18.360, + "args": { + "External id": 82308,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865693847.079, "dur": 18.360, + "args": { + "External id": 82309,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865693900.049, "dur": 4.390, + "args": { + "External id": 82310,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865693912.289, "dur": 1.270, + "args": { + "External id": 82311,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865693918.409, "dur": 1.850, + "args": { + "External id": 82312,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865693959.929, "dur": 10.610, + "args": { + "External id": 82313,"Record function id": 0, "Ev Idx": 3892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6300865693971.529, "dur": 469.959, + "args": { + "External id": 82314,"Record function id": 0, "Ev Idx": 3893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865693989.679, "dur": 3.260, + "args": { + "External id": 82315,"Record function id": 0, "Ev Idx": 3894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865693993.739, "dur": 217.769, + "args": { + "External id": 82316,"Record function id": 0, "Ev Idx": 3895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865694007.389, "dur": 202.930, + "args": { + "External id": 82317,"Sequence number": 1770911, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3896 + } + }, + { + "ph": "s", "id": 291, "pid": 5714, "tid": 5714, "ts": 6300865694007.389, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694014.029, "dur": 6.530, + "args": { + "External id": 82318,"Record function id": 0, "Ev Idx": 3897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865694021.779, "dur": 179.780, + "args": { + "External id": 82319,"Record function id": 0, "Ev Idx": 3898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694046.219, "dur": 3.430, + "args": { + "External id": 82320,"Record function id": 0, "Ev Idx": 3899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865694050.519, "dur": 125.810, + "args": { + "External id": 82321,"Record function id": 0, "Ev Idx": 3900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694054.149, "dur": 3.890, + "args": { + "External id": 82322,"Record function id": 0, "Ev Idx": 3901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865694058.719, "dur": 115.140, + "args": { + "External id": 82323,"Record function id": 0, "Ev Idx": 3902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694097.929, "dur": 5.780, + "args": { + "External id": 82324,"Record function id": 0, "Ev Idx": 3903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865694104.869, "dur": 68.010, + "args": { + "External id": 82325,"Record function id": 0, "Ev Idx": 3904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865694135.699, "dur": 24.670, + "args": { + "External id": 82326,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694182.808, "dur": 3.471, + "args": { + "External id": 82327,"Record function id": 0, "Ev Idx": 3906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865694187.079, "dur": 13.709, + "args": { + "External id": 82328,"Record function id": 0, "Ev Idx": 3907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694215.879, "dur": 4.529, + "args": { + "External id": 82329,"Record function id": 0, "Ev Idx": 3908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6300865694221.068, "dur": 220.000, + "args": { + "External id": 82330,"Record function id": 0, "Ev Idx": 3909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694225.039, "dur": 2.400, + "args": { + "External id": 82331,"Record function id": 0, "Ev Idx": 3910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865694228.108, "dur": 211.890, + "args": { + "External id": 82332,"Record function id": 0, "Ev Idx": 3911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865694241.728, "dur": 197.150, + "args": { + "External id": 82333,"Sequence number": 1770912, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3912 + } + }, + { + "ph": "s", "id": 290, "pid": 5714, "tid": 5714, "ts": 6300865694241.728, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694247.179, "dur": 4.789, + "args": { + "External id": 82334,"Record function id": 0, "Ev Idx": 3913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865694252.759, "dur": 178.169, + "args": { + "External id": 82335,"Record function id": 0, "Ev Idx": 3914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694274.819, "dur": 3.269, + "args": { + "External id": 82336,"Record function id": 0, "Ev Idx": 3915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865694278.988, "dur": 128.690, + "args": { + "External id": 82337,"Record function id": 0, "Ev Idx": 3916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694282.208, "dur": 3.790, + "args": { + "External id": 82338,"Record function id": 0, "Ev Idx": 3917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865694286.658, "dur": 118.940, + "args": { + "External id": 82339,"Record function id": 0, "Ev Idx": 3918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694328.778, "dur": 5.990, + "args": { + "External id": 82340,"Record function id": 0, "Ev Idx": 3919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865694335.808, "dur": 68.890, + "args": { + "External id": 82341,"Record function id": 0, "Ev Idx": 3920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865694370.348, "dur": 23.160, + "args": { + "External id": 82342,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694413.488, "dur": 3.160, + "args": { + "External id": 82343,"Record function id": 0, "Ev Idx": 3922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865694417.458, "dur": 12.850, + "args": { + "External id": 82344,"Record function id": 0, "Ev Idx": 3923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694451.958, "dur": 10.540, + "args": { + "External id": 82345,"Record function id": 0, "Ev Idx": 3924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6300865694463.448, "dur": 260.359, + "args": { + "External id": 82346,"Record function id": 0, "Ev Idx": 3925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865694485.828, "dur": 227.339, + "args": { + "External id": 82347,"Sequence number": 1770913, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 3926 + } + }, + { + "ph": "s", "id": 289, "pid": 5714, "tid": 5714, "ts": 6300865694485.828, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865694511.258, "dur": 120.620, + "args": { + "External id": 82348,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 3927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865694548.508, "dur": 12.480, + "args": { + "External id": 82349,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865694551.588, "dur": 8.390, + "args": { + "External id": 82350,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865694562.908, "dur": 6.270, + "args": { + "External id": 82351,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865694570.538, "dur": 2.520, + "args": { + "External id": 82352,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865694576.698, "dur": 4.880, + "args": { + "External id": 82353,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865694649.707, "dur": 31.411, + "args": { + "External id": 82354,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865694734.227, "dur": 21.920, + "args": { + "External id": 82355,"Record function id": 0, "Ev Idx": 3934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6300865694757.157, "dur": 350.589, + "args": { + "External id": 82356,"Record function id": 0, "Ev Idx": 3935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865694783.347, "dur": 313.339, + "args": { + "External id": 82357,"Sequence number": 1770914, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 3936 + } + }, + { + "ph": "s", "id": 288, "pid": 5714, "tid": 5714, "ts": 6300865694783.347, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865694839.017, "dur": 29.130, + "args": { + "External id": 82358,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 3937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865694883.547, "dur": 28.210, + "args": { + "External id": 82359,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865694923.317, "dur": 17.570, + "args": { + "External id": 82360,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6300865694963.717, "dur": 22.620, + "args": { + "External id": 82361,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 3940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865694998.457, "dur": 26.040, + "args": { + "External id": 82362,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6300865695046.497, "dur": 16.409, + "args": { + "External id": 82363,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3942 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.1)", "pid": 5714, "tid": 5714, + "ts": 6300865695152.337, "dur": 56.269, + "args": { + "External id": 82364,"Record function id": 0, "Ev Idx": 3943 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865695272.516, "dur": 65.790, + "args": { + "External id": 82365,"Record function id": 0, "Ev Idx": 3944 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.2)", "pid": 5714, "tid": 5714, + "ts": 6300865695348.616, "dur": 986.008, + "args": { + "External id": 82366,"Record function id": 0, "Ev Idx": 3945 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 5714, "tid": 5714, + "ts": 6300865695357.376, "dur": 509.329, + "args": { + "External id": 82367,"Record function id": 0, "Ev Idx": 3946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865695426.666, "dur": 9.140, + "args": { + "External id": 82368,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865695446.236, "dur": 26.620, + "args": { + "External id": 82369,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695449.636, "dur": 1.290, + "args": { + "External id": 82370,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695453.566, "dur": 0.280, + "args": { + "External id": 82371,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695454.846, "dur": 0.350, + "args": { + "External id": 82372,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695456.746, "dur": 2.580, + "args": { + "External id": 82373,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695460.196, "dur": 0.360, + "args": { + "External id": 82374,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695461.326, "dur": 0.270, + "args": { + "External id": 82375,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695464.316, "dur": 0.330, + "args": { + "External id": 82376,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695465.806, "dur": 0.350, + "args": { + "External id": 82377,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695467.576, "dur": 0.260, + "args": { + "External id": 82378,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865695481.466, "dur": 25.290, + "args": { + "External id": 82379,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865695539.945, "dur": 101.660, + "args": { + "External id": 82380,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865695550.596, "dur": 7.549, + "args": { + "External id": 82381,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865695562.245, "dur": 10.200, + "args": { + "External id": 82382,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865695564.745, "dur": 7.260, + "args": { + "External id": 82383,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695568.065, "dur": 1.880, + "args": { + "External id": 82384,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865695579.885, "dur": 22.950, + "args": { + "External id": 82385,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695581.765, "dur": 1.620, + "args": { + "External id": 82386,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695584.965, "dur": 0.260, + "args": { + "External id": 82387,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695586.125, "dur": 0.260, + "args": { + "External id": 82388,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695589.036, "dur": 0.249, + "args": { + "External id": 82389,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695590.176, "dur": 0.280, + "args": { + "External id": 82390,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695591.205, "dur": 1.260, + "args": { + "External id": 82391,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695593.665, "dur": 0.240, + "args": { + "External id": 82392,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695594.816, "dur": 0.240, + "args": { + "External id": 82393,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865695596.685, "dur": 1.950, + "args": { + "External id": 82394,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865695613.835, "dur": 18.920, + "args": { + "External id": 82395,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865695697.835, "dur": 95.580, + "args": { + "External id": 82396,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865695717.735, "dur": 72.120, + "args": { + "External id": 82397,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3976, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865695729.745, "dur": 55.350, + "args": { + "External id": 82398,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865695808.795, "dur": 3.550, + "args": { + "External id": 82399,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3978, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 5714, "tid": 5714, + "ts": 6300865695886.605, "dur": 280.799, + "args": { + "External id": 82400,"Record function id": 0, "Ev Idx": 3979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865695995.895, "dur": 4.069, + "args": { + "External id": 82401,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865696004.384, "dur": 0.871, + "args": { + "External id": 82402,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865696007.355, "dur": 0.660, + "args": { + "External id": 82403,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865696009.895, "dur": 0.620, + "args": { + "External id": 82404,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865696011.924, "dur": 0.760, + "args": { + "External id": 82405,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865696015.284, "dur": 0.780, + "args": { + "External id": 82406,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865696018.235, "dur": 0.920, + "args": { + "External id": 82407,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865696020.775, "dur": 3.129, + "args": { + "External id": 82408,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865696025.424, "dur": 0.831, + "args": { + "External id": 82409,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865696029.135, "dur": 0.789, + "args": { + "External id": 82410,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865696043.494, "dur": 95.160, + "args": { + "External id": 82411,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865696055.414, "dur": 79.590, + "args": { + "External id": 82412,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865696066.794, "dur": 6.740, + "args": { + "External id": 82413,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865696076.314, "dur": 35.240, + "args": { + "External id": 82414,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865696077.744, "dur": 33.410, + "args": { + "External id": 82415,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865696081.044, "dur": 7.730, + "args": { + "External id": 82416,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865696089.914, "dur": 20.710, + "args": { + "External id": 82417,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6300865696276.214, "dur": 31.290, + "args": { + "External id": 82418,"Sequence number": 1770915, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3997 + } + }, + { + "ph": "s", "id": 287, "pid": 5714, "tid": 5714, "ts": 6300865696276.214, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300865696288.474, "dur": 6.790, + "args": { + "External id": 82419,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 3998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865696290.874, "dur": 3.780, + "args": { + "External id": 82420,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 3999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865696368.854, "dur": 10.470, + "args": { + "External id": 82421,"Record function id": 0, "Ev Idx": 4000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6300865696380.414, "dur": 1573.616, + "args": { + "External id": 82422,"Record function id": 0, "Ev Idx": 4001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865696402.574, "dur": 115.309, + "args": { + "External id": 82423,"Sequence number": 1770916, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4002 + } + }, + { + "ph": "s", "id": 286, "pid": 5714, "tid": 5714, "ts": 6300865696402.574, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865696452.054, "dur": 28.559, + "args": { + "External id": 82424,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865696493.273, "dur": 5.350, + "args": { + "External id": 82425,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865696494.633, "dur": 3.760, + "args": { + "External id": 82426,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865696539.823, "dur": 10.990, + "args": { + "External id": 82427,"Record function id": 0, "Ev Idx": 4006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6300865696551.813, "dur": 1016.818, + "args": { + "External id": 82428,"Record function id": 0, "Ev Idx": 4007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865696574.343, "dur": 216.670, + "args": { + "External id": 82429,"Sequence number": 1770917, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4008 + } + }, + { + "ph": "s", "id": 285, "pid": 5714, "tid": 5714, "ts": 6300865696574.343, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865696602.743, "dur": 33.950, + "args": { + "External id": 82430,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865696649.493, "dur": 18.180, + "args": { + "External id": 82431,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865696680.183, "dur": 17.790, + "args": { + "External id": 82432,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865696749.873, "dur": 4.440, + "args": { + "External id": 82433,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865696763.043, "dur": 1.210, + "args": { + "External id": 82434,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865696769.203, "dur": 1.860, + "args": { + "External id": 82435,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865696811.713, "dur": 9.840, + "args": { + "External id": 82436,"Record function id": 0, "Ev Idx": 4015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6300865696822.553, "dur": 454.148, + "args": { + "External id": 82437,"Record function id": 0, "Ev Idx": 4016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865696840.622, "dur": 3.171, + "args": { + "External id": 82438,"Record function id": 0, "Ev Idx": 4017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865696844.462, "dur": 216.330, + "args": { + "External id": 82439,"Record function id": 0, "Ev Idx": 4018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865696858.082, "dur": 201.540, + "args": { + "External id": 82440,"Sequence number": 1770918, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4019 + } + }, + { + "ph": "s", "id": 284, "pid": 5714, "tid": 5714, "ts": 6300865696858.082, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865696864.342, "dur": 5.300, + "args": { + "External id": 82441,"Record function id": 0, "Ev Idx": 4020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865696870.693, "dur": 178.359, + "args": { + "External id": 82442,"Record function id": 0, "Ev Idx": 4021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865696894.953, "dur": 3.469, + "args": { + "External id": 82443,"Record function id": 0, "Ev Idx": 4022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865696899.133, "dur": 125.089, + "args": { + "External id": 82444,"Record function id": 0, "Ev Idx": 4023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865696903.102, "dur": 3.860, + "args": { + "External id": 82445,"Record function id": 0, "Ev Idx": 4024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865696907.522, "dur": 114.220, + "args": { + "External id": 82446,"Record function id": 0, "Ev Idx": 4025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865696946.292, "dur": 5.800, + "args": { + "External id": 82447,"Record function id": 0, "Ev Idx": 4026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865696953.222, "dur": 67.520, + "args": { + "External id": 82448,"Record function id": 0, "Ev Idx": 4027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865696984.952, "dur": 23.820, + "args": { + "External id": 82449,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865697030.932, "dur": 3.390, + "args": { + "External id": 82450,"Record function id": 0, "Ev Idx": 4029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865697035.032, "dur": 13.340, + "args": { + "External id": 82451,"Record function id": 0, "Ev Idx": 4030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865697064.782, "dur": 4.090, + "args": { + "External id": 82452,"Record function id": 0, "Ev Idx": 4031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6300865697069.532, "dur": 206.760, + "args": { + "External id": 82453,"Record function id": 0, "Ev Idx": 4032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865697073.262, "dur": 2.380, + "args": { + "External id": 82454,"Record function id": 0, "Ev Idx": 4033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865697076.212, "dur": 199.080, + "args": { + "External id": 82455,"Record function id": 0, "Ev Idx": 4034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865697090.162, "dur": 183.919, + "args": { + "External id": 82456,"Sequence number": 1770919, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4035 + } + }, + { + "ph": "s", "id": 283, "pid": 5714, "tid": 5714, "ts": 6300865697090.162, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865697095.432, "dur": 5.090, + "args": { + "External id": 82457,"Record function id": 0, "Ev Idx": 4036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865697101.272, "dur": 165.109, + "args": { + "External id": 82458,"Record function id": 0, "Ev Idx": 4037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865697122.822, "dur": 3.220, + "args": { + "External id": 82459,"Record function id": 0, "Ev Idx": 4038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865697126.892, "dur": 115.710, + "args": { + "External id": 82460,"Record function id": 0, "Ev Idx": 4039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865697131.082, "dur": 3.820, + "args": { + "External id": 82461,"Record function id": 0, "Ev Idx": 4040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865697135.542, "dur": 104.890, + "args": { + "External id": 82462,"Record function id": 0, "Ev Idx": 4041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865697170.292, "dur": 5.310, + "args": { + "External id": 82463,"Record function id": 0, "Ev Idx": 4042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865697176.402, "dur": 63.160, + "args": { + "External id": 82464,"Record function id": 0, "Ev Idx": 4043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865697206.212, "dur": 22.220, + "args": { + "External id": 82465,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865697248.972, "dur": 3.210, + "args": { + "External id": 82466,"Record function id": 0, "Ev Idx": 4045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865697252.912, "dur": 12.849, + "args": { + "External id": 82467,"Record function id": 0, "Ev Idx": 4046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865697287.292, "dur": 17.689, + "args": { + "External id": 82468,"Record function id": 0, "Ev Idx": 4047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6300865697306.012, "dur": 261.399, + "args": { + "External id": 82469,"Record function id": 0, "Ev Idx": 4048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865697328.612, "dur": 228.319, + "args": { + "External id": 82470,"Sequence number": 1770920, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4049 + } + }, + { + "ph": "s", "id": 282, "pid": 5714, "tid": 5714, "ts": 6300865697328.612, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865697355.192, "dur": 119.069, + "args": { + "External id": 82471,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865697391.931, "dur": 13.220, + "args": { + "External id": 82472,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865697394.961, "dur": 9.210, + "args": { + "External id": 82473,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865697407.081, "dur": 6.480, + "args": { + "External id": 82474,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865697414.771, "dur": 2.390, + "args": { + "External id": 82475,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865697420.251, "dur": 3.770, + "args": { + "External id": 82476,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865697491.801, "dur": 31.300, + "args": { + "External id": 82477,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865697578.311, "dur": 21.060, + "args": { + "External id": 82478,"Record function id": 0, "Ev Idx": 4057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6300865697600.431, "dur": 350.579, + "args": { + "External id": 82479,"Record function id": 0, "Ev Idx": 4058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865697627.581, "dur": 312.589, + "args": { + "External id": 82480,"Sequence number": 1770921, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4059 + } + }, + { + "ph": "s", "id": 281, "pid": 5714, "tid": 5714, "ts": 6300865697627.581, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865697681.191, "dur": 29.449, + "args": { + "External id": 82481,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865697726.400, "dur": 27.611, + "args": { + "External id": 82482,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865697766.840, "dur": 18.120, + "args": { + "External id": 82483,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6300865697807.970, "dur": 20.470, + "args": { + "External id": 82484,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865697840.860, "dur": 25.500, + "args": { + "External id": 82485,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6300865697888.180, "dur": 17.270, + "args": { + "External id": 82486,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4065 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.2)", "pid": 5714, "tid": 5714, + "ts": 6300865697996.070, "dur": 55.250, + "args": { + "External id": 82487,"Record function id": 0, "Ev Idx": 4066 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865698115.240, "dur": 45.359, + "args": { + "External id": 82488,"Record function id": 0, "Ev Idx": 4067 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.3)", "pid": 5714, "tid": 5714, + "ts": 6300865698170.499, "dur": 999.428, + "args": { + "External id": 82489,"Record function id": 0, "Ev Idx": 4068 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 5714, "tid": 5714, + "ts": 6300865698177.919, "dur": 550.299, + "args": { + "External id": 82490,"Record function id": 0, "Ev Idx": 4069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865698248.259, "dur": 8.910, + "args": { + "External id": 82491,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865698267.419, "dur": 25.280, + "args": { + "External id": 82492,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698270.949, "dur": 1.360, + "args": { + "External id": 82493,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698275.249, "dur": 0.360, + "args": { + "External id": 82494,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698276.419, "dur": 0.310, + "args": { + "External id": 82495,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698278.099, "dur": 2.370, + "args": { + "External id": 82496,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698281.219, "dur": 0.300, + "args": { + "External id": 82497,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698282.829, "dur": 0.170, + "args": { + "External id": 82498,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698285.099, "dur": 0.330, + "args": { + "External id": 82499,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698286.149, "dur": 0.180, + "args": { + "External id": 82500,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698287.469, "dur": 0.220, + "args": { + "External id": 82501,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865698330.809, "dur": 31.170, + "args": { + "External id": 82502,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865698396.689, "dur": 103.160, + "args": { + "External id": 82503,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865698408.289, "dur": 9.240, + "args": { + "External id": 82504,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865698422.039, "dur": 8.320, + "args": { + "External id": 82505,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865698424.559, "dur": 5.420, + "args": { + "External id": 82506,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698427.219, "dur": 0.870, + "args": { + "External id": 82507,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865698438.029, "dur": 22.960, + "args": { + "External id": 82508,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698440.339, "dur": 1.530, + "args": { + "External id": 82509,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698442.809, "dur": 1.520, + "args": { + "External id": 82510,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698445.199, "dur": 0.180, + "args": { + "External id": 82511,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698447.009, "dur": 0.250, + "args": { + "External id": 82512,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698449.079, "dur": 0.420, + "args": { + "External id": 82513,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698450.229, "dur": 0.280, + "args": { + "External id": 82514,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698451.599, "dur": 0.260, + "args": { + "External id": 82515,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698454.069, "dur": 0.190, + "args": { + "External id": 82516,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698455.449, "dur": 1.420, + "args": { + "External id": 82517,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865698472.779, "dur": 18.440, + "args": { + "External id": 82518,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865698556.319, "dur": 96.770, + "args": { + "External id": 82519,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865698577.689, "dur": 72.169, + "args": { + "External id": 82520,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4099, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865698589.769, "dur": 55.160, + "args": { + "External id": 82521,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865698670.869, "dur": 3.400, + "args": { + "External id": 82522,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4101, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 5714, "tid": 5714, + "ts": 6300865698747.878, "dur": 263.500, + "args": { + "External id": 82523,"Record function id": 0, "Ev Idx": 4102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865698841.218, "dur": 4.120, + "args": { + "External id": 82524,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865698849.658, "dur": 0.940, + "args": { + "External id": 82525,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865698852.548, "dur": 0.700, + "args": { + "External id": 82526,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865698855.188, "dur": 0.620, + "args": { + "External id": 82527,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865698857.308, "dur": 0.750, + "args": { + "External id": 82528,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865698859.408, "dur": 0.610, + "args": { + "External id": 82529,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865698862.848, "dur": 0.850, + "args": { + "External id": 82530,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865698865.318, "dur": 3.050, + "args": { + "External id": 82531,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865698869.908, "dur": 0.840, + "args": { + "External id": 82532,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865698872.098, "dur": 0.660, + "args": { + "External id": 82533,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865698887.988, "dur": 94.690, + "args": { + "External id": 82534,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865698899.288, "dur": 79.450, + "args": { + "External id": 82535,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865698912.338, "dur": 6.630, + "args": { + "External id": 82536,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865698921.668, "dur": 35.040, + "args": { + "External id": 82537,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865698923.128, "dur": 33.190, + "args": { + "External id": 82538,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865698926.578, "dur": 7.770, + "args": { + "External id": 82539,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865698935.488, "dur": 20.380, + "args": { + "External id": 82540,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6300865699122.497, "dur": 21.520, + "args": { + "External id": 82541,"Sequence number": 1770922, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4120 + } + }, + { + "ph": "s", "id": 280, "pid": 5714, "tid": 5714, "ts": 6300865699122.497, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300865699133.617, "dur": 6.560, + "args": { + "External id": 82542,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865699136.017, "dur": 3.660, + "args": { + "External id": 82543,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699201.427, "dur": 10.630, + "args": { + "External id": 82544,"Record function id": 0, "Ev Idx": 4123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6300865699213.077, "dur": 1565.767, + "args": { + "External id": 82545,"Record function id": 0, "Ev Idx": 4124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865699236.267, "dur": 124.530, + "args": { + "External id": 82546,"Sequence number": 1770923, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4125 + } + }, + { + "ph": "s", "id": 279, "pid": 5714, "tid": 5714, "ts": 6300865699236.267, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865699285.397, "dur": 37.930, + "args": { + "External id": 82547,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865699336.947, "dur": 5.430, + "args": { + "External id": 82548,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865699338.307, "dur": 3.830, + "args": { + "External id": 82549,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699382.987, "dur": 10.870, + "args": { + "External id": 82550,"Record function id": 0, "Ev Idx": 4129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6300865699394.717, "dur": 1006.728, + "args": { + "External id": 82551,"Record function id": 0, "Ev Idx": 4130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865699417.727, "dur": 193.349, + "args": { + "External id": 82552,"Sequence number": 1770924, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4131 + } + }, + { + "ph": "s", "id": 278, "pid": 5714, "tid": 5714, "ts": 6300865699417.727, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865699446.617, "dur": 33.779, + "args": { + "External id": 82553,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865699492.856, "dur": 18.531, + "args": { + "External id": 82554,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865699522.416, "dur": 16.680, + "args": { + "External id": 82555,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865699573.486, "dur": 4.310, + "args": { + "External id": 82556,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865699585.446, "dur": 1.000, + "args": { + "External id": 82557,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865699591.066, "dur": 2.170, + "args": { + "External id": 82558,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699630.846, "dur": 9.680, + "args": { + "External id": 82559,"Record function id": 0, "Ev Idx": 4138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6300865699641.466, "dur": 465.119, + "args": { + "External id": 82560,"Record function id": 0, "Ev Idx": 4139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699659.726, "dur": 2.940, + "args": { + "External id": 82561,"Record function id": 0, "Ev Idx": 4140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865699663.576, "dur": 214.090, + "args": { + "External id": 82562,"Record function id": 0, "Ev Idx": 4141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865699676.796, "dur": 199.670, + "args": { + "External id": 82563,"Sequence number": 1770925, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4142 + } + }, + { + "ph": "s", "id": 277, "pid": 5714, "tid": 5714, "ts": 6300865699676.796, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699683.466, "dur": 6.400, + "args": { + "External id": 82564,"Record function id": 0, "Ev Idx": 4143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865699690.656, "dur": 176.970, + "args": { + "External id": 82565,"Record function id": 0, "Ev Idx": 4144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699714.696, "dur": 3.670, + "args": { + "External id": 82566,"Record function id": 0, "Ev Idx": 4145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865699719.136, "dur": 124.370, + "args": { + "External id": 82567,"Record function id": 0, "Ev Idx": 4146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699722.646, "dur": 4.030, + "args": { + "External id": 82568,"Record function id": 0, "Ev Idx": 4147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865699727.516, "dur": 113.500, + "args": { + "External id": 82569,"Record function id": 0, "Ev Idx": 4148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699766.846, "dur": 5.910, + "args": { + "External id": 82570,"Record function id": 0, "Ev Idx": 4149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865699773.866, "dur": 66.130, + "args": { + "External id": 82571,"Record function id": 0, "Ev Idx": 4150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865699804.336, "dur": 23.600, + "args": { + "External id": 82572,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699849.736, "dur": 3.220, + "args": { + "External id": 82573,"Record function id": 0, "Ev Idx": 4152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865699853.636, "dur": 13.340, + "args": { + "External id": 82574,"Record function id": 0, "Ev Idx": 4153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699881.466, "dur": 4.240, + "args": { + "External id": 82575,"Record function id": 0, "Ev Idx": 4154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6300865699886.376, "dur": 219.769, + "args": { + "External id": 82576,"Record function id": 0, "Ev Idx": 4155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699890.006, "dur": 2.510, + "args": { + "External id": 82577,"Record function id": 0, "Ev Idx": 4156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865699893.116, "dur": 212.179, + "args": { + "External id": 82578,"Record function id": 0, "Ev Idx": 4157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865699906.195, "dur": 198.070, + "args": { + "External id": 82579,"Sequence number": 1770926, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4158 + } + }, + { + "ph": "s", "id": 276, "pid": 5714, "tid": 5714, "ts": 6300865699906.195, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699911.446, "dur": 13.020, + "args": { + "External id": 82580,"Record function id": 0, "Ev Idx": 4159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865699926.686, "dur": 169.419, + "args": { + "External id": 82581,"Record function id": 0, "Ev Idx": 4160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699947.666, "dur": 3.329, + "args": { + "External id": 82582,"Record function id": 0, "Ev Idx": 4161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865699951.775, "dur": 121.410, + "args": { + "External id": 82583,"Record function id": 0, "Ev Idx": 4162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699956.526, "dur": 10.440, + "args": { + "External id": 82584,"Record function id": 0, "Ev Idx": 4163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865699967.575, "dur": 103.550, + "args": { + "External id": 82585,"Record function id": 0, "Ev Idx": 4164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865699999.655, "dur": 5.310, + "args": { + "External id": 82586,"Record function id": 0, "Ev Idx": 4165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865700005.785, "dur": 64.460, + "args": { + "External id": 82587,"Record function id": 0, "Ev Idx": 4166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865700036.795, "dur": 22.570, + "args": { + "External id": 82588,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865700079.115, "dur": 3.090, + "args": { + "External id": 82589,"Record function id": 0, "Ev Idx": 4168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865700083.035, "dur": 12.470, + "args": { + "External id": 82590,"Record function id": 0, "Ev Idx": 4169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865700117.065, "dur": 9.320, + "args": { + "External id": 82591,"Record function id": 0, "Ev Idx": 4170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6300865700127.245, "dur": 273.040, + "args": { + "External id": 82592,"Record function id": 0, "Ev Idx": 4171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865700148.835, "dur": 240.550, + "args": { + "External id": 82593,"Sequence number": 1770927, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4172 + } + }, + { + "ph": "s", "id": 275, "pid": 5714, "tid": 5714, "ts": 6300865700148.835, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865700173.005, "dur": 119.420, + "args": { + "External id": 82594,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865700210.435, "dur": 12.630, + "args": { + "External id": 82595,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865700213.425, "dur": 8.630, + "args": { + "External id": 82596,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865700224.885, "dur": 6.510, + "args": { + "External id": 82597,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865700233.425, "dur": 2.530, + "args": { + "External id": 82598,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865700238.565, "dur": 3.950, + "args": { + "External id": 82599,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865700318.905, "dur": 31.709, + "args": { + "External id": 82600,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865700410.874, "dur": 20.380, + "args": { + "External id": 82601,"Record function id": 0, "Ev Idx": 4180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6300865700432.234, "dur": 343.720, + "args": { + "External id": 82602,"Record function id": 0, "Ev Idx": 4181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865700458.474, "dur": 306.580, + "args": { + "External id": 82603,"Sequence number": 1770928, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4182 + } + }, + { + "ph": "s", "id": 274, "pid": 5714, "tid": 5714, "ts": 6300865700458.474, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865700512.134, "dur": 29.210, + "args": { + "External id": 82604,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865700557.674, "dur": 27.460, + "args": { + "External id": 82605,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865700596.354, "dur": 17.980, + "args": { + "External id": 82606,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6300865700636.504, "dur": 20.530, + "args": { + "External id": 82607,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865700668.754, "dur": 25.400, + "args": { + "External id": 82608,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6300865700714.974, "dur": 16.330, + "args": { + "External id": 82609,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4188 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.3)", "pid": 5714, "tid": 5714, + "ts": 6300865700820.373, "dur": 54.971, + "args": { + "External id": 82610,"Record function id": 0, "Ev Idx": 4189 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865700940.283, "dur": 45.690, + "args": { + "External id": 82611,"Record function id": 0, "Ev Idx": 4190 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.4)", "pid": 5714, "tid": 5714, + "ts": 6300865700995.533, "dur": 986.618, + "args": { + "External id": 82612,"Record function id": 0, "Ev Idx": 4191 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 5714, "tid": 5714, + "ts": 6300865701003.923, "dur": 520.209, + "args": { + "External id": 82613,"Record function id": 0, "Ev Idx": 4192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865701075.403, "dur": 9.340, + "args": { + "External id": 82614,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865701095.853, "dur": 24.670, + "args": { + "External id": 82615,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701099.323, "dur": 1.340, + "args": { + "External id": 82616,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701103.303, "dur": 0.230, + "args": { + "External id": 82617,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701104.353, "dur": 0.280, + "args": { + "External id": 82618,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701106.023, "dur": 2.350, + "args": { + "External id": 82619,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701109.223, "dur": 0.280, + "args": { + "External id": 82620,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701110.803, "dur": 0.200, + "args": { + "External id": 82621,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701112.773, "dur": 0.280, + "args": { + "External id": 82622,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701113.793, "dur": 0.160, + "args": { + "External id": 82623,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701115.333, "dur": 0.230, + "args": { + "External id": 82624,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865701129.363, "dur": 25.560, + "args": { + "External id": 82625,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865701187.333, "dur": 100.159, + "args": { + "External id": 82626,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865701198.273, "dur": 7.530, + "args": { + "External id": 82627,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865701210.083, "dur": 9.360, + "args": { + "External id": 82628,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865701212.593, "dur": 6.440, + "args": { + "External id": 82629,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701215.133, "dur": 1.840, + "args": { + "External id": 82630,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865701227.003, "dur": 22.009, + "args": { + "External id": 82631,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701229.152, "dur": 1.191, + "args": { + "External id": 82632,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701231.403, "dur": 0.189, + "args": { + "External id": 82633,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701232.372, "dur": 0.180, + "args": { + "External id": 82634,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701234.903, "dur": 0.229, + "args": { + "External id": 82635,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701235.972, "dur": 0.240, + "args": { + "External id": 82636,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701237.112, "dur": 1.320, + "args": { + "External id": 82637,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701239.712, "dur": 0.171, + "args": { + "External id": 82638,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701240.612, "dur": 0.260, + "args": { + "External id": 82639,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701243.132, "dur": 1.680, + "args": { + "External id": 82640,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865701259.652, "dur": 19.080, + "args": { + "External id": 82641,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865701355.702, "dur": 95.400, + "args": { + "External id": 82642,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865701375.272, "dur": 72.540, + "args": { + "External id": 82643,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4222, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865701388.342, "dur": 54.660, + "args": { + "External id": 82644,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865701467.192, "dur": 3.590, + "args": { + "External id": 82645,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4224, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 5714, "tid": 5714, + "ts": 6300865701544.502, "dur": 274.489, + "args": { + "External id": 82646,"Record function id": 0, "Ev Idx": 4225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701650.592, "dur": 4.190, + "args": { + "External id": 82647,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701659.052, "dur": 0.900, + "args": { + "External id": 82648,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701661.851, "dur": 0.720, + "args": { + "External id": 82649,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701664.302, "dur": 0.880, + "args": { + "External id": 82650,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701667.611, "dur": 0.760, + "args": { + "External id": 82651,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701669.682, "dur": 0.729, + "args": { + "External id": 82652,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701671.922, "dur": 0.609, + "args": { + "External id": 82653,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701674.091, "dur": 3.220, + "args": { + "External id": 82654,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701679.782, "dur": 0.720, + "args": { + "External id": 82655,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701681.882, "dur": 0.760, + "args": { + "External id": 82656,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865701696.751, "dur": 93.010, + "args": { + "External id": 82657,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865701707.731, "dur": 78.270, + "args": { + "External id": 82658,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865701719.051, "dur": 6.880, + "args": { + "External id": 82659,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865701728.871, "dur": 35.070, + "args": { + "External id": 82660,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865701730.322, "dur": 33.199, + "args": { + "External id": 82661,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865701733.571, "dur": 6.291, + "args": { + "External id": 82662,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865701742.111, "dur": 20.930, + "args": { + "External id": 82663,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6300865701928.691, "dur": 27.010, + "args": { + "External id": 82664,"Sequence number": 1770929, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4243 + } + }, + { + "ph": "s", "id": 273, "pid": 5714, "tid": 5714, "ts": 6300865701928.691, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300865701939.751, "dur": 12.080, + "args": { + "External id": 82665,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865701947.501, "dur": 3.730, + "args": { + "External id": 82666,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702014.591, "dur": 11.020, + "args": { + "External id": 82667,"Record function id": 0, "Ev Idx": 4246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6300865702026.701, "dur": 1556.436, + "args": { + "External id": 82668,"Record function id": 0, "Ev Idx": 4247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865702049.381, "dur": 112.480, + "args": { + "External id": 82669,"Sequence number": 1770930, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4248 + } + }, + { + "ph": "s", "id": 272, "pid": 5714, "tid": 5714, "ts": 6300865702049.381, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865702097.591, "dur": 27.699, + "args": { + "External id": 82670,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865702138.361, "dur": 5.449, + "args": { + "External id": 82671,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865702139.690, "dur": 3.891, + "args": { + "External id": 82672,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702183.961, "dur": 10.560, + "args": { + "External id": 82673,"Record function id": 0, "Ev Idx": 4252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6300865702195.450, "dur": 996.158, + "args": { + "External id": 82674,"Record function id": 0, "Ev Idx": 4253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865702217.600, "dur": 213.960, + "args": { + "External id": 82675,"Sequence number": 1770931, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4254 + } + }, + { + "ph": "s", "id": 271, "pid": 5714, "tid": 5714, "ts": 6300865702217.600, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865702248.360, "dur": 33.880, + "args": { + "External id": 82676,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865702294.720, "dur": 29.290, + "args": { + "External id": 82677,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865702340.920, "dur": 17.380, + "args": { + "External id": 82678,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865702392.980, "dur": 4.370, + "args": { + "External id": 82679,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865702404.820, "dur": 1.010, + "args": { + "External id": 82680,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865702410.780, "dur": 1.610, + "args": { + "External id": 82681,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702452.430, "dur": 9.480, + "args": { + "External id": 82682,"Record function id": 0, "Ev Idx": 4261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6300865702462.840, "dur": 446.959, + "args": { + "External id": 82683,"Record function id": 0, "Ev Idx": 4262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702480.810, "dur": 3.120, + "args": { + "External id": 82684,"Record function id": 0, "Ev Idx": 4263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865702484.580, "dur": 214.759, + "args": { + "External id": 82685,"Record function id": 0, "Ev Idx": 4264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865702498.190, "dur": 199.859, + "args": { + "External id": 82686,"Sequence number": 1770932, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4265 + } + }, + { + "ph": "s", "id": 270, "pid": 5714, "tid": 5714, "ts": 6300865702498.190, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702503.990, "dur": 5.230, + "args": { + "External id": 82687,"Record function id": 0, "Ev Idx": 4266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865702510.010, "dur": 178.939, + "args": { + "External id": 82688,"Record function id": 0, "Ev Idx": 4267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702534.010, "dur": 3.630, + "args": { + "External id": 82689,"Record function id": 0, "Ev Idx": 4268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865702538.380, "dur": 125.789, + "args": { + "External id": 82690,"Record function id": 0, "Ev Idx": 4269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702541.160, "dur": 4.209, + "args": { + "External id": 82691,"Record function id": 0, "Ev Idx": 4270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865702546.109, "dur": 115.370, + "args": { + "External id": 82692,"Record function id": 0, "Ev Idx": 4271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702583.580, "dur": 5.800, + "args": { + "External id": 82693,"Record function id": 0, "Ev Idx": 4272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865702590.620, "dur": 69.889, + "args": { + "External id": 82694,"Record function id": 0, "Ev Idx": 4273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865702624.500, "dur": 24.169, + "args": { + "External id": 82695,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702670.349, "dur": 3.640, + "args": { + "External id": 82696,"Record function id": 0, "Ev Idx": 4275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865702674.649, "dur": 13.620, + "args": { + "External id": 82697,"Record function id": 0, "Ev Idx": 4276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702703.229, "dur": 4.570, + "args": { + "External id": 82698,"Record function id": 0, "Ev Idx": 4277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6300865702708.429, "dur": 200.970, + "args": { + "External id": 82699,"Record function id": 0, "Ev Idx": 4278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702711.469, "dur": 2.400, + "args": { + "External id": 82700,"Record function id": 0, "Ev Idx": 4279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865702714.439, "dur": 194.100, + "args": { + "External id": 82701,"Record function id": 0, "Ev Idx": 4280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865702728.189, "dur": 179.250, + "args": { + "External id": 82702,"Sequence number": 1770933, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4281 + } + }, + { + "ph": "s", "id": 269, "pid": 5714, "tid": 5714, "ts": 6300865702728.189, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702733.889, "dur": 4.630, + "args": { + "External id": 82703,"Record function id": 0, "Ev Idx": 4282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865702739.269, "dur": 160.400, + "args": { + "External id": 82704,"Record function id": 0, "Ev Idx": 4283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702761.119, "dur": 3.060, + "args": { + "External id": 82705,"Record function id": 0, "Ev Idx": 4284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865702764.999, "dur": 111.950, + "args": { + "External id": 82706,"Record function id": 0, "Ev Idx": 4285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702768.129, "dur": 3.880, + "args": { + "External id": 82707,"Record function id": 0, "Ev Idx": 4286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865702772.719, "dur": 102.050, + "args": { + "External id": 82708,"Record function id": 0, "Ev Idx": 4287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702805.009, "dur": 5.410, + "args": { + "External id": 82709,"Record function id": 0, "Ev Idx": 4288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865702811.399, "dur": 62.390, + "args": { + "External id": 82710,"Record function id": 0, "Ev Idx": 4289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865702841.149, "dur": 21.280, + "args": { + "External id": 82711,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702882.739, "dur": 3.150, + "args": { + "External id": 82712,"Record function id": 0, "Ev Idx": 4291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865702886.569, "dur": 12.490, + "args": { + "External id": 82713,"Record function id": 0, "Ev Idx": 4292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865702919.539, "dur": 9.500, + "args": { + "External id": 82714,"Record function id": 0, "Ev Idx": 4293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6300865702929.959, "dur": 260.369, + "args": { + "External id": 82715,"Record function id": 0, "Ev Idx": 4294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865702951.939, "dur": 227.449, + "args": { + "External id": 82716,"Sequence number": 1770934, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4295 + } + }, + { + "ph": "s", "id": 268, "pid": 5714, "tid": 5714, "ts": 6300865702951.939, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865702976.749, "dur": 120.419, + "args": { + "External id": 82717,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865703014.059, "dur": 12.500, + "args": { + "External id": 82718,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865703017.019, "dur": 8.589, + "args": { + "External id": 82719,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865703028.359, "dur": 6.520, + "args": { + "External id": 82720,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865703036.619, "dur": 2.660, + "args": { + "External id": 82721,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865703041.879, "dur": 4.840, + "args": { + "External id": 82722,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865703114.618, "dur": 31.380, + "args": { + "External id": 82723,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865703200.848, "dur": 21.190, + "args": { + "External id": 82724,"Record function id": 0, "Ev Idx": 4303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6300865703223.028, "dur": 356.949, + "args": { + "External id": 82725,"Record function id": 0, "Ev Idx": 4304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865703248.718, "dur": 320.269, + "args": { + "External id": 82726,"Sequence number": 1770935, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4305 + } + }, + { + "ph": "s", "id": 267, "pid": 5714, "tid": 5714, "ts": 6300865703248.718, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865703309.818, "dur": 29.270, + "args": { + "External id": 82727,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865703354.948, "dur": 27.840, + "args": { + "External id": 82728,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865703394.808, "dur": 18.360, + "args": { + "External id": 82729,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6300865703437.258, "dur": 20.129, + "args": { + "External id": 82730,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865703471.107, "dur": 24.951, + "args": { + "External id": 82731,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6300865703518.138, "dur": 16.739, + "args": { + "External id": 82732,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4311 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.4)", "pid": 5714, "tid": 5714, + "ts": 6300865703624.227, "dur": 54.570, + "args": { + "External id": 82733,"Record function id": 0, "Ev Idx": 4312 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865703742.947, "dur": 45.140, + "args": { + "External id": 82734,"Record function id": 0, "Ev Idx": 4313 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.5)", "pid": 5714, "tid": 5714, + "ts": 6300865703797.487, "dur": 982.017, + "args": { + "External id": 82735,"Record function id": 0, "Ev Idx": 4314 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 5714, "tid": 5714, + "ts": 6300865703805.107, "dur": 522.458, + "args": { + "External id": 82736,"Record function id": 0, "Ev Idx": 4315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865703874.777, "dur": 8.920, + "args": { + "External id": 82737,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865703894.326, "dur": 25.260, + "args": { + "External id": 82738,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865703897.977, "dur": 1.309, + "args": { + "External id": 82739,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865703902.026, "dur": 0.351, + "args": { + "External id": 82740,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865703903.297, "dur": 0.320, + "args": { + "External id": 82741,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865703904.786, "dur": 2.520, + "args": { + "External id": 82742,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865703908.197, "dur": 0.269, + "args": { + "External id": 82743,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865703909.617, "dur": 0.240, + "args": { + "External id": 82744,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865703911.737, "dur": 0.349, + "args": { + "External id": 82745,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865703912.817, "dur": 0.289, + "args": { + "External id": 82746,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865703914.457, "dur": 0.180, + "args": { + "External id": 82747,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865703928.166, "dur": 26.400, + "args": { + "External id": 82748,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865703987.546, "dur": 106.240, + "args": { + "External id": 82749,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865703998.666, "dur": 7.880, + "args": { + "External id": 82750,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865704010.676, "dur": 9.200, + "args": { + "External id": 82751,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865704013.166, "dur": 6.280, + "args": { + "External id": 82752,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704015.746, "dur": 1.740, + "args": { + "External id": 82753,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865704027.576, "dur": 25.500, + "args": { + "External id": 82754,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704029.866, "dur": 1.360, + "args": { + "External id": 82755,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704032.346, "dur": 0.180, + "args": { + "External id": 82756,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704036.176, "dur": 0.280, + "args": { + "External id": 82757,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704038.776, "dur": 0.230, + "args": { + "External id": 82758,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704039.746, "dur": 0.180, + "args": { + "External id": 82759,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704040.776, "dur": 1.410, + "args": { + "External id": 82760,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704043.776, "dur": 0.260, + "args": { + "External id": 82761,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704044.776, "dur": 0.180, + "args": { + "External id": 82762,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704047.436, "dur": 1.260, + "args": { + "External id": 82763,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865704065.076, "dur": 20.130, + "args": { + "External id": 82764,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865704150.256, "dur": 94.980, + "args": { + "External id": 82765,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865704170.766, "dur": 71.180, + "args": { + "External id": 82766,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4345, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865704182.806, "dur": 54.660, + "args": { + "External id": 82767,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865704261.146, "dur": 3.530, + "args": { + "External id": 82768,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4347, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 5714, "tid": 5714, + "ts": 6300865704348.225, "dur": 271.000, + "args": { + "External id": 82769,"Record function id": 0, "Ev Idx": 4348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704443.705, "dur": 4.160, + "args": { + "External id": 82770,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704452.475, "dur": 0.870, + "args": { + "External id": 82771,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704455.285, "dur": 0.680, + "args": { + "External id": 82772,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704457.845, "dur": 0.740, + "args": { + "External id": 82773,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704461.535, "dur": 0.860, + "args": { + "External id": 82774,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704463.805, "dur": 0.680, + "args": { + "External id": 82775,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704466.095, "dur": 0.860, + "args": { + "External id": 82776,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704468.535, "dur": 2.790, + "args": { + "External id": 82777,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704473.765, "dur": 1.020, + "args": { + "External id": 82778,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704476.025, "dur": 0.780, + "args": { + "External id": 82779,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865704490.415, "dur": 99.770, + "args": { + "External id": 82780,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865704501.915, "dur": 84.530, + "args": { + "External id": 82781,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865704514.335, "dur": 7.150, + "args": { + "External id": 82782,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865704524.085, "dur": 40.380, + "args": { + "External id": 82783,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865704526.815, "dur": 37.270, + "args": { + "External id": 82784,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865704532.875, "dur": 8.970, + "args": { + "External id": 82785,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865704543.045, "dur": 20.540, + "args": { + "External id": 82786,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6300865704732.595, "dur": 21.249, + "args": { + "External id": 82787,"Sequence number": 1770936, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4366 + } + }, + { + "ph": "s", "id": 266, "pid": 5714, "tid": 5714, "ts": 6300865704732.595, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300865704743.515, "dur": 6.549, + "args": { + "External id": 82788,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865704745.915, "dur": 3.680, + "args": { + "External id": 82789,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865704810.064, "dur": 10.560, + "args": { + "External id": 82790,"Record function id": 0, "Ev Idx": 4369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6300865704821.615, "dur": 1547.706, + "args": { + "External id": 82791,"Record function id": 0, "Ev Idx": 4370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865704844.214, "dur": 113.780, + "args": { + "External id": 82792,"Sequence number": 1770937, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4371 + } + }, + { + "ph": "s", "id": 265, "pid": 5714, "tid": 5714, "ts": 6300865704844.214, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865704893.544, "dur": 26.960, + "args": { + "External id": 82793,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865704933.444, "dur": 5.420, + "args": { + "External id": 82794,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865704934.684, "dur": 3.930, + "args": { + "External id": 82795,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865704979.644, "dur": 10.350, + "args": { + "External id": 82796,"Record function id": 0, "Ev Idx": 4375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6300865704990.894, "dur": 985.368, + "args": { + "External id": 82797,"Record function id": 0, "Ev Idx": 4376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865705013.514, "dur": 194.989, + "args": { + "External id": 82798,"Sequence number": 1770938, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4377 + } + }, + { + "ph": "s", "id": 264, "pid": 5714, "tid": 5714, "ts": 6300865705013.514, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865705042.414, "dur": 34.270, + "args": { + "External id": 82799,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865705088.724, "dur": 18.730, + "args": { + "External id": 82800,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865705119.384, "dur": 16.440, + "args": { + "External id": 82801,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865705170.364, "dur": 4.080, + "args": { + "External id": 82802,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865705182.063, "dur": 1.040, + "args": { + "External id": 82803,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865705188.083, "dur": 1.671, + "args": { + "External id": 82804,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705228.903, "dur": 9.960, + "args": { + "External id": 82805,"Record function id": 0, "Ev Idx": 4384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6300865705239.854, "dur": 458.208, + "args": { + "External id": 82806,"Record function id": 0, "Ev Idx": 4385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705257.634, "dur": 2.980, + "args": { + "External id": 82807,"Record function id": 0, "Ev Idx": 4386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865705261.343, "dur": 223.750, + "args": { + "External id": 82808,"Record function id": 0, "Ev Idx": 4387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865705274.574, "dur": 209.319, + "args": { + "External id": 82809,"Sequence number": 1770939, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4388 + } + }, + { + "ph": "s", "id": 263, "pid": 5714, "tid": 5714, "ts": 6300865705274.574, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705281.693, "dur": 5.180, + "args": { + "External id": 82810,"Record function id": 0, "Ev Idx": 4389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865705287.673, "dur": 187.500, + "args": { + "External id": 82811,"Record function id": 0, "Ev Idx": 4390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705320.613, "dur": 3.640, + "args": { + "External id": 82812,"Record function id": 0, "Ev Idx": 4391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865705325.093, "dur": 126.260, + "args": { + "External id": 82813,"Record function id": 0, "Ev Idx": 4392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705328.373, "dur": 4.120, + "args": { + "External id": 82814,"Record function id": 0, "Ev Idx": 4393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865705333.213, "dur": 115.500, + "args": { + "External id": 82815,"Record function id": 0, "Ev Idx": 4394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705371.513, "dur": 5.520, + "args": { + "External id": 82816,"Record function id": 0, "Ev Idx": 4395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865705378.243, "dur": 69.420, + "args": { + "External id": 82817,"Record function id": 0, "Ev Idx": 4396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865705409.283, "dur": 26.120, + "args": { + "External id": 82818,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705457.363, "dur": 3.390, + "args": { + "External id": 82819,"Record function id": 0, "Ev Idx": 4398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865705461.393, "dur": 13.080, + "args": { + "External id": 82820,"Record function id": 0, "Ev Idx": 4399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705488.933, "dur": 4.130, + "args": { + "External id": 82821,"Record function id": 0, "Ev Idx": 4400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6300865705493.783, "dur": 203.779, + "args": { + "External id": 82822,"Record function id": 0, "Ev Idx": 4401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705496.813, "dur": 2.350, + "args": { + "External id": 82823,"Record function id": 0, "Ev Idx": 4402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865705499.883, "dur": 196.750, + "args": { + "External id": 82824,"Record function id": 0, "Ev Idx": 4403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865705513.983, "dur": 181.519, + "args": { + "External id": 82825,"Sequence number": 1770940, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4404 + } + }, + { + "ph": "s", "id": 262, "pid": 5714, "tid": 5714, "ts": 6300865705513.983, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705518.743, "dur": 4.660, + "args": { + "External id": 82826,"Record function id": 0, "Ev Idx": 4405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865705524.143, "dur": 163.419, + "args": { + "External id": 82827,"Record function id": 0, "Ev Idx": 4406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705545.783, "dur": 3.140, + "args": { + "External id": 82828,"Record function id": 0, "Ev Idx": 4407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865705549.753, "dur": 114.849, + "args": { + "External id": 82829,"Record function id": 0, "Ev Idx": 4408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705553.063, "dur": 3.830, + "args": { + "External id": 82830,"Record function id": 0, "Ev Idx": 4409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865705557.673, "dur": 104.680, + "args": { + "External id": 82831,"Record function id": 0, "Ev Idx": 4410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705590.773, "dur": 5.240, + "args": { + "External id": 82832,"Record function id": 0, "Ev Idx": 4411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865705596.943, "dur": 64.439, + "args": { + "External id": 82833,"Record function id": 0, "Ev Idx": 4412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865705627.502, "dur": 22.680, + "args": { + "External id": 82834,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705669.902, "dur": 3.091, + "args": { + "External id": 82835,"Record function id": 0, "Ev Idx": 4414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865705673.633, "dur": 13.300, + "args": { + "External id": 82836,"Record function id": 0, "Ev Idx": 4415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705708.333, "dur": 9.480, + "args": { + "External id": 82837,"Record function id": 0, "Ev Idx": 4416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6300865705718.733, "dur": 256.289, + "args": { + "External id": 82838,"Record function id": 0, "Ev Idx": 4417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865705740.592, "dur": 223.740, + "args": { + "External id": 82839,"Sequence number": 1770941, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4418 + } + }, + { + "ph": "s", "id": 261, "pid": 5714, "tid": 5714, "ts": 6300865705740.592, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865705765.402, "dur": 118.050, + "args": { + "External id": 82840,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865705802.152, "dur": 12.580, + "args": { + "External id": 82841,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865705805.102, "dur": 8.550, + "args": { + "External id": 82842,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865705816.492, "dur": 6.370, + "args": { + "External id": 82843,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865705824.772, "dur": 2.880, + "args": { + "External id": 82844,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865705830.352, "dur": 3.670, + "args": { + "External id": 82845,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865705900.742, "dur": 31.470, + "args": { + "External id": 82846,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865705985.462, "dur": 20.340, + "args": { + "External id": 82847,"Record function id": 0, "Ev Idx": 4426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6300865706006.802, "dur": 359.249, + "args": { + "External id": 82848,"Record function id": 0, "Ev Idx": 4427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865706033.302, "dur": 321.969, + "args": { + "External id": 82849,"Sequence number": 1770942, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4428 + } + }, + { + "ph": "s", "id": 260, "pid": 5714, "tid": 5714, "ts": 6300865706033.302, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865706086.892, "dur": 28.629, + "args": { + "External id": 82850,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865706130.652, "dur": 27.500, + "args": { + "External id": 82851,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865706170.161, "dur": 18.020, + "args": { + "External id": 82852,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6300865706214.971, "dur": 20.210, + "args": { + "External id": 82853,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865706247.591, "dur": 25.110, + "args": { + "External id": 82854,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6300865706292.721, "dur": 27.000, + "args": { + "External id": 82855,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4434 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.5)", "pid": 5714, "tid": 5714, + "ts": 6300865706410.521, "dur": 56.540, + "args": { + "External id": 82856,"Record function id": 0, "Ev Idx": 4435 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865706530.660, "dur": 45.640, + "args": { + "External id": 82857,"Record function id": 0, "Ev Idx": 4436 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.6)", "pid": 5714, "tid": 5714, + "ts": 6300865706585.360, "dur": 963.348, + "args": { + "External id": 82858,"Record function id": 0, "Ev Idx": 4437 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 5714, "tid": 5714, + "ts": 6300865706593.431, "dur": 505.358, + "args": { + "External id": 82859,"Record function id": 0, "Ev Idx": 4438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865706661.940, "dur": 9.330, + "args": { + "External id": 82860,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865706681.540, "dur": 24.890, + "args": { + "External id": 82861,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706684.910, "dur": 1.400, + "args": { + "External id": 82862,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706688.980, "dur": 0.300, + "args": { + "External id": 82863,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706690.230, "dur": 0.280, + "args": { + "External id": 82864,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706691.820, "dur": 2.570, + "args": { + "External id": 82865,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706695.180, "dur": 0.200, + "args": { + "External id": 82866,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706696.650, "dur": 0.280, + "args": { + "External id": 82867,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706698.840, "dur": 0.270, + "args": { + "External id": 82868,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706699.830, "dur": 0.310, + "args": { + "External id": 82869,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706701.270, "dur": 0.170, + "args": { + "External id": 82870,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865706715.120, "dur": 25.600, + "args": { + "External id": 82871,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865706773.660, "dur": 100.480, + "args": { + "External id": 82872,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865706784.400, "dur": 7.510, + "args": { + "External id": 82873,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865706796.120, "dur": 9.290, + "args": { + "External id": 82874,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865706798.650, "dur": 6.340, + "args": { + "External id": 82875,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706801.300, "dur": 1.830, + "args": { + "External id": 82876,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865706812.690, "dur": 22.440, + "args": { + "External id": 82877,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706814.870, "dur": 1.140, + "args": { + "External id": 82878,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706816.980, "dur": 0.240, + "args": { + "External id": 82879,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706817.960, "dur": 0.310, + "args": { + "External id": 82880,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706820.810, "dur": 0.190, + "args": { + "External id": 82881,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706821.850, "dur": 0.260, + "args": { + "External id": 82882,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706822.860, "dur": 1.750, + "args": { + "External id": 82883,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706825.840, "dur": 0.170, + "args": { + "External id": 82884,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706826.730, "dur": 0.240, + "args": { + "External id": 82885,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865706829.620, "dur": 1.120, + "args": { + "External id": 82886,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865706846.550, "dur": 18.870, + "args": { + "External id": 82887,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865706931.420, "dur": 94.190, + "args": { + "External id": 82888,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865706951.270, "dur": 71.049, + "args": { + "External id": 82889,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4468, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865706963.219, "dur": 54.391, + "args": { + "External id": 82890,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865707042.609, "dur": 3.330, + "args": { + "External id": 82891,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4470, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 5714, "tid": 5714, + "ts": 6300865707118.269, "dur": 272.940, + "args": { + "External id": 82892,"Record function id": 0, "Ev Idx": 4471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707211.319, "dur": 4.080, + "args": { + "External id": 82893,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707219.939, "dur": 0.980, + "args": { + "External id": 82894,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707222.979, "dur": 0.660, + "args": { + "External id": 82895,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707225.409, "dur": 0.720, + "args": { + "External id": 82896,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707227.629, "dur": 0.720, + "args": { + "External id": 82897,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707229.769, "dur": 0.800, + "args": { + "External id": 82898,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707232.139, "dur": 0.770, + "args": { + "External id": 82899,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707235.619, "dur": 3.050, + "args": { + "External id": 82900,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707240.169, "dur": 0.790, + "args": { + "External id": 82901,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707242.349, "dur": 0.720, + "args": { + "External id": 82902,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865707256.839, "dur": 104.840, + "args": { + "External id": 82903,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865707268.229, "dur": 89.560, + "args": { + "External id": 82904,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865707280.559, "dur": 7.190, + "args": { + "External id": 82905,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865707291.619, "dur": 43.540, + "args": { + "External id": 82906,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865707293.099, "dur": 41.690, + "args": { + "External id": 82907,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865707305.249, "dur": 6.970, + "args": { + "External id": 82908,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865707313.519, "dur": 20.710, + "args": { + "External id": 82909,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6300865707502.198, "dur": 21.320, + "args": { + "External id": 82910,"Sequence number": 1770943, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4489 + } + }, + { + "ph": "s", "id": 259, "pid": 5714, "tid": 5714, "ts": 6300865707502.198, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300865707513.168, "dur": 6.610, + "args": { + "External id": 82911,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865707515.578, "dur": 3.670, + "args": { + "External id": 82912,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865707579.768, "dur": 10.320, + "args": { + "External id": 82913,"Record function id": 0, "Ev Idx": 4492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6300865707591.048, "dur": 1550.217, + "args": { + "External id": 82914,"Record function id": 0, "Ev Idx": 4493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865707613.228, "dur": 112.740, + "args": { + "External id": 82915,"Sequence number": 1770944, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4494 + } + }, + { + "ph": "s", "id": 258, "pid": 5714, "tid": 5714, "ts": 6300865707613.228, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865707661.808, "dur": 27.050, + "args": { + "External id": 82916,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865707702.258, "dur": 5.150, + "args": { + "External id": 82917,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865707703.528, "dur": 3.640, + "args": { + "External id": 82918,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865707747.708, "dur": 10.920, + "args": { + "External id": 82919,"Record function id": 0, "Ev Idx": 4498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6300865707759.468, "dur": 1000.658, + "args": { + "External id": 82920,"Record function id": 0, "Ev Idx": 4499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865707783.088, "dur": 195.499, + "args": { + "External id": 82921,"Sequence number": 1770945, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4500 + } + }, + { + "ph": "s", "id": 257, "pid": 5714, "tid": 5714, "ts": 6300865707783.088, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865707812.198, "dur": 33.819, + "args": { + "External id": 82922,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865707858.228, "dur": 18.520, + "args": { + "External id": 82923,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865707889.657, "dur": 15.991, + "args": { + "External id": 82924,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865707940.437, "dur": 4.100, + "args": { + "External id": 82925,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865707952.097, "dur": 1.170, + "args": { + "External id": 82926,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865707957.957, "dur": 1.580, + "args": { + "External id": 82927,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865707998.507, "dur": 9.370, + "args": { + "External id": 82928,"Record function id": 0, "Ev Idx": 4507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6300865708008.797, "dur": 474.039, + "args": { + "External id": 82929,"Record function id": 0, "Ev Idx": 4508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708026.717, "dur": 3.070, + "args": { + "External id": 82930,"Record function id": 0, "Ev Idx": 4509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865708030.437, "dur": 211.210, + "args": { + "External id": 82931,"Record function id": 0, "Ev Idx": 4510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865708043.737, "dur": 196.620, + "args": { + "External id": 82932,"Sequence number": 1770946, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4511 + } + }, + { + "ph": "s", "id": 256, "pid": 5714, "tid": 5714, "ts": 6300865708043.737, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708049.627, "dur": 5.340, + "args": { + "External id": 82933,"Record function id": 0, "Ev Idx": 4512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865708055.687, "dur": 175.790, + "args": { + "External id": 82934,"Record function id": 0, "Ev Idx": 4513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708078.907, "dur": 3.370, + "args": { + "External id": 82935,"Record function id": 0, "Ev Idx": 4514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865708082.987, "dur": 124.200, + "args": { + "External id": 82936,"Record function id": 0, "Ev Idx": 4515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708086.007, "dur": 4.000, + "args": { + "External id": 82937,"Record function id": 0, "Ev Idx": 4516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865708090.677, "dur": 114.310, + "args": { + "External id": 82938,"Record function id": 0, "Ev Idx": 4517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708128.057, "dur": 5.690, + "args": { + "External id": 82939,"Record function id": 0, "Ev Idx": 4518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865708135.157, "dur": 68.750, + "args": { + "External id": 82940,"Record function id": 0, "Ev Idx": 4519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865708167.727, "dur": 24.200, + "args": { + "External id": 82941,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708213.477, "dur": 3.230, + "args": { + "External id": 82942,"Record function id": 0, "Ev Idx": 4521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865708217.367, "dur": 13.390, + "args": { + "External id": 82943,"Record function id": 0, "Ev Idx": 4522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708245.387, "dur": 4.200, + "args": { + "External id": 82944,"Record function id": 0, "Ev Idx": 4523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6300865708263.227, "dur": 219.129, + "args": { + "External id": 82945,"Record function id": 0, "Ev Idx": 4524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708266.596, "dur": 2.540, + "args": { + "External id": 82946,"Record function id": 0, "Ev Idx": 4525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865708269.776, "dur": 211.620, + "args": { + "External id": 82947,"Record function id": 0, "Ev Idx": 4526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865708283.356, "dur": 196.920, + "args": { + "External id": 82948,"Sequence number": 1770947, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4527 + } + }, + { + "ph": "s", "id": 255, "pid": 5714, "tid": 5714, "ts": 6300865708283.356, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708289.567, "dur": 4.500, + "args": { + "External id": 82949,"Record function id": 0, "Ev Idx": 4528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865708294.787, "dur": 176.689, + "args": { + "External id": 82950,"Record function id": 0, "Ev Idx": 4529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708325.416, "dur": 4.211, + "args": { + "External id": 82951,"Record function id": 0, "Ev Idx": 4530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865708330.596, "dur": 118.330, + "args": { + "External id": 82952,"Record function id": 0, "Ev Idx": 4531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708335.016, "dur": 4.420, + "args": { + "External id": 82953,"Record function id": 0, "Ev Idx": 4532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865708340.107, "dur": 106.599, + "args": { + "External id": 82954,"Record function id": 0, "Ev Idx": 4533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708374.396, "dur": 5.230, + "args": { + "External id": 82955,"Record function id": 0, "Ev Idx": 4534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865708380.486, "dur": 65.330, + "args": { + "External id": 82956,"Record function id": 0, "Ev Idx": 4535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865708411.506, "dur": 22.920, + "args": { + "External id": 82957,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708454.486, "dur": 3.010, + "args": { + "External id": 82958,"Record function id": 0, "Ev Idx": 4537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865708458.126, "dur": 12.750, + "args": { + "External id": 82959,"Record function id": 0, "Ev Idx": 4538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708492.966, "dur": 9.570, + "args": { + "External id": 82960,"Record function id": 0, "Ev Idx": 4539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6300865708503.456, "dur": 255.430, + "args": { + "External id": 82961,"Record function id": 0, "Ev Idx": 4540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865708525.736, "dur": 222.519, + "args": { + "External id": 82962,"Sequence number": 1770948, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4541 + } + }, + { + "ph": "s", "id": 254, "pid": 5714, "tid": 5714, "ts": 6300865708525.736, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865708550.246, "dur": 117.750, + "args": { + "External id": 82963,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865708587.426, "dur": 12.810, + "args": { + "External id": 82964,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865708590.566, "dur": 8.660, + "args": { + "External id": 82965,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865708602.056, "dur": 6.130, + "args": { + "External id": 82966,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865708609.896, "dur": 2.470, + "args": { + "External id": 82967,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865708614.906, "dur": 3.720, + "args": { + "External id": 82968,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865708685.446, "dur": 30.820, + "args": { + "External id": 82969,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865708769.295, "dur": 20.691, + "args": { + "External id": 82970,"Record function id": 0, "Ev Idx": 4549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6300865708791.006, "dur": 347.189, + "args": { + "External id": 82971,"Record function id": 0, "Ev Idx": 4550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865708817.835, "dur": 309.390, + "args": { + "External id": 82972,"Sequence number": 1770949, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4551 + } + }, + { + "ph": "s", "id": 253, "pid": 5714, "tid": 5714, "ts": 6300865708817.835, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865708870.675, "dur": 28.890, + "args": { + "External id": 82973,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865708915.485, "dur": 28.060, + "args": { + "External id": 82974,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865708955.545, "dur": 17.890, + "args": { + "External id": 82975,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6300865708997.155, "dur": 20.460, + "args": { + "External id": 82976,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865709030.045, "dur": 25.000, + "args": { + "External id": 82977,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6300865709076.375, "dur": 16.410, + "args": { + "External id": 82978,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4557 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.6)", "pid": 5714, "tid": 5714, + "ts": 6300865709182.394, "dur": 55.580, + "args": { + "External id": 82979,"Record function id": 0, "Ev Idx": 4558 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865709311.254, "dur": 46.650, + "args": { + "External id": 82980,"Record function id": 0, "Ev Idx": 4559 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.7)", "pid": 5714, "tid": 5714, + "ts": 6300865709367.734, "dur": 974.428, + "args": { + "External id": 82981,"Record function id": 0, "Ev Idx": 4560 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 5714, "tid": 5714, + "ts": 6300865709375.924, "dur": 510.519, + "args": { + "External id": 82982,"Record function id": 0, "Ev Idx": 4561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865709446.054, "dur": 8.980, + "args": { + "External id": 82983,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865709465.534, "dur": 28.140, + "args": { + "External id": 82984,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709469.004, "dur": 1.330, + "args": { + "External id": 82985,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709473.024, "dur": 0.410, + "args": { + "External id": 82986,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709474.224, "dur": 0.180, + "args": { + "External id": 82987,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709475.584, "dur": 1.560, + "args": { + "External id": 82988,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709479.094, "dur": 0.310, + "args": { + "External id": 82989,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709480.744, "dur": 0.310, + "args": { + "External id": 82990,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709481.864, "dur": 1.250, + "args": { + "External id": 82991,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709483.874, "dur": 0.250, + "args": { + "External id": 82992,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709487.984, "dur": 0.240, + "args": { + "External id": 82993,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865709502.014, "dur": 25.450, + "args": { + "External id": 82994,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865709559.374, "dur": 102.390, + "args": { + "External id": 82995,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865709570.724, "dur": 7.420, + "args": { + "External id": 82996,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865709582.353, "dur": 9.231, + "args": { + "External id": 82997,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865709584.833, "dur": 6.351, + "args": { + "External id": 82998,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709587.364, "dur": 1.960, + "args": { + "External id": 82999,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865709599.173, "dur": 21.531, + "args": { + "External id": 83000,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709601.424, "dur": 1.380, + "args": { + "External id": 83001,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709603.773, "dur": 0.191, + "args": { + "External id": 83002,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709604.713, "dur": 0.291, + "args": { + "External id": 83003,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709607.364, "dur": 0.260, + "args": { + "External id": 83004,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709608.364, "dur": 0.280, + "args": { + "External id": 83005,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709609.413, "dur": 1.351, + "args": { + "External id": 83006,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709611.913, "dur": 0.251, + "args": { + "External id": 83007,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709612.924, "dur": 0.249, + "args": { + "External id": 83008,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865709615.604, "dur": 1.129, + "args": { + "External id": 83009,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865709633.444, "dur": 19.249, + "args": { + "External id": 83010,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865709717.953, "dur": 93.950, + "args": { + "External id": 83011,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865709737.353, "dur": 71.210, + "args": { + "External id": 83012,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4591, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865709749.153, "dur": 54.960, + "args": { + "External id": 83013,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865709827.773, "dur": 3.700, + "args": { + "External id": 83014,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4593, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 5714, "tid": 5714, + "ts": 6300865709906.183, "dur": 266.589, + "args": { + "External id": 83015,"Record function id": 0, "Ev Idx": 4594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710000.783, "dur": 4.360, + "args": { + "External id": 83016,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710009.563, "dur": 1.000, + "args": { + "External id": 83017,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710012.503, "dur": 0.720, + "args": { + "External id": 83018,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710015.213, "dur": 0.760, + "args": { + "External id": 83019,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710017.453, "dur": 0.750, + "args": { + "External id": 83020,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710021.083, "dur": 0.649, + "args": { + "External id": 83021,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710023.392, "dur": 0.851, + "args": { + "External id": 83022,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710026.063, "dur": 2.669, + "args": { + "External id": 83023,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710030.212, "dur": 0.640, + "args": { + "External id": 83024,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710033.332, "dur": 0.680, + "args": { + "External id": 83025,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865710047.923, "dur": 96.169, + "args": { + "External id": 83026,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865710059.352, "dur": 80.780, + "args": { + "External id": 83027,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865710071.703, "dur": 7.400, + "args": { + "External id": 83028,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865710081.592, "dur": 35.391, + "args": { + "External id": 83029,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865710083.052, "dur": 33.571, + "args": { + "External id": 83030,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865710086.603, "dur": 7.709, + "args": { + "External id": 83031,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865710095.372, "dur": 20.760, + "args": { + "External id": 83032,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6300865710284.652, "dur": 30.750, + "args": { + "External id": 83033,"Sequence number": 1770950, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4612 + } + }, + { + "ph": "s", "id": 252, "pid": 5714, "tid": 5714, "ts": 6300865710284.652, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300865710295.822, "dur": 15.430, + "args": { + "External id": 83034,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865710306.872, "dur": 3.770, + "args": { + "External id": 83035,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865710373.202, "dur": 11.310, + "args": { + "External id": 83036,"Record function id": 0, "Ev Idx": 4615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6300865710385.492, "dur": 1534.186, + "args": { + "External id": 83037,"Record function id": 0, "Ev Idx": 4616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865710408.382, "dur": 111.400, + "args": { + "External id": 83038,"Sequence number": 1770951, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4617 + } + }, + { + "ph": "s", "id": 251, "pid": 5714, "tid": 5714, "ts": 6300865710408.382, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865710456.102, "dur": 27.420, + "args": { + "External id": 83039,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865710496.162, "dur": 5.069, + "args": { + "External id": 83040,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865710497.451, "dur": 3.540, + "args": { + "External id": 83041,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865710541.931, "dur": 11.151, + "args": { + "External id": 83042,"Record function id": 0, "Ev Idx": 4621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6300865710554.011, "dur": 989.268, + "args": { + "External id": 83043,"Record function id": 0, "Ev Idx": 4622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865710580.411, "dur": 199.910, + "args": { + "External id": 83044,"Sequence number": 1770952, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4623 + } + }, + { + "ph": "s", "id": 250, "pid": 5714, "tid": 5714, "ts": 6300865710580.411, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865710611.421, "dur": 33.810, + "args": { + "External id": 83045,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865710658.091, "dur": 18.400, + "args": { + "External id": 83046,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865710687.831, "dur": 17.000, + "args": { + "External id": 83047,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865710741.181, "dur": 3.200, + "args": { + "External id": 83048,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865710751.851, "dur": 2.140, + "args": { + "External id": 83049,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865710758.731, "dur": 1.840, + "args": { + "External id": 83050,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865710800.331, "dur": 9.640, + "args": { + "External id": 83051,"Record function id": 0, "Ev Idx": 4630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6300865710810.921, "dur": 441.949, + "args": { + "External id": 83052,"Record function id": 0, "Ev Idx": 4631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865710828.151, "dur": 2.930, + "args": { + "External id": 83053,"Record function id": 0, "Ev Idx": 4632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865710831.751, "dur": 211.729, + "args": { + "External id": 83054,"Record function id": 0, "Ev Idx": 4633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865710845.131, "dur": 197.079, + "args": { + "External id": 83055,"Sequence number": 1770953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4634 + } + }, + { + "ph": "s", "id": 249, "pid": 5714, "tid": 5714, "ts": 6300865710845.131, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865710851.581, "dur": 5.660, + "args": { + "External id": 83056,"Record function id": 0, "Ev Idx": 4635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865710857.991, "dur": 175.429, + "args": { + "External id": 83057,"Record function id": 0, "Ev Idx": 4636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865710882.401, "dur": 3.340, + "args": { + "External id": 83058,"Record function id": 0, "Ev Idx": 4637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865710886.491, "dur": 122.669, + "args": { + "External id": 83059,"Record function id": 0, "Ev Idx": 4638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865710889.491, "dur": 3.960, + "args": { + "External id": 83060,"Record function id": 0, "Ev Idx": 4639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865710894.251, "dur": 112.589, + "args": { + "External id": 83061,"Record function id": 0, "Ev Idx": 4640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865710931.001, "dur": 5.900, + "args": { + "External id": 83062,"Record function id": 0, "Ev Idx": 4641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865710938.041, "dur": 67.839, + "args": { + "External id": 83063,"Record function id": 0, "Ev Idx": 4642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865710970.210, "dur": 23.140, + "args": { + "External id": 83064,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865711015.300, "dur": 3.470, + "args": { + "External id": 83065,"Record function id": 0, "Ev Idx": 4644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865711019.400, "dur": 13.310, + "args": { + "External id": 83066,"Record function id": 0, "Ev Idx": 4645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865711047.710, "dur": 4.180, + "args": { + "External id": 83067,"Record function id": 0, "Ev Idx": 4646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6300865711052.580, "dur": 199.790, + "args": { + "External id": 83068,"Record function id": 0, "Ev Idx": 4647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865711055.740, "dur": 2.530, + "args": { + "External id": 83069,"Record function id": 0, "Ev Idx": 4648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865711058.870, "dur": 192.550, + "args": { + "External id": 83070,"Record function id": 0, "Ev Idx": 4649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865711072.430, "dur": 177.740, + "args": { + "External id": 83071,"Sequence number": 1770954, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4650 + } + }, + { + "ph": "s", "id": 248, "pid": 5714, "tid": 5714, "ts": 6300865711072.430, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865711078.200, "dur": 4.570, + "args": { + "External id": 83072,"Record function id": 0, "Ev Idx": 4651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865711083.540, "dur": 157.450, + "args": { + "External id": 83073,"Record function id": 0, "Ev Idx": 4652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865711105.100, "dur": 3.120, + "args": { + "External id": 83074,"Record function id": 0, "Ev Idx": 4653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865711109.010, "dur": 109.830, + "args": { + "External id": 83075,"Record function id": 0, "Ev Idx": 4654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865711112.880, "dur": 3.930, + "args": { + "External id": 83076,"Record function id": 0, "Ev Idx": 4655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865711117.450, "dur": 99.380, + "args": { + "External id": 83077,"Record function id": 0, "Ev Idx": 4656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865711149.780, "dur": 5.040, + "args": { + "External id": 83078,"Record function id": 0, "Ev Idx": 4657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865711155.700, "dur": 60.250, + "args": { + "External id": 83079,"Record function id": 0, "Ev Idx": 4658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865711184.310, "dur": 20.510, + "args": { + "External id": 83080,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865711224.390, "dur": 3.060, + "args": { + "External id": 83081,"Record function id": 0, "Ev Idx": 4660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865711228.070, "dur": 12.300, + "args": { + "External id": 83082,"Record function id": 0, "Ev Idx": 4661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865711262.680, "dur": 8.910, + "args": { + "External id": 83083,"Record function id": 0, "Ev Idx": 4662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6300865711272.500, "dur": 269.579, + "args": { + "External id": 83084,"Record function id": 0, "Ev Idx": 4663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865711294.670, "dur": 236.949, + "args": { + "External id": 83085,"Sequence number": 1770955, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4664 + } + }, + { + "ph": "s", "id": 247, "pid": 5714, "tid": 5714, "ts": 6300865711294.670, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865711329.390, "dur": 119.699, + "args": { + "External id": 83086,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865711364.089, "dur": 13.911, + "args": { + "External id": 83087,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865711368.309, "dur": 8.660, + "args": { + "External id": 83088,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865711379.800, "dur": 6.309, + "args": { + "External id": 83089,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865711388.200, "dur": 2.609, + "args": { + "External id": 83090,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865711393.340, "dur": 4.829, + "args": { + "External id": 83091,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865711466.449, "dur": 30.840, + "args": { + "External id": 83092,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865711552.499, "dur": 20.330, + "args": { + "External id": 83093,"Record function id": 0, "Ev Idx": 4672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6300865711573.839, "dur": 342.629, + "args": { + "External id": 83094,"Record function id": 0, "Ev Idx": 4673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865711599.909, "dur": 305.699, + "args": { + "External id": 83095,"Sequence number": 1770956, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4674 + } + }, + { + "ph": "s", "id": 246, "pid": 5714, "tid": 5714, "ts": 6300865711599.909, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865711652.399, "dur": 28.310, + "args": { + "External id": 83096,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865711695.789, "dur": 27.380, + "args": { + "External id": 83097,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865711734.179, "dur": 17.620, + "args": { + "External id": 83098,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6300865711776.109, "dur": 20.259, + "args": { + "External id": 83099,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865711808.159, "dur": 25.409, + "args": { + "External id": 83100,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6300865711854.048, "dur": 17.060, + "args": { + "External id": 83101,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4680 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.7)", "pid": 5714, "tid": 5714, + "ts": 6300865711961.388, "dur": 55.320, + "args": { + "External id": 83102,"Record function id": 0, "Ev Idx": 4681 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865712079.938, "dur": 46.350, + "args": { + "External id": 83103,"Record function id": 0, "Ev Idx": 4682 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.8)", "pid": 5714, "tid": 5714, + "ts": 6300865712135.718, "dur": 967.387, + "args": { + "External id": 83104,"Record function id": 0, "Ev Idx": 4683 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 5714, "tid": 5714, + "ts": 6300865712143.048, "dur": 516.079, + "args": { + "External id": 83105,"Record function id": 0, "Ev Idx": 4684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865712210.578, "dur": 9.150, + "args": { + "External id": 83106,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865712230.698, "dur": 25.769, + "args": { + "External id": 83107,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712234.567, "dur": 1.480, + "args": { + "External id": 83108,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712237.487, "dur": 1.480, + "args": { + "External id": 83109,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712239.807, "dur": 0.191, + "args": { + "External id": 83110,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712241.547, "dur": 1.151, + "args": { + "External id": 83111,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712244.658, "dur": 0.280, + "args": { + "External id": 83112,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712246.167, "dur": 0.280, + "args": { + "External id": 83113,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712247.187, "dur": 0.271, + "args": { + "External id": 83114,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712249.487, "dur": 0.271, + "args": { + "External id": 83115,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712251.098, "dur": 0.240, + "args": { + "External id": 83116,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865712264.487, "dur": 25.120, + "args": { + "External id": 83117,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865712331.757, "dur": 101.810, + "args": { + "External id": 83118,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865712342.647, "dur": 7.710, + "args": { + "External id": 83119,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865712355.057, "dur": 8.270, + "args": { + "External id": 83120,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865712357.607, "dur": 5.340, + "args": { + "External id": 83121,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712360.187, "dur": 0.850, + "args": { + "External id": 83122,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865712371.017, "dur": 22.510, + "args": { + "External id": 83123,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712373.397, "dur": 1.420, + "args": { + "External id": 83124,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712376.987, "dur": 0.190, + "args": { + "External id": 83125,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712377.907, "dur": 0.190, + "args": { + "External id": 83126,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712379.597, "dur": 1.400, + "args": { + "External id": 83127,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712381.757, "dur": 0.310, + "args": { + "External id": 83128,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712382.767, "dur": 0.170, + "args": { + "External id": 83129,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712385.127, "dur": 0.380, + "args": { + "External id": 83130,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712386.277, "dur": 0.450, + "args": { + "External id": 83131,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712388.217, "dur": 1.150, + "args": { + "External id": 83132,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865712405.697, "dur": 19.360, + "args": { + "External id": 83133,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865712490.597, "dur": 95.280, + "args": { + "External id": 83134,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865712510.067, "dur": 72.660, + "args": { + "External id": 83135,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4714, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865712522.797, "dur": 55.240, + "args": { + "External id": 83136,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865712602.027, "dur": 3.620, + "args": { + "External id": 83137,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4716, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 5714, "tid": 5714, + "ts": 6300865712678.766, "dur": 265.440, + "args": { + "External id": 83138,"Record function id": 0, "Ev Idx": 4717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865712771.286, "dur": 4.210, + "args": { + "External id": 83139,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865712779.806, "dur": 1.030, + "args": { + "External id": 83140,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865712782.716, "dur": 0.760, + "args": { + "External id": 83141,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865712785.306, "dur": 0.700, + "args": { + "External id": 83142,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865712787.466, "dur": 0.750, + "args": { + "External id": 83143,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865712789.736, "dur": 0.710, + "args": { + "External id": 83144,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865712792.046, "dur": 0.830, + "args": { + "External id": 83145,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865712795.636, "dur": 2.840, + "args": { + "External id": 83146,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865712799.976, "dur": 0.980, + "args": { + "External id": 83147,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865712802.426, "dur": 0.870, + "args": { + "External id": 83148,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865712816.986, "dur": 98.200, + "args": { + "External id": 83149,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865712828.476, "dur": 82.930, + "args": { + "External id": 83150,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865712841.876, "dur": 7.140, + "args": { + "External id": 83151,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865712852.956, "dur": 34.850, + "args": { + "External id": 83152,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865712854.426, "dur": 33.010, + "args": { + "External id": 83153,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865712857.606, "dur": 7.600, + "args": { + "External id": 83154,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865712866.436, "dur": 20.520, + "args": { + "External id": 83155,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6300865713056.566, "dur": 20.600, + "args": { + "External id": 83156,"Sequence number": 1770957, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4735 + } + }, + { + "ph": "s", "id": 245, "pid": 5714, "tid": 5714, "ts": 6300865713056.566, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300865713066.836, "dur": 6.480, + "args": { + "External id": 83157,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865713069.236, "dur": 3.640, + "args": { + "External id": 83158,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713133.736, "dur": 10.889, + "args": { + "External id": 83159,"Record function id": 0, "Ev Idx": 4738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6300865713145.745, "dur": 1556.337, + "args": { + "External id": 83160,"Record function id": 0, "Ev Idx": 4739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865713168.336, "dur": 113.829, + "args": { + "External id": 83161,"Sequence number": 1770958, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4740 + } + }, + { + "ph": "s", "id": 244, "pid": 5714, "tid": 5714, "ts": 6300865713168.336, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865713217.095, "dur": 28.020, + "args": { + "External id": 83162,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865713258.355, "dur": 5.380, + "args": { + "External id": 83163,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865713259.695, "dur": 3.810, + "args": { + "External id": 83164,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713312.065, "dur": 10.980, + "args": { + "External id": 83165,"Record function id": 0, "Ev Idx": 4744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6300865713324.005, "dur": 994.468, + "args": { + "External id": 83166,"Record function id": 0, "Ev Idx": 4745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865713354.355, "dur": 198.609, + "args": { + "External id": 83167,"Sequence number": 1770959, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4746 + } + }, + { + "ph": "s", "id": 243, "pid": 5714, "tid": 5714, "ts": 6300865713354.355, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865713385.755, "dur": 34.900, + "args": { + "External id": 83168,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865713433.075, "dur": 18.110, + "args": { + "External id": 83169,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865713461.675, "dur": 16.940, + "args": { + "External id": 83170,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865713515.315, "dur": 3.130, + "args": { + "External id": 83171,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865713525.825, "dur": 2.160, + "args": { + "External id": 83172,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865713533.315, "dur": 1.570, + "args": { + "External id": 83173,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713573.404, "dur": 9.640, + "args": { + "External id": 83174,"Record function id": 0, "Ev Idx": 4753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6300865713584.055, "dur": 446.408, + "args": { + "External id": 83175,"Record function id": 0, "Ev Idx": 4754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713601.764, "dur": 3.140, + "args": { + "External id": 83176,"Record function id": 0, "Ev Idx": 4755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865713605.615, "dur": 214.529, + "args": { + "External id": 83177,"Record function id": 0, "Ev Idx": 4756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865713619.344, "dur": 199.540, + "args": { + "External id": 83178,"Sequence number": 1770960, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4757 + } + }, + { + "ph": "s", "id": 242, "pid": 5714, "tid": 5714, "ts": 6300865713619.344, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713625.504, "dur": 5.660, + "args": { + "External id": 83179,"Record function id": 0, "Ev Idx": 4758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865713631.895, "dur": 178.439, + "args": { + "External id": 83180,"Record function id": 0, "Ev Idx": 4759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713655.754, "dur": 3.530, + "args": { + "External id": 83181,"Record function id": 0, "Ev Idx": 4760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865713660.044, "dur": 126.460, + "args": { + "External id": 83182,"Record function id": 0, "Ev Idx": 4761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713662.994, "dur": 4.210, + "args": { + "External id": 83183,"Record function id": 0, "Ev Idx": 4762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865713668.364, "dur": 115.790, + "args": { + "External id": 83184,"Record function id": 0, "Ev Idx": 4763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713705.824, "dur": 5.600, + "args": { + "External id": 83185,"Record function id": 0, "Ev Idx": 4764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865713712.524, "dur": 70.630, + "args": { + "External id": 83186,"Record function id": 0, "Ev Idx": 4765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865713746.454, "dur": 24.530, + "args": { + "External id": 83187,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713792.674, "dur": 3.400, + "args": { + "External id": 83188,"Record function id": 0, "Ev Idx": 4767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865713796.754, "dur": 12.860, + "args": { + "External id": 83189,"Record function id": 0, "Ev Idx": 4768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713823.944, "dur": 3.980, + "args": { + "External id": 83190,"Record function id": 0, "Ev Idx": 4769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6300865713828.594, "dur": 201.389, + "args": { + "External id": 83191,"Record function id": 0, "Ev Idx": 4770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713831.674, "dur": 2.300, + "args": { + "External id": 83192,"Record function id": 0, "Ev Idx": 4771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865713835.224, "dur": 193.790, + "args": { + "External id": 83193,"Record function id": 0, "Ev Idx": 4772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865713848.424, "dur": 179.479, + "args": { + "External id": 83194,"Sequence number": 1770961, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4773 + } + }, + { + "ph": "s", "id": 241, "pid": 5714, "tid": 5714, "ts": 6300865713848.424, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713853.154, "dur": 4.920, + "args": { + "External id": 83195,"Record function id": 0, "Ev Idx": 4774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865713858.784, "dur": 160.879, + "args": { + "External id": 83196,"Record function id": 0, "Ev Idx": 4775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713880.354, "dur": 3.280, + "args": { + "External id": 83197,"Record function id": 0, "Ev Idx": 4776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865713885.014, "dur": 112.029, + "args": { + "External id": 83198,"Record function id": 0, "Ev Idx": 4777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713887.684, "dur": 3.850, + "args": { + "External id": 83199,"Record function id": 0, "Ev Idx": 4778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865713892.164, "dur": 102.799, + "args": { + "External id": 83200,"Record function id": 0, "Ev Idx": 4779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865713925.464, "dur": 5.340, + "args": { + "External id": 83201,"Record function id": 0, "Ev Idx": 4780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865713931.664, "dur": 62.470, + "args": { + "External id": 83202,"Record function id": 0, "Ev Idx": 4781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865713961.004, "dur": 21.939, + "args": { + "External id": 83203,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865714002.523, "dur": 3.091, + "args": { + "External id": 83204,"Record function id": 0, "Ev Idx": 4783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865714006.234, "dur": 12.780, + "args": { + "External id": 83205,"Record function id": 0, "Ev Idx": 4784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865714040.754, "dur": 9.180, + "args": { + "External id": 83206,"Record function id": 0, "Ev Idx": 4785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6300865714050.783, "dur": 266.340, + "args": { + "External id": 83207,"Record function id": 0, "Ev Idx": 4786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865714072.654, "dur": 223.399, + "args": { + "External id": 83208,"Sequence number": 1770962, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4787 + } + }, + { + "ph": "s", "id": 240, "pid": 5714, "tid": 5714, "ts": 6300865714072.654, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865714097.243, "dur": 118.080, + "args": { + "External id": 83209,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865714131.533, "dur": 13.910, + "args": { + "External id": 83210,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865714135.703, "dur": 8.820, + "args": { + "External id": 83211,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865714147.923, "dur": 7.470, + "args": { + "External id": 83212,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865714156.633, "dur": 2.690, + "args": { + "External id": 83213,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865714161.953, "dur": 3.290, + "args": { + "External id": 83214,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865714232.783, "dur": 30.830, + "args": { + "External id": 83215,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865714328.313, "dur": 20.770, + "args": { + "External id": 83216,"Record function id": 0, "Ev Idx": 4795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6300865714350.163, "dur": 348.939, + "args": { + "External id": 83217,"Record function id": 0, "Ev Idx": 4796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865714377.483, "dur": 310.789, + "args": { + "External id": 83218,"Sequence number": 1770963, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4797 + } + }, + { + "ph": "s", "id": 239, "pid": 5714, "tid": 5714, "ts": 6300865714377.483, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865714433.273, "dur": 29.329, + "args": { + "External id": 83219,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865714477.473, "dur": 28.660, + "args": { + "External id": 83220,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865714517.913, "dur": 16.869, + "args": { + "External id": 83221,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6300865714559.642, "dur": 20.630, + "args": { + "External id": 83222,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865714592.332, "dur": 25.700, + "args": { + "External id": 83223,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6300865714638.172, "dur": 16.530, + "args": { + "External id": 83224,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4803 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.8)", "pid": 5714, "tid": 5714, + "ts": 6300865714743.142, "dur": 54.730, + "args": { + "External id": 83225,"Record function id": 0, "Ev Idx": 4804 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6300865714861.202, "dur": 44.650, + "args": { + "External id": 83226,"Record function id": 0, "Ev Idx": 4805 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.9)", "pid": 5714, "tid": 5714, + "ts": 6300865714914.901, "dur": 989.728, + "args": { + "External id": 83227,"Record function id": 0, "Ev Idx": 4806 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.9)", "pid": 5714, "tid": 5714, + "ts": 6300865714922.761, "dur": 535.449, + "args": { + "External id": 83228,"Record function id": 0, "Ev Idx": 4807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865715003.621, "dur": 9.130, + "args": { + "External id": 83229,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865715023.211, "dur": 27.020, + "args": { + "External id": 83230,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715026.961, "dur": 1.330, + "args": { + "External id": 83231,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715030.791, "dur": 0.540, + "args": { + "External id": 83232,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715032.661, "dur": 0.200, + "args": { + "External id": 83233,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715033.631, "dur": 1.470, + "args": { + "External id": 83234,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715037.071, "dur": 0.220, + "args": { + "External id": 83235,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715038.751, "dur": 0.300, + "args": { + "External id": 83236,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715039.971, "dur": 1.180, + "args": { + "External id": 83237,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715043.761, "dur": 0.290, + "args": { + "External id": 83238,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715045.261, "dur": 0.180, + "args": { + "External id": 83239,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865715058.331, "dur": 24.840, + "args": { + "External id": 83240,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6300865715116.551, "dur": 101.570, + "args": { + "External id": 83241,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865715126.741, "dur": 7.530, + "args": { + "External id": 83242,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6300865715138.511, "dur": 9.210, + "args": { + "External id": 83243,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865715141.001, "dur": 6.300, + "args": { + "External id": 83244,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715143.591, "dur": 1.790, + "args": { + "External id": 83245,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6300865715155.141, "dur": 22.600, + "args": { + "External id": 83246,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715157.401, "dur": 1.150, + "args": { + "External id": 83247,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715159.571, "dur": 0.270, + "args": { + "External id": 83248,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715160.611, "dur": 0.190, + "args": { + "External id": 83249,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715163.671, "dur": 0.270, + "args": { + "External id": 83250,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715164.711, "dur": 0.260, + "args": { + "External id": 83251,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715166.461, "dur": 1.310, + "args": { + "External id": 83252,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715168.531, "dur": 0.180, + "args": { + "External id": 83253,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715169.431, "dur": 0.280, + "args": { + "External id": 83254,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715172.411, "dur": 1.520, + "args": { + "External id": 83255,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6300865715190.691, "dur": 18.930, + "args": { + "External id": 83256,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6300865715275.821, "dur": 107.350, + "args": { + "External id": 83257,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865715295.861, "dur": 83.990, + "args": { + "External id": 83258,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4837, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6300865715320.680, "dur": 54.751, + "args": { + "External id": 83259,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300865715400.450, "dur": 3.680, + "args": { + "External id": 83260,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4839, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.9)", "pid": 5714, "tid": 5714, + "ts": 6300865715478.370, "dur": 264.740, + "args": { + "External id": 83261,"Record function id": 0, "Ev Idx": 4840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715572.820, "dur": 4.140, + "args": { + "External id": 83262,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715581.270, "dur": 0.940, + "args": { + "External id": 83263,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715583.950, "dur": 0.700, + "args": { + "External id": 83264,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715586.530, "dur": 0.710, + "args": { + "External id": 83265,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715588.820, "dur": 0.850, + "args": { + "External id": 83266,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715592.120, "dur": 1.020, + "args": { + "External id": 83267,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715594.570, "dur": 0.830, + "args": { + "External id": 83268,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715596.900, "dur": 3.180, + "args": { + "External id": 83269,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715601.400, "dur": 0.840, + "args": { + "External id": 83270,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715604.970, "dur": 0.740, + "args": { + "External id": 83271,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865715619.720, "dur": 94.200, + "args": { + "External id": 83272,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6300865715631.000, "dur": 79.100, + "args": { + "External id": 83273,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865715643.640, "dur": 6.970, + "args": { + "External id": 83274,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865715653.120, "dur": 34.090, + "args": { + "External id": 83275,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865715654.720, "dur": 32.090, + "args": { + "External id": 83276,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865715657.860, "dur": 7.430, + "args": { + "External id": 83277,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865715666.380, "dur": 19.920, + "args": { + "External id": 83278,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6300865715858.429, "dur": 20.700, + "args": { + "External id": 83279,"Sequence number": 1770964, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4858 + } + }, + { + "ph": "s", "id": 238, "pid": 5714, "tid": 5714, "ts": 6300865715858.429, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300865715868.709, "dur": 6.630, + "args": { + "External id": 83280,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865715871.069, "dur": 3.720, + "args": { + "External id": 83281,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865715935.009, "dur": 11.120, + "args": { + "External id": 83282,"Record function id": 0, "Ev Idx": 4861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6300865715947.249, "dur": 1579.657, + "args": { + "External id": 83283,"Record function id": 0, "Ev Idx": 4862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865715969.459, "dur": 114.170, + "args": { + "External id": 83284,"Sequence number": 1770965, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4863 + } + }, + { + "ph": "s", "id": 237, "pid": 5714, "tid": 5714, "ts": 6300865715969.459, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865716018.179, "dur": 27.020, + "args": { + "External id": 83285,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865716058.199, "dur": 5.410, + "args": { + "External id": 83286,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865716059.519, "dur": 3.870, + "args": { + "External id": 83287,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716111.889, "dur": 11.060, + "args": { + "External id": 83288,"Record function id": 0, "Ev Idx": 4867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6300865716123.909, "dur": 991.798, + "args": { + "External id": 83289,"Record function id": 0, "Ev Idx": 4868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865716146.699, "dur": 208.769, + "args": { + "External id": 83290,"Sequence number": 1770966, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4869 + } + }, + { + "ph": "s", "id": 236, "pid": 5714, "tid": 5714, "ts": 6300865716146.699, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865716174.299, "dur": 33.979, + "args": { + "External id": 83291,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865716220.598, "dur": 20.000, + "args": { + "External id": 83292,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865716252.909, "dur": 17.149, + "args": { + "External id": 83293,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865716316.218, "dur": 4.380, + "args": { + "External id": 83294,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865716329.568, "dur": 0.960, + "args": { + "External id": 83295,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865716335.588, "dur": 1.660, + "args": { + "External id": 83296,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716375.848, "dur": 9.760, + "args": { + "External id": 83297,"Record function id": 0, "Ev Idx": 4876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6300865716386.748, "dur": 451.669, + "args": { + "External id": 83298,"Record function id": 0, "Ev Idx": 4877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716404.738, "dur": 3.090, + "args": { + "External id": 83299,"Record function id": 0, "Ev Idx": 4878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865716409.018, "dur": 216.599, + "args": { + "External id": 83300,"Record function id": 0, "Ev Idx": 4879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865716422.598, "dur": 201.759, + "args": { + "External id": 83301,"Sequence number": 1770967, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4880 + } + }, + { + "ph": "s", "id": 235, "pid": 5714, "tid": 5714, "ts": 6300865716422.598, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716428.138, "dur": 5.020, + "args": { + "External id": 83302,"Record function id": 0, "Ev Idx": 4881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865716434.148, "dur": 181.200, + "args": { + "External id": 83303,"Record function id": 0, "Ev Idx": 4882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716459.068, "dur": 3.510, + "args": { + "External id": 83304,"Record function id": 0, "Ev Idx": 4883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865716463.608, "dur": 126.230, + "args": { + "External id": 83305,"Record function id": 0, "Ev Idx": 4884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716467.188, "dur": 4.110, + "args": { + "External id": 83306,"Record function id": 0, "Ev Idx": 4885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865716472.208, "dur": 115.130, + "args": { + "External id": 83307,"Record function id": 0, "Ev Idx": 4886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716510.058, "dur": 5.850, + "args": { + "External id": 83308,"Record function id": 0, "Ev Idx": 4887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865716517.428, "dur": 68.990, + "args": { + "External id": 83309,"Record function id": 0, "Ev Idx": 4888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865716549.238, "dur": 24.960, + "args": { + "External id": 83310,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716596.228, "dur": 3.490, + "args": { + "External id": 83311,"Record function id": 0, "Ev Idx": 4890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865716600.758, "dur": 13.950, + "args": { + "External id": 83312,"Record function id": 0, "Ev Idx": 4891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716629.897, "dur": 4.360, + "args": { + "External id": 83313,"Record function id": 0, "Ev Idx": 4892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6300865716635.088, "dur": 202.919, + "args": { + "External id": 83314,"Record function id": 0, "Ev Idx": 4893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716638.417, "dur": 2.540, + "args": { + "External id": 83315,"Record function id": 0, "Ev Idx": 4894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6300865716641.708, "dur": 195.279, + "args": { + "External id": 83316,"Record function id": 0, "Ev Idx": 4895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865716655.048, "dur": 180.819, + "args": { + "External id": 83317,"Sequence number": 1770968, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4896 + } + }, + { + "ph": "s", "id": 234, "pid": 5714, "tid": 5714, "ts": 6300865716655.048, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716660.288, "dur": 4.889, + "args": { + "External id": 83318,"Record function id": 0, "Ev Idx": 4897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6300865716666.208, "dur": 162.319, + "args": { + "External id": 83319,"Record function id": 0, "Ev Idx": 4898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716688.088, "dur": 3.080, + "args": { + "External id": 83320,"Record function id": 0, "Ev Idx": 4899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6300865716692.228, "dur": 113.599, + "args": { + "External id": 83321,"Record function id": 0, "Ev Idx": 4900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716695.248, "dur": 4.029, + "args": { + "External id": 83322,"Record function id": 0, "Ev Idx": 4901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6300865716700.177, "dur": 103.640, + "args": { + "External id": 83323,"Record function id": 0, "Ev Idx": 4902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716732.997, "dur": 5.110, + "args": { + "External id": 83324,"Record function id": 0, "Ev Idx": 4903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6300865716739.377, "dur": 63.510, + "args": { + "External id": 83325,"Record function id": 0, "Ev Idx": 4904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6300865716769.897, "dur": 21.330, + "args": { + "External id": 83326,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716811.497, "dur": 3.040, + "args": { + "External id": 83327,"Record function id": 0, "Ev Idx": 4906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6300865716815.367, "dur": 12.460, + "args": { + "External id": 83328,"Record function id": 0, "Ev Idx": 4907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865716848.767, "dur": 8.910, + "args": { + "External id": 83329,"Record function id": 0, "Ev Idx": 4908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6300865716858.787, "dur": 255.649, + "args": { + "External id": 83330,"Record function id": 0, "Ev Idx": 4909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865716881.357, "dur": 222.379, + "args": { + "External id": 83331,"Sequence number": 1770969, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4910 + } + }, + { + "ph": "s", "id": 233, "pid": 5714, "tid": 5714, "ts": 6300865716881.357, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865716905.247, "dur": 117.730, + "args": { + "External id": 83332,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865716940.857, "dur": 13.060, + "args": { + "External id": 83333,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865716944.317, "dur": 8.610, + "args": { + "External id": 83334,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865716956.027, "dur": 6.400, + "args": { + "External id": 83335,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865716964.087, "dur": 2.470, + "args": { + "External id": 83336,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865716969.177, "dur": 3.730, + "args": { + "External id": 83337,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865717040.347, "dur": 31.380, + "args": { + "External id": 83338,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865717124.787, "dur": 20.520, + "args": { + "External id": 83339,"Record function id": 0, "Ev Idx": 4918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6300865717146.516, "dur": 377.130, + "args": { + "External id": 83340,"Record function id": 0, "Ev Idx": 4919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865717174.196, "dur": 338.350, + "args": { + "External id": 83341,"Sequence number": 1770970, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4920 + } + }, + { + "ph": "s", "id": 232, "pid": 5714, "tid": 5714, "ts": 6300865717174.196, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865717235.706, "dur": 28.040, + "args": { + "External id": 83342,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865717279.166, "dur": 37.600, + "args": { + "External id": 83343,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865717330.016, "dur": 18.950, + "args": { + "External id": 83344,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6300865717372.056, "dur": 20.880, + "args": { + "External id": 83345,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865717405.156, "dur": 25.010, + "args": { + "External id": 83346,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6300865717453.976, "dur": 23.880, + "args": { + "External id": 83347,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4926 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.9)", "pid": 5714, "tid": 5714, + "ts": 6300865717568.535, "dur": 24.720, + "args": { + "External id": 83348,"Record function id": 0, "Ev Idx": 4927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6300865717653.415, "dur": 252.250, + "args": { + "External id": 83349,"Sequence number": 1770971, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 4928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865717656.575, "dur": 47.550, + "args": { + "External id": 83350,"Sequence number": 1770971, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 4929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865717657.945, "dur": 45.820, + "args": { + "External id": 83351,"Sequence number": 1770971, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 4930 + } + }, + { + "ph": "s", "id": 231, "pid": 5714, "tid": 5714, "ts": 6300865717657.945, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865717663.825, "dur": 10.700, + "args": { + "External id": 83352,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865717675.875, "dur": 25.660, + "args": { + "External id": 83353,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300865717706.345, "dur": 28.090, + "args": { + "External id": 83354,"Sequence number": 1770972, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4933 + } + }, + { + "ph": "s", "id": 230, "pid": 5714, "tid": 5714, "ts": 6300865717706.345, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300865717711.485, "dur": 0.830, + "args": { + "External id": 83355,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865717713.135, "dur": 0.170, + "args": { + "External id": 83356,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 4935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6300865717737.535, "dur": 29.340, + "args": { + "External id": 83357,"Sequence number": 1770973, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 4936 + } + }, + { + "ph": "s", "id": 229, "pid": 5714, "tid": 5714, "ts": 6300865717737.535, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865717769.175, "dur": 26.760, + "args": { + "External id": 83358,"Sequence number": 1770974, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 4937 + } + }, + { + "ph": "s", "id": 228, "pid": 5714, "tid": 5714, "ts": 6300865717769.175, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865717777.055, "dur": 16.340, + "args": { + "External id": 83359,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 4938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6300865717799.625, "dur": 25.260, + "args": { + "External id": 83360,"Sequence number": 1770975, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 4939 + } + }, + { + "ph": "s", "id": 227, "pid": 5714, "tid": 5714, "ts": 6300865717799.625, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865717829.285, "dur": 23.410, + "args": { + "External id": 83361,"Sequence number": 1770976, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 4940 + } + }, + { + "ph": "s", "id": 226, "pid": 5714, "tid": 5714, "ts": 6300865717829.285, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6300865717854.805, "dur": 28.690, + "args": { + "External id": 83362,"Sequence number": 1770977, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865717856.845, "dur": 26.430, + "args": { + "External id": 83363,"Sequence number": 1770977, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 4942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865717857.775, "dur": 25.220, + "args": { + "External id": 83364,"Sequence number": 1770977, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 4943 + } + }, + { + "ph": "s", "id": 225, "pid": 5714, "tid": 5714, "ts": 6300865717857.775, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865717863.065, "dur": 4.760, + "args": { + "External id": 83365,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865717868.785, "dur": 13.220, + "args": { + "External id": 83366,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865717885.895, "dur": 19.190, + "args": { + "External id": 83367,"Sequence number": 1770978, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4946 + } + }, + { + "ph": "s", "id": 224, "pid": 5714, "tid": 5714, "ts": 6300865717885.895, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865717935.785, "dur": 80.860, + "args": { + "External id": 83368,"Sequence number": 1770979, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865717938.105, "dur": 13.680, + "args": { + "External id": 83369,"Sequence number": 1770979, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4948 + } + }, + { + "ph": "s", "id": 223, "pid": 5714, "tid": 5714, "ts": 6300865717938.105, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865717943.834, "dur": 6.231, + "args": { + "External id": 83370,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865717948.094, "dur": 1.560, + "args": { + "External id": 83371,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865717953.865, "dur": 62.300, + "args": { + "External id": 83372,"Sequence number": 1770980, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 4951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865717957.185, "dur": 6.580, + "args": { + "External id": 83373,"Sequence number": 1770980, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865717958.565, "dur": 4.789, + "args": { + "External id": 83374,"Sequence number": 1770980, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4953 + } + }, + { + "ph": "s", "id": 222, "pid": 5714, "tid": 5714, "ts": 6300865717958.565, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865717966.994, "dur": 39.280, + "args": { + "External id": 83375,"Sequence number": 1770981, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 4954 + } + }, + { + "ph": "s", "id": 221, "pid": 5714, "tid": 5714, "ts": 6300865717966.994, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865718009.414, "dur": 5.100, + "args": { + "External id": 83376,"Sequence number": 1770982, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 4955 + } + }, + { + "ph": "s", "id": 220, "pid": 5714, "tid": 5714, "ts": 6300865718009.414, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865718030.214, "dur": 51.340, + "args": { + "External id": 83377,"Sequence number": 1770983, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865718030.994, "dur": 7.240, + "args": { + "External id": 83378,"Sequence number": 1770983, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4957 + } + }, + { + "ph": "s", "id": 219, "pid": 5714, "tid": 5714, "ts": 6300865718030.994, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865718032.754, "dur": 4.251, + "args": { + "External id": 83379,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865718035.765, "dur": 0.900, + "args": { + "External id": 83380,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865718039.054, "dur": 42.180, + "args": { + "External id": 83381,"Sequence number": 1770984, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 4960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865718040.324, "dur": 3.120, + "args": { + "External id": 83382,"Sequence number": 1770984, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865718041.074, "dur": 2.200, + "args": { + "External id": 83383,"Sequence number": 1770984, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4962 + } + }, + { + "ph": "s", "id": 218, "pid": 5714, "tid": 5714, "ts": 6300865718041.074, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865718044.094, "dur": 32.040, + "args": { + "External id": 83384,"Sequence number": 1770985, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 4963 + } + }, + { + "ph": "s", "id": 217, "pid": 5714, "tid": 5714, "ts": 6300865718044.094, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865718078.344, "dur": 2.110, + "args": { + "External id": 83385,"Sequence number": 1770986, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 4964 + } + }, + { + "ph": "s", "id": 216, "pid": 5714, "tid": 5714, "ts": 6300865718078.344, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865718093.454, "dur": 50.690, + "args": { + "External id": 83386,"Sequence number": 1770987, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865718094.134, "dur": 5.640, + "args": { + "External id": 83387,"Sequence number": 1770987, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4966 + } + }, + { + "ph": "s", "id": 215, "pid": 5714, "tid": 5714, "ts": 6300865718094.134, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865718095.524, "dur": 3.010, + "args": { + "External id": 83388,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865718097.344, "dur": 0.900, + "args": { + "External id": 83389,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865718101.684, "dur": 42.150, + "args": { + "External id": 83390,"Sequence number": 1770988, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 4969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865718102.954, "dur": 2.970, + "args": { + "External id": 83391,"Sequence number": 1770988, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865718103.804, "dur": 1.920, + "args": { + "External id": 83392,"Sequence number": 1770988, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4971 + } + }, + { + "ph": "s", "id": 214, "pid": 5714, "tid": 5714, "ts": 6300865718103.804, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865718106.534, "dur": 30.850, + "args": { + "External id": 83393,"Sequence number": 1770989, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 4972 + } + }, + { + "ph": "s", "id": 213, "pid": 5714, "tid": 5714, "ts": 6300865718106.534, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865718139.764, "dur": 3.360, + "args": { + "External id": 83394,"Sequence number": 1770990, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 4973 + } + }, + { + "ph": "s", "id": 212, "pid": 5714, "tid": 5714, "ts": 6300865718139.764, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865718166.214, "dur": 3.670, + "args": { + "External id": 83395,"Sequence number": 1770991, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865718167.104, "dur": 2.550, + "args": { + "External id": 83396,"Sequence number": 1770991, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4975 + } + }, + { + "ph": "s", "id": 211, "pid": 5714, "tid": 5714, "ts": 6300865718167.104, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865718178.224, "dur": 5.940, + "args": { + "External id": 83397,"Sequence number": 1770992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865718178.874, "dur": 5.070, + "args": { + "External id": 83398,"Sequence number": 1770992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4977 + } + }, + { + "ph": "s", "id": 210, "pid": 5714, "tid": 5714, "ts": 6300865718178.874, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865718191.374, "dur": 3.510, + "args": { + "External id": 83399,"Sequence number": 1770993, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865718192.094, "dur": 2.460, + "args": { + "External id": 83400,"Sequence number": 1770993, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4979 + } + }, + { + "ph": "s", "id": 209, "pid": 5714, "tid": 5714, "ts": 6300865718192.094, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865718227.194, "dur": 214.179, + "args": { + "External id": 83401,"Sequence number": 1770994, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4980 + } + }, + { + "ph": "s", "id": 208, "pid": 5714, "tid": 5714, "ts": 6300865718227.194, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865718248.444, "dur": 10.510, + "args": { + "External id": 83402,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865718250.904, "dur": 7.350, + "args": { + "External id": 83403,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865718459.873, "dur": 136.770, + "args": { + "External id": 83404,"Sequence number": 1770995, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4983 + } + }, + { + "ph": "s", "id": 207, "pid": 5714, "tid": 5714, "ts": 6300865718459.873, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865718479.804, "dur": 11.729, + "args": { + "External id": 83405,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865718482.723, "dur": 8.090, + "args": { + "External id": 83406,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5714, "tid": 5714, + "ts": 6300865718627.673, "dur": 178.340, + "args": { + "External id": 83407,"Sequence number": 1770996, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 4986 + } + }, + { + "ph": "s", "id": 206, "pid": 5714, "tid": 5714, "ts": 6300865718627.673, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865718652.313, "dur": 123.190, + "args": { + "External id": 83408,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865718693.023, "dur": 13.560, + "args": { + "External id": 83409,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865718695.963, "dur": 9.580, + "args": { + "External id": 83410,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865718708.743, "dur": 5.130, + "args": { + "External id": 83411,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865718715.083, "dur": 3.390, + "args": { + "External id": 83412,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865718721.163, "dur": 4.900, + "args": { + "External id": 83413,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 5714, + "ts": 6300865718788.443, "dur": 5.400, + "args": { + "External id": 83414,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 4993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865718811.913, "dur": 5.730, + "args": { + "External id": 83415,"Sequence number": 1770997, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865718813.173, "dur": 4.230, + "args": { + "External id": 83416,"Sequence number": 1770997, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4995 + } + }, + { + "ph": "s", "id": 205, "pid": 5714, "tid": 5714, "ts": 6300865718813.173, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865718830.552, "dur": 63.760, + "args": { + "External id": 83417,"Sequence number": 1770998, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865718832.672, "dur": 6.660, + "args": { + "External id": 83418,"Sequence number": 1770998, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4997 + } + }, + { + "ph": "s", "id": 204, "pid": 5714, "tid": 5714, "ts": 6300865718832.672, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865718834.772, "dur": 3.560, + "args": { + "External id": 83419,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865718836.592, "dur": 1.360, + "args": { + "External id": 83420,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865718840.303, "dur": 53.640, + "args": { + "External id": 83421,"Sequence number": 1770999, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865718841.932, "dur": 4.131, + "args": { + "External id": 83422,"Sequence number": 1770999, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865718843.923, "dur": 1.920, + "args": { + "External id": 83423,"Sequence number": 1770999, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5002 + } + }, + { + "ph": "s", "id": 203, "pid": 5714, "tid": 5714, "ts": 6300865718843.923, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865718846.863, "dur": 40.220, + "args": { + "External id": 83424,"Sequence number": 1771000, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5003 + } + }, + { + "ph": "s", "id": 202, "pid": 5714, "tid": 5714, "ts": 6300865718846.863, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865718889.852, "dur": 3.100, + "args": { + "External id": 83425,"Sequence number": 1771001, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5004 + } + }, + { + "ph": "s", "id": 201, "pid": 5714, "tid": 5714, "ts": 6300865718889.852, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300865718907.303, "dur": 25.769, + "args": { + "External id": 83426,"Sequence number": 1771002, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5005 + } + }, + { + "ph": "s", "id": 200, "pid": 5714, "tid": 5714, "ts": 6300865718907.303, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6300865718952.592, "dur": 184.120, + "args": { + "External id": 83427,"Sequence number": 1771003, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865718954.542, "dur": 32.330, + "args": { + "External id": 83428,"Sequence number": 1771003, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865718955.672, "dur": 30.890, + "args": { + "External id": 83429,"Sequence number": 1771003, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5008 + } + }, + { + "ph": "s", "id": 199, "pid": 5714, "tid": 5714, "ts": 6300865718955.672, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865718960.062, "dur": 7.460, + "args": { + "External id": 83430,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865718968.792, "dur": 15.890, + "args": { + "External id": 83431,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300865718988.202, "dur": 21.110, + "args": { + "External id": 83432,"Sequence number": 1771004, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5011 + } + }, + { + "ph": "s", "id": 198, "pid": 5714, "tid": 5714, "ts": 6300865718988.202, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300865718991.112, "dur": 0.450, + "args": { + "External id": 83433,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865718992.442, "dur": 0.150, + "args": { + "External id": 83434,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6300865719012.202, "dur": 20.010, + "args": { + "External id": 83435,"Sequence number": 1771005, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5014 + } + }, + { + "ph": "s", "id": 197, "pid": 5714, "tid": 5714, "ts": 6300865719012.202, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865719033.732, "dur": 21.230, + "args": { + "External id": 83436,"Sequence number": 1771006, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5015 + } + }, + { + "ph": "s", "id": 196, "pid": 5714, "tid": 5714, "ts": 6300865719033.732, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865719041.472, "dur": 11.390, + "args": { + "External id": 83437,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6300865719056.152, "dur": 15.770, + "args": { + "External id": 83438,"Sequence number": 1771007, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5017 + } + }, + { + "ph": "s", "id": 195, "pid": 5714, "tid": 5714, "ts": 6300865719056.152, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865719075.452, "dur": 16.980, + "args": { + "External id": 83439,"Sequence number": 1771008, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5018 + } + }, + { + "ph": "s", "id": 194, "pid": 5714, "tid": 5714, "ts": 6300865719075.452, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6300865719093.662, "dur": 24.290, + "args": { + "External id": 83440,"Sequence number": 1771009, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865719094.972, "dur": 22.710, + "args": { + "External id": 83441,"Sequence number": 1771009, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865719095.882, "dur": 21.550, + "args": { + "External id": 83442,"Sequence number": 1771009, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5021 + } + }, + { + "ph": "s", "id": 193, "pid": 5714, "tid": 5714, "ts": 6300865719095.882, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865719099.472, "dur": 4.750, + "args": { + "External id": 83443,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865719105.192, "dur": 11.170, + "args": { + "External id": 83444,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865719120.842, "dur": 15.330, + "args": { + "External id": 83445,"Sequence number": 1771010, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5024 + } + }, + { + "ph": "s", "id": 192, "pid": 5714, "tid": 5714, "ts": 6300865719120.842, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865719161.392, "dur": 70.270, + "args": { + "External id": 83446,"Sequence number": 1771011, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865719162.372, "dur": 7.980, + "args": { + "External id": 83447,"Sequence number": 1771011, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5026 + } + }, + { + "ph": "s", "id": 191, "pid": 5714, "tid": 5714, "ts": 6300865719162.372, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865719164.672, "dur": 4.140, + "args": { + "External id": 83448,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865719166.932, "dur": 1.490, + "args": { + "External id": 83449,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865719171.162, "dur": 60.160, + "args": { + "External id": 83450,"Sequence number": 1771012, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865719174.062, "dur": 7.860, + "args": { + "External id": 83451,"Sequence number": 1771012, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865719174.972, "dur": 5.950, + "args": { + "External id": 83452,"Sequence number": 1771012, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5031 + } + }, + { + "ph": "s", "id": 190, "pid": 5714, "tid": 5714, "ts": 6300865719174.972, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865719183.972, "dur": 40.120, + "args": { + "External id": 83453,"Sequence number": 1771013, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5032 + } + }, + { + "ph": "s", "id": 189, "pid": 5714, "tid": 5714, "ts": 6300865719183.972, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865719226.552, "dur": 3.900, + "args": { + "External id": 83454,"Sequence number": 1771014, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5033 + } + }, + { + "ph": "s", "id": 188, "pid": 5714, "tid": 5714, "ts": 6300865719226.552, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865719244.742, "dur": 76.300, + "args": { + "External id": 83455,"Sequence number": 1771015, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865719246.552, "dur": 18.899, + "args": { + "External id": 83456,"Sequence number": 1771015, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5035 + } + }, + { + "ph": "s", "id": 187, "pid": 5714, "tid": 5714, "ts": 6300865719246.552, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865719259.562, "dur": 4.609, + "args": { + "External id": 83457,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865719262.911, "dur": 0.971, + "args": { + "External id": 83458,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865719266.331, "dur": 54.360, + "args": { + "External id": 83459,"Sequence number": 1771016, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865719267.722, "dur": 4.120, + "args": { + "External id": 83460,"Sequence number": 1771016, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865719269.842, "dur": 1.809, + "args": { + "External id": 83461,"Sequence number": 1771016, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5040 + } + }, + { + "ph": "s", "id": 186, "pid": 5714, "tid": 5714, "ts": 6300865719269.842, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865719272.502, "dur": 39.840, + "args": { + "External id": 83462,"Sequence number": 1771017, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5041 + } + }, + { + "ph": "s", "id": 185, "pid": 5714, "tid": 5714, "ts": 6300865719272.502, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865719314.962, "dur": 4.749, + "args": { + "External id": 83463,"Sequence number": 1771018, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5042 + } + }, + { + "ph": "s", "id": 184, "pid": 5714, "tid": 5714, "ts": 6300865719314.962, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5714, "tid": 5714, + "ts": 6300865719349.842, "dur": 150.809, + "args": { + "External id": 83464,"Sequence number": 1771019, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5043 + } + }, + { + "ph": "s", "id": 183, "pid": 5714, "tid": 5714, "ts": 6300865719349.842, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865719384.721, "dur": 8.110, + "args": { + "External id": 83465,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865719429.811, "dur": 56.130, + "args": { + "External id": 83466,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865719430.791, "dur": 7.170, + "args": { + "External id": 83467,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865719432.541, "dur": 4.000, + "args": { + "External id": 83468,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865719434.731, "dur": 1.350, + "args": { + "External id": 83469,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865719438.901, "dur": 46.480, + "args": { + "External id": 83470,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865719441.981, "dur": 4.120, + "args": { + "External id": 83471,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865719442.871, "dur": 3.050, + "args": { + "External id": 83472,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865719446.801, "dur": 33.870, + "args": { + "External id": 83473,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865719483.001, "dur": 1.170, + "args": { + "External id": 83474,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300865719507.401, "dur": 22.770, + "args": { + "External id": 83475,"Sequence number": 1771020, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5054 + } + }, + { + "ph": "s", "id": 182, "pid": 5714, "tid": 5714, "ts": 6300865719507.401, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6300865719559.631, "dur": 180.139, + "args": { + "External id": 83476,"Sequence number": 1771021, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865719561.281, "dur": 33.440, + "args": { + "External id": 83477,"Sequence number": 1771021, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865719563.801, "dur": 30.580, + "args": { + "External id": 83478,"Sequence number": 1771021, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5057 + } + }, + { + "ph": "s", "id": 181, "pid": 5714, "tid": 5714, "ts": 6300865719563.801, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865719567.901, "dur": 7.300, + "args": { + "External id": 83479,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865719576.481, "dur": 16.210, + "args": { + "External id": 83480,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300865719596.051, "dur": 20.060, + "args": { + "External id": 83481,"Sequence number": 1771022, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5060 + } + }, + { + "ph": "s", "id": 180, "pid": 5714, "tid": 5714, "ts": 6300865719596.051, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300865719599.171, "dur": 0.470, + "args": { + "External id": 83482,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865719600.381, "dur": 0.170, + "args": { + "External id": 83483,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6300865719617.941, "dur": 18.940, + "args": { + "External id": 83484,"Sequence number": 1771023, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5063 + } + }, + { + "ph": "s", "id": 179, "pid": 5714, "tid": 5714, "ts": 6300865719617.941, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865719638.481, "dur": 19.590, + "args": { + "External id": 83485,"Sequence number": 1771024, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5064 + } + }, + { + "ph": "s", "id": 178, "pid": 5714, "tid": 5714, "ts": 6300865719638.481, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865719644.711, "dur": 11.320, + "args": { + "External id": 83486,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6300865719659.231, "dur": 18.150, + "args": { + "External id": 83487,"Sequence number": 1771025, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5066 + } + }, + { + "ph": "s", "id": 177, "pid": 5714, "tid": 5714, "ts": 6300865719659.231, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865719680.931, "dur": 15.680, + "args": { + "External id": 83488,"Sequence number": 1771026, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5067 + } + }, + { + "ph": "s", "id": 176, "pid": 5714, "tid": 5714, "ts": 6300865719680.931, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6300865719697.881, "dur": 24.989, + "args": { + "External id": 83489,"Sequence number": 1771027, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865719700.421, "dur": 22.160, + "args": { + "External id": 83490,"Sequence number": 1771027, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865719701.301, "dur": 21.049, + "args": { + "External id": 83491,"Sequence number": 1771027, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5070 + } + }, + { + "ph": "s", "id": 175, "pid": 5714, "tid": 5714, "ts": 6300865719701.301, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865719705.261, "dur": 4.349, + "args": { + "External id": 83492,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865719710.610, "dur": 10.700, + "args": { + "External id": 83493,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865719724.701, "dur": 14.609, + "args": { + "External id": 83494,"Sequence number": 1771028, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5073 + } + }, + { + "ph": "s", "id": 174, "pid": 5714, "tid": 5714, "ts": 6300865719724.701, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865719765.281, "dur": 62.719, + "args": { + "External id": 83495,"Sequence number": 1771029, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865719766.221, "dur": 9.200, + "args": { + "External id": 83496,"Sequence number": 1771029, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5075 + } + }, + { + "ph": "s", "id": 173, "pid": 5714, "tid": 5714, "ts": 6300865719766.221, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865719769.741, "dur": 4.109, + "args": { + "External id": 83497,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865719772.021, "dur": 1.449, + "args": { + "External id": 83498,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865719776.270, "dur": 51.350, + "args": { + "External id": 83499,"Sequence number": 1771030, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865719777.910, "dur": 6.360, + "args": { + "External id": 83500,"Sequence number": 1771030, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865719778.801, "dur": 5.169, + "args": { + "External id": 83501,"Sequence number": 1771030, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5080 + } + }, + { + "ph": "s", "id": 172, "pid": 5714, "tid": 5714, "ts": 6300865719778.801, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865719785.021, "dur": 35.209, + "args": { + "External id": 83502,"Sequence number": 1771031, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5081 + } + }, + { + "ph": "s", "id": 171, "pid": 5714, "tid": 5714, "ts": 6300865719785.021, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865719822.860, "dur": 3.810, + "args": { + "External id": 83503,"Sequence number": 1771032, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5082 + } + }, + { + "ph": "s", "id": 170, "pid": 5714, "tid": 5714, "ts": 6300865719822.860, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865719840.820, "dur": 52.540, + "args": { + "External id": 83504,"Sequence number": 1771033, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865719841.550, "dur": 7.600, + "args": { + "External id": 83505,"Sequence number": 1771033, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5084 + } + }, + { + "ph": "s", "id": 169, "pid": 5714, "tid": 5714, "ts": 6300865719841.550, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865719843.710, "dur": 4.170, + "args": { + "External id": 83506,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865719846.750, "dur": 0.820, + "args": { + "External id": 83507,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865719849.910, "dur": 43.140, + "args": { + "External id": 83508,"Sequence number": 1771034, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865719851.170, "dur": 3.100, + "args": { + "External id": 83509,"Sequence number": 1771034, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865719852.070, "dur": 2.010, + "args": { + "External id": 83510,"Sequence number": 1771034, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5089 + } + }, + { + "ph": "s", "id": 168, "pid": 5714, "tid": 5714, "ts": 6300865719852.070, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865719856.030, "dur": 30.880, + "args": { + "External id": 83511,"Sequence number": 1771035, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5090 + } + }, + { + "ph": "s", "id": 167, "pid": 5714, "tid": 5714, "ts": 6300865719856.030, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865719889.080, "dur": 3.210, + "args": { + "External id": 83512,"Sequence number": 1771036, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5091 + } + }, + { + "ph": "s", "id": 166, "pid": 5714, "tid": 5714, "ts": 6300865719889.080, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865719904.520, "dur": 48.480, + "args": { + "External id": 83513,"Sequence number": 1771037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865719905.190, "dur": 6.700, + "args": { + "External id": 83514,"Sequence number": 1771037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5093 + } + }, + { + "ph": "s", "id": 165, "pid": 5714, "tid": 5714, "ts": 6300865719905.190, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865719906.730, "dur": 3.970, + "args": { + "External id": 83515,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865719909.690, "dur": 0.720, + "args": { + "External id": 83516,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865719912.690, "dur": 40.000, + "args": { + "External id": 83517,"Sequence number": 1771038, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865719913.930, "dur": 3.650, + "args": { + "External id": 83518,"Sequence number": 1771038, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865719914.740, "dur": 2.650, + "args": { + "External id": 83519,"Sequence number": 1771038, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5098 + } + }, + { + "ph": "s", "id": 164, "pid": 5714, "tid": 5714, "ts": 6300865719914.740, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865719918.260, "dur": 29.850, + "args": { + "External id": 83520,"Sequence number": 1771039, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5099 + } + }, + { + "ph": "s", "id": 163, "pid": 5714, "tid": 5714, "ts": 6300865719918.260, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865719950.190, "dur": 1.840, + "args": { + "External id": 83521,"Sequence number": 1771040, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5100 + } + }, + { + "ph": "s", "id": 162, "pid": 5714, "tid": 5714, "ts": 6300865719950.190, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865719967.630, "dur": 3.660, + "args": { + "External id": 83522,"Sequence number": 1771041, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865719968.570, "dur": 2.520, + "args": { + "External id": 83523,"Sequence number": 1771041, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5102 + } + }, + { + "ph": "s", "id": 161, "pid": 5714, "tid": 5714, "ts": 6300865719968.570, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865719979.180, "dur": 6.040, + "args": { + "External id": 83524,"Sequence number": 1771042, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865719979.960, "dur": 5.050, + "args": { + "External id": 83525,"Sequence number": 1771042, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5104 + } + }, + { + "ph": "s", "id": 160, "pid": 5714, "tid": 5714, "ts": 6300865719979.960, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865719990.570, "dur": 2.340, + "args": { + "External id": 83526,"Sequence number": 1771043, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865719991.280, "dur": 1.320, + "args": { + "External id": 83527,"Sequence number": 1771043, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5106 + } + }, + { + "ph": "s", "id": 159, "pid": 5714, "tid": 5714, "ts": 6300865719991.280, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865720022.620, "dur": 141.649, + "args": { + "External id": 83528,"Sequence number": 1771044, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5107 + } + }, + { + "ph": "s", "id": 158, "pid": 5714, "tid": 5714, "ts": 6300865720022.620, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865720040.840, "dur": 11.320, + "args": { + "External id": 83529,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865720043.180, "dur": 8.240, + "args": { + "External id": 83530,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865720181.200, "dur": 137.179, + "args": { + "External id": 83531,"Sequence number": 1771045, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5110 + } + }, + { + "ph": "s", "id": 157, "pid": 5714, "tid": 5714, "ts": 6300865720181.200, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865720196.960, "dur": 12.149, + "args": { + "External id": 83532,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865720199.540, "dur": 8.849, + "args": { + "External id": 83533,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5714, "tid": 5714, + "ts": 6300865720348.399, "dur": 174.760, + "args": { + "External id": 83534,"Sequence number": 1771046, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 5113 + } + }, + { + "ph": "s", "id": 156, "pid": 5714, "tid": 5714, "ts": 6300865720348.399, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865720370.239, "dur": 125.910, + "args": { + "External id": 83535,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 5114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865720404.989, "dur": 13.640, + "args": { + "External id": 83536,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865720408.099, "dur": 9.490, + "args": { + "External id": 83537,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865720420.629, "dur": 6.220, + "args": { + "External id": 83538,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865720427.999, "dur": 2.540, + "args": { + "External id": 83539,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865720433.149, "dur": 5.280, + "args": { + "External id": 83540,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 5714, + "ts": 6300865720507.019, "dur": 4.020, + "args": { + "External id": 83541,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 5120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865720529.159, "dur": 7.230, + "args": { + "External id": 83542,"Sequence number": 1771047, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865720530.429, "dur": 5.690, + "args": { + "External id": 83543,"Sequence number": 1771047, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5122 + } + }, + { + "ph": "s", "id": 155, "pid": 5714, "tid": 5714, "ts": 6300865720530.429, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865720551.019, "dur": 62.209, + "args": { + "External id": 83544,"Sequence number": 1771048, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865720551.899, "dur": 6.730, + "args": { + "External id": 83545,"Sequence number": 1771048, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5124 + } + }, + { + "ph": "s", "id": 154, "pid": 5714, "tid": 5714, "ts": 6300865720551.899, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865720553.829, "dur": 3.650, + "args": { + "External id": 83546,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865720555.849, "dur": 1.240, + "args": { + "External id": 83547,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865720559.599, "dur": 53.240, + "args": { + "External id": 83548,"Sequence number": 1771049, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865720562.579, "dur": 2.770, + "args": { + "External id": 83549,"Sequence number": 1771049, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865720563.279, "dur": 1.870, + "args": { + "External id": 83550,"Sequence number": 1771049, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5129 + } + }, + { + "ph": "s", "id": 153, "pid": 5714, "tid": 5714, "ts": 6300865720563.279, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865720566.139, "dur": 40.500, + "args": { + "External id": 83551,"Sequence number": 1771050, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5130 + } + }, + { + "ph": "s", "id": 152, "pid": 5714, "tid": 5714, "ts": 6300865720566.139, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865720609.268, "dur": 2.551, + "args": { + "External id": 83552,"Sequence number": 1771051, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5131 + } + }, + { + "ph": "s", "id": 151, "pid": 5714, "tid": 5714, "ts": 6300865720609.268, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300865720624.819, "dur": 21.809, + "args": { + "External id": 83553,"Sequence number": 1771052, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5132 + } + }, + { + "ph": "s", "id": 150, "pid": 5714, "tid": 5714, "ts": 6300865720624.819, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6300865720665.708, "dur": 182.280, + "args": { + "External id": 83554,"Sequence number": 1771053, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865720669.748, "dur": 30.560, + "args": { + "External id": 83555,"Sequence number": 1771053, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865720671.048, "dur": 28.850, + "args": { + "External id": 83556,"Sequence number": 1771053, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5135 + } + }, + { + "ph": "s", "id": 149, "pid": 5714, "tid": 5714, "ts": 6300865720671.048, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865720675.108, "dur": 6.750, + "args": { + "External id": 83557,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865720682.938, "dur": 15.390, + "args": { + "External id": 83558,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300865720701.648, "dur": 22.040, + "args": { + "External id": 83559,"Sequence number": 1771054, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5138 + } + }, + { + "ph": "s", "id": 148, "pid": 5714, "tid": 5714, "ts": 6300865720701.648, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300865720704.598, "dur": 0.440, + "args": { + "External id": 83560,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865720707.178, "dur": 0.150, + "args": { + "External id": 83561,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6300865720725.378, "dur": 20.130, + "args": { + "External id": 83562,"Sequence number": 1771055, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5141 + } + }, + { + "ph": "s", "id": 147, "pid": 5714, "tid": 5714, "ts": 6300865720725.378, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865720747.058, "dur": 18.530, + "args": { + "External id": 83563,"Sequence number": 1771056, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5142 + } + }, + { + "ph": "s", "id": 146, "pid": 5714, "tid": 5714, "ts": 6300865720747.058, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865720752.368, "dur": 11.200, + "args": { + "External id": 83564,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6300865720766.778, "dur": 15.300, + "args": { + "External id": 83565,"Sequence number": 1771057, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5144 + } + }, + { + "ph": "s", "id": 145, "pid": 5714, "tid": 5714, "ts": 6300865720766.778, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865720785.678, "dur": 17.010, + "args": { + "External id": 83566,"Sequence number": 1771058, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5145 + } + }, + { + "ph": "s", "id": 144, "pid": 5714, "tid": 5714, "ts": 6300865720785.678, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6300865720803.928, "dur": 25.130, + "args": { + "External id": 83567,"Sequence number": 1771059, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865720805.118, "dur": 23.660, + "args": { + "External id": 83568,"Sequence number": 1771059, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865720805.998, "dur": 22.520, + "args": { + "External id": 83569,"Sequence number": 1771059, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5148 + } + }, + { + "ph": "s", "id": 143, "pid": 5714, "tid": 5714, "ts": 6300865720805.998, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865720811.018, "dur": 4.570, + "args": { + "External id": 83570,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865720816.538, "dur": 10.930, + "args": { + "External id": 83571,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865720831.078, "dur": 16.380, + "args": { + "External id": 83572,"Sequence number": 1771060, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5151 + } + }, + { + "ph": "s", "id": 142, "pid": 5714, "tid": 5714, "ts": 6300865720831.078, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865720869.088, "dur": 59.240, + "args": { + "External id": 83573,"Sequence number": 1771061, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865720871.408, "dur": 7.790, + "args": { + "External id": 83574,"Sequence number": 1771061, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5153 + } + }, + { + "ph": "s", "id": 141, "pid": 5714, "tid": 5714, "ts": 6300865720871.408, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865720873.798, "dur": 3.980, + "args": { + "External id": 83575,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865720875.938, "dur": 1.460, + "args": { + "External id": 83576,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865720880.008, "dur": 47.870, + "args": { + "External id": 83577,"Sequence number": 1771062, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865720881.578, "dur": 4.720, + "args": { + "External id": 83578,"Sequence number": 1771062, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865720883.318, "dur": 2.780, + "args": { + "External id": 83579,"Sequence number": 1771062, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5158 + } + }, + { + "ph": "s", "id": 140, "pid": 5714, "tid": 5714, "ts": 6300865720883.318, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865720887.018, "dur": 33.780, + "args": { + "External id": 83580,"Sequence number": 1771063, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5159 + } + }, + { + "ph": "s", "id": 139, "pid": 5714, "tid": 5714, "ts": 6300865720887.018, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865720923.318, "dur": 3.650, + "args": { + "External id": 83581,"Sequence number": 1771064, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5160 + } + }, + { + "ph": "s", "id": 138, "pid": 5714, "tid": 5714, "ts": 6300865720923.318, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865720940.518, "dur": 49.760, + "args": { + "External id": 83582,"Sequence number": 1771065, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865720941.208, "dur": 6.710, + "args": { + "External id": 83583,"Sequence number": 1771065, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5162 + } + }, + { + "ph": "s", "id": 137, "pid": 5714, "tid": 5714, "ts": 6300865720941.208, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865720944.048, "dur": 2.640, + "args": { + "External id": 83584,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865720945.708, "dur": 0.740, + "args": { + "External id": 83585,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865720948.808, "dur": 41.170, + "args": { + "External id": 83586,"Sequence number": 1771066, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865720950.108, "dur": 4.030, + "args": { + "External id": 83587,"Sequence number": 1771066, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865720952.148, "dur": 1.810, + "args": { + "External id": 83588,"Sequence number": 1771066, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5167 + } + }, + { + "ph": "s", "id": 136, "pid": 5714, "tid": 5714, "ts": 6300865720952.148, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865720954.758, "dur": 29.210, + "args": { + "External id": 83589,"Sequence number": 1771067, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5168 + } + }, + { + "ph": "s", "id": 135, "pid": 5714, "tid": 5714, "ts": 6300865720954.758, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865720986.068, "dur": 2.980, + "args": { + "External id": 83590,"Sequence number": 1771068, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5169 + } + }, + { + "ph": "s", "id": 134, "pid": 5714, "tid": 5714, "ts": 6300865720986.068, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5714, "tid": 5714, + "ts": 6300865721016.088, "dur": 134.529, + "args": { + "External id": 83591,"Sequence number": 1771069, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5170 + } + }, + { + "ph": "s", "id": 133, "pid": 5714, "tid": 5714, "ts": 6300865721016.088, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865721043.067, "dur": 8.220, + "args": { + "External id": 83592,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865721079.307, "dur": 56.910, + "args": { + "External id": 83593,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865721080.227, "dur": 8.171, + "args": { + "External id": 83594,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865721082.007, "dur": 4.971, + "args": { + "External id": 83595,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865721085.278, "dur": 1.229, + "args": { + "External id": 83596,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865721089.367, "dur": 46.290, + "args": { + "External id": 83597,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865721091.138, "dur": 5.400, + "args": { + "External id": 83598,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865721093.418, "dur": 2.929, + "args": { + "External id": 83599,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865721097.367, "dur": 33.260, + "args": { + "External id": 83600,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865721133.177, "dur": 1.310, + "args": { + "External id": 83601,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300865721157.287, "dur": 21.720, + "args": { + "External id": 83602,"Sequence number": 1771070, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5181 + } + }, + { + "ph": "s", "id": 132, "pid": 5714, "tid": 5714, "ts": 6300865721157.287, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6300865721207.317, "dur": 188.540, + "args": { + "External id": 83603,"Sequence number": 1771071, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865721209.047, "dur": 32.970, + "args": { + "External id": 83604,"Sequence number": 1771071, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865721210.407, "dur": 31.320, + "args": { + "External id": 83605,"Sequence number": 1771071, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5184 + } + }, + { + "ph": "s", "id": 131, "pid": 5714, "tid": 5714, "ts": 6300865721210.407, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865721216.597, "dur": 6.750, + "args": { + "External id": 83606,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865721224.487, "dur": 15.690, + "args": { + "External id": 83607,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300865721243.267, "dur": 18.600, + "args": { + "External id": 83608,"Sequence number": 1771072, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5187 + } + }, + { + "ph": "s", "id": 130, "pid": 5714, "tid": 5714, "ts": 6300865721243.267, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300865721246.167, "dur": 0.420, + "args": { + "External id": 83609,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865721247.307, "dur": 0.160, + "args": { + "External id": 83610,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6300865721263.577, "dur": 19.400, + "args": { + "External id": 83611,"Sequence number": 1771073, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5190 + } + }, + { + "ph": "s", "id": 129, "pid": 5714, "tid": 5714, "ts": 6300865721263.577, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865721284.387, "dur": 27.490, + "args": { + "External id": 83612,"Sequence number": 1771074, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5191 + } + }, + { + "ph": "s", "id": 128, "pid": 5714, "tid": 5714, "ts": 6300865721284.387, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865721289.687, "dur": 19.790, + "args": { + "External id": 83613,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6300865721313.337, "dur": 17.470, + "args": { + "External id": 83614,"Sequence number": 1771075, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5193 + } + }, + { + "ph": "s", "id": 127, "pid": 5714, "tid": 5714, "ts": 6300865721313.337, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865721334.277, "dur": 16.060, + "args": { + "External id": 83615,"Sequence number": 1771076, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5194 + } + }, + { + "ph": "s", "id": 126, "pid": 5714, "tid": 5714, "ts": 6300865721334.277, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6300865721351.667, "dur": 26.680, + "args": { + "External id": 83616,"Sequence number": 1771077, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865721352.897, "dur": 25.150, + "args": { + "External id": 83617,"Sequence number": 1771077, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865721355.227, "dur": 22.560, + "args": { + "External id": 83618,"Sequence number": 1771077, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5197 + } + }, + { + "ph": "s", "id": 125, "pid": 5714, "tid": 5714, "ts": 6300865721355.227, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865721360.567, "dur": 4.530, + "args": { + "External id": 83619,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865721366.137, "dur": 10.700, + "args": { + "External id": 83620,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865721380.327, "dur": 15.040, + "args": { + "External id": 83621,"Sequence number": 1771078, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5200 + } + }, + { + "ph": "s", "id": 124, "pid": 5714, "tid": 5714, "ts": 6300865721380.327, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865721421.267, "dur": 63.250, + "args": { + "External id": 83622,"Sequence number": 1771079, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865721422.187, "dur": 9.280, + "args": { + "External id": 83623,"Sequence number": 1771079, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5202 + } + }, + { + "ph": "s", "id": 123, "pid": 5714, "tid": 5714, "ts": 6300865721422.187, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865721424.677, "dur": 5.210, + "args": { + "External id": 83624,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865721428.027, "dur": 1.440, + "args": { + "External id": 83625,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865721432.267, "dur": 51.859, + "args": { + "External id": 83626,"Sequence number": 1771080, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865721433.877, "dur": 5.380, + "args": { + "External id": 83627,"Sequence number": 1771080, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865721434.757, "dur": 4.310, + "args": { + "External id": 83628,"Sequence number": 1771080, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5207 + } + }, + { + "ph": "s", "id": 122, "pid": 5714, "tid": 5714, "ts": 6300865721434.757, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865721441.077, "dur": 36.100, + "args": { + "External id": 83629,"Sequence number": 1771081, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5208 + } + }, + { + "ph": "s", "id": 121, "pid": 5714, "tid": 5714, "ts": 6300865721441.077, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865721479.697, "dur": 3.449, + "args": { + "External id": 83630,"Sequence number": 1771082, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5209 + } + }, + { + "ph": "s", "id": 120, "pid": 5714, "tid": 5714, "ts": 6300865721479.697, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865721497.226, "dur": 50.731, + "args": { + "External id": 83631,"Sequence number": 1771083, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865721497.957, "dur": 6.649, + "args": { + "External id": 83632,"Sequence number": 1771083, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5211 + } + }, + { + "ph": "s", "id": 119, "pid": 5714, "tid": 5714, "ts": 6300865721497.957, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865721499.666, "dur": 3.800, + "args": { + "External id": 83633,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865721502.337, "dur": 0.809, + "args": { + "External id": 83634,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865721505.406, "dur": 42.240, + "args": { + "External id": 83635,"Sequence number": 1771084, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865721506.697, "dur": 2.889, + "args": { + "External id": 83636,"Sequence number": 1771084, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865721507.626, "dur": 1.771, + "args": { + "External id": 83637,"Sequence number": 1771084, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5216 + } + }, + { + "ph": "s", "id": 118, "pid": 5714, "tid": 5714, "ts": 6300865721507.626, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865721510.257, "dur": 30.929, + "args": { + "External id": 83638,"Sequence number": 1771085, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5217 + } + }, + { + "ph": "s", "id": 117, "pid": 5714, "tid": 5714, "ts": 6300865721510.257, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865721543.386, "dur": 3.420, + "args": { + "External id": 83639,"Sequence number": 1771086, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5218 + } + }, + { + "ph": "s", "id": 116, "pid": 5714, "tid": 5714, "ts": 6300865721543.386, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865721559.797, "dur": 49.819, + "args": { + "External id": 83640,"Sequence number": 1771087, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865721560.516, "dur": 5.600, + "args": { + "External id": 83641,"Sequence number": 1771087, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5220 + } + }, + { + "ph": "s", "id": 115, "pid": 5714, "tid": 5714, "ts": 6300865721560.516, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865721562.186, "dur": 2.850, + "args": { + "External id": 83642,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865721563.796, "dur": 0.930, + "args": { + "External id": 83643,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865721568.126, "dur": 41.190, + "args": { + "External id": 83644,"Sequence number": 1771088, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865721569.426, "dur": 3.780, + "args": { + "External id": 83645,"Sequence number": 1771088, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865721570.286, "dur": 2.690, + "args": { + "External id": 83646,"Sequence number": 1771088, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5225 + } + }, + { + "ph": "s", "id": 114, "pid": 5714, "tid": 5714, "ts": 6300865721570.286, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865721573.866, "dur": 28.050, + "args": { + "External id": 83647,"Sequence number": 1771089, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5226 + } + }, + { + "ph": "s", "id": 113, "pid": 5714, "tid": 5714, "ts": 6300865721573.866, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865721604.096, "dur": 4.490, + "args": { + "External id": 83648,"Sequence number": 1771090, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5227 + } + }, + { + "ph": "s", "id": 112, "pid": 5714, "tid": 5714, "ts": 6300865721604.096, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865721624.166, "dur": 3.610, + "args": { + "External id": 83649,"Sequence number": 1771091, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865721624.966, "dur": 2.450, + "args": { + "External id": 83650,"Sequence number": 1771091, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5229 + } + }, + { + "ph": "s", "id": 111, "pid": 5714, "tid": 5714, "ts": 6300865721624.966, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865721635.346, "dur": 3.500, + "args": { + "External id": 83651,"Sequence number": 1771092, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865721636.026, "dur": 2.640, + "args": { + "External id": 83652,"Sequence number": 1771092, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5231 + } + }, + { + "ph": "s", "id": 110, "pid": 5714, "tid": 5714, "ts": 6300865721636.026, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865721645.296, "dur": 3.630, + "args": { + "External id": 83653,"Sequence number": 1771093, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865721645.986, "dur": 2.770, + "args": { + "External id": 83654,"Sequence number": 1771093, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5233 + } + }, + { + "ph": "s", "id": 109, "pid": 5714, "tid": 5714, "ts": 6300865721645.986, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865721678.326, "dur": 139.340, + "args": { + "External id": 83655,"Sequence number": 1771094, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5234 + } + }, + { + "ph": "s", "id": 108, "pid": 5714, "tid": 5714, "ts": 6300865721678.326, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865721696.386, "dur": 10.260, + "args": { + "External id": 83656,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865721698.556, "dur": 7.390, + "args": { + "External id": 83657,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865721835.256, "dur": 127.060, + "args": { + "External id": 83658,"Sequence number": 1771095, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5237 + } + }, + { + "ph": "s", "id": 107, "pid": 5714, "tid": 5714, "ts": 6300865721835.256, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865721851.076, "dur": 13.300, + "args": { + "External id": 83659,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865721853.786, "dur": 9.930, + "args": { + "External id": 83660,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5714, "tid": 5714, + "ts": 6300865721989.876, "dur": 168.059, + "args": { + "External id": 83661,"Sequence number": 1771096, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 5240 + } + }, + { + "ph": "s", "id": 106, "pid": 5714, "tid": 5714, "ts": 6300865721989.876, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865722009.625, "dur": 120.540, + "args": { + "External id": 83662,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 5241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865722048.075, "dur": 12.550, + "args": { + "External id": 83663,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865722051.015, "dur": 8.550, + "args": { + "External id": 83664,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865722062.815, "dur": 5.240, + "args": { + "External id": 83665,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865722069.225, "dur": 2.870, + "args": { + "External id": 83666,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865722074.975, "dur": 6.040, + "args": { + "External id": 83667,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 5714, + "ts": 6300865722140.955, "dur": 3.910, + "args": { + "External id": 83668,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 5247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865722163.675, "dur": 5.390, + "args": { + "External id": 83669,"Sequence number": 1771097, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865722164.875, "dur": 3.940, + "args": { + "External id": 83670,"Sequence number": 1771097, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5249 + } + }, + { + "ph": "s", "id": 105, "pid": 5714, "tid": 5714, "ts": 6300865722164.875, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865722183.075, "dur": 66.200, + "args": { + "External id": 83671,"Sequence number": 1771098, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865722185.265, "dur": 7.050, + "args": { + "External id": 83672,"Sequence number": 1771098, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5251 + } + }, + { + "ph": "s", "id": 104, "pid": 5714, "tid": 5714, "ts": 6300865722185.265, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865722187.635, "dur": 3.580, + "args": { + "External id": 83673,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865722189.565, "dur": 1.250, + "args": { + "External id": 83674,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865722193.325, "dur": 55.550, + "args": { + "External id": 83675,"Sequence number": 1771099, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865722194.865, "dur": 3.690, + "args": { + "External id": 83676,"Sequence number": 1771099, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865722196.545, "dur": 1.800, + "args": { + "External id": 83677,"Sequence number": 1771099, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5256 + } + }, + { + "ph": "s", "id": 103, "pid": 5714, "tid": 5714, "ts": 6300865722196.545, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865722199.315, "dur": 41.870, + "args": { + "External id": 83678,"Sequence number": 1771100, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5257 + } + }, + { + "ph": "s", "id": 102, "pid": 5714, "tid": 5714, "ts": 6300865722199.315, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865722243.635, "dur": 4.280, + "args": { + "External id": 83679,"Sequence number": 1771101, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5258 + } + }, + { + "ph": "s", "id": 101, "pid": 5714, "tid": 5714, "ts": 6300865722243.635, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300865722260.925, "dur": 22.200, + "args": { + "External id": 83680,"Sequence number": 1771102, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5259 + } + }, + { + "ph": "s", "id": 100, "pid": 5714, "tid": 5714, "ts": 6300865722260.925, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6300865722312.025, "dur": 183.739, + "args": { + "External id": 83681,"Sequence number": 1771103, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865722313.735, "dur": 35.220, + "args": { + "External id": 83682,"Sequence number": 1771103, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865722317.065, "dur": 31.530, + "args": { + "External id": 83683,"Sequence number": 1771103, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5262 + } + }, + { + "ph": "s", "id": 99, "pid": 5714, "tid": 5714, "ts": 6300865722317.065, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865722321.215, "dur": 7.400, + "args": { + "External id": 83684,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865722329.855, "dur": 17.080, + "args": { + "External id": 83685,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300865722350.304, "dur": 20.231, + "args": { + "External id": 83686,"Sequence number": 1771104, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5265 + } + }, + { + "ph": "s", "id": 98, "pid": 5714, "tid": 5714, "ts": 6300865722350.304, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300865722353.684, "dur": 0.440, + "args": { + "External id": 83687,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865722354.855, "dur": 0.160, + "args": { + "External id": 83688,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6300865722373.604, "dur": 21.680, + "args": { + "External id": 83689,"Sequence number": 1771105, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5268 + } + }, + { + "ph": "s", "id": 97, "pid": 5714, "tid": 5714, "ts": 6300865722373.604, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865722396.824, "dur": 18.240, + "args": { + "External id": 83690,"Sequence number": 1771106, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5269 + } + }, + { + "ph": "s", "id": 96, "pid": 5714, "tid": 5714, "ts": 6300865722396.824, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865722402.015, "dur": 10.960, + "args": { + "External id": 83691,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6300865722416.184, "dur": 15.900, + "args": { + "External id": 83692,"Sequence number": 1771107, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5271 + } + }, + { + "ph": "s", "id": 95, "pid": 5714, "tid": 5714, "ts": 6300865722416.184, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865722435.444, "dur": 15.630, + "args": { + "External id": 83693,"Sequence number": 1771108, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5272 + } + }, + { + "ph": "s", "id": 94, "pid": 5714, "tid": 5714, "ts": 6300865722435.444, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6300865722453.474, "dur": 24.830, + "args": { + "External id": 83694,"Sequence number": 1771109, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865722454.804, "dur": 23.200, + "args": { + "External id": 83695,"Sequence number": 1771109, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865722455.834, "dur": 21.940, + "args": { + "External id": 83696,"Sequence number": 1771109, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5275 + } + }, + { + "ph": "s", "id": 93, "pid": 5714, "tid": 5714, "ts": 6300865722455.834, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865722460.504, "dur": 4.530, + "args": { + "External id": 83697,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865722465.994, "dur": 10.760, + "args": { + "External id": 83698,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865722480.174, "dur": 15.090, + "args": { + "External id": 83699,"Sequence number": 1771110, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5278 + } + }, + { + "ph": "s", "id": 92, "pid": 5714, "tid": 5714, "ts": 6300865722480.174, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865722516.494, "dur": 62.320, + "args": { + "External id": 83700,"Sequence number": 1771111, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865722517.474, "dur": 8.790, + "args": { + "External id": 83701,"Sequence number": 1771111, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5280 + } + }, + { + "ph": "s", "id": 91, "pid": 5714, "tid": 5714, "ts": 6300865722517.474, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865722520.654, "dur": 4.080, + "args": { + "External id": 83702,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865722522.854, "dur": 1.500, + "args": { + "External id": 83703,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865722527.164, "dur": 51.240, + "args": { + "External id": 83704,"Sequence number": 1771112, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865722528.754, "dur": 5.110, + "args": { + "External id": 83705,"Sequence number": 1771112, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865722530.764, "dur": 2.890, + "args": { + "External id": 83706,"Sequence number": 1771112, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5285 + } + }, + { + "ph": "s", "id": 90, "pid": 5714, "tid": 5714, "ts": 6300865722530.764, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865722535.834, "dur": 34.920, + "args": { + "External id": 83707,"Sequence number": 1771113, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5286 + } + }, + { + "ph": "s", "id": 89, "pid": 5714, "tid": 5714, "ts": 6300865722535.834, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865722573.164, "dur": 4.340, + "args": { + "External id": 83708,"Sequence number": 1771114, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5287 + } + }, + { + "ph": "s", "id": 88, "pid": 5714, "tid": 5714, "ts": 6300865722573.164, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865722591.004, "dur": 51.310, + "args": { + "External id": 83709,"Sequence number": 1771115, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865722591.734, "dur": 6.540, + "args": { + "External id": 83710,"Sequence number": 1771115, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5289 + } + }, + { + "ph": "s", "id": 87, "pid": 5714, "tid": 5714, "ts": 6300865722591.734, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865722594.524, "dur": 2.600, + "args": { + "External id": 83711,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865722596.034, "dur": 0.820, + "args": { + "External id": 83712,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865722599.124, "dur": 42.860, + "args": { + "External id": 83713,"Sequence number": 1771116, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865722600.444, "dur": 4.200, + "args": { + "External id": 83714,"Sequence number": 1771116, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865722601.414, "dur": 3.050, + "args": { + "External id": 83715,"Sequence number": 1771116, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5294 + } + }, + { + "ph": "s", "id": 86, "pid": 5714, "tid": 5714, "ts": 6300865722601.414, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865722605.274, "dur": 30.660, + "args": { + "External id": 83716,"Sequence number": 1771117, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5295 + } + }, + { + "ph": "s", "id": 85, "pid": 5714, "tid": 5714, "ts": 6300865722605.274, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865722638.274, "dur": 2.840, + "args": { + "External id": 83717,"Sequence number": 1771118, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5296 + } + }, + { + "ph": "s", "id": 84, "pid": 5714, "tid": 5714, "ts": 6300865722638.274, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5714, "tid": 5714, + "ts": 6300865722667.754, "dur": 133.129, + "args": { + "External id": 83718,"Sequence number": 1771119, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5297 + } + }, + { + "ph": "s", "id": 83, "pid": 5714, "tid": 5714, "ts": 6300865722667.754, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865722695.054, "dur": 7.440, + "args": { + "External id": 83719,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865722730.194, "dur": 56.680, + "args": { + "External id": 83720,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865722731.154, "dur": 8.100, + "args": { + "External id": 83721,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865722732.844, "dur": 4.740, + "args": { + "External id": 83722,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865722735.724, "dur": 1.400, + "args": { + "External id": 83723,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865722740.204, "dur": 46.210, + "args": { + "External id": 83724,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865722741.974, "dur": 5.480, + "args": { + "External id": 83725,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865722744.114, "dur": 3.130, + "args": { + "External id": 83726,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865722748.094, "dur": 33.530, + "args": { + "External id": 83727,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865722784.034, "dur": 1.200, + "args": { + "External id": 83728,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300865722807.514, "dur": 22.520, + "args": { + "External id": 83729,"Sequence number": 1771120, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5308 + } + }, + { + "ph": "s", "id": 82, "pid": 5714, "tid": 5714, "ts": 6300865722807.514, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6300865722858.794, "dur": 177.129, + "args": { + "External id": 83730,"Sequence number": 1771121, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865722860.334, "dur": 32.199, + "args": { + "External id": 83731,"Sequence number": 1771121, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865722861.643, "dur": 30.560, + "args": { + "External id": 83732,"Sequence number": 1771121, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5311 + } + }, + { + "ph": "s", "id": 81, "pid": 5714, "tid": 5714, "ts": 6300865722861.643, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865722865.703, "dur": 6.960, + "args": { + "External id": 83733,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865722874.954, "dur": 15.599, + "args": { + "External id": 83734,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300865722893.823, "dur": 18.810, + "args": { + "External id": 83735,"Sequence number": 1771122, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5314 + } + }, + { + "ph": "s", "id": 80, "pid": 5714, "tid": 5714, "ts": 6300865722893.823, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300865722896.653, "dur": 0.450, + "args": { + "External id": 83736,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865722897.773, "dur": 0.160, + "args": { + "External id": 83737,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6300865722914.343, "dur": 18.950, + "args": { + "External id": 83738,"Sequence number": 1771123, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5317 + } + }, + { + "ph": "s", "id": 79, "pid": 5714, "tid": 5714, "ts": 6300865722914.343, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865722934.813, "dur": 16.750, + "args": { + "External id": 83739,"Sequence number": 1771124, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5318 + } + }, + { + "ph": "s", "id": 78, "pid": 5714, "tid": 5714, "ts": 6300865722934.813, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865722938.873, "dur": 10.700, + "args": { + "External id": 83740,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6300865722953.893, "dur": 17.050, + "args": { + "External id": 83741,"Sequence number": 1771125, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5320 + } + }, + { + "ph": "s", "id": 77, "pid": 5714, "tid": 5714, "ts": 6300865722953.893, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865722974.663, "dur": 15.790, + "args": { + "External id": 83742,"Sequence number": 1771126, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5321 + } + }, + { + "ph": "s", "id": 76, "pid": 5714, "tid": 5714, "ts": 6300865722974.663, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6300865722991.663, "dur": 26.860, + "args": { + "External id": 83743,"Sequence number": 1771127, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865722992.893, "dur": 25.340, + "args": { + "External id": 83744,"Sequence number": 1771127, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865722993.733, "dur": 24.250, + "args": { + "External id": 83745,"Sequence number": 1771127, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5324 + } + }, + { + "ph": "s", "id": 75, "pid": 5714, "tid": 5714, "ts": 6300865722993.733, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865723001.023, "dur": 4.480, + "args": { + "External id": 83746,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865723006.453, "dur": 10.540, + "args": { + "External id": 83747,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865723020.523, "dur": 14.930, + "args": { + "External id": 83748,"Sequence number": 1771128, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5327 + } + }, + { + "ph": "s", "id": 74, "pid": 5714, "tid": 5714, "ts": 6300865723020.523, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865723061.033, "dur": 61.010, + "args": { + "External id": 83749,"Sequence number": 1771129, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865723062.013, "dur": 9.810, + "args": { + "External id": 83750,"Sequence number": 1771129, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5329 + } + }, + { + "ph": "s", "id": 73, "pid": 5714, "tid": 5714, "ts": 6300865723062.013, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865723064.373, "dur": 5.730, + "args": { + "External id": 83751,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865723068.183, "dur": 1.470, + "args": { + "External id": 83752,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865723072.633, "dur": 49.010, + "args": { + "External id": 83753,"Sequence number": 1771130, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865723074.243, "dur": 3.940, + "args": { + "External id": 83754,"Sequence number": 1771130, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865723075.163, "dur": 2.810, + "args": { + "External id": 83755,"Sequence number": 1771130, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5334 + } + }, + { + "ph": "s", "id": 72, "pid": 5714, "tid": 5714, "ts": 6300865723075.163, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865723078.863, "dur": 36.800, + "args": { + "External id": 83756,"Sequence number": 1771131, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5335 + } + }, + { + "ph": "s", "id": 71, "pid": 5714, "tid": 5714, "ts": 6300865723078.863, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865723118.333, "dur": 2.460, + "args": { + "External id": 83757,"Sequence number": 1771132, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5336 + } + }, + { + "ph": "s", "id": 70, "pid": 5714, "tid": 5714, "ts": 6300865723118.333, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865723134.213, "dur": 49.840, + "args": { + "External id": 83758,"Sequence number": 1771133, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865723134.963, "dur": 5.540, + "args": { + "External id": 83759,"Sequence number": 1771133, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5338 + } + }, + { + "ph": "s", "id": 69, "pid": 5714, "tid": 5714, "ts": 6300865723134.963, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865723136.543, "dur": 2.900, + "args": { + "External id": 83760,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865723138.233, "dur": 0.840, + "args": { + "External id": 83761,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865723142.503, "dur": 41.220, + "args": { + "External id": 83762,"Sequence number": 1771134, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865723143.783, "dur": 2.770, + "args": { + "External id": 83763,"Sequence number": 1771134, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865723144.543, "dur": 1.810, + "args": { + "External id": 83764,"Sequence number": 1771134, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5343 + } + }, + { + "ph": "s", "id": 68, "pid": 5714, "tid": 5714, "ts": 6300865723144.543, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865723147.183, "dur": 29.630, + "args": { + "External id": 83765,"Sequence number": 1771135, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5344 + } + }, + { + "ph": "s", "id": 67, "pid": 5714, "tid": 5714, "ts": 6300865723147.183, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865723178.923, "dur": 4.040, + "args": { + "External id": 83766,"Sequence number": 1771136, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5345 + } + }, + { + "ph": "s", "id": 66, "pid": 5714, "tid": 5714, "ts": 6300865723178.923, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865723195.703, "dur": 49.179, + "args": { + "External id": 83767,"Sequence number": 1771137, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865723196.383, "dur": 6.410, + "args": { + "External id": 83768,"Sequence number": 1771137, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5347 + } + }, + { + "ph": "s", "id": 65, "pid": 5714, "tid": 5714, "ts": 6300865723196.383, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865723197.793, "dur": 3.900, + "args": { + "External id": 83769,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865723199.663, "dur": 1.710, + "args": { + "External id": 83770,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865723203.613, "dur": 40.929, + "args": { + "External id": 83771,"Sequence number": 1771138, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865723206.113, "dur": 4.450, + "args": { + "External id": 83772,"Sequence number": 1771138, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865723207.223, "dur": 3.140, + "args": { + "External id": 83773,"Sequence number": 1771138, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5352 + } + }, + { + "ph": "s", "id": 64, "pid": 5714, "tid": 5714, "ts": 6300865723207.223, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865723211.223, "dur": 27.710, + "args": { + "External id": 83774,"Sequence number": 1771139, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5353 + } + }, + { + "ph": "s", "id": 63, "pid": 5714, "tid": 5714, "ts": 6300865723211.223, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865723241.002, "dur": 2.840, + "args": { + "External id": 83775,"Sequence number": 1771140, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5354 + } + }, + { + "ph": "s", "id": 62, "pid": 5714, "tid": 5714, "ts": 6300865723241.002, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865723260.653, "dur": 3.849, + "args": { + "External id": 83776,"Sequence number": 1771141, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865723261.862, "dur": 2.431, + "args": { + "External id": 83777,"Sequence number": 1771141, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5356 + } + }, + { + "ph": "s", "id": 61, "pid": 5714, "tid": 5714, "ts": 6300865723261.862, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865723272.262, "dur": 3.371, + "args": { + "External id": 83778,"Sequence number": 1771142, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865723272.953, "dur": 2.500, + "args": { + "External id": 83779,"Sequence number": 1771142, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5358 + } + }, + { + "ph": "s", "id": 60, "pid": 5714, "tid": 5714, "ts": 6300865723272.953, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865723280.313, "dur": 3.409, + "args": { + "External id": 83780,"Sequence number": 1771143, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865723282.202, "dur": 1.351, + "args": { + "External id": 83781,"Sequence number": 1771143, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5360 + } + }, + { + "ph": "s", "id": 59, "pid": 5714, "tid": 5714, "ts": 6300865723282.202, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865723322.852, "dur": 145.200, + "args": { + "External id": 83782,"Sequence number": 1771144, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5361 + } + }, + { + "ph": "s", "id": 58, "pid": 5714, "tid": 5714, "ts": 6300865723322.852, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865723341.382, "dur": 13.550, + "args": { + "External id": 83783,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865723343.782, "dur": 10.310, + "args": { + "External id": 83784,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6300865723485.412, "dur": 125.070, + "args": { + "External id": 83785,"Sequence number": 1771145, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5364 + } + }, + { + "ph": "s", "id": 57, "pid": 5714, "tid": 5714, "ts": 6300865723485.412, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865723501.012, "dur": 12.790, + "args": { + "External id": 83786,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865723503.892, "dur": 9.110, + "args": { + "External id": 83787,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5714, "tid": 5714, + "ts": 6300865723639.782, "dur": 166.579, + "args": { + "External id": 83788,"Sequence number": 1771146, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 5367 + } + }, + { + "ph": "s", "id": 56, "pid": 5714, "tid": 5714, "ts": 6300865723639.782, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6300865723659.422, "dur": 120.519, + "args": { + "External id": 83789,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 5368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865723696.712, "dur": 12.329, + "args": { + "External id": 83790,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865723699.872, "dur": 8.129, + "args": { + "External id": 83791,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865723712.212, "dur": 6.489, + "args": { + "External id": 83792,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865723719.861, "dur": 2.811, + "args": { + "External id": 83793,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865723725.272, "dur": 5.669, + "args": { + "External id": 83794,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 5714, + "ts": 6300865723790.911, "dur": 3.990, + "args": { + "External id": 83795,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 5374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865723812.101, "dur": 6.560, + "args": { + "External id": 83796,"Sequence number": 1771147, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865723813.301, "dur": 5.090, + "args": { + "External id": 83797,"Sequence number": 1771147, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5376 + } + }, + { + "ph": "s", "id": 55, "pid": 5714, "tid": 5714, "ts": 6300865723813.301, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865723831.081, "dur": 64.620, + "args": { + "External id": 83798,"Sequence number": 1771148, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865723833.231, "dur": 6.890, + "args": { + "External id": 83799,"Sequence number": 1771148, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5378 + } + }, + { + "ph": "s", "id": 54, "pid": 5714, "tid": 5714, "ts": 6300865723833.231, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865723835.341, "dur": 3.740, + "args": { + "External id": 83800,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865723837.411, "dur": 1.280, + "args": { + "External id": 83801,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865723841.111, "dur": 54.170, + "args": { + "External id": 83802,"Sequence number": 1771149, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865723842.771, "dur": 4.210, + "args": { + "External id": 83803,"Sequence number": 1771149, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865723844.491, "dur": 2.300, + "args": { + "External id": 83804,"Sequence number": 1771149, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5383 + } + }, + { + "ph": "s", "id": 53, "pid": 5714, "tid": 5714, "ts": 6300865723844.491, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865723847.751, "dur": 40.110, + "args": { + "External id": 83805,"Sequence number": 1771150, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5384 + } + }, + { + "ph": "s", "id": 52, "pid": 5714, "tid": 5714, "ts": 6300865723847.751, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865723890.311, "dur": 3.990, + "args": { + "External id": 83806,"Sequence number": 1771151, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5385 + } + }, + { + "ph": "s", "id": 51, "pid": 5714, "tid": 5714, "ts": 6300865723890.311, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300865723913.191, "dur": 23.230, + "args": { + "External id": 83807,"Sequence number": 1771152, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5386 + } + }, + { + "ph": "s", "id": 50, "pid": 5714, "tid": 5714, "ts": 6300865723913.191, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6300865723956.861, "dur": 183.510, + "args": { + "External id": 83808,"Sequence number": 1771153, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865723958.591, "dur": 30.170, + "args": { + "External id": 83809,"Sequence number": 1771153, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865723959.831, "dur": 28.670, + "args": { + "External id": 83810,"Sequence number": 1771153, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5389 + } + }, + { + "ph": "s", "id": 49, "pid": 5714, "tid": 5714, "ts": 6300865723959.831, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865723964.151, "dur": 6.490, + "args": { + "External id": 83811,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865723971.801, "dur": 15.170, + "args": { + "External id": 83812,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300865723990.081, "dur": 20.800, + "args": { + "External id": 83813,"Sequence number": 1771154, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5392 + } + }, + { + "ph": "s", "id": 48, "pid": 5714, "tid": 5714, "ts": 6300865723990.081, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300865723993.031, "dur": 0.430, + "args": { + "External id": 83814,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865723994.211, "dur": 0.150, + "args": { + "External id": 83815,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6300865724013.811, "dur": 21.330, + "args": { + "External id": 83816,"Sequence number": 1771155, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5395 + } + }, + { + "ph": "s", "id": 47, "pid": 5714, "tid": 5714, "ts": 6300865724013.811, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865724036.661, "dur": 18.640, + "args": { + "External id": 83817,"Sequence number": 1771156, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5396 + } + }, + { + "ph": "s", "id": 46, "pid": 5714, "tid": 5714, "ts": 6300865724036.661, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865724042.141, "dur": 11.120, + "args": { + "External id": 83818,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6300865724056.501, "dur": 18.100, + "args": { + "External id": 83819,"Sequence number": 1771157, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5398 + } + }, + { + "ph": "s", "id": 45, "pid": 5714, "tid": 5714, "ts": 6300865724056.501, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865724078.181, "dur": 17.350, + "args": { + "External id": 83820,"Sequence number": 1771158, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5399 + } + }, + { + "ph": "s", "id": 44, "pid": 5714, "tid": 5714, "ts": 6300865724078.181, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6300865724096.781, "dur": 24.759, + "args": { + "External id": 83821,"Sequence number": 1771159, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865724098.151, "dur": 23.149, + "args": { + "External id": 83822,"Sequence number": 1771159, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865724099.001, "dur": 22.030, + "args": { + "External id": 83823,"Sequence number": 1771159, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5402 + } + }, + { + "ph": "s", "id": 43, "pid": 5714, "tid": 5714, "ts": 6300865724099.001, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865724102.771, "dur": 5.480, + "args": { + "External id": 83824,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865724109.171, "dur": 10.809, + "args": { + "External id": 83825,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300865724123.491, "dur": 16.300, + "args": { + "External id": 83826,"Sequence number": 1771160, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5405 + } + }, + { + "ph": "s", "id": 42, "pid": 5714, "tid": 5714, "ts": 6300865724123.491, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865724161.320, "dur": 59.900, + "args": { + "External id": 83827,"Sequence number": 1771161, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865724163.400, "dur": 7.780, + "args": { + "External id": 83828,"Sequence number": 1771161, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5407 + } + }, + { + "ph": "s", "id": 41, "pid": 5714, "tid": 5714, "ts": 6300865724163.400, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865724165.640, "dur": 4.051, + "args": { + "External id": 83829,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865724167.860, "dur": 1.480, + "args": { + "External id": 83830,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865724171.991, "dur": 48.869, + "args": { + "External id": 83831,"Sequence number": 1771162, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865724173.500, "dur": 6.271, + "args": { + "External id": 83832,"Sequence number": 1771162, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865724175.731, "dur": 3.769, + "args": { + "External id": 83833,"Sequence number": 1771162, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5412 + } + }, + { + "ph": "s", "id": 40, "pid": 5714, "tid": 5714, "ts": 6300865724175.731, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865724180.540, "dur": 34.650, + "args": { + "External id": 83834,"Sequence number": 1771163, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5413 + } + }, + { + "ph": "s", "id": 39, "pid": 5714, "tid": 5714, "ts": 6300865724180.540, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865724217.510, "dur": 2.460, + "args": { + "External id": 83835,"Sequence number": 1771164, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5414 + } + }, + { + "ph": "s", "id": 38, "pid": 5714, "tid": 5714, "ts": 6300865724217.510, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865724233.550, "dur": 52.520, + "args": { + "External id": 83836,"Sequence number": 1771165, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865724234.250, "dur": 8.030, + "args": { + "External id": 83837,"Sequence number": 1771165, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5416 + } + }, + { + "ph": "s", "id": 37, "pid": 5714, "tid": 5714, "ts": 6300865724234.250, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865724237.460, "dur": 3.760, + "args": { + "External id": 83838,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865724240.050, "dur": 0.850, + "args": { + "External id": 83839,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865724243.100, "dur": 42.640, + "args": { + "External id": 83840,"Sequence number": 1771166, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865724244.550, "dur": 4.080, + "args": { + "External id": 83841,"Sequence number": 1771166, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865724246.510, "dur": 1.920, + "args": { + "External id": 83842,"Sequence number": 1771166, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5421 + } + }, + { + "ph": "s", "id": 36, "pid": 5714, "tid": 5714, "ts": 6300865724246.510, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865724249.320, "dur": 29.160, + "args": { + "External id": 83843,"Sequence number": 1771167, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5422 + } + }, + { + "ph": "s", "id": 35, "pid": 5714, "tid": 5714, "ts": 6300865724249.320, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865724280.670, "dur": 4.180, + "args": { + "External id": 83844,"Sequence number": 1771168, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5423 + } + }, + { + "ph": "s", "id": 34, "pid": 5714, "tid": 5714, "ts": 6300865724280.670, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5714, "tid": 5714, + "ts": 6300865724322.140, "dur": 134.450, + "args": { + "External id": 83845,"Sequence number": 1771169, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5424 + } + }, + { + "ph": "s", "id": 33, "pid": 5714, "tid": 5714, "ts": 6300865724322.140, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865724348.620, "dur": 9.370, + "args": { + "External id": 83846,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865724387.300, "dur": 54.950, + "args": { + "External id": 83847,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865724388.340, "dur": 7.060, + "args": { + "External id": 83848,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865724390.050, "dur": 3.950, + "args": { + "External id": 83849,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865724392.090, "dur": 1.480, + "args": { + "External id": 83850,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865724396.270, "dur": 45.570, + "args": { + "External id": 83851,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6300865724398.180, "dur": 5.030, + "args": { + "External id": 83852,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865724400.010, "dur": 3.010, + "args": { + "External id": 83853,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865724403.910, "dur": 33.010, + "args": { + "External id": 83854,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6300865724439.320, "dur": 1.330, + "args": { + "External id": 83855,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300865724463.180, "dur": 22.600, + "args": { + "External id": 83856,"Sequence number": 1771170, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5435 + } + }, + { + "ph": "s", "id": 32, "pid": 5714, "tid": 5714, "ts": 6300865724463.180, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::stack", "pid": 5714, "tid": 5714, + "ts": 6300865724502.000, "dur": 38.600, + "args": { + "External id": 83857,"Sequence number": 1771171, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "-2"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[[1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1]], []], "Input Dims": [[[8, 2048, 768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 768]], []], "Ev Idx": 5436 + } + }, + { + "ph": "s", "id": 31, "pid": 5714, "tid": 5714, "ts": 6300865724502.000, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::cat", "pid": 5714, "tid": 5714, + "ts": 6300865724508.380, "dur": 26.870, + "args": { + "External id": 83858,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[[1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1]], []], "Input Dims": [[[8, 2048, 768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 768]], []], "Ev Idx": 5437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865724537.550, "dur": 1.270, + "args": { + "External id": 83859,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 3072], []], "Ev Idx": 5438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6300865724569.290, "dur": 26.129, + "args": { + "External id": 83860,"Record function id": 0, "Ev Idx": 5439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/1", "pid": 5714, "tid": 5714, + "ts": 6300865724596.970, "dur": 216.629, + "args": { + "External id": 83861,"Record function id": 0, "Ev Idx": 5440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6300865724672.589, "dur": 125.580, + "args": { + "External id": 83862,"Sequence number": 1771172, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "8", "2048", "4", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "c10::BFloat16"], "Input Strides": [[1], [], [], [], [6291456, 3072, 768, 1]], "Input Dims": [[768], [], [], [], [8, 2048, 4, 768]], "Ev Idx": 5441 + } + }, + { + "ph": "s", "id": 30, "pid": 5714, "tid": 5714, "ts": 6300865724672.589, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6300865724734.799, "dur": 29.270, + "args": { + "External id": 83863,"kernel_hash": "cwefpfej5pwum5b4hu7een5otcjqe4vo2l2suze5lxgbdcyqp62t", "grid": "grid(65536,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "65536", "768"], "kernel_file": "/tmp/torchinductor_root/we/cwefpfej5pwum5b4hu7een5otcjqe4vo2l2suze5lxgbdcyqp62t.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[8192, 4, 1, 1], [6291456, 3072, 768, 1], [1], [6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 1], [8, 2048, 4, 768], [768], [8, 2048, 4, 768], [], []], "Ev Idx": 5442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6300865724882.279, "dur": 38.630, + "args": { + "External id": 83864,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False"], "Input type": ["ScalarList", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865724885.429, "dur": 9.030, + "args": { + "External id": 83865,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False", ""], "Input type": ["ScalarList", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865724896.929, "dur": 23.610, + "args": { + "External id": 83866,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 5445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865724899.279, "dur": 20.290, + "args": { + "External id": 83867,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 5446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6300865724928.699, "dur": 16.900, + "args": { + "External id": 83868,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False"], "Input type": ["ScalarList", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865724929.989, "dur": 4.200, + "args": { + "External id": 83869,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False", ""], "Input type": ["ScalarList", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865724934.939, "dur": 10.390, + "args": { + "External id": 83870,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 5449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865724935.859, "dur": 8.640, + "args": { + "External id": 83871,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 5450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6300865724949.619, "dur": 16.760, + "args": { + "External id": 83872,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False"], "Input type": ["ScalarList", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865724950.619, "dur": 4.130, + "args": { + "External id": 83873,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False", ""], "Input type": ["ScalarList", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865724955.459, "dur": 10.620, + "args": { + "External id": 83874,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 5453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865724957.399, "dur": 7.810, + "args": { + "External id": 83875,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 5454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865724976.689, "dur": 0.320, + "args": { + "External id": 83876,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], [], [], [], []], "Ev Idx": 5455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unfold", "pid": 5714, "tid": 5714, + "ts": 6300865724983.449, "dur": 8.720, + "args": { + "External id": 83877,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "5", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 5456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865724988.638, "dur": 1.711, + "args": { + "External id": 83878,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 5]", "[4096, 1, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 5457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865724997.638, "dur": 5.851, + "args": { + "External id": 83879,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 5], [], [], [], []], "Ev Idx": 5458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725001.009, "dur": 0.849, + "args": { + "External id": 83880,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 5]", "[4096, 1, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 5], [], [], []], "Ev Idx": 5459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865725004.809, "dur": 3.800, + "args": { + "External id": 83881,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 5], [], [], [], []], "Ev Idx": 5460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725006.469, "dur": 1.480, + "args": { + "External id": 83882,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 5]", "[4096, 1, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 5], [], [], []], "Ev Idx": 5461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865725009.898, "dur": 2.591, + "args": { + "External id": 83883,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "1", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 5], [], [], [], []], "Ev Idx": 5462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725011.538, "dur": 0.411, + "args": { + "External id": 83884,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 5], [], [], []], "Ev Idx": 5463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865725016.709, "dur": 2.629, + "args": { + "External id": 83885,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 4], [], [], [], []], "Ev Idx": 5464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725018.478, "dur": 0.300, + "args": { + "External id": 83886,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 4], [], [], []], "Ev Idx": 5465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865725020.369, "dur": 2.400, + "args": { + "External id": 83887,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 4], [], [], [], []], "Ev Idx": 5466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725021.809, "dur": 0.380, + "args": { + "External id": 83888,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 4], [], [], []], "Ev Idx": 5467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865725023.778, "dur": 2.500, + "args": { + "External id": 83889,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 4], [], [], [], []], "Ev Idx": 5468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725025.529, "dur": 0.249, + "args": { + "External id": 83890,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 2048, 4], [], [], []], "Ev Idx": 5469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865725031.329, "dur": 7.880, + "args": { + "External id": 83891,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "2"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 2048, 4], [], []], "Ev Idx": 5470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725038.009, "dur": 0.400, + "args": { + "External id": 83892,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 2048, 4], [], [], []], "Ev Idx": 5471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865725044.369, "dur": 2.969, + "args": { + "External id": 83893,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 5472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725046.338, "dur": 0.371, + "args": { + "External id": 83894,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6300865725049.969, "dur": 6.769, + "args": { + "External id": 83895,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 5474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725054.889, "dur": 0.429, + "args": { + "External id": 83896,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865725057.929, "dur": 3.440, + "args": { + "External id": 83897,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 5476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725059.318, "dur": 1.520, + "args": { + "External id": 83898,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 5477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865725066.458, "dur": 5.491, + "args": { + "External id": 83899,"Sequence number": 1771173, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5478 + } + }, + { + "ph": "s", "id": 29, "pid": 5714, "tid": 5714, "ts": 6300865725066.458, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725069.538, "dur": 0.531, + "args": { + "External id": 83900,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865725073.069, "dur": 3.049, + "args": { + "External id": 83901,"Sequence number": 1771174, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5480 + } + }, + { + "ph": "s", "id": 28, "pid": 5714, "tid": 5714, "ts": 6300865725073.069, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725075.009, "dur": 0.369, + "args": { + "External id": 83902,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6300865725077.178, "dur": 5.050, + "args": { + "External id": 83903,"Sequence number": 1771175, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 5482 + } + }, + { + "ph": "s", "id": 27, "pid": 5714, "tid": 5714, "ts": 6300865725077.178, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725081.008, "dur": 0.470, + "args": { + "External id": 83904,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865725083.358, "dur": 4.560, + "args": { + "External id": 83905,"Sequence number": 1771176, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5484 + } + }, + { + "ph": "s", "id": 26, "pid": 5714, "tid": 5714, "ts": 6300865725083.358, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725086.388, "dur": 0.780, + "args": { + "External id": 83906,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6300865725092.638, "dur": 35.780, + "args": { + "External id": 83907,"Sequence number": 1771177, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6300865725094.018, "dur": 34.040, + "args": { + "External id": 83908,"Sequence number": 1771177, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865725096.128, "dur": 10.300, + "args": { + "External id": 83909,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 5488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865725098.008, "dur": 7.850, + "args": { + "External id": 83910,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865725107.588, "dur": 19.940, + "args": { + "External id": 83911,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 5490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865725159.768, "dur": 4.950, + "args": { + "External id": 83912,"Sequence number": 1771177, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5491 + } + }, + { + "ph": "s", "id": 25, "pid": 5714, "tid": 5714, "ts": 6300865725159.768, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865725167.378, "dur": 1.190, + "args": { + "External id": 83913,"Sequence number": 1771178, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5714, "tid": 5714, + "ts": 6300865725197.448, "dur": 213702.295, + "args": { + "External id": 83914,"Sequence number": 1771178, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 5493 + } + }, + { + "ph": "s", "id": 24, "pid": 5714, "tid": 5714, "ts": 6300865725197.448, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6300865725218.618, "dur": 48.590, + "args": { + "External id": 83915,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6300865725219.808, "dur": 47.060, + "args": { + "External id": 83916,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865725222.048, "dur": 17.850, + "args": { + "External id": 83917,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865725226.428, "dur": 12.180, + "args": { + "External id": 83918,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865725241.068, "dur": 25.100, + "args": { + "External id": 83919,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 5498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6300865725286.928, "dur": 41.290, + "args": { + "External id": 83920,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865725288.278, "dur": 19.900, + "args": { + "External id": 83921,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725291.788, "dur": 15.780, + "args": { + "External id": 83922,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865725309.248, "dur": 18.690, + "args": { + "External id": 83923,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865725310.868, "dur": 16.220, + "args": { + "External id": 83924,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6300865725333.328, "dur": 19.490, + "args": { + "External id": 83925,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865725334.208, "dur": 6.930, + "args": { + "External id": 83926,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725335.758, "dur": 5.010, + "args": { + "External id": 83927,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865725341.738, "dur": 10.850, + "args": { + "External id": 83928,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865725342.628, "dur": 9.060, + "args": { + "External id": 83929,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 5508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6300865725359.128, "dur": 18.820, + "args": { + "External id": 83930,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865725361.998, "dur": 4.760, + "args": { + "External id": 83931,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865725367.538, "dur": 10.150, + "args": { + "External id": 83932,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 5511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865725368.548, "dur": 8.190, + "args": { + "External id": 83933,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 5714, + "ts": 6300865725383.118, "dur": 26.830, + "args": { + "External id": 83934,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865725414.568, "dur": 55.420, + "args": { + "External id": 83935,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865725418.318, "dur": 51.130, + "args": { + "External id": 83936,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725426.728, "dur": 1.160, + "args": { + "External id": 83937,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 5516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865725428.968, "dur": 23.249, + "args": { + "External id": 83938,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865725430.068, "dur": 21.920, + "args": { + "External id": 83939,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 5518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865725432.217, "dur": 4.351, + "args": { + "External id": 83940,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865725437.617, "dur": 13.980, + "args": { + "External id": 83941,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 5520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6300865725477.008, "dur": 208248.086, + "args": { + "External id": 83942,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6300865725478.808, "dur": 208243.496, + "args": { + "External id": 83943,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865933745.894, "dur": 12.290, + "args": { + "External id": 83944,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865933754.324, "dur": 1.660, + "args": { + "External id": 83945,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865933764.734, "dur": 72.170, + "args": { + "External id": 83946,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865933765.984, "dur": 9.100, + "args": { + "External id": 83947,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865933768.134, "dur": 6.100, + "args": { + "External id": 83948,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865933771.534, "dur": 2.300, + "args": { + "External id": 83949,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865933776.684, "dur": 59.220, + "args": { + "External id": 83950,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865933778.274, "dur": 56.460, + "args": { + "External id": 83951,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865933842.224, "dur": 4.840, + "args": { + "External id": 83952,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865933845.084, "dur": 0.790, + "args": { + "External id": 83953,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865933857.514, "dur": 3.930, + "args": { + "External id": 83954,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865933871.474, "dur": 10.200, + "args": { + "External id": 83955,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865933874.364, "dur": 6.940, + "args": { + "External id": 83956,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865934002.324, "dur": 253.519, + "args": { + "External id": 83957,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865934006.694, "dur": 5.089, + "args": { + "External id": 83958,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865934015.714, "dur": 239.439, + "args": { + "External id": 83959,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865934018.514, "dur": 0.689, + "args": { + "External id": 83960,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865934022.423, "dur": 31.731, + "args": { + "External id": 83961,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865934056.714, "dur": 4.680, + "args": { + "External id": 83962,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934059.743, "dur": 1.060, + "args": { + "External id": 83963,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865934063.043, "dur": 32.920, + "args": { + "External id": 83964,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865934064.694, "dur": 4.040, + "args": { + "External id": 83965,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865934070.783, "dur": 24.840, + "args": { + "External id": 83966,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865934076.163, "dur": 5.500, + "args": { + "External id": 83967,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865934099.403, "dur": 24.460, + "args": { + "External id": 83968,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865934139.113, "dur": 19.300, + "args": { + "External id": 83969,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865934162.373, "dur": 17.210, + "args": { + "External id": 83970,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865934181.753, "dur": 12.990, + "args": { + "External id": 83971,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865934197.713, "dur": 28.940, + "args": { + "External id": 83972,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865934200.873, "dur": 3.140, + "args": { + "External id": 83973,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934210.523, "dur": 0.970, + "args": { + "External id": 83974,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865934230.973, "dur": 13.520, + "args": { + "External id": 83975,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865934245.613, "dur": 8.080, + "args": { + "External id": 83976,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865934266.323, "dur": 3.320, + "args": { + "External id": 83977,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865934278.433, "dur": 4.590, + "args": { + "External id": 83978,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934281.403, "dur": 0.630, + "args": { + "External id": 83979,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865934379.113, "dur": 66.329, + "args": { + "External id": 83980,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865934453.902, "dur": 8.431, + "args": { + "External id": 83981,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934459.253, "dur": 1.200, + "args": { + "External id": 83982,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865934463.762, "dur": 27.131, + "args": { + "External id": 83983,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865934499.193, "dur": 5.820, + "args": { + "External id": 83984,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865934500.682, "dur": 3.420, + "args": { + "External id": 83985,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934502.633, "dur": 1.040, + "args": { + "External id": 83986,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865934508.733, "dur": 38.659, + "args": { + "External id": 83987,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865934509.693, "dur": 36.749, + "args": { + "External id": 83988,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865934555.102, "dur": 18.350, + "args": { + "External id": 83989,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865934581.962, "dur": 5.400, + "args": { + "External id": 83990,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934585.052, "dur": 0.920, + "args": { + "External id": 83991,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865934591.612, "dur": 43.200, + "args": { + "External id": 83992,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865934592.542, "dur": 6.770, + "args": { + "External id": 83993,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865934593.792, "dur": 4.900, + "args": { + "External id": 83994,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934596.712, "dur": 1.650, + "args": { + "External id": 83995,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865934600.152, "dur": 34.090, + "args": { + "External id": 83996,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865934601.002, "dur": 32.410, + "args": { + "External id": 83997,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865934641.362, "dur": 5.650, + "args": { + "External id": 83998,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934644.202, "dur": 1.560, + "args": { + "External id": 83999,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865934654.792, "dur": 2.020, + "args": { + "External id": 84000,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865934664.972, "dur": 8.430, + "args": { + "External id": 84001,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865934666.632, "dur": 6.450, + "args": { + "External id": 84002,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865934759.112, "dur": 165.569, + "args": { + "External id": 84003,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865934761.522, "dur": 4.780, + "args": { + "External id": 84004,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865934767.992, "dur": 156.160, + "args": { + "External id": 84005,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865934769.332, "dur": 0.220, + "args": { + "External id": 84006,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865934771.882, "dur": 22.790, + "args": { + "External id": 84007,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865934797.772, "dur": 3.680, + "args": { + "External id": 84008,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934800.012, "dur": 0.960, + "args": { + "External id": 84009,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865934802.252, "dur": 22.290, + "args": { + "External id": 84010,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865934803.272, "dur": 3.950, + "args": { + "External id": 84011,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865934808.312, "dur": 15.820, + "args": { + "External id": 84012,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865934811.382, "dur": 3.530, + "args": { + "External id": 84013,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865934825.832, "dur": 19.680, + "args": { + "External id": 84014,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865934847.442, "dur": 10.060, + "args": { + "External id": 84015,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865934860.222, "dur": 11.770, + "args": { + "External id": 84016,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865934873.322, "dur": 8.420, + "args": { + "External id": 84017,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865934884.652, "dur": 18.909, + "args": { + "External id": 84018,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865934886.721, "dur": 2.720, + "args": { + "External id": 84019,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934891.681, "dur": 0.860, + "args": { + "External id": 84020,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865934905.281, "dur": 8.511, + "args": { + "External id": 84021,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865934914.872, "dur": 7.780, + "args": { + "External id": 84022,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865934933.272, "dur": 3.129, + "args": { + "External id": 84023,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865934946.741, "dur": 4.391, + "args": { + "External id": 84024,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865934949.372, "dur": 0.680, + "args": { + "External id": 84025,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865935024.691, "dur": 45.820, + "args": { + "External id": 84026,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865935078.211, "dur": 8.160, + "args": { + "External id": 84027,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935082.301, "dur": 2.160, + "args": { + "External id": 84028,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865935087.621, "dur": 20.520, + "args": { + "External id": 84029,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865935115.341, "dur": 7.390, + "args": { + "External id": 84030,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865935116.931, "dur": 5.000, + "args": { + "External id": 84031,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935120.631, "dur": 0.940, + "args": { + "External id": 84032,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865935125.261, "dur": 34.590, + "args": { + "External id": 84033,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865935126.271, "dur": 32.770, + "args": { + "External id": 84034,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865935164.751, "dur": 14.730, + "args": { + "External id": 84035,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865935187.601, "dur": 5.580, + "args": { + "External id": 84036,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935190.831, "dur": 0.910, + "args": { + "External id": 84037,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865935197.201, "dur": 41.210, + "args": { + "External id": 84038,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865935199.361, "dur": 4.030, + "args": { + "External id": 84039,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865935200.431, "dur": 2.430, + "args": { + "External id": 84040,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935201.951, "dur": 0.580, + "args": { + "External id": 84041,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865935204.251, "dur": 33.600, + "args": { + "External id": 84042,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865935205.191, "dur": 31.900, + "args": { + "External id": 84043,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865935244.801, "dur": 4.230, + "args": { + "External id": 84044,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935247.211, "dur": 0.600, + "args": { + "External id": 84045,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865935256.401, "dur": 1.800, + "args": { + "External id": 84046,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865935266.271, "dur": 9.930, + "args": { + "External id": 84047,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865935267.781, "dur": 8.090, + "args": { + "External id": 84048,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865935370.000, "dur": 161.780, + "args": { + "External id": 84049,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865935372.440, "dur": 5.000, + "args": { + "External id": 84050,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865935379.060, "dur": 152.160, + "args": { + "External id": 84051,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865935380.440, "dur": 0.220, + "args": { + "External id": 84052,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865935381.860, "dur": 22.111, + "args": { + "External id": 84053,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865935408.220, "dur": 3.991, + "args": { + "External id": 84054,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935410.731, "dur": 1.000, + "args": { + "External id": 84055,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865935413.011, "dur": 21.759, + "args": { + "External id": 84056,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865935414.031, "dur": 2.820, + "args": { + "External id": 84057,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865935417.891, "dur": 16.489, + "args": { + "External id": 84058,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865935422.120, "dur": 3.420, + "args": { + "External id": 84059,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865935436.030, "dur": 17.250, + "args": { + "External id": 84060,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865935454.870, "dur": 10.230, + "args": { + "External id": 84061,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865935467.850, "dur": 12.190, + "args": { + "External id": 84062,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865935481.290, "dur": 8.470, + "args": { + "External id": 84063,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865935492.480, "dur": 18.610, + "args": { + "External id": 84064,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865935494.480, "dur": 2.490, + "args": { + "External id": 84065,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935499.210, "dur": 0.730, + "args": { + "External id": 84066,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865935512.820, "dur": 8.100, + "args": { + "External id": 84067,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865935522.030, "dur": 7.820, + "args": { + "External id": 84068,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865935540.530, "dur": 2.950, + "args": { + "External id": 84069,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865935554.630, "dur": 4.390, + "args": { + "External id": 84070,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935557.340, "dur": 0.620, + "args": { + "External id": 84071,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865935630.450, "dur": 45.060, + "args": { + "External id": 84072,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865935683.210, "dur": 6.760, + "args": { + "External id": 84073,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935687.040, "dur": 1.130, + "args": { + "External id": 84074,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865935691.200, "dur": 20.170, + "args": { + "External id": 84075,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865935718.730, "dur": 7.670, + "args": { + "External id": 84076,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865935720.140, "dur": 5.350, + "args": { + "External id": 84077,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935724.060, "dur": 1.050, + "args": { + "External id": 84078,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865935728.810, "dur": 33.640, + "args": { + "External id": 84079,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865935729.850, "dur": 31.630, + "args": { + "External id": 84080,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865935767.379, "dur": 14.651, + "args": { + "External id": 84081,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865935789.799, "dur": 5.671, + "args": { + "External id": 84082,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935792.939, "dur": 1.000, + "args": { + "External id": 84083,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865935800.710, "dur": 41.129, + "args": { + "External id": 84084,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865935801.530, "dur": 5.860, + "args": { + "External id": 84085,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865935802.810, "dur": 4.069, + "args": { + "External id": 84086,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935804.399, "dur": 2.160, + "args": { + "External id": 84087,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865935808.199, "dur": 33.100, + "args": { + "External id": 84088,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865935809.110, "dur": 31.409, + "args": { + "External id": 84089,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865935848.670, "dur": 4.309, + "args": { + "External id": 84090,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865935851.090, "dur": 0.680, + "args": { + "External id": 84091,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865935860.379, "dur": 1.771, + "args": { + "External id": 84092,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865935869.059, "dur": 8.490, + "args": { + "External id": 84093,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865935870.679, "dur": 6.480, + "args": { + "External id": 84094,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865935960.889, "dur": 166.910, + "args": { + "External id": 84095,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865935963.149, "dur": 6.050, + "args": { + "External id": 84096,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865935970.819, "dur": 156.440, + "args": { + "External id": 84097,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865935972.039, "dur": 0.220, + "args": { + "External id": 84098,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865935973.269, "dur": 22.010, + "args": { + "External id": 84099,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865935998.819, "dur": 4.890, + "args": { + "External id": 84100,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936002.339, "dur": 0.910, + "args": { + "External id": 84101,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865936004.459, "dur": 24.030, + "args": { + "External id": 84102,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865936006.889, "dur": 3.080, + "args": { + "External id": 84103,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865936010.989, "dur": 17.130, + "args": { + "External id": 84104,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865936014.169, "dur": 4.980, + "args": { + "External id": 84105,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865936029.729, "dur": 18.140, + "args": { + "External id": 84106,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865936049.399, "dur": 9.980, + "args": { + "External id": 84107,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865936062.479, "dur": 11.810, + "args": { + "External id": 84108,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865936075.539, "dur": 8.610, + "args": { + "External id": 84109,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865936085.929, "dur": 21.610, + "args": { + "External id": 84110,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865936089.249, "dur": 2.890, + "args": { + "External id": 84111,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936095.799, "dur": 0.860, + "args": { + "External id": 84112,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865936109.259, "dur": 8.030, + "args": { + "External id": 84113,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865936118.369, "dur": 7.540, + "args": { + "External id": 84114,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865936136.459, "dur": 3.200, + "args": { + "External id": 84115,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865936149.739, "dur": 4.410, + "args": { + "External id": 84116,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936152.519, "dur": 0.630, + "args": { + "External id": 84117,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865936224.338, "dur": 44.811, + "args": { + "External id": 84118,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865936276.858, "dur": 8.140, + "args": { + "External id": 84119,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936282.049, "dur": 1.169, + "args": { + "External id": 84120,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865936286.218, "dur": 30.890, + "args": { + "External id": 84121,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865936325.308, "dur": 7.640, + "args": { + "External id": 84122,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865936326.828, "dur": 5.170, + "args": { + "External id": 84123,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936329.318, "dur": 2.270, + "args": { + "External id": 84124,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865936335.488, "dur": 35.070, + "args": { + "External id": 84125,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865936336.458, "dur": 33.110, + "args": { + "External id": 84126,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865936376.888, "dur": 15.110, + "args": { + "External id": 84127,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865936400.268, "dur": 5.610, + "args": { + "External id": 84128,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936403.488, "dur": 0.970, + "args": { + "External id": 84129,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865936409.848, "dur": 40.610, + "args": { + "External id": 84130,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865936410.698, "dur": 5.240, + "args": { + "External id": 84131,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865936411.868, "dur": 3.480, + "args": { + "External id": 84132,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936414.438, "dur": 0.480, + "args": { + "External id": 84133,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865936416.778, "dur": 33.110, + "args": { + "External id": 84134,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865936417.698, "dur": 31.420, + "args": { + "External id": 84135,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865936456.708, "dur": 4.710, + "args": { + "External id": 84136,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936459.418, "dur": 0.700, + "args": { + "External id": 84137,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865936469.008, "dur": 1.780, + "args": { + "External id": 84138,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865936477.518, "dur": 10.650, + "args": { + "External id": 84139,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865936480.368, "dur": 7.470, + "args": { + "External id": 84140,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865936570.228, "dur": 163.840, + "args": { + "External id": 84141,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865936572.428, "dur": 4.780, + "args": { + "External id": 84142,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865936579.998, "dur": 153.510, + "args": { + "External id": 84143,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865936581.378, "dur": 0.190, + "args": { + "External id": 84144,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865936582.868, "dur": 21.740, + "args": { + "External id": 84145,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865936606.208, "dur": 4.990, + "args": { + "External id": 84146,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936608.238, "dur": 2.470, + "args": { + "External id": 84147,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865936612.008, "dur": 22.440, + "args": { + "External id": 84148,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865936613.078, "dur": 3.030, + "args": { + "External id": 84149,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865936618.668, "dur": 15.420, + "args": { + "External id": 84150,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865936621.558, "dur": 3.570, + "args": { + "External id": 84151,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865936635.678, "dur": 17.339, + "args": { + "External id": 84152,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865936655.837, "dur": 9.960, + "args": { + "External id": 84153,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865936668.668, "dur": 12.200, + "args": { + "External id": 84154,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865936682.177, "dur": 8.480, + "args": { + "External id": 84155,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865936692.468, "dur": 19.569, + "args": { + "External id": 84156,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865936695.377, "dur": 2.551, + "args": { + "External id": 84157,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936700.068, "dur": 0.840, + "args": { + "External id": 84158,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865936714.877, "dur": 8.491, + "args": { + "External id": 84159,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865936724.508, "dur": 7.909, + "args": { + "External id": 84160,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865936742.817, "dur": 3.040, + "args": { + "External id": 84161,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865936755.867, "dur": 4.730, + "args": { + "External id": 84162,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936758.927, "dur": 0.630, + "args": { + "External id": 84163,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865936831.457, "dur": 44.890, + "args": { + "External id": 84164,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865936883.907, "dur": 8.480, + "args": { + "External id": 84165,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936889.447, "dur": 1.150, + "args": { + "External id": 84166,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865936893.657, "dur": 20.340, + "args": { + "External id": 84167,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865936921.507, "dur": 6.770, + "args": { + "External id": 84168,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865936922.977, "dur": 4.390, + "args": { + "External id": 84169,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865936926.047, "dur": 0.990, + "args": { + "External id": 84170,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865936930.847, "dur": 45.770, + "args": { + "External id": 84171,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865936931.737, "dur": 44.020, + "args": { + "External id": 84172,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865936982.607, "dur": 14.920, + "args": { + "External id": 84173,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865937005.557, "dur": 5.450, + "args": { + "External id": 84174,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937008.617, "dur": 0.980, + "args": { + "External id": 84175,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865937014.957, "dur": 41.580, + "args": { + "External id": 84176,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865937015.777, "dur": 6.270, + "args": { + "External id": 84177,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865937016.907, "dur": 4.610, + "args": { + "External id": 84178,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937019.417, "dur": 1.800, + "args": { + "External id": 84179,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865937022.837, "dur": 33.120, + "args": { + "External id": 84180,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865937023.707, "dur": 31.570, + "args": { + "External id": 84181,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865937062.417, "dur": 4.620, + "args": { + "External id": 84182,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937065.107, "dur": 0.700, + "args": { + "External id": 84183,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865937074.277, "dur": 1.760, + "args": { + "External id": 84184,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865937083.747, "dur": 8.689, + "args": { + "External id": 84185,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865937085.407, "dur": 6.660, + "args": { + "External id": 84186,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865937174.256, "dur": 176.210, + "args": { + "External id": 84187,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865937176.487, "dur": 5.909, + "args": { + "External id": 84188,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865937183.936, "dur": 165.900, + "args": { + "External id": 84189,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865937185.156, "dur": 0.210, + "args": { + "External id": 84190,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865937186.396, "dur": 22.100, + "args": { + "External id": 84191,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865937211.646, "dur": 4.790, + "args": { + "External id": 84192,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937214.966, "dur": 1.020, + "args": { + "External id": 84193,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865937217.326, "dur": 22.050, + "args": { + "External id": 84194,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865937218.506, "dur": 2.950, + "args": { + "External id": 84195,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865937222.606, "dur": 16.430, + "args": { + "External id": 84196,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865937226.076, "dur": 3.700, + "args": { + "External id": 84197,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865937240.606, "dur": 17.910, + "args": { + "External id": 84198,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865937260.236, "dur": 9.820, + "args": { + "External id": 84199,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865937272.956, "dur": 11.650, + "args": { + "External id": 84200,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865937285.836, "dur": 8.420, + "args": { + "External id": 84201,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865937306.686, "dur": 22.590, + "args": { + "External id": 84202,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865937310.446, "dur": 2.950, + "args": { + "External id": 84203,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937315.686, "dur": 0.940, + "args": { + "External id": 84204,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865937331.246, "dur": 8.310, + "args": { + "External id": 84205,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865937340.726, "dur": 7.800, + "args": { + "External id": 84206,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865937359.236, "dur": 3.040, + "args": { + "External id": 84207,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865937372.826, "dur": 5.020, + "args": { + "External id": 84208,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937375.976, "dur": 0.680, + "args": { + "External id": 84209,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865937449.096, "dur": 45.550, + "args": { + "External id": 84210,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865937502.136, "dur": 7.410, + "args": { + "External id": 84211,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937506.566, "dur": 1.110, + "args": { + "External id": 84212,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865937510.766, "dur": 19.780, + "args": { + "External id": 84213,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865937537.726, "dur": 8.109, + "args": { + "External id": 84214,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865937539.215, "dur": 5.791, + "args": { + "External id": 84215,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937542.435, "dur": 2.160, + "args": { + "External id": 84216,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865937548.355, "dur": 33.560, + "args": { + "External id": 84217,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865937549.335, "dur": 31.680, + "args": { + "External id": 84218,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865937599.155, "dur": 14.431, + "args": { + "External id": 84219,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865937621.575, "dur": 5.800, + "args": { + "External id": 84220,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937625.065, "dur": 0.890, + "args": { + "External id": 84221,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865937631.375, "dur": 39.800, + "args": { + "External id": 84222,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865937633.375, "dur": 4.100, + "args": { + "External id": 84223,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865937634.555, "dur": 2.390, + "args": { + "External id": 84224,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5803 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937636.085, "dur": 0.560, + "args": { + "External id": 84225,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865937638.275, "dur": 32.340, + "args": { + "External id": 84226,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865937639.105, "dur": 30.810, + "args": { + "External id": 84227,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865937677.485, "dur": 5.110, + "args": { + "External id": 84228,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937679.765, "dur": 1.530, + "args": { + "External id": 84229,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865937689.925, "dur": 1.840, + "args": { + "External id": 84230,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865937699.345, "dur": 8.110, + "args": { + "External id": 84231,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865937701.025, "dur": 6.090, + "args": { + "External id": 84232,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865937789.415, "dur": 162.760, + "args": { + "External id": 84233,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865937791.855, "dur": 5.080, + "args": { + "External id": 84234,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865937799.835, "dur": 151.870, + "args": { + "External id": 84235,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865937801.255, "dur": 0.190, + "args": { + "External id": 84236,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865937804.085, "dur": 21.600, + "args": { + "External id": 84237,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865937827.295, "dur": 4.290, + "args": { + "External id": 84238,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937829.275, "dur": 1.830, + "args": { + "External id": 84239,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865937832.395, "dur": 21.020, + "args": { + "External id": 84240,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865937833.395, "dur": 3.090, + "args": { + "External id": 84241,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865937837.485, "dur": 15.610, + "args": { + "External id": 84242,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865937840.575, "dur": 3.460, + "args": { + "External id": 84243,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865937854.645, "dur": 18.590, + "args": { + "External id": 84244,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865937874.905, "dur": 9.980, + "args": { + "External id": 84245,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865937887.695, "dur": 11.270, + "args": { + "External id": 84246,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865937900.325, "dur": 8.570, + "args": { + "External id": 84247,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865937911.795, "dur": 20.070, + "args": { + "External id": 84248,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865937913.995, "dur": 2.530, + "args": { + "External id": 84249,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937918.595, "dur": 2.160, + "args": { + "External id": 84250,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865937933.615, "dur": 8.170, + "args": { + "External id": 84251,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865937942.915, "dur": 7.750, + "args": { + "External id": 84252,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865937962.475, "dur": 2.970, + "args": { + "External id": 84253,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865937976.034, "dur": 4.500, + "args": { + "External id": 84254,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865937978.765, "dur": 0.720, + "args": { + "External id": 84255,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865938050.414, "dur": 44.860, + "args": { + "External id": 84256,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865938102.984, "dur": 7.220, + "args": { + "External id": 84257,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938107.244, "dur": 1.100, + "args": { + "External id": 84258,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865938111.454, "dur": 19.980, + "args": { + "External id": 84259,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865938138.654, "dur": 6.540, + "args": { + "External id": 84260,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865938140.154, "dur": 4.230, + "args": { + "External id": 84261,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938143.194, "dur": 0.860, + "args": { + "External id": 84262,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865938147.604, "dur": 33.580, + "args": { + "External id": 84263,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865938148.564, "dur": 31.660, + "args": { + "External id": 84264,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865938186.074, "dur": 14.550, + "args": { + "External id": 84265,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865938208.834, "dur": 5.360, + "args": { + "External id": 84266,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938212.014, "dur": 0.850, + "args": { + "External id": 84267,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865938219.604, "dur": 41.010, + "args": { + "External id": 84268,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865938220.414, "dur": 5.870, + "args": { + "External id": 84269,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865938221.634, "dur": 4.050, + "args": { + "External id": 84270,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938224.704, "dur": 0.660, + "args": { + "External id": 84271,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865938227.134, "dur": 32.930, + "args": { + "External id": 84272,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865938227.964, "dur": 31.280, + "args": { + "External id": 84273,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865938267.744, "dur": 4.250, + "args": { + "External id": 84274,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938270.314, "dur": 0.520, + "args": { + "External id": 84275,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865938279.794, "dur": 1.770, + "args": { + "External id": 84276,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865938288.254, "dur": 16.730, + "args": { + "External id": 84277,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865938289.814, "dur": 6.470, + "args": { + "External id": 84278,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865938389.814, "dur": 161.279, + "args": { + "External id": 84279,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865938392.244, "dur": 6.630, + "args": { + "External id": 84280,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865938400.334, "dur": 150.239, + "args": { + "External id": 84281,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865938402.634, "dur": 0.300, + "args": { + "External id": 84282,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865938404.234, "dur": 22.499, + "args": { + "External id": 84283,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865938428.453, "dur": 3.680, + "args": { + "External id": 84284,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938430.653, "dur": 1.031, + "args": { + "External id": 84285,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865938432.884, "dur": 23.669, + "args": { + "External id": 84286,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865938435.393, "dur": 2.811, + "args": { + "External id": 84287,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865938439.204, "dur": 17.020, + "args": { + "External id": 84288,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865938443.593, "dur": 3.711, + "args": { + "External id": 84289,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865938457.784, "dur": 16.889, + "args": { + "External id": 84290,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865938476.324, "dur": 9.840, + "args": { + "External id": 84291,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865938488.944, "dur": 11.409, + "args": { + "External id": 84292,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865938501.624, "dur": 8.319, + "args": { + "External id": 84293,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865938511.613, "dur": 18.440, + "args": { + "External id": 84294,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865938513.593, "dur": 2.650, + "args": { + "External id": 84295,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938518.473, "dur": 0.730, + "args": { + "External id": 84296,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865938532.883, "dur": 7.890, + "args": { + "External id": 84297,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865938541.843, "dur": 7.500, + "args": { + "External id": 84298,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865938560.133, "dur": 3.000, + "args": { + "External id": 84299,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865938573.033, "dur": 4.480, + "args": { + "External id": 84300,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938575.813, "dur": 0.690, + "args": { + "External id": 84301,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865938647.723, "dur": 44.920, + "args": { + "External id": 84302,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865938700.163, "dur": 8.210, + "args": { + "External id": 84303,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938704.213, "dur": 2.290, + "args": { + "External id": 84304,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865938710.843, "dur": 21.210, + "args": { + "External id": 84305,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865938739.293, "dur": 5.720, + "args": { + "External id": 84306,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865938740.913, "dur": 3.230, + "args": { + "External id": 84307,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938742.853, "dur": 0.930, + "args": { + "External id": 84308,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865938747.423, "dur": 33.880, + "args": { + "External id": 84309,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865938748.373, "dur": 32.060, + "args": { + "External id": 84310,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865938786.203, "dur": 14.440, + "args": { + "External id": 84311,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865938807.523, "dur": 26.860, + "args": { + "External id": 84312,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865938810.253, "dur": 23.650, + "args": { + "External id": 84313,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938816.623, "dur": 1.920, + "args": { + "External id": 84314,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 5893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865938841.073, "dur": 25.259, + "args": { + "External id": 84315,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865938842.273, "dur": 23.759, + "args": { + "External id": 84316,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 5895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938846.092, "dur": 5.920, + "args": { + "External id": 84317,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5896 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865938853.023, "dur": 12.469, + "args": { + "External id": 84318,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865938875.852, "dur": 5.131, + "args": { + "External id": 84319,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865938878.332, "dur": 2.320, + "args": { + "External id": 84320,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865938881.983, "dur": 1.109, + "args": { + "External id": 84321,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865938882.392, "dur": 0.531, + "args": { + "External id": 84322,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865938916.972, "dur": 25.171, + "args": { + "External id": 84323,"Sequence number": 1771179, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 5902 + } + }, + { + "ph": "s", "id": 23, "pid": 5714, "tid": 5714, "ts": 6300865938916.972, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865938950.032, "dur": 7.490, + "args": { + "External id": 84324,"Sequence number": 1771180, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 5903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938954.252, "dur": 1.480, + "args": { + "External id": 84325,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6300865938959.912, "dur": 6.220, + "args": { + "External id": 84326,"Sequence number": 1771180, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "1"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 5905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938964.632, "dur": 0.410, + "args": { + "External id": 84327,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "2"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865938967.572, "dur": 2.280, + "args": { + "External id": 84328,"Sequence number": 1771180, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 5907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938969.082, "dur": 0.250, + "args": { + "External id": 84329,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "2"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 5908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865938975.492, "dur": 4.560, + "args": { + "External id": 84330,"Sequence number": 1771180, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5909 + } + }, + { + "ph": "s", "id": 22, "pid": 5714, "tid": 5714, "ts": 6300865938975.492, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938978.132, "dur": 0.580, + "args": { + "External id": 84331,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865938982.502, "dur": 2.960, + "args": { + "External id": 84332,"Sequence number": 1771181, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5911 + } + }, + { + "ph": "s", "id": 21, "pid": 5714, "tid": 5714, "ts": 6300865938982.502, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938984.382, "dur": 0.330, + "args": { + "External id": 84333,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6300865938986.492, "dur": 5.110, + "args": { + "External id": 84334,"Sequence number": 1771182, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 5913 + } + }, + { + "ph": "s", "id": 20, "pid": 5714, "tid": 5714, "ts": 6300865938986.492, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938989.352, "dur": 1.490, + "args": { + "External id": 84335,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865938992.782, "dur": 4.860, + "args": { + "External id": 84336,"Sequence number": 1771183, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5915 + } + }, + { + "ph": "s", "id": 19, "pid": 5714, "tid": 5714, "ts": 6300865938992.782, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865938995.112, "dur": 1.770, + "args": { + "External id": 84337,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6300865939001.382, "dur": 46.100, + "args": { + "External id": 84338,"Sequence number": 1771184, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6300865939003.722, "dur": 41.880, + "args": { + "External id": 84339,"Sequence number": 1771184, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865939006.122, "dur": 9.920, + "args": { + "External id": 84340,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 5919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865939008.462, "dur": 6.940, + "args": { + "External id": 84341,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865939017.022, "dur": 26.370, + "args": { + "External id": 84342,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 5921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865939076.882, "dur": 5.200, + "args": { + "External id": 84343,"Sequence number": 1771184, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5922 + } + }, + { + "ph": "s", "id": 18, "pid": 5714, "tid": 5714, "ts": 6300865939076.882, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865939084.702, "dur": 2.040, + "args": { + "External id": 84344,"Sequence number": 1771185, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5714, "tid": 5714, + "ts": 6300865939112.472, "dur": 21378.231, + "args": { + "External id": 84345,"Sequence number": 1771185, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 5924 + } + }, + { + "ph": "s", "id": 17, "pid": 5714, "tid": 5714, "ts": 6300865939112.472, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6300865939125.822, "dur": 31.480, + "args": { + "External id": 84346,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6300865939126.562, "dur": 30.470, + "args": { + "External id": 84347,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865939128.072, "dur": 8.560, + "args": { + "External id": 84348,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865939129.682, "dur": 6.300, + "args": { + "External id": 84349,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865939137.562, "dur": 18.950, + "args": { + "External id": 84350,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 5929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6300865939173.292, "dur": 26.390, + "args": { + "External id": 84351,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865939174.342, "dur": 8.120, + "args": { + "External id": 84352,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865939176.342, "dur": 5.670, + "args": { + "External id": 84353,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865939183.452, "dur": 15.960, + "args": { + "External id": 84354,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865939184.932, "dur": 13.570, + "args": { + "External id": 84355,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6300865939203.992, "dur": 20.810, + "args": { + "External id": 84356,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865939204.732, "dur": 7.550, + "args": { + "External id": 84357,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865939207.532, "dur": 4.390, + "args": { + "External id": 84358,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865939212.852, "dur": 11.740, + "args": { + "External id": 84359,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865939213.752, "dur": 10.060, + "args": { + "External id": 84360,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 5939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6300865939230.602, "dur": 19.680, + "args": { + "External id": 84361,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865939231.972, "dur": 7.710, + "args": { + "External id": 84362,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865939240.422, "dur": 9.580, + "args": { + "External id": 84363,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 5942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865939241.302, "dur": 7.890, + "args": { + "External id": 84364,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 5714, + "ts": 6300865939255.272, "dur": 21.110, + "args": { + "External id": 84365,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865939279.342, "dur": 60.160, + "args": { + "External id": 84366,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865939282.202, "dur": 56.740, + "args": { + "External id": 84367,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865939288.051, "dur": 1.031, + "args": { + "External id": 84368,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 5947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865939290.211, "dur": 35.120, + "args": { + "External id": 84369,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865939291.351, "dur": 33.700, + "args": { + "External id": 84370,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 5949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865939294.902, "dur": 13.640, + "args": { + "External id": 84371,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865939309.731, "dur": 14.851, + "args": { + "External id": 84372,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 5951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6300865939345.151, "dur": 16217.374, + "args": { + "External id": 84373,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6300865939346.791, "dur": 16214.464, + "args": { + "External id": 84374,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865955575.914, "dur": 8.111, + "args": { + "External id": 84375,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865955580.674, "dur": 1.351, + "args": { + "External id": 84376,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865955589.874, "dur": 53.780, + "args": { + "External id": 84377,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865955590.874, "dur": 5.360, + "args": { + "External id": 84378,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865955592.465, "dur": 2.960, + "args": { + "External id": 84379,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865955594.205, "dur": 0.849, + "args": { + "External id": 84380,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865955597.225, "dur": 45.580, + "args": { + "External id": 84381,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865955598.354, "dur": 43.420, + "args": { + "External id": 84382,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865955648.825, "dur": 4.420, + "args": { + "External id": 84383,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865955651.305, "dur": 0.629, + "args": { + "External id": 84384,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865955661.565, "dur": 2.089, + "args": { + "External id": 84385,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865955670.684, "dur": 8.430, + "args": { + "External id": 84386,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865955672.294, "dur": 6.430, + "args": { + "External id": 84387,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865955768.644, "dur": 170.610, + "args": { + "External id": 84388,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865955772.174, "dur": 6.030, + "args": { + "External id": 84389,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865955781.114, "dur": 157.610, + "args": { + "External id": 84390,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865955782.354, "dur": 0.220, + "args": { + "External id": 84391,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865955783.734, "dur": 24.180, + "args": { + "External id": 84392,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865955809.874, "dur": 4.780, + "args": { + "External id": 84393,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865955813.204, "dur": 0.970, + "args": { + "External id": 84394,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865955815.434, "dur": 21.190, + "args": { + "External id": 84395,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865955816.714, "dur": 2.940, + "args": { + "External id": 84396,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865955820.734, "dur": 15.560, + "args": { + "External id": 84397,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865955823.694, "dur": 3.450, + "args": { + "External id": 84398,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865955838.964, "dur": 18.110, + "args": { + "External id": 84399,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865955858.804, "dur": 10.190, + "args": { + "External id": 84400,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865955872.024, "dur": 12.520, + "args": { + "External id": 84401,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865955885.874, "dur": 8.500, + "args": { + "External id": 84402,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865955896.094, "dur": 20.690, + "args": { + "External id": 84403,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865955899.494, "dur": 2.790, + "args": { + "External id": 84404,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865955904.594, "dur": 0.850, + "args": { + "External id": 84405,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865955919.784, "dur": 8.580, + "args": { + "External id": 84406,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865955929.494, "dur": 7.930, + "args": { + "External id": 84407,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865955948.294, "dur": 2.930, + "args": { + "External id": 84408,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865955959.324, "dur": 4.350, + "args": { + "External id": 84409,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865955961.924, "dur": 0.640, + "args": { + "External id": 84410,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865956036.933, "dur": 46.291, + "args": { + "External id": 84411,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865956090.753, "dur": 8.271, + "args": { + "External id": 84412,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956095.004, "dur": 1.169, + "args": { + "External id": 84413,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865956101.164, "dur": 22.539, + "args": { + "External id": 84414,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865956131.253, "dur": 6.920, + "args": { + "External id": 84415,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865956132.733, "dur": 4.540, + "args": { + "External id": 84416,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956134.723, "dur": 2.190, + "args": { + "External id": 84417,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865956140.713, "dur": 33.760, + "args": { + "External id": 84418,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865956141.713, "dur": 31.850, + "args": { + "External id": 84419,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865956179.533, "dur": 14.890, + "args": { + "External id": 84420,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865956203.653, "dur": 5.470, + "args": { + "External id": 84421,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956206.823, "dur": 0.920, + "args": { + "External id": 84422,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865956212.973, "dur": 42.300, + "args": { + "External id": 84423,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865956213.803, "dur": 5.340, + "args": { + "External id": 84424,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865956215.013, "dur": 3.480, + "args": { + "External id": 84425,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956216.503, "dur": 1.700, + "args": { + "External id": 84426,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865956221.033, "dur": 33.720, + "args": { + "External id": 84427,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865956221.953, "dur": 31.960, + "args": { + "External id": 84428,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865956261.193, "dur": 4.320, + "args": { + "External id": 84429,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956263.583, "dur": 0.750, + "args": { + "External id": 84430,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865956273.103, "dur": 1.840, + "args": { + "External id": 84431,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865956281.833, "dur": 9.610, + "args": { + "External id": 84432,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865956284.723, "dur": 6.360, + "args": { + "External id": 84433,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865956385.173, "dur": 164.939, + "args": { + "External id": 84434,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865956387.493, "dur": 5.200, + "args": { + "External id": 84435,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865956395.103, "dur": 154.469, + "args": { + "External id": 84436,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865956396.443, "dur": 0.190, + "args": { + "External id": 84437,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865956397.623, "dur": 22.280, + "args": { + "External id": 84438,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865956421.613, "dur": 4.930, + "args": { + "External id": 84439,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956423.703, "dur": 2.320, + "args": { + "External id": 84440,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865956428.533, "dur": 20.799, + "args": { + "External id": 84441,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865956429.523, "dur": 2.820, + "args": { + "External id": 84442,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865956433.433, "dur": 15.559, + "args": { + "External id": 84443,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865956436.423, "dur": 3.760, + "args": { + "External id": 84444,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865956450.623, "dur": 19.380, + "args": { + "External id": 84445,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865956471.983, "dur": 10.309, + "args": { + "External id": 84446,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865956485.452, "dur": 12.520, + "args": { + "External id": 84447,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865956499.283, "dur": 8.620, + "args": { + "External id": 84448,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865956509.652, "dur": 20.080, + "args": { + "External id": 84449,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865956512.952, "dur": 2.711, + "args": { + "External id": 84450,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956517.563, "dur": 0.940, + "args": { + "External id": 84451,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865956531.483, "dur": 8.129, + "args": { + "External id": 84452,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865956540.623, "dur": 7.689, + "args": { + "External id": 84453,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865956559.272, "dur": 3.110, + "args": { + "External id": 84454,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865956572.642, "dur": 4.270, + "args": { + "External id": 84455,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956575.262, "dur": 0.660, + "args": { + "External id": 84456,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865956648.682, "dur": 46.450, + "args": { + "External id": 84457,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865956704.332, "dur": 6.620, + "args": { + "External id": 84458,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956708.012, "dur": 1.120, + "args": { + "External id": 84459,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865956712.232, "dur": 20.120, + "args": { + "External id": 84460,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865956739.722, "dur": 5.710, + "args": { + "External id": 84461,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865956741.282, "dur": 3.180, + "args": { + "External id": 84462,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956743.002, "dur": 0.980, + "args": { + "External id": 84463,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865956748.902, "dur": 33.370, + "args": { + "External id": 84464,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865956749.882, "dur": 31.530, + "args": { + "External id": 84465,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865956787.082, "dur": 14.450, + "args": { + "External id": 84466,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865956809.782, "dur": 5.030, + "args": { + "External id": 84467,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956812.542, "dur": 0.860, + "args": { + "External id": 84468,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865956818.702, "dur": 41.010, + "args": { + "External id": 84469,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865956819.492, "dur": 6.270, + "args": { + "External id": 84470,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865956821.852, "dur": 3.350, + "args": { + "External id": 84471,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956823.282, "dur": 1.580, + "args": { + "External id": 84472,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865956826.572, "dur": 32.610, + "args": { + "External id": 84473,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865956827.452, "dur": 30.960, + "args": { + "External id": 84474,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865956866.002, "dur": 4.290, + "args": { + "External id": 84475,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865956868.392, "dur": 0.580, + "args": { + "External id": 84476,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865956877.532, "dur": 1.750, + "args": { + "External id": 84477,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865956887.092, "dur": 7.930, + "args": { + "External id": 84478,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865956888.582, "dur": 6.089, + "args": { + "External id": 84479,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865956978.331, "dur": 175.800, + "args": { + "External id": 84480,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865956995.711, "dur": 6.020, + "args": { + "External id": 84481,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865957003.461, "dur": 150.160, + "args": { + "External id": 84482,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865957004.691, "dur": 0.220, + "args": { + "External id": 84483,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865957006.871, "dur": 22.070, + "args": { + "External id": 84484,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865957030.501, "dur": 3.530, + "args": { + "External id": 84485,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957032.481, "dur": 1.020, + "args": { + "External id": 84486,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865957036.051, "dur": 21.260, + "args": { + "External id": 84487,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865957037.011, "dur": 3.850, + "args": { + "External id": 84488,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865957042.031, "dur": 14.930, + "args": { + "External id": 84489,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865957044.841, "dur": 3.260, + "args": { + "External id": 84490,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865957058.561, "dur": 18.440, + "args": { + "External id": 84491,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865957078.651, "dur": 10.440, + "args": { + "External id": 84492,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865957091.781, "dur": 11.570, + "args": { + "External id": 84493,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865957104.711, "dur": 8.310, + "args": { + "External id": 84494,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865957114.881, "dur": 19.100, + "args": { + "External id": 84495,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865957117.971, "dur": 2.720, + "args": { + "External id": 84496,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957122.511, "dur": 0.840, + "args": { + "External id": 84497,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865957135.671, "dur": 8.060, + "args": { + "External id": 84498,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865957144.821, "dur": 7.470, + "args": { + "External id": 84499,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865957162.661, "dur": 3.000, + "args": { + "External id": 84500,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865957175.391, "dur": 6.680, + "args": { + "External id": 84501,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957180.381, "dur": 0.650, + "args": { + "External id": 84502,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865957254.081, "dur": 54.700, + "args": { + "External id": 84503,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865957317.081, "dur": 8.470, + "args": { + "External id": 84504,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957322.511, "dur": 1.210, + "args": { + "External id": 84505,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865957326.901, "dur": 20.960, + "args": { + "External id": 84506,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865957355.321, "dur": 6.789, + "args": { + "External id": 84507,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865957356.821, "dur": 4.489, + "args": { + "External id": 84508,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957359.941, "dur": 0.909, + "args": { + "External id": 84509,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865957364.521, "dur": 34.580, + "args": { + "External id": 84510,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865957365.501, "dur": 32.529, + "args": { + "External id": 84511,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865957404.110, "dur": 14.511, + "args": { + "External id": 84512,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865957426.790, "dur": 5.200, + "args": { + "External id": 84513,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957429.670, "dur": 0.950, + "args": { + "External id": 84514,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865957435.800, "dur": 40.160, + "args": { + "External id": 84515,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865957437.740, "dur": 3.700, + "args": { + "External id": 84516,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865957438.870, "dur": 2.050, + "args": { + "External id": 84517,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957440.120, "dur": 0.490, + "args": { + "External id": 84518,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865957442.250, "dur": 33.160, + "args": { + "External id": 84519,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865957443.160, "dur": 31.530, + "args": { + "External id": 84520,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865957481.790, "dur": 5.380, + "args": { + "External id": 84521,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957484.090, "dur": 1.920, + "args": { + "External id": 84522,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865957495.470, "dur": 1.870, + "args": { + "External id": 84523,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865957504.000, "dur": 9.370, + "args": { + "External id": 84524,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865957505.630, "dur": 7.390, + "args": { + "External id": 84525,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865957594.940, "dur": 166.060, + "args": { + "External id": 84526,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865957597.130, "dur": 5.000, + "args": { + "External id": 84527,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865957604.790, "dur": 155.660, + "args": { + "External id": 84528,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865957606.020, "dur": 0.200, + "args": { + "External id": 84529,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865957607.290, "dur": 22.950, + "args": { + "External id": 84530,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865957631.920, "dur": 3.240, + "args": { + "External id": 84531,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957633.700, "dur": 0.920, + "args": { + "External id": 84532,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865957635.990, "dur": 22.980, + "args": { + "External id": 84533,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865957636.900, "dur": 2.920, + "args": { + "External id": 84534,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865957640.880, "dur": 17.730, + "args": { + "External id": 84535,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865957644.950, "dur": 4.550, + "args": { + "External id": 84536,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865957661.510, "dur": 20.770, + "args": { + "External id": 84537,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865957683.960, "dur": 10.320, + "args": { + "External id": 84538,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865957697.350, "dur": 11.540, + "args": { + "External id": 84539,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865957710.080, "dur": 8.380, + "args": { + "External id": 84540,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865957720.120, "dur": 18.520, + "args": { + "External id": 84541,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865957722.110, "dur": 2.600, + "args": { + "External id": 84542,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957726.500, "dur": 0.930, + "args": { + "External id": 84543,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865957741.580, "dur": 8.120, + "args": { + "External id": 84544,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865957750.830, "dur": 8.320, + "args": { + "External id": 84545,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865957769.680, "dur": 2.989, + "args": { + "External id": 84546,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865957782.469, "dur": 4.500, + "args": { + "External id": 84547,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957785.249, "dur": 0.711, + "args": { + "External id": 84548,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865957857.909, "dur": 45.050, + "args": { + "External id": 84549,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865957910.489, "dur": 8.220, + "args": { + "External id": 84550,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957914.789, "dur": 2.030, + "args": { + "External id": 84551,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865957919.969, "dur": 20.070, + "args": { + "External id": 84552,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865957948.279, "dur": 5.620, + "args": { + "External id": 84553,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865957949.749, "dur": 3.320, + "args": { + "External id": 84554,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865957951.799, "dur": 0.930, + "args": { + "External id": 84555,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865957956.189, "dur": 33.220, + "args": { + "External id": 84556,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865957957.099, "dur": 31.400, + "args": { + "External id": 84557,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865957994.279, "dur": 14.500, + "args": { + "External id": 84558,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865958017.019, "dur": 6.780, + "args": { + "External id": 84559,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958021.359, "dur": 1.060, + "args": { + "External id": 84560,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865958027.889, "dur": 39.620, + "args": { + "External id": 84561,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865958028.679, "dur": 3.880, + "args": { + "External id": 84562,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865958029.759, "dur": 2.260, + "args": { + "External id": 84563,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958031.239, "dur": 0.480, + "args": { + "External id": 84564,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865958033.349, "dur": 33.530, + "args": { + "External id": 84565,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865958035.369, "dur": 30.710, + "args": { + "External id": 84566,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865958073.509, "dur": 4.400, + "args": { + "External id": 84567,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958076.079, "dur": 0.580, + "args": { + "External id": 84568,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865958085.299, "dur": 1.760, + "args": { + "External id": 84569,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865958093.629, "dur": 9.740, + "args": { + "External id": 84570,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865958095.299, "dur": 7.710, + "args": { + "External id": 84571,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865958185.659, "dur": 173.729, + "args": { + "External id": 84572,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865958188.089, "dur": 5.860, + "args": { + "External id": 84573,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865958195.489, "dur": 163.299, + "args": { + "External id": 84574,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865958196.849, "dur": 0.210, + "args": { + "External id": 84575,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865958198.129, "dur": 21.150, + "args": { + "External id": 84576,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865958222.028, "dur": 3.591, + "args": { + "External id": 84577,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958224.228, "dur": 0.940, + "args": { + "External id": 84578,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865958226.319, "dur": 21.860, + "args": { + "External id": 84579,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865958227.388, "dur": 2.851, + "args": { + "External id": 84580,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865958231.328, "dur": 16.511, + "args": { + "External id": 84581,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865958235.568, "dur": 3.531, + "args": { + "External id": 84582,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865958250.908, "dur": 17.620, + "args": { + "External id": 84583,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865958270.199, "dur": 9.809, + "args": { + "External id": 84584,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865958282.739, "dur": 11.480, + "args": { + "External id": 84585,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865958295.488, "dur": 19.180, + "args": { + "External id": 84586,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865958316.908, "dur": 19.990, + "args": { + "External id": 84587,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865958319.298, "dur": 2.760, + "args": { + "External id": 84588,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958324.298, "dur": 1.120, + "args": { + "External id": 84589,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865958339.848, "dur": 8.410, + "args": { + "External id": 84590,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865958349.348, "dur": 8.110, + "args": { + "External id": 84591,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865958368.358, "dur": 3.100, + "args": { + "External id": 84592,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865958381.658, "dur": 4.480, + "args": { + "External id": 84593,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958384.508, "dur": 0.640, + "args": { + "External id": 84594,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865958457.278, "dur": 45.170, + "args": { + "External id": 84595,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865958510.118, "dur": 6.930, + "args": { + "External id": 84596,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958514.188, "dur": 1.090, + "args": { + "External id": 84597,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865958519.408, "dur": 20.980, + "args": { + "External id": 84598,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865958547.858, "dur": 5.660, + "args": { + "External id": 84599,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865958549.318, "dur": 3.310, + "args": { + "External id": 84600,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958551.418, "dur": 0.850, + "args": { + "External id": 84601,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865958555.988, "dur": 35.240, + "args": { + "External id": 84602,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865958558.498, "dur": 31.760, + "args": { + "External id": 84603,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865958597.168, "dur": 14.440, + "args": { + "External id": 84604,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865958619.638, "dur": 5.570, + "args": { + "External id": 84605,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958622.908, "dur": 0.890, + "args": { + "External id": 84606,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865958629.208, "dur": 40.599, + "args": { + "External id": 84607,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865958630.038, "dur": 5.530, + "args": { + "External id": 84608,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865958631.148, "dur": 3.830, + "args": { + "External id": 84609,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958634.278, "dur": 0.440, + "args": { + "External id": 84610,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865958636.378, "dur": 32.789, + "args": { + "External id": 84611,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865958637.238, "dur": 31.149, + "args": { + "External id": 84612,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865958676.007, "dur": 5.731, + "args": { + "External id": 84613,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958678.798, "dur": 1.580, + "args": { + "External id": 84614,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865958689.187, "dur": 1.840, + "args": { + "External id": 84615,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865958699.107, "dur": 8.131, + "args": { + "External id": 84616,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865958700.818, "dur": 6.089, + "args": { + "External id": 84617,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865958788.787, "dur": 165.530, + "args": { + "External id": 84618,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865958791.077, "dur": 6.040, + "args": { + "External id": 84619,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865958798.877, "dur": 154.940, + "args": { + "External id": 84620,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865958800.137, "dur": 0.230, + "args": { + "External id": 84621,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865958801.597, "dur": 22.840, + "args": { + "External id": 84622,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865958827.377, "dur": 3.520, + "args": { + "External id": 84623,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958829.457, "dur": 0.960, + "args": { + "External id": 84624,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865958831.727, "dur": 23.210, + "args": { + "External id": 84625,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865958832.687, "dur": 3.000, + "args": { + "External id": 84626,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865958836.717, "dur": 17.860, + "args": { + "External id": 84627,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865958841.037, "dur": 4.760, + "args": { + "External id": 84628,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865958856.267, "dur": 17.250, + "args": { + "External id": 84629,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865958875.267, "dur": 10.270, + "args": { + "External id": 84630,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865958888.437, "dur": 12.160, + "args": { + "External id": 84631,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865958901.817, "dur": 8.400, + "args": { + "External id": 84632,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865958914.857, "dur": 18.950, + "args": { + "External id": 84633,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865958916.817, "dur": 2.520, + "args": { + "External id": 84634,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958921.847, "dur": 0.810, + "args": { + "External id": 84635,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865958935.617, "dur": 8.240, + "args": { + "External id": 84636,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865958944.997, "dur": 7.590, + "args": { + "External id": 84637,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865958963.367, "dur": 3.040, + "args": { + "External id": 84638,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865958976.527, "dur": 4.380, + "args": { + "External id": 84639,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865958979.277, "dur": 0.640, + "args": { + "External id": 84640,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865959050.897, "dur": 45.400, + "args": { + "External id": 84641,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865959103.866, "dur": 7.031, + "args": { + "External id": 84642,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959107.866, "dur": 1.211, + "args": { + "External id": 84643,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865959112.186, "dur": 20.231, + "args": { + "External id": 84644,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865959139.677, "dur": 6.660, + "args": { + "External id": 84645,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865959141.006, "dur": 4.460, + "args": { + "External id": 84646,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959144.197, "dur": 0.929, + "args": { + "External id": 84647,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865959148.757, "dur": 33.720, + "args": { + "External id": 84648,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865959149.657, "dur": 31.840, + "args": { + "External id": 84649,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865959187.376, "dur": 14.520, + "args": { + "External id": 84650,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865959209.976, "dur": 5.480, + "args": { + "External id": 84651,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959213.096, "dur": 0.920, + "args": { + "External id": 84652,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865959219.306, "dur": 41.980, + "args": { + "External id": 84653,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865959221.266, "dur": 5.630, + "args": { + "External id": 84654,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865959222.406, "dur": 3.880, + "args": { + "External id": 84655,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959223.956, "dur": 1.990, + "args": { + "External id": 84656,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865959227.746, "dur": 32.930, + "args": { + "External id": 84657,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865959228.576, "dur": 31.220, + "args": { + "External id": 84658,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865959267.386, "dur": 4.670, + "args": { + "External id": 84659,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959270.076, "dur": 0.590, + "args": { + "External id": 84660,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865959279.666, "dur": 1.840, + "args": { + "External id": 84661,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865959289.386, "dur": 18.050, + "args": { + "External id": 84662,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865959291.176, "dur": 15.800, + "args": { + "External id": 84663,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865959389.426, "dur": 162.019, + "args": { + "External id": 84664,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865959391.916, "dur": 5.770, + "args": { + "External id": 84665,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865959399.186, "dur": 151.750, + "args": { + "External id": 84666,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865959400.486, "dur": 0.200, + "args": { + "External id": 84667,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865959404.086, "dur": 22.510, + "args": { + "External id": 84668,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865959428.256, "dur": 3.650, + "args": { + "External id": 84669,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959430.466, "dur": 0.960, + "args": { + "External id": 84670,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865959432.626, "dur": 22.550, + "args": { + "External id": 84671,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865959433.616, "dur": 4.310, + "args": { + "External id": 84672,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865959439.046, "dur": 15.770, + "args": { + "External id": 84673,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865959442.086, "dur": 3.450, + "args": { + "External id": 84674,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865959456.396, "dur": 17.100, + "args": { + "External id": 84675,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865959475.406, "dur": 9.950, + "args": { + "External id": 84676,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865959487.996, "dur": 11.660, + "args": { + "External id": 84677,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865959500.956, "dur": 8.450, + "args": { + "External id": 84678,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865959512.416, "dur": 18.449, + "args": { + "External id": 84679,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865959514.476, "dur": 2.480, + "args": { + "External id": 84680,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959519.126, "dur": 0.780, + "args": { + "External id": 84681,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865959532.565, "dur": 8.471, + "args": { + "External id": 84682,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865959542.076, "dur": 7.729, + "args": { + "External id": 84683,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865959561.405, "dur": 3.040, + "args": { + "External id": 84684,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865959574.345, "dur": 4.620, + "args": { + "External id": 84685,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959577.305, "dur": 0.680, + "args": { + "External id": 84686,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865959649.455, "dur": 44.880, + "args": { + "External id": 84687,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865959701.965, "dur": 8.130, + "args": { + "External id": 84688,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959705.985, "dur": 2.250, + "args": { + "External id": 84689,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865959711.315, "dur": 20.350, + "args": { + "External id": 84690,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865959739.145, "dur": 6.760, + "args": { + "External id": 84691,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865959740.605, "dur": 4.520, + "args": { + "External id": 84692,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959743.855, "dur": 0.930, + "args": { + "External id": 84693,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865959748.405, "dur": 33.870, + "args": { + "External id": 84694,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865959749.355, "dur": 32.050, + "args": { + "External id": 84695,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865959787.085, "dur": 14.570, + "args": { + "External id": 84696,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865959809.505, "dur": 5.380, + "args": { + "External id": 84697,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959812.595, "dur": 0.920, + "args": { + "External id": 84698,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865959820.245, "dur": 39.170, + "args": { + "External id": 84699,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865959821.105, "dur": 3.940, + "args": { + "External id": 84700,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865959822.205, "dur": 2.310, + "args": { + "External id": 84701,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959823.695, "dur": 0.520, + "args": { + "External id": 84702,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865959825.885, "dur": 32.970, + "args": { + "External id": 84703,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865959826.805, "dur": 31.160, + "args": { + "External id": 84704,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865959866.705, "dur": 4.460, + "args": { + "External id": 84705,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865959869.355, "dur": 0.600, + "args": { + "External id": 84706,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865959878.285, "dur": 1.780, + "args": { + "External id": 84707,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865959886.745, "dur": 9.980, + "args": { + "External id": 84708,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865959888.295, "dur": 8.060, + "args": { + "External id": 84709,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865959978.184, "dur": 160.120, + "args": { + "External id": 84710,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865959980.435, "dur": 4.680, + "args": { + "External id": 84711,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865959986.675, "dur": 151.069, + "args": { + "External id": 84712,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865959988.064, "dur": 0.191, + "args": { + "External id": 84713,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865959989.295, "dur": 22.229, + "args": { + "External id": 84714,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865960014.175, "dur": 3.600, + "args": { + "External id": 84715,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960016.295, "dur": 0.940, + "args": { + "External id": 84716,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865960018.475, "dur": 23.580, + "args": { + "External id": 84717,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865960019.595, "dur": 2.780, + "args": { + "External id": 84718,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865960023.375, "dur": 18.329, + "args": { + "External id": 84719,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865960028.904, "dur": 3.631, + "args": { + "External id": 84720,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865960043.284, "dur": 17.080, + "args": { + "External id": 84721,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865960062.115, "dur": 9.869, + "args": { + "External id": 84722,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865960074.884, "dur": 11.990, + "args": { + "External id": 84723,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865960088.134, "dur": 8.360, + "args": { + "External id": 84724,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865960098.164, "dur": 18.520, + "args": { + "External id": 84725,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865960100.234, "dur": 2.500, + "args": { + "External id": 84726,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960104.964, "dur": 0.820, + "args": { + "External id": 84727,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865960119.644, "dur": 8.280, + "args": { + "External id": 84728,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865960128.974, "dur": 7.510, + "args": { + "External id": 84729,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865960146.984, "dur": 3.000, + "args": { + "External id": 84730,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865960159.744, "dur": 4.550, + "args": { + "External id": 84731,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960162.654, "dur": 0.660, + "args": { + "External id": 84732,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865960234.004, "dur": 45.450, + "args": { + "External id": 84733,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865960286.934, "dur": 6.880, + "args": { + "External id": 84734,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960290.794, "dur": 1.160, + "args": { + "External id": 84735,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865960296.334, "dur": 30.070, + "args": { + "External id": 84736,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865960334.614, "dur": 7.080, + "args": { + "External id": 84737,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865960336.114, "dur": 4.650, + "args": { + "External id": 84738,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960339.364, "dur": 1.070, + "args": { + "External id": 84739,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865960344.294, "dur": 34.360, + "args": { + "External id": 84740,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865960345.294, "dur": 32.520, + "args": { + "External id": 84741,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865960383.594, "dur": 14.730, + "args": { + "External id": 84742,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865960404.754, "dur": 23.160, + "args": { + "External id": 84743,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865960406.644, "dur": 20.830, + "args": { + "External id": 84744,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960413.223, "dur": 0.740, + "args": { + "External id": 84745,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865960432.974, "dur": 24.569, + "args": { + "External id": 84746,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865960434.063, "dur": 23.171, + "args": { + "External id": 84747,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 6326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960437.574, "dur": 6.140, + "args": { + "External id": 84748,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865960444.743, "dur": 11.931, + "args": { + "External id": 84749,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865960467.134, "dur": 5.400, + "args": { + "External id": 84750,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865960469.894, "dur": 2.280, + "args": { + "External id": 84751,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865960473.594, "dur": 1.169, + "args": { + "External id": 84752,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865960474.043, "dur": 0.560, + "args": { + "External id": 84753,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865960504.243, "dur": 21.190, + "args": { + "External id": 84754,"Sequence number": 1771186, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865960527.993, "dur": 11.030, + "args": { + "External id": 84755,"Sequence number": 1771187, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6334 + } + }, + { + "ph": "s", "id": 16, "pid": 5714, "tid": 5714, "ts": 6300865960527.993, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865960545.803, "dur": 7.780, + "args": { + "External id": 84756,"Sequence number": 1771188, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 6335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960550.253, "dur": 1.570, + "args": { + "External id": 84757,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6300865960555.583, "dur": 6.290, + "args": { + "External id": 84758,"Sequence number": 1771188, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "2"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 6337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960560.333, "dur": 0.360, + "args": { + "External id": 84759,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "3"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865960563.243, "dur": 2.280, + "args": { + "External id": 84760,"Sequence number": 1771188, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 6339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960564.743, "dur": 0.280, + "args": { + "External id": 84761,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "3"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 6340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865960570.783, "dur": 4.470, + "args": { + "External id": 84762,"Sequence number": 1771188, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6341 + } + }, + { + "ph": "s", "id": 15, "pid": 5714, "tid": 5714, "ts": 6300865960570.783, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960573.343, "dur": 0.540, + "args": { + "External id": 84763,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865960577.523, "dur": 4.380, + "args": { + "External id": 84764,"Sequence number": 1771189, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6343 + } + }, + { + "ph": "s", "id": 14, "pid": 5714, "tid": 5714, "ts": 6300865960577.523, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960580.813, "dur": 0.300, + "args": { + "External id": 84765,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6300865960582.933, "dur": 3.780, + "args": { + "External id": 84766,"Sequence number": 1771190, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "2"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 6345 + } + }, + { + "ph": "s", "id": 13, "pid": 5714, "tid": 5714, "ts": 6300865960582.933, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960585.583, "dur": 0.400, + "args": { + "External id": 84767,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865960587.883, "dur": 4.340, + "args": { + "External id": 84768,"Sequence number": 1771191, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 6347 + } + }, + { + "ph": "s", "id": 12, "pid": 5714, "tid": 5714, "ts": 6300865960587.883, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960590.123, "dur": 1.400, + "args": { + "External id": 84769,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 6348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6300865960596.093, "dur": 31.840, + "args": { + "External id": 84770,"Sequence number": 1771192, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6300865960598.103, "dur": 29.490, + "args": { + "External id": 84771,"Sequence number": 1771192, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865960599.813, "dur": 9.150, + "args": { + "External id": 84772,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 6351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865960601.563, "dur": 6.850, + "args": { + "External id": 84773,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865960610.093, "dur": 16.950, + "args": { + "External id": 84774,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 6353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865960653.193, "dur": 5.060, + "args": { + "External id": 84775,"Sequence number": 1771192, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 6354 + } + }, + { + "ph": "s", "id": 11, "pid": 5714, "tid": 5714, "ts": 6300865960653.193, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865960660.863, "dur": 2.060, + "args": { + "External id": 84776,"Sequence number": 1771193, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5714, "tid": 5714, + "ts": 6300865960686.073, "dur": 21577.901, + "args": { + "External id": 84777,"Sequence number": 1771193, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 6356 + } + }, + { + "ph": "s", "id": 10, "pid": 5714, "tid": 5714, "ts": 6300865960686.073, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6300865960698.803, "dur": 30.790, + "args": { + "External id": 84778,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6300865960699.573, "dur": 29.770, + "args": { + "External id": 84779,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865960701.043, "dur": 8.820, + "args": { + "External id": 84780,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865960702.723, "dur": 6.430, + "args": { + "External id": 84781,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865960710.803, "dur": 17.980, + "args": { + "External id": 84782,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 6361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6300865960744.673, "dur": 24.550, + "args": { + "External id": 84783,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865960745.693, "dur": 8.300, + "args": { + "External id": 84784,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960747.623, "dur": 5.930, + "args": { + "External id": 84785,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865960755.003, "dur": 13.980, + "args": { + "External id": 84786,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865960756.433, "dur": 11.650, + "args": { + "External id": 84787,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6300865960773.413, "dur": 19.660, + "args": { + "External id": 84788,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865960774.203, "dur": 7.540, + "args": { + "External id": 84789,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960776.843, "dur": 4.450, + "args": { + "External id": 84790,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865960782.393, "dur": 10.470, + "args": { + "External id": 84791,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865960783.303, "dur": 8.730, + "args": { + "External id": 84792,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 6371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6300865960798.763, "dur": 17.650, + "args": { + "External id": 84793,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 6372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865960800.103, "dur": 5.560, + "args": { + "External id": 84794,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865960806.423, "dur": 9.700, + "args": { + "External id": 84795,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 6374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865960807.373, "dur": 7.990, + "args": { + "External id": 84796,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 5714, + "ts": 6300865960820.513, "dur": 19.920, + "args": { + "External id": 84797,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865960843.193, "dur": 46.129, + "args": { + "External id": 84798,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865960845.043, "dur": 43.770, + "args": { + "External id": 84799,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960851.073, "dur": 1.020, + "args": { + "External id": 84800,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865960854.213, "dur": 22.669, + "args": { + "External id": 84801,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865960855.313, "dur": 21.309, + "args": { + "External id": 84802,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 6381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865960858.613, "dur": 4.360, + "args": { + "External id": 84803,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865960863.953, "dur": 12.220, + "args": { + "External id": 84804,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 6383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6300865960894.062, "dur": 16473.673, + "args": { + "External id": 84805,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6300865960895.322, "dur": 16471.763, + "args": { + "External id": 84806,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865977377.005, "dur": 6.910, + "args": { + "External id": 84807,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865977380.875, "dur": 1.050, + "args": { + "External id": 84808,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865977388.215, "dur": 51.360, + "args": { + "External id": 84809,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865977390.285, "dur": 5.240, + "args": { + "External id": 84810,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865977391.835, "dur": 2.910, + "args": { + "External id": 84811,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865977393.565, "dur": 0.840, + "args": { + "External id": 84812,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865977396.535, "dur": 42.240, + "args": { + "External id": 84813,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865977397.705, "dur": 40.080, + "args": { + "External id": 84814,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865977444.775, "dur": 4.550, + "args": { + "External id": 84815,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865977447.515, "dur": 0.540, + "args": { + "External id": 84816,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865977457.975, "dur": 2.130, + "args": { + "External id": 84817,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865977467.265, "dur": 8.360, + "args": { + "External id": 84818,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865977468.855, "dur": 6.440, + "args": { + "External id": 84819,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865977563.245, "dur": 171.019, + "args": { + "External id": 84820,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865977565.705, "dur": 4.690, + "args": { + "External id": 84821,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865977572.904, "dur": 160.810, + "args": { + "External id": 84822,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865977575.435, "dur": 0.209, + "args": { + "External id": 84823,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865977576.744, "dur": 23.580, + "args": { + "External id": 84824,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865977602.104, "dur": 9.020, + "args": { + "External id": 84825,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865977609.484, "dur": 1.111, + "args": { + "External id": 84826,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865977611.964, "dur": 22.560, + "args": { + "External id": 84827,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865977613.824, "dur": 3.251, + "args": { + "External id": 84828,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865977618.115, "dur": 16.080, + "args": { + "External id": 84829,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865977621.115, "dur": 3.469, + "args": { + "External id": 84830,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865977635.744, "dur": 17.671, + "args": { + "External id": 84831,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865977655.164, "dur": 9.991, + "args": { + "External id": 84832,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865977667.894, "dur": 12.440, + "args": { + "External id": 84833,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865977682.764, "dur": 8.460, + "args": { + "External id": 84834,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865977693.034, "dur": 19.790, + "args": { + "External id": 84835,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865977695.244, "dur": 2.500, + "args": { + "External id": 84836,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865977699.924, "dur": 1.940, + "args": { + "External id": 84837,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865977714.794, "dur": 8.650, + "args": { + "External id": 84838,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865977724.504, "dur": 7.800, + "args": { + "External id": 84839,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865977744.004, "dur": 3.130, + "args": { + "External id": 84840,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865977755.344, "dur": 4.360, + "args": { + "External id": 84841,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865977758.094, "dur": 0.670, + "args": { + "External id": 84842,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865977832.014, "dur": 45.150, + "args": { + "External id": 84843,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865977885.064, "dur": 8.370, + "args": { + "External id": 84844,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865977889.474, "dur": 1.070, + "args": { + "External id": 84845,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865977894.714, "dur": 21.530, + "args": { + "External id": 84846,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865977923.714, "dur": 6.850, + "args": { + "External id": 84847,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865977926.424, "dur": 3.270, + "args": { + "External id": 84848,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865977928.474, "dur": 0.870, + "args": { + "External id": 84849,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865977933.014, "dur": 34.090, + "args": { + "External id": 84850,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865977933.944, "dur": 32.190, + "args": { + "External id": 84851,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865977971.914, "dur": 14.550, + "args": { + "External id": 84852,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865977994.724, "dur": 7.730, + "args": { + "External id": 84853,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978000.034, "dur": 0.950, + "args": { + "External id": 84854,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865978006.534, "dur": 41.789, + "args": { + "External id": 84855,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865978007.364, "dur": 5.730, + "args": { + "External id": 84856,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865978008.544, "dur": 3.959, + "args": { + "External id": 84857,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978011.543, "dur": 0.640, + "args": { + "External id": 84858,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865978013.983, "dur": 33.780, + "args": { + "External id": 84859,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865978014.923, "dur": 32.040, + "args": { + "External id": 84860,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865978055.534, "dur": 5.480, + "args": { + "External id": 84861,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978058.283, "dur": 1.540, + "args": { + "External id": 84862,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865978068.714, "dur": 1.960, + "args": { + "External id": 84863,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865978077.394, "dur": 8.129, + "args": { + "External id": 84864,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865978078.983, "dur": 6.211, + "args": { + "External id": 84865,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865978170.043, "dur": 175.620, + "args": { + "External id": 84866,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865978173.283, "dur": 4.730, + "args": { + "External id": 84867,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865978179.623, "dur": 165.470, + "args": { + "External id": 84868,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865978181.813, "dur": 0.370, + "args": { + "External id": 84869,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865978183.303, "dur": 21.570, + "args": { + "External id": 84870,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865978206.533, "dur": 3.480, + "args": { + "External id": 84871,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978208.593, "dur": 0.970, + "args": { + "External id": 84872,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865978210.763, "dur": 24.080, + "args": { + "External id": 84873,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865978214.123, "dur": 2.730, + "args": { + "External id": 84874,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865978217.913, "dur": 16.630, + "args": { + "External id": 84875,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865978220.963, "dur": 4.720, + "args": { + "External id": 84876,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865978236.053, "dur": 18.740, + "args": { + "External id": 84877,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865978256.463, "dur": 10.410, + "args": { + "External id": 84878,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865978269.853, "dur": 11.930, + "args": { + "External id": 84879,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865978283.143, "dur": 8.490, + "args": { + "External id": 84880,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865978293.363, "dur": 30.630, + "args": { + "External id": 84881,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865978295.443, "dur": 11.580, + "args": { + "External id": 84882,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978310.293, "dur": 0.980, + "args": { + "External id": 84883,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865978325.913, "dur": 8.790, + "args": { + "External id": 84884,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865978335.723, "dur": 8.020, + "args": { + "External id": 84885,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865978354.763, "dur": 3.100, + "args": { + "External id": 84886,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865978368.613, "dur": 4.600, + "args": { + "External id": 84887,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978371.493, "dur": 0.700, + "args": { + "External id": 84888,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865978445.463, "dur": 45.919, + "args": { + "External id": 84889,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865978498.793, "dur": 8.249, + "args": { + "External id": 84890,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978504.102, "dur": 1.200, + "args": { + "External id": 84891,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865978508.242, "dur": 20.000, + "args": { + "External id": 84892,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865978535.773, "dur": 5.769, + "args": { + "External id": 84893,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865978537.322, "dur": 3.391, + "args": { + "External id": 84894,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978539.293, "dur": 0.960, + "args": { + "External id": 84895,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865978543.793, "dur": 34.859, + "args": { + "External id": 84896,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865978545.953, "dur": 31.829, + "args": { + "External id": 84897,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865978583.712, "dur": 14.380, + "args": { + "External id": 84898,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865978606.172, "dur": 5.590, + "args": { + "External id": 84899,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978609.342, "dur": 0.970, + "args": { + "External id": 84900,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865978615.692, "dur": 41.840, + "args": { + "External id": 84901,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865978616.542, "dur": 7.000, + "args": { + "External id": 84902,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865978617.852, "dur": 5.100, + "args": { + "External id": 84903,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978620.982, "dur": 1.630, + "args": { + "External id": 84904,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865978624.402, "dur": 32.550, + "args": { + "External id": 84905,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865978625.322, "dur": 30.820, + "args": { + "External id": 84906,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865978663.992, "dur": 4.560, + "args": { + "External id": 84907,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978666.762, "dur": 0.490, + "args": { + "External id": 84908,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865978675.952, "dur": 1.880, + "args": { + "External id": 84909,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865978684.362, "dur": 8.720, + "args": { + "External id": 84910,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865978687.022, "dur": 5.740, + "args": { + "External id": 84911,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865978774.742, "dur": 162.550, + "args": { + "External id": 84912,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865978777.192, "dur": 4.790, + "args": { + "External id": 84913,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865978783.582, "dur": 153.150, + "args": { + "External id": 84914,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865978784.792, "dur": 0.220, + "args": { + "External id": 84915,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865978787.172, "dur": 23.480, + "args": { + "External id": 84916,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865978812.282, "dur": 3.680, + "args": { + "External id": 84917,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978814.622, "dur": 0.850, + "args": { + "External id": 84918,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865978816.672, "dur": 23.090, + "args": { + "External id": 84919,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865978818.722, "dur": 3.960, + "args": { + "External id": 84920,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865978823.922, "dur": 15.530, + "args": { + "External id": 84921,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865978826.942, "dur": 3.630, + "args": { + "External id": 84922,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865978840.972, "dur": 18.000, + "args": { + "External id": 84923,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865978860.772, "dur": 9.990, + "args": { + "External id": 84924,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865978873.452, "dur": 11.450, + "args": { + "External id": 84925,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865978886.172, "dur": 8.780, + "args": { + "External id": 84926,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865978896.741, "dur": 19.811, + "args": { + "External id": 84927,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865978898.861, "dur": 2.460, + "args": { + "External id": 84928,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978904.761, "dur": 0.851, + "args": { + "External id": 84929,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865978918.272, "dur": 8.289, + "args": { + "External id": 84930,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865978927.652, "dur": 7.809, + "args": { + "External id": 84931,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865978945.932, "dur": 2.940, + "args": { + "External id": 84932,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865978958.841, "dur": 4.460, + "args": { + "External id": 84933,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865978961.692, "dur": 0.640, + "args": { + "External id": 84934,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865979032.951, "dur": 45.840, + "args": { + "External id": 84935,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865979087.611, "dur": 8.220, + "args": { + "External id": 84936,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979091.731, "dur": 2.180, + "args": { + "External id": 84937,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865979097.081, "dur": 21.080, + "args": { + "External id": 84938,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865979125.401, "dur": 5.580, + "args": { + "External id": 84939,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865979126.871, "dur": 3.270, + "args": { + "External id": 84940,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979128.961, "dur": 0.850, + "args": { + "External id": 84941,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865979135.031, "dur": 34.210, + "args": { + "External id": 84942,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865979136.011, "dur": 32.250, + "args": { + "External id": 84943,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865979174.241, "dur": 15.120, + "args": { + "External id": 84944,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865979197.391, "dur": 5.460, + "args": { + "External id": 84945,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979200.511, "dur": 0.940, + "args": { + "External id": 84946,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865979206.681, "dur": 39.890, + "args": { + "External id": 84947,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865979207.521, "dur": 5.070, + "args": { + "External id": 84948,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865979209.801, "dur": 2.290, + "args": { + "External id": 84949,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979211.261, "dur": 0.520, + "args": { + "External id": 84950,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865979213.441, "dur": 32.580, + "args": { + "External id": 84951,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865979214.321, "dur": 30.830, + "args": { + "External id": 84952,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865979252.711, "dur": 4.380, + "args": { + "External id": 84953,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979255.261, "dur": 0.620, + "args": { + "External id": 84954,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865979265.501, "dur": 1.870, + "args": { + "External id": 84955,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865979273.851, "dur": 9.240, + "args": { + "External id": 84956,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865979275.411, "dur": 7.290, + "args": { + "External id": 84957,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865979373.871, "dur": 161.889, + "args": { + "External id": 84958,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865979376.171, "dur": 6.140, + "args": { + "External id": 84959,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865979384.251, "dur": 151.019, + "args": { + "External id": 84960,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865979386.640, "dur": 0.220, + "args": { + "External id": 84961,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865979387.851, "dur": 21.769, + "args": { + "External id": 84962,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865979412.440, "dur": 3.840, + "args": { + "External id": 84963,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979414.700, "dur": 1.011, + "args": { + "External id": 84964,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865979417.000, "dur": 21.810, + "args": { + "External id": 84965,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865979418.031, "dur": 2.800, + "args": { + "External id": 84966,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865979421.831, "dur": 16.579, + "args": { + "External id": 84967,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865979425.751, "dur": 3.629, + "args": { + "External id": 84968,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865979440.070, "dur": 17.330, + "args": { + "External id": 84969,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865979459.130, "dur": 9.930, + "args": { + "External id": 84970,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865979471.860, "dur": 11.620, + "args": { + "External id": 84971,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865979486.130, "dur": 8.440, + "args": { + "External id": 84972,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865979496.290, "dur": 18.830, + "args": { + "External id": 84973,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865979498.420, "dur": 2.540, + "args": { + "External id": 84974,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979503.180, "dur": 0.850, + "args": { + "External id": 84975,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865979516.870, "dur": 8.650, + "args": { + "External id": 84976,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865979526.540, "dur": 7.500, + "args": { + "External id": 84977,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865979545.760, "dur": 3.020, + "args": { + "External id": 84978,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865979559.340, "dur": 4.490, + "args": { + "External id": 84979,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979562.140, "dur": 0.680, + "args": { + "External id": 84980,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865979633.510, "dur": 44.470, + "args": { + "External id": 84981,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865979685.890, "dur": 6.760, + "args": { + "External id": 84982,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979689.820, "dur": 1.070, + "args": { + "External id": 84983,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865979693.880, "dur": 20.080, + "args": { + "External id": 84984,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865979721.260, "dur": 8.640, + "args": { + "External id": 84985,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865979724.010, "dur": 5.060, + "args": { + "External id": 84986,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979727.780, "dur": 0.960, + "args": { + "External id": 84987,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865979732.390, "dur": 33.910, + "args": { + "External id": 84988,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865979733.330, "dur": 32.010, + "args": { + "External id": 84989,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865979770.930, "dur": 14.340, + "args": { + "External id": 84990,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865979793.290, "dur": 5.349, + "args": { + "External id": 84991,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979796.370, "dur": 0.909, + "args": { + "External id": 84992,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865979815.599, "dur": 40.380, + "args": { + "External id": 84993,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865979816.530, "dur": 4.360, + "args": { + "External id": 84994,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865979817.719, "dur": 2.491, + "args": { + "External id": 84995,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979819.350, "dur": 0.569, + "args": { + "External id": 84996,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865979821.739, "dur": 33.611, + "args": { + "External id": 84997,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865979822.679, "dur": 31.911, + "args": { + "External id": 84998,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865979862.330, "dur": 5.429, + "args": { + "External id": 84999,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865979864.890, "dur": 1.620, + "args": { + "External id": 85000,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865979876.629, "dur": 1.890, + "args": { + "External id": 85001,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865979885.109, "dur": 7.940, + "args": { + "External id": 85002,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865979886.729, "dur": 6.000, + "args": { + "External id": 85003,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865979973.229, "dur": 162.630, + "args": { + "External id": 85004,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865979975.429, "dur": 4.720, + "args": { + "External id": 85005,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865979981.819, "dur": 153.520, + "args": { + "External id": 85006,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865979984.619, "dur": 0.230, + "args": { + "External id": 85007,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865979985.859, "dur": 23.560, + "args": { + "External id": 85008,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865980011.169, "dur": 3.500, + "args": { + "External id": 85009,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980013.309, "dur": 0.910, + "args": { + "External id": 85010,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865980015.379, "dur": 21.790, + "args": { + "External id": 85011,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865980016.379, "dur": 2.860, + "args": { + "External id": 85012,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865980020.289, "dur": 16.550, + "args": { + "External id": 85013,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865980024.509, "dur": 3.500, + "args": { + "External id": 85014,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865980038.469, "dur": 18.230, + "args": { + "External id": 85015,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865980058.439, "dur": 10.100, + "args": { + "External id": 85016,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865980071.289, "dur": 11.590, + "args": { + "External id": 85017,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865980085.299, "dur": 8.470, + "args": { + "External id": 85018,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865980095.549, "dur": 18.710, + "args": { + "External id": 85019,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865980097.649, "dur": 2.510, + "args": { + "External id": 85020,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980102.249, "dur": 0.940, + "args": { + "External id": 85021,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865980116.059, "dur": 8.110, + "args": { + "External id": 85022,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865980125.289, "dur": 7.760, + "args": { + "External id": 85023,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865980144.349, "dur": 3.010, + "args": { + "External id": 85024,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865980157.299, "dur": 4.430, + "args": { + "External id": 85025,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980160.099, "dur": 0.640, + "args": { + "External id": 85026,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865980231.809, "dur": 44.840, + "args": { + "External id": 85027,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865980283.958, "dur": 7.931, + "args": { + "External id": 85028,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980288.109, "dur": 1.980, + "args": { + "External id": 85029,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865980293.138, "dur": 28.960, + "args": { + "External id": 85030,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865980330.378, "dur": 7.650, + "args": { + "External id": 85031,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865980333.248, "dur": 3.940, + "args": { + "External id": 85032,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980335.728, "dur": 1.110, + "args": { + "External id": 85033,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865980340.508, "dur": 34.350, + "args": { + "External id": 85034,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865980341.428, "dur": 32.530, + "args": { + "External id": 85035,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865980379.828, "dur": 14.710, + "args": { + "External id": 85036,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865980402.788, "dur": 7.130, + "args": { + "External id": 85037,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980407.648, "dur": 0.880, + "args": { + "External id": 85038,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865980413.898, "dur": 39.050, + "args": { + "External id": 85039,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865980414.708, "dur": 4.020, + "args": { + "External id": 85040,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865980415.868, "dur": 2.370, + "args": { + "External id": 85041,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980417.448, "dur": 0.500, + "args": { + "External id": 85042,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865980419.508, "dur": 32.890, + "args": { + "External id": 85043,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865980420.438, "dur": 31.100, + "args": { + "External id": 85044,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865980459.928, "dur": 4.530, + "args": { + "External id": 85045,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980462.618, "dur": 0.640, + "args": { + "External id": 85046,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865980471.738, "dur": 1.870, + "args": { + "External id": 85047,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865980480.408, "dur": 9.360, + "args": { + "External id": 85048,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865980481.998, "dur": 7.360, + "args": { + "External id": 85049,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865980571.538, "dur": 160.999, + "args": { + "External id": 85050,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865980573.848, "dur": 6.090, + "args": { + "External id": 85051,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865980581.428, "dur": 150.600, + "args": { + "External id": 85052,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865980582.798, "dur": 0.210, + "args": { + "External id": 85053,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865980584.088, "dur": 22.040, + "args": { + "External id": 85054,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865980609.108, "dur": 3.650, + "args": { + "External id": 85055,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980611.378, "dur": 0.940, + "args": { + "External id": 85056,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865980613.468, "dur": 23.410, + "args": { + "External id": 85057,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865980614.568, "dur": 2.870, + "args": { + "External id": 85058,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865980619.548, "dur": 16.990, + "args": { + "External id": 85059,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865980623.518, "dur": 3.760, + "args": { + "External id": 85060,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865980638.148, "dur": 17.029, + "args": { + "External id": 85061,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865980656.897, "dur": 9.691, + "args": { + "External id": 85062,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865980669.297, "dur": 11.351, + "args": { + "External id": 85063,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865980681.848, "dur": 8.409, + "args": { + "External id": 85064,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865980692.057, "dur": 20.260, + "args": { + "External id": 85065,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865980694.057, "dur": 2.611, + "args": { + "External id": 85066,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980700.048, "dur": 0.929, + "args": { + "External id": 85067,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865980714.037, "dur": 8.031, + "args": { + "External id": 85068,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865980723.097, "dur": 7.751, + "args": { + "External id": 85069,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865980741.197, "dur": 3.011, + "args": { + "External id": 85070,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865980754.357, "dur": 4.630, + "args": { + "External id": 85071,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980757.367, "dur": 0.630, + "args": { + "External id": 85072,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865980828.907, "dur": 44.850, + "args": { + "External id": 85073,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865980880.927, "dur": 8.380, + "args": { + "External id": 85074,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980886.217, "dur": 1.180, + "args": { + "External id": 85075,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865980890.537, "dur": 21.480, + "args": { + "External id": 85076,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865980919.337, "dur": 5.410, + "args": { + "External id": 85077,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865980920.787, "dur": 3.140, + "args": { + "External id": 85078,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980922.707, "dur": 0.870, + "args": { + "External id": 85079,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865980927.157, "dur": 33.500, + "args": { + "External id": 85080,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865980928.087, "dur": 31.710, + "args": { + "External id": 85081,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865980966.267, "dur": 15.180, + "args": { + "External id": 85082,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865980989.467, "dur": 5.550, + "args": { + "External id": 85083,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865980992.737, "dur": 0.910, + "args": { + "External id": 85084,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865980998.997, "dur": 39.780, + "args": { + "External id": 85085,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865980999.827, "dur": 5.350, + "args": { + "External id": 85086,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865981000.987, "dur": 3.680, + "args": { + "External id": 85087,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981003.737, "dur": 0.470, + "args": { + "External id": 85088,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865981005.997, "dur": 32.250, + "args": { + "External id": 85089,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865981006.907, "dur": 30.530, + "args": { + "External id": 85090,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865981045.067, "dur": 4.530, + "args": { + "External id": 85091,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981047.647, "dur": 0.620, + "args": { + "External id": 85092,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865981057.097, "dur": 1.830, + "args": { + "External id": 85093,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865981065.487, "dur": 9.210, + "args": { + "External id": 85094,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865981068.237, "dur": 6.140, + "args": { + "External id": 85095,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865981155.387, "dur": 176.699, + "args": { + "External id": 85096,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865981158.856, "dur": 4.551, + "args": { + "External id": 85097,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865981164.996, "dur": 166.550, + "args": { + "External id": 85098,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865981166.216, "dur": 0.211, + "args": { + "External id": 85099,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865981167.347, "dur": 23.059, + "args": { + "External id": 85100,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865981192.226, "dur": 6.640, + "args": { + "External id": 85101,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981197.466, "dur": 0.950, + "args": { + "External id": 85102,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865981199.636, "dur": 22.470, + "args": { + "External id": 85103,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865981202.076, "dur": 3.010, + "args": { + "External id": 85104,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865981206.136, "dur": 15.650, + "args": { + "External id": 85105,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865981209.316, "dur": 3.590, + "args": { + "External id": 85106,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865981223.376, "dur": 17.990, + "args": { + "External id": 85107,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865981242.986, "dur": 10.030, + "args": { + "External id": 85108,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865981255.636, "dur": 11.500, + "args": { + "External id": 85109,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865981268.416, "dur": 8.790, + "args": { + "External id": 85110,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865981280.036, "dur": 29.770, + "args": { + "External id": 85111,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865981283.336, "dur": 2.730, + "args": { + "External id": 85112,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981289.386, "dur": 0.840, + "args": { + "External id": 85113,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865981312.166, "dur": 9.020, + "args": { + "External id": 85114,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865981322.296, "dur": 8.060, + "args": { + "External id": 85115,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865981341.236, "dur": 3.150, + "args": { + "External id": 85116,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865981354.576, "dur": 4.510, + "args": { + "External id": 85117,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981357.356, "dur": 0.680, + "args": { + "External id": 85118,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865981429.946, "dur": 45.240, + "args": { + "External id": 85119,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865981484.146, "dur": 7.220, + "args": { + "External id": 85120,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981488.386, "dur": 1.110, + "args": { + "External id": 85121,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865981492.596, "dur": 20.330, + "args": { + "External id": 85122,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865981520.216, "dur": 6.770, + "args": { + "External id": 85123,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865981521.716, "dur": 4.450, + "args": { + "External id": 85124,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981523.766, "dur": 2.010, + "args": { + "External id": 85125,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865981530.606, "dur": 33.049, + "args": { + "External id": 85126,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865981531.575, "dur": 31.120, + "args": { + "External id": 85127,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865981568.486, "dur": 14.640, + "args": { + "External id": 85128,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865981591.095, "dur": 5.351, + "args": { + "External id": 85129,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981594.215, "dur": 0.960, + "args": { + "External id": 85130,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865981600.366, "dur": 40.419, + "args": { + "External id": 85131,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865981601.186, "dur": 5.189, + "args": { + "External id": 85132,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865981603.555, "dur": 2.280, + "args": { + "External id": 85133,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981605.055, "dur": 0.520, + "args": { + "External id": 85134,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865981607.186, "dur": 33.049, + "args": { + "External id": 85135,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865981608.046, "dur": 31.329, + "args": { + "External id": 85136,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865981649.165, "dur": 6.200, + "args": { + "External id": 85137,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981652.085, "dur": 1.950, + "args": { + "External id": 85138,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865981663.905, "dur": 1.870, + "args": { + "External id": 85139,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865981672.435, "dur": 8.120, + "args": { + "External id": 85140,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865981673.855, "dur": 6.380, + "args": { + "External id": 85141,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865981760.895, "dur": 162.250, + "args": { + "External id": 85142,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865981763.075, "dur": 5.570, + "args": { + "External id": 85143,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865981770.255, "dur": 152.330, + "args": { + "External id": 85144,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865981772.695, "dur": 0.210, + "args": { + "External id": 85145,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865981773.885, "dur": 23.160, + "args": { + "External id": 85146,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865981798.695, "dur": 3.760, + "args": { + "External id": 85147,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981800.805, "dur": 0.970, + "args": { + "External id": 85148,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865981803.195, "dur": 23.250, + "args": { + "External id": 85149,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865981804.195, "dur": 2.910, + "args": { + "External id": 85150,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865981808.095, "dur": 18.020, + "args": { + "External id": 85151,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865981812.205, "dur": 4.610, + "args": { + "External id": 85152,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865981827.705, "dur": 17.250, + "args": { + "External id": 85153,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865981846.555, "dur": 9.970, + "args": { + "External id": 85154,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865981860.455, "dur": 11.580, + "args": { + "External id": 85155,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865981873.305, "dur": 8.370, + "args": { + "External id": 85156,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865981883.355, "dur": 18.300, + "args": { + "External id": 85157,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865981885.455, "dur": 2.440, + "args": { + "External id": 85158,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981889.965, "dur": 0.760, + "args": { + "External id": 85159,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865981903.445, "dur": 7.990, + "args": { + "External id": 85160,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865981913.695, "dur": 7.650, + "args": { + "External id": 85161,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865981931.755, "dur": 3.010, + "args": { + "External id": 85162,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865981944.805, "dur": 4.410, + "args": { + "External id": 85163,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865981947.555, "dur": 0.620, + "args": { + "External id": 85164,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865982018.194, "dur": 44.631, + "args": { + "External id": 85165,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865982070.614, "dur": 6.930, + "args": { + "External id": 85166,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982074.614, "dur": 1.090, + "args": { + "External id": 85167,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865982078.764, "dur": 21.220, + "args": { + "External id": 85168,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865982107.524, "dur": 6.610, + "args": { + "External id": 85169,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865982110.064, "dur": 3.260, + "args": { + "External id": 85170,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982112.064, "dur": 0.890, + "args": { + "External id": 85171,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865982116.694, "dur": 34.410, + "args": { + "External id": 85172,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865982117.614, "dur": 32.510, + "args": { + "External id": 85173,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865982155.694, "dur": 14.380, + "args": { + "External id": 85174,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865982175.134, "dur": 24.620, + "args": { + "External id": 85175,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865982177.404, "dur": 21.850, + "args": { + "External id": 85176,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982184.744, "dur": 0.660, + "args": { + "External id": 85177,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865982206.094, "dur": 24.370, + "args": { + "External id": 85178,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865982207.234, "dur": 22.940, + "args": { + "External id": 85179,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 6758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982210.624, "dur": 5.880, + "args": { + "External id": 85180,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865982217.584, "dur": 12.110, + "args": { + "External id": 85181,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865982240.144, "dur": 5.100, + "args": { + "External id": 85182,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865982241.464, "dur": 3.430, + "args": { + "External id": 85183,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300865982246.254, "dur": 2.600, + "args": { + "External id": 85184,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300865982248.044, "dur": 0.630, + "args": { + "External id": 85185,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865982278.754, "dur": 29.620, + "args": { + "External id": 85186,"Sequence number": 1771194, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865982311.274, "dur": 12.790, + "args": { + "External id": 85187,"Sequence number": 1771195, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6766 + } + }, + { + "ph": "s", "id": 9, "pid": 5714, "tid": 5714, "ts": 6300865982311.274, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865982330.984, "dur": 8.140, + "args": { + "External id": 85188,"Sequence number": 1771196, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 6767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982335.634, "dur": 1.620, + "args": { + "External id": 85189,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6300865982341.094, "dur": 5.650, + "args": { + "External id": 85190,"Sequence number": 1771196, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "3"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 6769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982345.314, "dur": 0.360, + "args": { + "External id": 85191,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "4"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865982348.244, "dur": 2.280, + "args": { + "External id": 85192,"Sequence number": 1771196, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 6771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982349.744, "dur": 0.260, + "args": { + "External id": 85193,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "4"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 6772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865982356.144, "dur": 4.320, + "args": { + "External id": 85194,"Sequence number": 1771196, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6773 + } + }, + { + "ph": "s", "id": 8, "pid": 5714, "tid": 5714, "ts": 6300865982356.144, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982358.684, "dur": 0.540, + "args": { + "External id": 85195,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865982362.774, "dur": 2.980, + "args": { + "External id": 85196,"Sequence number": 1771197, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6775 + } + }, + { + "ph": "s", "id": 7, "pid": 5714, "tid": 5714, "ts": 6300865982362.774, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982364.714, "dur": 0.270, + "args": { + "External id": 85197,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6300865982366.774, "dur": 3.700, + "args": { + "External id": 85198,"Sequence number": 1771198, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "3"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 6777 + } + }, + { + "ph": "s", "id": 6, "pid": 5714, "tid": 5714, "ts": 6300865982366.774, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982369.314, "dur": 0.400, + "args": { + "External id": 85199,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865982371.614, "dur": 3.330, + "args": { + "External id": 85200,"Sequence number": 1771199, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 6779 + } + }, + { + "ph": "s", "id": 5, "pid": 5714, "tid": 5714, "ts": 6300865982371.614, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982373.884, "dur": 0.370, + "args": { + "External id": 85201,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 6780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6300865982378.644, "dur": 35.729, + "args": { + "External id": 85202,"Sequence number": 1771200, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6300865982380.704, "dur": 33.340, + "args": { + "External id": 85203,"Sequence number": 1771200, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865982382.464, "dur": 11.030, + "args": { + "External id": 85204,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 6783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865982384.214, "dur": 8.700, + "args": { + "External id": 85205,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865982394.534, "dur": 18.979, + "args": { + "External id": 85206,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 6785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865982440.113, "dur": 5.240, + "args": { + "External id": 85207,"Sequence number": 1771200, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 6786 + } + }, + { + "ph": "s", "id": 4, "pid": 5714, "tid": 5714, "ts": 6300865982440.113, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865982447.884, "dur": 1.029, + "args": { + "External id": 85208,"Sequence number": 1771201, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5714, "tid": 5714, + "ts": 6300865982471.013, "dur": 21748.881, + "args": { + "External id": 85209,"Sequence number": 1771201, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 6788 + } + }, + { + "ph": "s", "id": 3, "pid": 5714, "tid": 5714, "ts": 6300865982471.013, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6300865982485.413, "dur": 30.880, + "args": { + "External id": 85210,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6300865982486.164, "dur": 29.759, + "args": { + "External id": 85211,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865982487.804, "dur": 8.900, + "args": { + "External id": 85212,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865982489.493, "dur": 6.560, + "args": { + "External id": 85213,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865982497.644, "dur": 17.759, + "args": { + "External id": 85214,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 6793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6300865982531.173, "dur": 24.860, + "args": { + "External id": 85215,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865982532.233, "dur": 8.550, + "args": { + "External id": 85216,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982534.183, "dur": 6.160, + "args": { + "External id": 85217,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865982541.743, "dur": 14.020, + "args": { + "External id": 85218,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865982543.233, "dur": 11.630, + "args": { + "External id": 85219,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6300865982560.173, "dur": 18.670, + "args": { + "External id": 85220,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300865982560.923, "dur": 6.790, + "args": { + "External id": 85221,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982563.463, "dur": 3.870, + "args": { + "External id": 85222,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865982568.323, "dur": 10.320, + "args": { + "External id": 85223,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865982569.193, "dur": 8.670, + "args": { + "External id": 85224,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 6803 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6300865982584.403, "dur": 17.710, + "args": { + "External id": 85225,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 6804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865982586.953, "dur": 4.460, + "args": { + "External id": 85226,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300865982592.343, "dur": 9.500, + "args": { + "External id": 85227,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 6806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300865982593.313, "dur": 7.750, + "args": { + "External id": 85228,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 5714, + "ts": 6300865982606.123, "dur": 19.710, + "args": { + "External id": 85229,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865982628.633, "dur": 45.750, + "args": { + "External id": 85230,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865982631.383, "dur": 42.450, + "args": { + "External id": 85231,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982637.273, "dur": 0.960, + "args": { + "External id": 85232,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300865982639.253, "dur": 22.760, + "args": { + "External id": 85233,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300865982641.443, "dur": 20.300, + "args": { + "External id": 85234,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 6813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300865982643.743, "dur": 4.540, + "args": { + "External id": 85235,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865982649.313, "dur": 12.000, + "args": { + "External id": 85236,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 6815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6300865982679.153, "dur": 16578.112, + "args": { + "External id": 85237,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6300865982680.393, "dur": 16576.182, + "args": { + "External id": 85238,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865999266.545, "dur": 10.740, + "args": { + "External id": 85239,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999273.825, "dur": 1.090, + "args": { + "External id": 85240,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865999281.845, "dur": 59.120, + "args": { + "External id": 85241,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865999282.785, "dur": 5.370, + "args": { + "External id": 85242,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865999284.525, "dur": 2.870, + "args": { + "External id": 85243,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999286.305, "dur": 0.750, + "args": { + "External id": 85244,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865999289.205, "dur": 50.950, + "args": { + "External id": 85245,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865999291.705, "dur": 47.380, + "args": { + "External id": 85246,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865999346.415, "dur": 4.920, + "args": { + "External id": 85247,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999349.295, "dur": 0.580, + "args": { + "External id": 85248,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865999358.285, "dur": 2.050, + "args": { + "External id": 85249,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865999367.355, "dur": 8.500, + "args": { + "External id": 85250,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865999368.985, "dur": 6.520, + "args": { + "External id": 85251,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865999464.845, "dur": 165.909, + "args": { + "External id": 85252,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865999468.305, "dur": 5.310, + "args": { + "External id": 85253,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300865999475.205, "dur": 155.000, + "args": { + "External id": 85254,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300865999476.405, "dur": 0.230, + "args": { + "External id": 85255,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300865999477.665, "dur": 24.200, + "args": { + "External id": 85256,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300865999503.895, "dur": 4.940, + "args": { + "External id": 85257,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999507.265, "dur": 1.000, + "args": { + "External id": 85258,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865999509.545, "dur": 22.630, + "args": { + "External id": 85259,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865999510.625, "dur": 2.880, + "args": { + "External id": 85260,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300865999515.845, "dur": 16.020, + "args": { + "External id": 85261,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865999519.205, "dur": 3.400, + "args": { + "External id": 85262,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300865999533.415, "dur": 16.960, + "args": { + "External id": 85263,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300865999552.095, "dur": 10.000, + "args": { + "External id": 85264,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300865999565.015, "dur": 11.539, + "args": { + "External id": 85265,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300865999577.934, "dur": 8.400, + "args": { + "External id": 85266,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300865999588.085, "dur": 21.660, + "args": { + "External id": 85267,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300865999591.194, "dur": 2.611, + "args": { + "External id": 85268,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999597.514, "dur": 0.831, + "args": { + "External id": 85269,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300865999611.625, "dur": 8.309, + "args": { + "External id": 85270,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865999621.125, "dur": 7.689, + "args": { + "External id": 85271,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865999639.505, "dur": 3.080, + "args": { + "External id": 85272,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865999650.645, "dur": 4.329, + "args": { + "External id": 85273,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999653.385, "dur": 0.620, + "args": { + "External id": 85274,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865999726.154, "dur": 45.350, + "args": { + "External id": 85275,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865999779.204, "dur": 9.930, + "args": { + "External id": 85276,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999785.004, "dur": 1.220, + "args": { + "External id": 85277,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300865999790.344, "dur": 23.160, + "args": { + "External id": 85278,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865999821.034, "dur": 6.940, + "args": { + "External id": 85279,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865999822.494, "dur": 4.560, + "args": { + "External id": 85280,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999825.674, "dur": 1.000, + "args": { + "External id": 85281,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865999830.464, "dur": 35.240, + "args": { + "External id": 85282,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865999832.674, "dur": 32.140, + "args": { + "External id": 85283,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300865999870.854, "dur": 14.930, + "args": { + "External id": 85284,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865999893.914, "dur": 5.520, + "args": { + "External id": 85285,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999897.024, "dur": 1.010, + "args": { + "External id": 85286,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300865999903.414, "dur": 40.700, + "args": { + "External id": 85287,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300865999904.224, "dur": 5.270, + "args": { + "External id": 85288,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300865999905.444, "dur": 3.540, + "args": { + "External id": 85289,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999908.014, "dur": 0.580, + "args": { + "External id": 85290,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300865999910.314, "dur": 33.170, + "args": { + "External id": 85291,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300865999911.264, "dur": 31.340, + "args": { + "External id": 85292,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300865999950.104, "dur": 5.320, + "args": { + "External id": 85293,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300865999952.644, "dur": 1.570, + "args": { + "External id": 85294,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300865999962.944, "dur": 1.840, + "args": { + "External id": 85295,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300865999973.064, "dur": 8.240, + "args": { + "External id": 85296,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300865999974.774, "dur": 6.220, + "args": { + "External id": 85297,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866000065.453, "dur": 163.650, + "args": { + "External id": 85298,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866000067.753, "dur": 5.871, + "args": { + "External id": 85299,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866000075.213, "dur": 153.380, + "args": { + "External id": 85300,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300866000076.393, "dur": 0.220, + "args": { + "External id": 85301,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300866000079.004, "dur": 21.900, + "args": { + "External id": 85302,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300866000102.564, "dur": 4.709, + "args": { + "External id": 85303,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000105.804, "dur": 1.009, + "args": { + "External id": 85304,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866000108.064, "dur": 21.229, + "args": { + "External id": 85305,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866000109.184, "dur": 2.869, + "args": { + "External id": 85306,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866000113.083, "dur": 15.840, + "args": { + "External id": 85307,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866000116.063, "dur": 3.600, + "args": { + "External id": 85308,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300866000130.503, "dur": 18.580, + "args": { + "External id": 85309,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300866000151.113, "dur": 9.900, + "args": { + "External id": 85310,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300866000164.053, "dur": 12.000, + "args": { + "External id": 85311,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300866000178.443, "dur": 8.560, + "args": { + "External id": 85312,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300866000188.773, "dur": 19.890, + "args": { + "External id": 85313,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866000190.953, "dur": 2.610, + "args": { + "External id": 85314,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000195.793, "dur": 2.070, + "args": { + "External id": 85315,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300866000210.453, "dur": 8.310, + "args": { + "External id": 85316,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866000219.833, "dur": 7.540, + "args": { + "External id": 85317,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6896 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866000238.793, "dur": 2.950, + "args": { + "External id": 85318,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866000251.973, "dur": 4.540, + "args": { + "External id": 85319,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000254.883, "dur": 0.640, + "args": { + "External id": 85320,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866000338.473, "dur": 45.640, + "args": { + "External id": 85321,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866000391.793, "dur": 7.500, + "args": { + "External id": 85322,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000396.143, "dur": 1.220, + "args": { + "External id": 85323,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300866000400.603, "dur": 20.450, + "args": { + "External id": 85324,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866000428.733, "dur": 7.130, + "args": { + "External id": 85325,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866000430.203, "dur": 4.830, + "args": { + "External id": 85326,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000433.703, "dur": 0.970, + "args": { + "External id": 85327,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866000438.213, "dur": 33.739, + "args": { + "External id": 85328,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866000439.173, "dur": 31.790, + "args": { + "External id": 85329,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866000476.872, "dur": 14.891, + "args": { + "External id": 85330,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866000500.052, "dur": 5.300, + "args": { + "External id": 85331,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000503.092, "dur": 0.900, + "args": { + "External id": 85332,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300866000510.803, "dur": 40.179, + "args": { + "External id": 85333,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866000511.683, "dur": 5.329, + "args": { + "External id": 85334,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866000512.992, "dur": 3.440, + "args": { + "External id": 85335,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000515.543, "dur": 0.580, + "args": { + "External id": 85336,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866000517.883, "dur": 32.519, + "args": { + "External id": 85337,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866000518.803, "dur": 30.859, + "args": { + "External id": 85338,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866000557.482, "dur": 4.430, + "args": { + "External id": 85339,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000559.942, "dur": 0.540, + "args": { + "External id": 85340,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866000570.462, "dur": 1.810, + "args": { + "External id": 85341,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300866000579.122, "dur": 8.080, + "args": { + "External id": 85342,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866000580.792, "dur": 6.060, + "args": { + "External id": 85343,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866000668.972, "dur": 164.500, + "args": { + "External id": 85344,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866000672.392, "dur": 4.590, + "args": { + "External id": 85345,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866000678.542, "dur": 154.390, + "args": { + "External id": 85346,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300866000681.162, "dur": 0.320, + "args": { + "External id": 85347,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300866000683.632, "dur": 23.490, + "args": { + "External id": 85348,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300866000708.742, "dur": 3.670, + "args": { + "External id": 85349,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000710.932, "dur": 1.000, + "args": { + "External id": 85350,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866000713.262, "dur": 21.780, + "args": { + "External id": 85351,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866000715.302, "dur": 2.910, + "args": { + "External id": 85352,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866000719.302, "dur": 15.410, + "args": { + "External id": 85353,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866000722.362, "dur": 3.410, + "args": { + "External id": 85354,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300866000736.302, "dur": 18.330, + "args": { + "External id": 85355,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300866000756.442, "dur": 9.830, + "args": { + "External id": 85356,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300866000770.222, "dur": 11.710, + "args": { + "External id": 85357,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300866000784.492, "dur": 8.350, + "args": { + "External id": 85358,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300866000794.592, "dur": 18.390, + "args": { + "External id": 85359,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866000796.642, "dur": 2.500, + "args": { + "External id": 85360,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000801.312, "dur": 0.810, + "args": { + "External id": 85361,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300866000814.692, "dur": 8.110, + "args": { + "External id": 85362,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866000823.952, "dur": 7.800, + "args": { + "External id": 85363,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866000843.542, "dur": 2.990, + "args": { + "External id": 85364,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866000856.652, "dur": 4.530, + "args": { + "External id": 85365,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000859.522, "dur": 0.680, + "args": { + "External id": 85366,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866000931.722, "dur": 46.320, + "args": { + "External id": 85367,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866000985.462, "dur": 6.999, + "args": { + "External id": 85368,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866000989.491, "dur": 1.160, + "args": { + "External id": 85369,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300866000993.681, "dur": 20.020, + "args": { + "External id": 85370,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866001020.901, "dur": 6.950, + "args": { + "External id": 85371,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866001023.571, "dur": 3.480, + "args": { + "External id": 85372,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001025.771, "dur": 0.940, + "args": { + "External id": 85373,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866001030.231, "dur": 33.780, + "args": { + "External id": 85374,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866001031.171, "dur": 31.920, + "args": { + "External id": 85375,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866001068.801, "dur": 14.440, + "args": { + "External id": 85376,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866001091.301, "dur": 7.020, + "args": { + "External id": 85377,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001095.971, "dur": 0.930, + "args": { + "External id": 85378,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300866001102.321, "dur": 40.210, + "args": { + "External id": 85379,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866001103.181, "dur": 5.110, + "args": { + "External id": 85380,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866001104.361, "dur": 3.420, + "args": { + "External id": 85381,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001105.851, "dur": 1.580, + "args": { + "External id": 85382,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866001109.131, "dur": 32.860, + "args": { + "External id": 85383,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866001110.021, "dur": 31.110, + "args": { + "External id": 85384,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866001162.001, "dur": 4.130, + "args": { + "External id": 85385,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001164.461, "dur": 0.510, + "args": { + "External id": 85386,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866001173.411, "dur": 1.860, + "args": { + "External id": 85387,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300866001181.891, "dur": 8.360, + "args": { + "External id": 85388,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866001183.501, "dur": 6.440, + "args": { + "External id": 85389,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866001271.931, "dur": 173.219, + "args": { + "External id": 85390,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866001274.331, "dur": 5.830, + "args": { + "External id": 85391,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866001281.691, "dur": 162.929, + "args": { + "External id": 85392,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300866001282.871, "dur": 0.230, + "args": { + "External id": 85393,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300866001285.251, "dur": 31.190, + "args": { + "External id": 85394,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300866001318.381, "dur": 3.810, + "args": { + "External id": 85395,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001320.521, "dur": 1.100, + "args": { + "External id": 85396,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866001323.081, "dur": 24.309, + "args": { + "External id": 85397,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866001324.351, "dur": 4.230, + "args": { + "External id": 85398,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866001330.861, "dur": 16.129, + "args": { + "External id": 85399,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866001334.050, "dur": 4.100, + "args": { + "External id": 85400,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300866001348.650, "dur": 18.171, + "args": { + "External id": 85401,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300866001368.490, "dur": 10.231, + "args": { + "External id": 85402,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300866001381.681, "dur": 11.560, + "args": { + "External id": 85403,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300866001394.530, "dur": 8.491, + "args": { + "External id": 85404,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300866001404.770, "dur": 19.931, + "args": { + "External id": 85405,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866001407.070, "dur": 2.480, + "args": { + "External id": 85406,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001412.850, "dur": 0.940, + "args": { + "External id": 85407,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300866001426.410, "dur": 8.280, + "args": { + "External id": 85408,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866001435.720, "dur": 7.540, + "args": { + "External id": 85409,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866001454.160, "dur": 3.020, + "args": { + "External id": 85410,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866001467.060, "dur": 4.480, + "args": { + "External id": 85411,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001469.910, "dur": 0.620, + "args": { + "External id": 85412,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866001542.760, "dur": 45.160, + "args": { + "External id": 85413,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866001595.640, "dur": 9.800, + "args": { + "External id": 85414,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001602.390, "dur": 1.210, + "args": { + "External id": 85415,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300866001606.770, "dur": 20.090, + "args": { + "External id": 85416,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866001634.320, "dur": 5.750, + "args": { + "External id": 85417,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866001635.810, "dur": 3.350, + "args": { + "External id": 85418,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001637.800, "dur": 1.010, + "args": { + "External id": 85419,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866001642.480, "dur": 33.170, + "args": { + "External id": 85420,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866001643.450, "dur": 31.150, + "args": { + "External id": 85421,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866001681.460, "dur": 14.380, + "args": { + "External id": 85422,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866001703.780, "dur": 5.310, + "args": { + "External id": 85423,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001706.820, "dur": 0.900, + "args": { + "External id": 85424,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300866001712.970, "dur": 39.280, + "args": { + "External id": 85425,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866001713.750, "dur": 5.140, + "args": { + "External id": 85426,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866001714.910, "dur": 3.440, + "args": { + "External id": 85427,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001717.590, "dur": 0.490, + "args": { + "External id": 85428,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866001719.670, "dur": 32.030, + "args": { + "External id": 85429,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866001720.530, "dur": 30.370, + "args": { + "External id": 85430,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866001758.880, "dur": 5.530, + "args": { + "External id": 85431,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001761.350, "dur": 1.670, + "args": { + "External id": 85432,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866001771.789, "dur": 1.811, + "args": { + "External id": 85433,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300866001780.209, "dur": 9.080, + "args": { + "External id": 85434,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866001782.760, "dur": 6.180, + "args": { + "External id": 85435,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866001869.919, "dur": 161.530, + "args": { + "External id": 85436,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866001872.219, "dur": 4.860, + "args": { + "External id": 85437,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866001878.709, "dur": 152.210, + "args": { + "External id": 85438,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300866001879.949, "dur": 0.210, + "args": { + "External id": 85439,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300866001881.179, "dur": 23.750, + "args": { + "External id": 85440,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300866001906.689, "dur": 3.530, + "args": { + "External id": 85441,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001908.809, "dur": 0.940, + "args": { + "External id": 85442,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866001910.949, "dur": 23.300, + "args": { + "External id": 85443,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866001912.049, "dur": 2.880, + "args": { + "External id": 85444,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866001917.149, "dur": 16.770, + "args": { + "External id": 85445,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866001921.269, "dur": 3.510, + "args": { + "External id": 85446,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300866001935.559, "dur": 17.500, + "args": { + "External id": 85447,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300866001954.699, "dur": 10.130, + "args": { + "External id": 85448,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300866001967.489, "dur": 11.310, + "args": { + "External id": 85449,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300866001980.079, "dur": 8.580, + "args": { + "External id": 85450,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300866001990.369, "dur": 19.730, + "args": { + "External id": 85451,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866001992.389, "dur": 2.500, + "args": { + "External id": 85452,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866001998.349, "dur": 0.780, + "args": { + "External id": 85453,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300866002011.749, "dur": 9.020, + "args": { + "External id": 85454,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866002021.829, "dur": 7.940, + "args": { + "External id": 85455,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866002039.899, "dur": 2.990, + "args": { + "External id": 85456,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866002053.149, "dur": 4.390, + "args": { + "External id": 85457,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002055.949, "dur": 0.640, + "args": { + "External id": 85458,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866002126.969, "dur": 44.940, + "args": { + "External id": 85459,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866002179.639, "dur": 8.020, + "args": { + "External id": 85460,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002184.779, "dur": 1.090, + "args": { + "External id": 85461,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300866002188.879, "dur": 20.890, + "args": { + "External id": 85462,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866002216.959, "dur": 5.529, + "args": { + "External id": 85463,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866002218.399, "dur": 3.240, + "args": { + "External id": 85464,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002220.368, "dur": 0.820, + "args": { + "External id": 85465,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866002224.908, "dur": 35.171, + "args": { + "External id": 85466,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866002227.068, "dur": 32.020, + "args": { + "External id": 85467,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866002264.988, "dur": 14.600, + "args": { + "External id": 85468,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866002287.408, "dur": 5.371, + "args": { + "External id": 85469,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002290.519, "dur": 0.929, + "args": { + "External id": 85470,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300866002305.339, "dur": 43.129, + "args": { + "External id": 85471,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866002306.219, "dur": 5.679, + "args": { + "External id": 85472,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866002307.448, "dur": 3.850, + "args": { + "External id": 85473,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002310.378, "dur": 0.620, + "args": { + "External id": 85474,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866002312.728, "dur": 35.180, + "args": { + "External id": 85475,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866002313.678, "dur": 33.380, + "args": { + "External id": 85476,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866002354.738, "dur": 4.440, + "args": { + "External id": 85477,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002357.308, "dur": 0.640, + "args": { + "External id": 85478,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866002366.748, "dur": 1.850, + "args": { + "External id": 85479,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300866002376.098, "dur": 8.540, + "args": { + "External id": 85480,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866002377.678, "dur": 6.610, + "args": { + "External id": 85481,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866002464.548, "dur": 161.040, + "args": { + "External id": 85482,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866002468.408, "dur": 5.590, + "args": { + "External id": 85483,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866002475.428, "dur": 149.620, + "args": { + "External id": 85484,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300866002476.638, "dur": 0.210, + "args": { + "External id": 85485,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300866002479.118, "dur": 21.650, + "args": { + "External id": 85486,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300866002502.388, "dur": 4.780, + "args": { + "External id": 85487,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002505.778, "dur": 0.930, + "args": { + "External id": 85488,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866002507.898, "dur": 20.700, + "args": { + "External id": 85489,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866002508.908, "dur": 2.870, + "args": { + "External id": 85490,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866002512.808, "dur": 15.480, + "args": { + "External id": 85491,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866002515.838, "dur": 3.510, + "args": { + "External id": 85492,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300866002529.828, "dur": 17.080, + "args": { + "External id": 85493,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300866002548.558, "dur": 9.790, + "args": { + "External id": 85494,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300866002561.148, "dur": 11.570, + "args": { + "External id": 85495,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300866002574.048, "dur": 8.220, + "args": { + "External id": 85496,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300866002585.348, "dur": 19.810, + "args": { + "External id": 85497,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866002588.588, "dur": 2.430, + "args": { + "External id": 85498,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002593.218, "dur": 0.860, + "args": { + "External id": 85499,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300866002606.918, "dur": 8.080, + "args": { + "External id": 85500,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866002616.158, "dur": 7.570, + "args": { + "External id": 85501,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866002634.368, "dur": 3.120, + "args": { + "External id": 85502,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866002647.378, "dur": 4.640, + "args": { + "External id": 85503,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002650.248, "dur": 0.670, + "args": { + "External id": 85504,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866002734.987, "dur": 45.700, + "args": { + "External id": 85505,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866002788.197, "dur": 8.290, + "args": { + "External id": 85506,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002793.427, "dur": 1.210, + "args": { + "External id": 85507,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300866002797.797, "dur": 20.330, + "args": { + "External id": 85508,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866002825.427, "dur": 6.980, + "args": { + "External id": 85509,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866002826.867, "dur": 4.750, + "args": { + "External id": 85510,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002830.307, "dur": 0.950, + "args": { + "External id": 85511,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866002834.797, "dur": 33.520, + "args": { + "External id": 85512,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866002835.727, "dur": 31.670, + "args": { + "External id": 85513,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866002873.147, "dur": 14.620, + "args": { + "External id": 85514,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866002895.707, "dur": 5.420, + "args": { + "External id": 85515,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002898.817, "dur": 0.940, + "args": { + "External id": 85516,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300866002905.167, "dur": 39.370, + "args": { + "External id": 85517,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866002907.137, "dur": 3.900, + "args": { + "External id": 85518,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866002908.187, "dur": 2.350, + "args": { + "External id": 85519,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002909.797, "dur": 0.450, + "args": { + "External id": 85520,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866002911.807, "dur": 32.170, + "args": { + "External id": 85521,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866002912.627, "dur": 30.570, + "args": { + "External id": 85522,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866002950.967, "dur": 5.270, + "args": { + "External id": 85523,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866002953.367, "dur": 1.500, + "args": { + "External id": 85524,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866002963.707, "dur": 1.870, + "args": { + "External id": 85525,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300866002973.217, "dur": 8.590, + "args": { + "External id": 85526,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866002975.157, "dur": 6.290, + "args": { + "External id": 85527,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866003061.877, "dur": 169.139, + "args": { + "External id": 85528,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866003064.277, "dur": 4.710, + "args": { + "External id": 85529,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866003070.767, "dur": 159.699, + "args": { + "External id": 85530,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300866003071.967, "dur": 0.200, + "args": { + "External id": 85531,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300866003074.307, "dur": 32.850, + "args": { + "External id": 85532,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300866003108.857, "dur": 3.540, + "args": { + "External id": 85533,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003110.997, "dur": 0.909, + "args": { + "External id": 85534,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866003113.086, "dur": 21.520, + "args": { + "External id": 85535,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866003114.017, "dur": 2.960, + "args": { + "External id": 85536,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866003117.966, "dur": 16.351, + "args": { + "External id": 85537,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866003121.977, "dur": 3.409, + "args": { + "External id": 85538,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300866003135.877, "dur": 18.240, + "args": { + "External id": 85539,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300866003155.737, "dur": 9.889, + "args": { + "External id": 85540,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300866003168.406, "dur": 11.471, + "args": { + "External id": 85541,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300866003181.197, "dur": 8.269, + "args": { + "External id": 85542,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300866003192.326, "dur": 18.410, + "args": { + "External id": 85543,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866003194.426, "dur": 2.490, + "args": { + "External id": 85544,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003199.076, "dur": 0.890, + "args": { + "External id": 85545,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300866003212.486, "dur": 8.100, + "args": { + "External id": 85546,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866003221.656, "dur": 7.630, + "args": { + "External id": 85547,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866003240.966, "dur": 2.970, + "args": { + "External id": 85548,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866003253.756, "dur": 4.520, + "args": { + "External id": 85549,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003256.556, "dur": 0.690, + "args": { + "External id": 85550,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866003338.046, "dur": 44.810, + "args": { + "External id": 85551,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866003390.496, "dur": 8.830, + "args": { + "External id": 85552,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003394.746, "dur": 2.590, + "args": { + "External id": 85553,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300866003400.576, "dur": 19.760, + "args": { + "External id": 85554,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866003427.746, "dur": 6.750, + "args": { + "External id": 85555,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866003429.226, "dur": 4.450, + "args": { + "External id": 85556,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003432.396, "dur": 0.940, + "args": { + "External id": 85557,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866003436.826, "dur": 32.890, + "args": { + "External id": 85558,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866003437.806, "dur": 31.030, + "args": { + "External id": 85559,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866003474.346, "dur": 14.500, + "args": { + "External id": 85560,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866003496.766, "dur": 5.340, + "args": { + "External id": 85561,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003499.776, "dur": 0.980, + "args": { + "External id": 85562,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6300866003507.166, "dur": 38.550, + "args": { + "External id": 85563,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866003508.036, "dur": 3.970, + "args": { + "External id": 85564,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866003509.146, "dur": 2.360, + "args": { + "External id": 85565,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003510.746, "dur": 0.480, + "args": { + "External id": 85566,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866003512.796, "dur": 32.429, + "args": { + "External id": 85567,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866003513.666, "dur": 30.770, + "args": { + "External id": 85568,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866003552.905, "dur": 4.340, + "args": { + "External id": 85569,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003555.345, "dur": 0.691, + "args": { + "External id": 85570,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866003564.505, "dur": 1.880, + "args": { + "External id": 85571,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6300866003573.045, "dur": 9.180, + "args": { + "External id": 85572,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866003574.696, "dur": 7.129, + "args": { + "External id": 85573,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866003663.885, "dur": 173.670, + "args": { + "External id": 85574,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866003666.125, "dur": 5.700, + "args": { + "External id": 85575,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6300866003673.435, "dur": 163.610, + "args": { + "External id": 85576,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6300866003674.795, "dur": 0.200, + "args": { + "External id": 85577,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6300866003676.035, "dur": 22.400, + "args": { + "External id": 85578,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6300866003701.155, "dur": 3.640, + "args": { + "External id": 85579,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003703.405, "dur": 0.940, + "args": { + "External id": 85580,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866003705.505, "dur": 23.390, + "args": { + "External id": 85581,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866003706.595, "dur": 2.950, + "args": { + "External id": 85582,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866003710.625, "dur": 17.890, + "args": { + "External id": 85583,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866003715.965, "dur": 3.640, + "args": { + "External id": 85584,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300866003730.175, "dur": 16.870, + "args": { + "External id": 85585,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6300866003748.675, "dur": 9.940, + "args": { + "External id": 85586,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6300866003761.525, "dur": 11.790, + "args": { + "External id": 85587,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6300866003774.625, "dur": 8.240, + "args": { + "External id": 85588,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300866003784.585, "dur": 30.580, + "args": { + "External id": 85589,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866003786.705, "dur": 6.240, + "args": { + "External id": 85590,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003802.125, "dur": 1.780, + "args": { + "External id": 85591,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6300866003818.555, "dur": 8.230, + "args": { + "External id": 85592,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866003827.875, "dur": 7.970, + "args": { + "External id": 85593,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866003846.225, "dur": 3.050, + "args": { + "External id": 85594,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866003861.005, "dur": 6.450, + "args": { + "External id": 85595,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866003864.835, "dur": 0.990, + "args": { + "External id": 85596,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866003968.405, "dur": 46.970, + "args": { + "External id": 85597,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6300866004023.435, "dur": 8.140, + "args": { + "External id": 85598,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866004028.155, "dur": 1.320, + "args": { + "External id": 85599,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300866004034.004, "dur": 21.711, + "args": { + "External id": 85600,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6300866004063.204, "dur": 5.711, + "args": { + "External id": 85601,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6300866004064.764, "dur": 3.311, + "args": { + "External id": 85602,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866004066.844, "dur": 0.880, + "args": { + "External id": 85603,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6300866004071.344, "dur": 34.940, + "args": { + "External id": 85604,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6300866004073.644, "dur": 31.750, + "args": { + "External id": 85605,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866004112.134, "dur": 15.030, + "args": { + "External id": 85606,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300866004132.234, "dur": 24.910, + "args": { + "External id": 85607,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 7186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6300866004134.344, "dur": 22.330, + "args": { + "External id": 85608,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866004142.074, "dur": 0.780, + "args": { + "External id": 85609,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866004162.304, "dur": 24.040, + "args": { + "External id": 85610,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 7189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6300866004163.414, "dur": 22.660, + "args": { + "External id": 85611,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 7190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300866004166.874, "dur": 5.760, + "args": { + "External id": 85612,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300866004173.714, "dur": 11.740, + "args": { + "External id": 85613,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300866004196.094, "dur": 4.890, + "args": { + "External id": 85614,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 7193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300866004198.504, "dur": 2.160, + "args": { + "External id": 85615,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 7194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6300866004202.014, "dur": 1.220, + "args": { + "External id": 85616,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6300866004202.464, "dur": 0.500, + "args": { + "External id": 85617,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866004234.514, "dur": 20.380, + "args": { + "External id": 85618,"Sequence number": 1771202, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6300866004257.534, "dur": 10.590, + "args": { + "External id": 85619,"Sequence number": 1771203, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7198 + } + }, + { + "ph": "s", "id": 2, "pid": 5714, "tid": 5714, "ts": 6300866004257.534, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward", "pid": 5714, "tid": 5714, + "ts": 6300866004390.834, "dur": 64.229, + "args": { + "External id": 85620,"Record function id": 0, "Ev Idx": 7199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 5714, + "ts": 6300866004584.133, "dur": 39.270, + "args": { + "External id": 85621,"Sequence number": 1771204, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7200 + } + }, + { + "ph": "s", "id": 1, "pid": 5714, "tid": 5714, "ts": 6300866004584.133, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 5714, "tid": 5714, + "ts": 6300866004681.093, "dur": 28.210, + "args": { + "External id": 85622,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "1"], "Input type": ["float", "", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[1], [], [], [], [], []], "Ev Idx": 7201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300866004682.303, "dur": 11.210, + "args": { + "External id": 85623,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "1"], "Input type": ["float", "", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[1], [], [], [], [], []], "Ev Idx": 7202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6300866004685.833, "dur": 6.930, + "args": { + "External id": 85624,"Record function id": 0, "Concrete Inputs": ["[1]", "[1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300866004694.513, "dur": 14.460, + "args": { + "External id": 85625,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 5714, + "ts": 6300866110375.553, "dur": 315.669, + "args": { + "External id": 85626,"Sequence number": 1771205, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 5714, + "ts": 6300866110757.522, "dur": 310.939, + "args": { + "External id": 85627,"Sequence number": 1771206, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300866111116.721, "dur": 123.770, + "args": { + "External id": 85628,"Sequence number": 1771207, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "long int", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300866112975.637, "dur": 155.990, + "args": { + "External id": 85629,"Sequence number": 1771208, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "long int", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300866113182.356, "dur": 95.850, + "args": { + "External id": 85630,"Sequence number": 1771209, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "long int", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_norm", "pid": 5714, "tid": 5714, + "ts": 6300866119100.153, "dur": 10622.576, + "args": { + "External id": 85631,"Record function id": 0, "Concrete Inputs": ["", "2.", ""], "Input type": ["TensorList", "Scalar", ""], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_norm", "pid": 5714, "tid": 5714, + "ts": 6300866121038.478, "dur": 2017.576, + "args": { + "External id": 85632,"Record function id": 0, "Concrete Inputs": ["", "2.", ""], "Input type": ["TensorList", "Scalar", ""], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6300866121093.049, "dur": 197.639, + "args": { + "External id": 85633,"Record function id": 0, "Concrete Inputs": ["[12032]", "6", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866121106.269, "dur": 46.029, + "args": { + "External id": 85634,"Record function id": 0, "Concrete Inputs": ["[12032]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6300866121162.338, "dur": 125.810, + "args": { + "External id": 85635,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[12032]], "Ev Idx": 7214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6300866121173.898, "dur": 108.070, + "args": { + "External id": 85636,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[12032], []], "Ev Idx": 7215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129834.658, "dur": 14.040, + "args": { + "External id": 85637,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129864.838, "dur": 2.611, + "args": { + "External id": 85638,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129877.158, "dur": 1.920, + "args": { + "External id": 85639,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129887.778, "dur": 1.840, + "args": { + "External id": 85640,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129898.109, "dur": 1.920, + "args": { + "External id": 85641,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129909.829, "dur": 1.789, + "args": { + "External id": 85642,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129920.289, "dur": 1.789, + "args": { + "External id": 85643,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129934.978, "dur": 1.920, + "args": { + "External id": 85644,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129945.498, "dur": 1.830, + "args": { + "External id": 85645,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129956.768, "dur": 1.880, + "args": { + "External id": 85646,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129967.678, "dur": 1.820, + "args": { + "External id": 85647,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129977.888, "dur": 1.810, + "args": { + "External id": 85648,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129988.058, "dur": 1.850, + "args": { + "External id": 85649,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866129998.528, "dur": 1.790, + "args": { + "External id": 85650,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130008.908, "dur": 1.810, + "args": { + "External id": 85651,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130023.148, "dur": 1.920, + "args": { + "External id": 85652,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130033.248, "dur": 1.810, + "args": { + "External id": 85653,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130044.308, "dur": 1.890, + "args": { + "External id": 85654,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130054.668, "dur": 1.820, + "args": { + "External id": 85655,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130064.748, "dur": 1.770, + "args": { + "External id": 85656,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130074.958, "dur": 1.880, + "args": { + "External id": 85657,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130085.028, "dur": 1.790, + "args": { + "External id": 85658,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130095.118, "dur": 1.790, + "args": { + "External id": 85659,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130109.478, "dur": 1.920, + "args": { + "External id": 85660,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130119.528, "dur": 1.920, + "args": { + "External id": 85661,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130130.848, "dur": 1.810, + "args": { + "External id": 85662,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130141.208, "dur": 1.770, + "args": { + "External id": 85663,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130151.388, "dur": 1.840, + "args": { + "External id": 85664,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130161.228, "dur": 1.720, + "args": { + "External id": 85665,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130170.908, "dur": 1.750, + "args": { + "External id": 85666,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130180.388, "dur": 1.900, + "args": { + "External id": 85667,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130193.458, "dur": 1.750, + "args": { + "External id": 85668,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130203.218, "dur": 1.820, + "args": { + "External id": 85669,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130213.648, "dur": 1.700, + "args": { + "External id": 85670,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130223.318, "dur": 1.700, + "args": { + "External id": 85671,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130232.958, "dur": 1.750, + "args": { + "External id": 85672,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130242.608, "dur": 1.700, + "args": { + "External id": 85673,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130252.108, "dur": 1.740, + "args": { + "External id": 85674,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130261.798, "dur": 1.720, + "args": { + "External id": 85675,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130274.457, "dur": 1.691, + "args": { + "External id": 85676,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130283.968, "dur": 1.749, + "args": { + "External id": 85677,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130294.468, "dur": 1.869, + "args": { + "External id": 85678,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130334.748, "dur": 1.929, + "args": { + "External id": 85679,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130344.868, "dur": 1.740, + "args": { + "External id": 85680,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130354.357, "dur": 1.671, + "args": { + "External id": 85681,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130363.837, "dur": 1.710, + "args": { + "External id": 85682,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130373.347, "dur": 1.720, + "args": { + "External id": 85683,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130386.347, "dur": 1.750, + "args": { + "External id": 85684,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130395.937, "dur": 1.790, + "args": { + "External id": 85685,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130405.747, "dur": 1.840, + "args": { + "External id": 85686,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130415.517, "dur": 1.740, + "args": { + "External id": 85687,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130425.157, "dur": 1.690, + "args": { + "External id": 85688,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130434.977, "dur": 1.710, + "args": { + "External id": 85689,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130445.647, "dur": 1.760, + "args": { + "External id": 85690,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130466.867, "dur": 1.780, + "args": { + "External id": 85691,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130480.817, "dur": 1.730, + "args": { + "External id": 85692,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130490.537, "dur": 1.720, + "args": { + "External id": 85693,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130500.237, "dur": 1.670, + "args": { + "External id": 85694,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130509.857, "dur": 1.610, + "args": { + "External id": 85695,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130519.207, "dur": 1.720, + "args": { + "External id": 85696,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130528.727, "dur": 1.710, + "args": { + "External id": 85697,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130538.517, "dur": 1.720, + "args": { + "External id": 85698,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130548.157, "dur": 1.710, + "args": { + "External id": 85699,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130565.177, "dur": 1.890, + "args": { + "External id": 85700,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130575.307, "dur": 1.750, + "args": { + "External id": 85701,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130586.327, "dur": 1.700, + "args": { + "External id": 85702,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130596.077, "dur": 1.720, + "args": { + "External id": 85703,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130605.907, "dur": 1.640, + "args": { + "External id": 85704,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130615.647, "dur": 1.670, + "args": { + "External id": 85705,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130625.547, "dur": 1.700, + "args": { + "External id": 85706,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130635.367, "dur": 1.930, + "args": { + "External id": 85707,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130648.947, "dur": 1.750, + "args": { + "External id": 85708,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130658.937, "dur": 1.660, + "args": { + "External id": 85709,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130668.497, "dur": 1.720, + "args": { + "External id": 85710,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130678.417, "dur": 1.650, + "args": { + "External id": 85711,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130688.347, "dur": 1.670, + "args": { + "External id": 85712,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130698.097, "dur": 1.650, + "args": { + "External id": 85713,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130710.436, "dur": 1.680, + "args": { + "External id": 85714,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130720.227, "dur": 1.840, + "args": { + "External id": 85715,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130733.527, "dur": 1.709, + "args": { + "External id": 85716,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130743.327, "dur": 1.709, + "args": { + "External id": 85717,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130753.047, "dur": 1.669, + "args": { + "External id": 85718,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130763.296, "dur": 1.680, + "args": { + "External id": 85719,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130773.076, "dur": 1.671, + "args": { + "External id": 85720,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130782.696, "dur": 1.671, + "args": { + "External id": 85721,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130792.547, "dur": 1.680, + "args": { + "External id": 85722,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130802.476, "dur": 1.720, + "args": { + "External id": 85723,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130815.426, "dur": 1.710, + "args": { + "External id": 85724,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130825.086, "dur": 1.750, + "args": { + "External id": 85725,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130834.996, "dur": 1.770, + "args": { + "External id": 85726,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130844.966, "dur": 1.710, + "args": { + "External id": 85727,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130854.666, "dur": 1.710, + "args": { + "External id": 85728,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130864.406, "dur": 1.650, + "args": { + "External id": 85729,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130875.816, "dur": 1.680, + "args": { + "External id": 85730,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130885.646, "dur": 1.700, + "args": { + "External id": 85731,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130898.486, "dur": 1.680, + "args": { + "External id": 85732,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130908.296, "dur": 1.720, + "args": { + "External id": 85733,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130918.016, "dur": 1.650, + "args": { + "External id": 85734,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130927.856, "dur": 1.770, + "args": { + "External id": 85735,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130937.566, "dur": 1.690, + "args": { + "External id": 85736,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130947.206, "dur": 1.690, + "args": { + "External id": 85737,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130956.916, "dur": 1.650, + "args": { + "External id": 85738,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130966.536, "dur": 1.700, + "args": { + "External id": 85739,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130980.096, "dur": 1.690, + "args": { + "External id": 85740,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130989.646, "dur": 1.790, + "args": { + "External id": 85741,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866130999.306, "dur": 1.710, + "args": { + "External id": 85742,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131008.976, "dur": 1.760, + "args": { + "External id": 85743,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131018.646, "dur": 1.730, + "args": { + "External id": 85744,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131028.196, "dur": 1.710, + "args": { + "External id": 85745,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131039.416, "dur": 1.740, + "args": { + "External id": 85746,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131049.076, "dur": 1.700, + "args": { + "External id": 85747,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131062.306, "dur": 1.730, + "args": { + "External id": 85748,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131072.196, "dur": 1.760, + "args": { + "External id": 85749,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131081.986, "dur": 1.680, + "args": { + "External id": 85750,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131091.516, "dur": 1.710, + "args": { + "External id": 85751,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131101.356, "dur": 1.750, + "args": { + "External id": 85752,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131111.016, "dur": 1.740, + "args": { + "External id": 85753,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131120.766, "dur": 1.740, + "args": { + "External id": 85754,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131130.476, "dur": 1.730, + "args": { + "External id": 85755,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131144.096, "dur": 1.799, + "args": { + "External id": 85756,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131154.386, "dur": 1.849, + "args": { + "External id": 85757,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131164.635, "dur": 1.820, + "args": { + "External id": 85758,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131175.046, "dur": 1.749, + "args": { + "External id": 85759,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131185.306, "dur": 1.740, + "args": { + "External id": 85760,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131195.526, "dur": 1.800, + "args": { + "External id": 85761,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131205.895, "dur": 1.800, + "args": { + "External id": 85762,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131216.195, "dur": 1.851, + "args": { + "External id": 85763,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866131230.115, "dur": 1.851, + "args": { + "External id": 85764,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::stack", "pid": 5714, "tid": 5714, + "ts": 6300866131490.035, "dur": 4236.150, + "args": { + "External id": 85765,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::stack", "pid": 5714, "tid": 5714, + "ts": 6300866133488.620, "dur": 2017.976, + "args": { + "External id": 85766,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133516.750, "dur": 37.240, + "args": { + "External id": 85767,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133535.960, "dur": 12.890, + "args": { + "External id": 85768,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133558.780, "dur": 13.310, + "args": { + "External id": 85769,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133568.460, "dur": 2.310, + "args": { + "External id": 85770,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133576.380, "dur": 10.220, + "args": { + "External id": 85771,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133583.780, "dur": 1.690, + "args": { + "External id": 85772,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133590.250, "dur": 8.060, + "args": { + "External id": 85773,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133595.390, "dur": 1.810, + "args": { + "External id": 85774,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133601.470, "dur": 13.270, + "args": { + "External id": 85775,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133611.880, "dur": 1.750, + "args": { + "External id": 85776,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133617.980, "dur": 7.480, + "args": { + "External id": 85777,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133622.520, "dur": 1.740, + "args": { + "External id": 85778,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133629.250, "dur": 10.950, + "args": { + "External id": 85779,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133633.700, "dur": 5.220, + "args": { + "External id": 85780,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133647.400, "dur": 9.650, + "args": { + "External id": 85781,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133654.220, "dur": 1.690, + "args": { + "External id": 85782,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133660.450, "dur": 7.700, + "args": { + "External id": 85783,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133665.270, "dur": 1.780, + "args": { + "External id": 85784,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133671.410, "dur": 10.010, + "args": { + "External id": 85785,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133678.530, "dur": 1.770, + "args": { + "External id": 85786,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133685.230, "dur": 9.240, + "args": { + "External id": 85787,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133691.650, "dur": 1.700, + "args": { + "External id": 85788,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133697.720, "dur": 7.370, + "args": { + "External id": 85789,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133702.300, "dur": 1.710, + "args": { + "External id": 85790,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133708.440, "dur": 12.980, + "args": { + "External id": 85791,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133718.500, "dur": 1.750, + "args": { + "External id": 85792,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133724.710, "dur": 7.050, + "args": { + "External id": 85793,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133729.060, "dur": 1.630, + "args": { + "External id": 85794,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133735.060, "dur": 10.150, + "args": { + "External id": 85795,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133739.440, "dur": 4.650, + "args": { + "External id": 85796,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133752.130, "dur": 9.200, + "args": { + "External id": 85797,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133758.490, "dur": 1.740, + "args": { + "External id": 85798,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133764.580, "dur": 7.240, + "args": { + "External id": 85799,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133768.980, "dur": 1.770, + "args": { + "External id": 85800,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133775.210, "dur": 10.510, + "args": { + "External id": 85801,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133782.700, "dur": 1.850, + "args": { + "External id": 85802,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133789.269, "dur": 10.020, + "args": { + "External id": 85803,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133795.909, "dur": 2.311, + "args": { + "External id": 85804,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133802.500, "dur": 7.440, + "args": { + "External id": 85805,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133806.909, "dur": 1.911, + "args": { + "External id": 85806,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133813.369, "dur": 12.891, + "args": { + "External id": 85807,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133823.429, "dur": 1.711, + "args": { + "External id": 85808,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133829.500, "dur": 7.180, + "args": { + "External id": 85809,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133833.860, "dur": 1.740, + "args": { + "External id": 85810,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133839.949, "dur": 13.811, + "args": { + "External id": 85811,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133850.740, "dur": 1.840, + "args": { + "External id": 85812,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133861.000, "dur": 9.709, + "args": { + "External id": 85813,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133867.320, "dur": 2.280, + "args": { + "External id": 85814,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133873.969, "dur": 7.151, + "args": { + "External id": 85815,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133878.469, "dur": 1.620, + "args": { + "External id": 85816,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133884.439, "dur": 11.360, + "args": { + "External id": 85817,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133892.579, "dur": 1.900, + "args": { + "External id": 85818,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133899.059, "dur": 9.200, + "args": { + "External id": 85819,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133905.569, "dur": 1.630, + "args": { + "External id": 85820,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133911.539, "dur": 9.850, + "args": { + "External id": 85821,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133915.939, "dur": 4.340, + "args": { + "External id": 85822,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133924.929, "dur": 9.630, + "args": { + "External id": 85823,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133931.349, "dur": 2.060, + "args": { + "External id": 85824,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133937.819, "dur": 7.370, + "args": { + "External id": 85825,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133942.489, "dur": 1.640, + "args": { + "External id": 85826,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133948.559, "dur": 10.290, + "args": { + "External id": 85827,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133955.709, "dur": 2.050, + "args": { + "External id": 85828,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133966.199, "dur": 9.200, + "args": { + "External id": 85829,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133972.579, "dur": 1.660, + "args": { + "External id": 85830,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133978.529, "dur": 7.170, + "args": { + "External id": 85831,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133982.959, "dur": 1.620, + "args": { + "External id": 85832,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866133988.969, "dur": 10.760, + "args": { + "External id": 85833,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866133996.739, "dur": 1.690, + "args": { + "External id": 85834,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134002.979, "dur": 9.360, + "args": { + "External id": 85835,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134009.489, "dur": 1.680, + "args": { + "External id": 85836,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134015.659, "dur": 9.950, + "args": { + "External id": 85837,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134020.099, "dur": 4.380, + "args": { + "External id": 85838,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134029.159, "dur": 9.530, + "args": { + "External id": 85839,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134036.039, "dur": 1.490, + "args": { + "External id": 85840,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134041.969, "dur": 7.420, + "args": { + "External id": 85841,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134046.619, "dur": 1.660, + "args": { + "External id": 85842,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134052.639, "dur": 10.340, + "args": { + "External id": 85843,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134059.899, "dur": 2.020, + "args": { + "External id": 85844,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134070.089, "dur": 8.870, + "args": { + "External id": 85845,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134076.229, "dur": 1.590, + "args": { + "External id": 85846,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134082.279, "dur": 7.310, + "args": { + "External id": 85847,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134086.789, "dur": 1.720, + "args": { + "External id": 85848,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134092.799, "dur": 10.510, + "args": { + "External id": 85849,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134100.459, "dur": 1.680, + "args": { + "External id": 85850,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134106.649, "dur": 9.800, + "args": { + "External id": 85851,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134113.189, "dur": 2.030, + "args": { + "External id": 85852,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134119.719, "dur": 10.200, + "args": { + "External id": 85853,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134124.099, "dur": 4.720, + "args": { + "External id": 85854,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134133.319, "dur": 7.970, + "args": { + "External id": 85855,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134138.609, "dur": 1.570, + "args": { + "External id": 85856,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134144.709, "dur": 7.230, + "args": { + "External id": 85857,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134149.159, "dur": 1.660, + "args": { + "External id": 85858,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134155.249, "dur": 11.870, + "args": { + "External id": 85859,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134164.049, "dur": 1.930, + "args": { + "External id": 85860,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134173.569, "dur": 8.230, + "args": { + "External id": 85861,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134178.779, "dur": 1.710, + "args": { + "External id": 85862,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134185.009, "dur": 7.760, + "args": { + "External id": 85863,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134189.979, "dur": 1.730, + "args": { + "External id": 85864,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134196.089, "dur": 11.530, + "args": { + "External id": 85865,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134204.859, "dur": 1.690, + "args": { + "External id": 85866,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134210.709, "dur": 9.200, + "args": { + "External id": 85867,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134217.109, "dur": 1.760, + "args": { + "External id": 85868,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134223.129, "dur": 11.579, + "args": { + "External id": 85869,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134228.368, "dur": 5.131, + "args": { + "External id": 85870,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134237.788, "dur": 7.191, + "args": { + "External id": 85871,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134242.439, "dur": 1.540, + "args": { + "External id": 85872,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134247.888, "dur": 7.180, + "args": { + "External id": 85873,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134252.788, "dur": 1.360, + "args": { + "External id": 85874,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134257.899, "dur": 9.660, + "args": { + "External id": 85875,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134265.048, "dur": 1.531, + "args": { + "External id": 85876,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134273.348, "dur": 7.291, + "args": { + "External id": 85877,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134278.308, "dur": 1.431, + "args": { + "External id": 85878,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134283.248, "dur": 6.991, + "args": { + "External id": 85879,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134288.048, "dur": 1.351, + "args": { + "External id": 85880,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134292.779, "dur": 35.089, + "args": { + "External id": 85881,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134325.268, "dur": 1.480, + "args": { + "External id": 85882,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134330.328, "dur": 5.880, + "args": { + "External id": 85883,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134334.198, "dur": 1.250, + "args": { + "External id": 85884,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134338.458, "dur": 7.890, + "args": { + "External id": 85885,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134342.058, "dur": 3.530, + "args": { + "External id": 85886,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134348.848, "dur": 5.640, + "args": { + "External id": 85887,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134352.648, "dur": 1.110, + "args": { + "External id": 85888,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134356.798, "dur": 6.250, + "args": { + "External id": 85889,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134360.618, "dur": 1.650, + "args": { + "External id": 85890,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134365.348, "dur": 7.980, + "args": { + "External id": 85891,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134371.248, "dur": 1.330, + "args": { + "External id": 85892,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134378.688, "dur": 5.800, + "args": { + "External id": 85893,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134382.568, "dur": 1.140, + "args": { + "External id": 85894,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134387.028, "dur": 6.020, + "args": { + "External id": 85895,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134391.078, "dur": 1.210, + "args": { + "External id": 85896,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134395.308, "dur": 8.910, + "args": { + "External id": 85897,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134402.118, "dur": 1.240, + "args": { + "External id": 85898,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134406.478, "dur": 5.770, + "args": { + "External id": 85899,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134410.228, "dur": 1.200, + "args": { + "External id": 85900,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134414.488, "dur": 8.480, + "args": { + "External id": 85901,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134418.548, "dur": 3.610, + "args": { + "External id": 85902,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134425.368, "dur": 6.120, + "args": { + "External id": 85903,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134429.198, "dur": 1.490, + "args": { + "External id": 85904,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134433.718, "dur": 6.040, + "args": { + "External id": 85905,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134437.768, "dur": 1.210, + "args": { + "External id": 85906,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134442.038, "dur": 8.320, + "args": { + "External id": 85907,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134448.248, "dur": 1.320, + "args": { + "External id": 85908,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134455.378, "dur": 6.270, + "args": { + "External id": 85909,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134459.318, "dur": 1.550, + "args": { + "External id": 85910,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134464.058, "dur": 6.240, + "args": { + "External id": 85911,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134468.288, "dur": 1.250, + "args": { + "External id": 85912,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134472.728, "dur": 8.580, + "args": { + "External id": 85913,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134479.228, "dur": 1.290, + "args": { + "External id": 85914,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134483.598, "dur": 5.640, + "args": { + "External id": 85915,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134487.188, "dur": 1.230, + "args": { + "External id": 85916,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134491.468, "dur": 8.580, + "args": { + "External id": 85917,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134495.528, "dur": 3.720, + "args": { + "External id": 85918,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134502.478, "dur": 5.810, + "args": { + "External id": 85919,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134506.338, "dur": 1.200, + "args": { + "External id": 85920,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134510.608, "dur": 6.350, + "args": { + "External id": 85921,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134514.588, "dur": 1.610, + "args": { + "External id": 85922,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134519.338, "dur": 8.580, + "args": { + "External id": 85923,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134525.748, "dur": 1.400, + "args": { + "External id": 85924,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134533.028, "dur": 5.640, + "args": { + "External id": 85925,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134536.668, "dur": 1.220, + "args": { + "External id": 85926,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134540.938, "dur": 6.260, + "args": { + "External id": 85927,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134545.158, "dur": 1.270, + "args": { + "External id": 85928,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134549.518, "dur": 7.870, + "args": { + "External id": 85929,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134555.408, "dur": 1.210, + "args": { + "External id": 85930,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134559.648, "dur": 6.250, + "args": { + "External id": 85931,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134563.568, "dur": 1.600, + "args": { + "External id": 85932,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134568.128, "dur": 7.790, + "args": { + "External id": 85933,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134572.148, "dur": 2.950, + "args": { + "External id": 85934,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134578.298, "dur": 5.820, + "args": { + "External id": 85935,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134582.188, "dur": 1.140, + "args": { + "External id": 85936,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134586.388, "dur": 6.020, + "args": { + "External id": 85937,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134590.458, "dur": 1.220, + "args": { + "External id": 85938,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134594.708, "dur": 8.950, + "args": { + "External id": 85939,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134601.588, "dur": 1.320, + "args": { + "External id": 85940,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134608.718, "dur": 5.840, + "args": { + "External id": 85941,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134612.598, "dur": 1.200, + "args": { + "External id": 85942,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134616.808, "dur": 6.000, + "args": { + "External id": 85943,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134620.788, "dur": 1.260, + "args": { + "External id": 85944,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134625.058, "dur": 8.320, + "args": { + "External id": 85945,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134631.388, "dur": 1.180, + "args": { + "External id": 85946,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134635.648, "dur": 6.310, + "args": { + "External id": 85947,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134639.548, "dur": 1.660, + "args": { + "External id": 85948,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134644.238, "dur": 8.940, + "args": { + "External id": 85949,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134648.218, "dur": 4.140, + "args": { + "External id": 85950,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134655.578, "dur": 6.040, + "args": { + "External id": 85951,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134659.668, "dur": 1.220, + "args": { + "External id": 85952,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134663.978, "dur": 6.009, + "args": { + "External id": 85953,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134667.947, "dur": 1.291, + "args": { + "External id": 85954,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134672.247, "dur": 8.100, + "args": { + "External id": 85955,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134678.298, "dur": 1.260, + "args": { + "External id": 85956,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134685.018, "dur": 5.820, + "args": { + "External id": 85957,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134688.647, "dur": 1.440, + "args": { + "External id": 85958,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134693.327, "dur": 5.680, + "args": { + "External id": 85959,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134697.158, "dur": 1.109, + "args": { + "External id": 85960,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134701.398, "dur": 8.609, + "args": { + "External id": 85961,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134707.927, "dur": 1.240, + "args": { + "External id": 85962,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134712.227, "dur": 6.051, + "args": { + "External id": 85963,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134716.067, "dur": 1.311, + "args": { + "External id": 85964,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134720.518, "dur": 8.149, + "args": { + "External id": 85965,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134724.398, "dur": 3.480, + "args": { + "External id": 85966,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134731.087, "dur": 5.720, + "args": { + "External id": 85967,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134734.847, "dur": 1.140, + "args": { + "External id": 85968,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134739.087, "dur": 6.120, + "args": { + "External id": 85969,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134743.058, "dur": 1.380, + "args": { + "External id": 85970,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134747.487, "dur": 8.271, + "args": { + "External id": 85971,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134753.578, "dur": 1.400, + "args": { + "External id": 85972,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134760.138, "dur": 5.739, + "args": { + "External id": 85973,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134763.887, "dur": 1.150, + "args": { + "External id": 85974,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134768.167, "dur": 6.280, + "args": { + "External id": 85975,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134772.427, "dur": 1.260, + "args": { + "External id": 85976,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134776.707, "dur": 7.990, + "args": { + "External id": 85977,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134782.817, "dur": 1.140, + "args": { + "External id": 85978,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134786.977, "dur": 6.140, + "args": { + "External id": 85979,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134791.107, "dur": 1.210, + "args": { + "External id": 85980,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134795.407, "dur": 7.710, + "args": { + "External id": 85981,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134799.097, "dur": 3.250, + "args": { + "External id": 85982,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134805.537, "dur": 5.990, + "args": { + "External id": 85983,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134809.497, "dur": 1.300, + "args": { + "External id": 85984,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134814.017, "dur": 5.840, + "args": { + "External id": 85985,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134817.967, "dur": 1.120, + "args": { + "External id": 85986,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134822.207, "dur": 8.000, + "args": { + "External id": 85987,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134828.137, "dur": 1.310, + "args": { + "External id": 85988,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134834.997, "dur": 6.100, + "args": { + "External id": 85989,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134838.757, "dur": 1.560, + "args": { + "External id": 85990,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134843.447, "dur": 5.560, + "args": { + "External id": 85991,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134847.117, "dur": 1.160, + "args": { + "External id": 85992,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134851.347, "dur": 8.470, + "args": { + "External id": 85993,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134857.637, "dur": 1.350, + "args": { + "External id": 85994,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134862.127, "dur": 6.360, + "args": { + "External id": 85995,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134866.047, "dur": 1.640, + "args": { + "External id": 85996,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134870.717, "dur": 8.020, + "args": { + "External id": 85997,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134874.587, "dur": 3.310, + "args": { + "External id": 85998,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134881.107, "dur": 5.700, + "args": { + "External id": 85999,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134884.767, "dur": 1.230, + "args": { + "External id": 86000,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134889.097, "dur": 6.050, + "args": { + "External id": 86001,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134893.177, "dur": 1.230, + "args": { + "External id": 86002,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134897.417, "dur": 8.500, + "args": { + "External id": 86003,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134903.757, "dur": 1.380, + "args": { + "External id": 86004,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134910.817, "dur": 6.080, + "args": { + "External id": 86005,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134914.537, "dur": 1.610, + "args": { + "External id": 86006,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134919.187, "dur": 5.660, + "args": { + "External id": 86007,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134922.897, "dur": 1.210, + "args": { + "External id": 86008,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134927.077, "dur": 8.780, + "args": { + "External id": 86009,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134933.927, "dur": 1.190, + "args": { + "External id": 86010,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134938.187, "dur": 5.790, + "args": { + "External id": 86011,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134942.007, "dur": 1.190, + "args": { + "External id": 86012,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134946.197, "dur": 8.030, + "args": { + "External id": 86013,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134949.957, "dur": 3.510, + "args": { + "External id": 86014,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866134956.717, "dur": 5.780, + "args": { + "External id": 86015,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866134960.567, "dur": 1.160, + "args": { + "External id": 86016,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866135011.337, "dur": 10.370, + "args": { + "External id": 89601,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866135019.407, "dur": 1.530, + "args": { + "External id": 89602,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866135024.427, "dur": 5.820, + "args": { + "External id": 89603,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866135028.237, "dur": 1.240, + "args": { + "External id": 89604,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6300866135035.127, "dur": 5.900, + "args": { + "External id": 89605,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866135039.067, "dur": 1.210, + "args": { + "External id": 89606,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::cat", "pid": 5714, "tid": 5714, + "ts": 6300866135082.497, "dur": 399.339, + "args": { + "External id": 89607,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linalg_vector_norm", "pid": 5714, "tid": 5714, + "ts": 6300866135769.375, "dur": 611.339, + "args": { + "External id": 89608,"Record function id": 0, "Concrete Inputs": ["", "2.", "", "False", ""], "Input type": ["float", "Scalar", "", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[128], [], [], [], []], "Ev Idx": 7603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linalg_vector_norm", "pid": 5714, "tid": 5714, + "ts": 6300866135983.475, "dur": 205.039, + "args": { + "External id": 89609,"Record function id": 0, "Concrete Inputs": ["", "2.", "", "False", ""], "Input type": ["float", "Scalar", "", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[128], [], [], [], []], "Ev Idx": 7604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6300866136054.555, "dur": 6.049, + "args": { + "External id": 89610,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Redistribute", "pid": 5714, "tid": 5714, + "ts": 6300866137048.382, "dur": 3015.333, + "args": { + "External id": 89611,"Sequence number": 1771210, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "False"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300866137215.522, "dur": 222.049, + "args": { + "External id": 89612,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866137227.322, "dur": 3.410, + "args": { + "External id": 89613,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866137238.782, "dur": 1.610, + "args": { + "External id": 89614,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "_c10d_functional::all_reduce", "pid": 5714, "tid": 5714, + "ts": 6300866137533.191, "dur": 949.088, + "args": { + "External id": 89615,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["float", "", ""], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6300866137548.511, "dur": 255.069, + "args": { + "External id": 89616,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6300866137559.311, "dur": 127.090, + "args": { + "External id": 89617,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "0"], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866137649.011, "dur": 33.660, + "args": { + "External id": 89618,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6300866137693.251, "dur": 106.760, + "args": { + "External id": 89619,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::allreduce_", "pid": 5714, "tid": 5714, + "ts": 6300866137834.691, "dur": 632.708, + "args": { + "External id": 89620,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "-1"], "Input type": ["TensorList", "", "", "", "Scalar"], "Input Strides": [[[]], [], [], [], []], "Input Dims": [[[]], [], [], [], []], "Ev Idx": 7615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300866137950.750, "dur": 484.959, + "args": { + "External id": 89621,"Record function id": 0, "Collective name": "allreduce", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[[]], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1, "Process Group Name": "0", "Input type": ["TensorList", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[[]], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 7616, "In msg nelems": 1 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:all_reduce", "pid": 5714, "tid": 5714, + "ts": 6300866138024.330, "dur": 378.989, + "args": { + "External id": 89622,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300866138720.629, "dur": 1107.887, + "args": { + "External id": 89623,"Record function id": 0, "Concrete Inputs": ["", "0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "_c10d_functional::wait_tensor", "pid": 5714, "tid": 5714, + "ts": 6300866139071.788, "dur": 192.559, + "args": { + "External id": 89624,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6300866139150.508, "dur": 49.199, + "args": { + "External id": 89625,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 7620, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6300866139549.096, "dur": 231.370, + "args": { + "External id": 89626,"Record function id": 0, "Concrete Inputs": ["", "0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866139560.387, "dur": 4.369, + "args": { + "External id": 89627,"Record function id": 0, "Concrete Inputs": ["", "0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866139573.476, "dur": 2.171, + "args": { + "External id": 89628,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "_ToTorchTensor", "pid": 5714, "tid": 5714, + "ts": 6300866140159.205, "dur": 109.830, + "args": { + "External id": 89629,"Sequence number": 1771211, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6300866140204.095, "dur": 41.660, + "args": { + "External id": 89630,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6300866140219.685, "dur": 24.000, + "args": { + "External id": 89631,"Record function id": 0, "Concrete Inputs": ["", "[]"], "Input type": ["float", "ScalarList"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6300866141197.413, "dur": 252.849, + "args": { + "External id": 89632,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reciprocal", "pid": 5714, "tid": 5714, + "ts": 6300866141499.322, "dur": 115.300, + "args": { + "External id": 89633,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6300866141656.872, "dur": 109.090, + "args": { + "External id": 89634,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clamp", "pid": 5714, "tid": 5714, + "ts": 6300866141839.251, "dur": 134.530, + "args": { + "External id": 89635,"Record function id": 0, "Concrete Inputs": ["", "", "1."], "Input type": ["float", "", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866141853.841, "dur": 2.210, + "args": { + "External id": 89636,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6300866142068.331, "dur": 2.770, + "args": { + "External id": 89637,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_mul_", "pid": 5714, "tid": 5714, + "ts": 6300866142569.520, "dur": 2474.724, + "args": { + "External id": 89638,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["TensorList", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_mul_", "pid": 5714, "tid": 5714, + "ts": 6300866144707.375, "dur": 205.909, + "args": { + "External id": 89639,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["TensorList", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::isnan", "pid": 5714, "tid": 5714, + "ts": 6300866145212.184, "dur": 211.919, + "args": { + "External id": 89640,"Sequence number": 1771212, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 5714, + "ts": 6300866145224.864, "dur": 193.739, + "args": { + "External id": 89641,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 5714, + "ts": 6300866145454.323, "dur": 117345.613, + "args": { + "External id": 89642,"Sequence number": 1771212, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6300866145462.143, "dur": 117334.844, + "args": { + "External id": 89643,"Sequence number": 1771212, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6300866145470.933, "dur": 117315.023, + "args": { + "External id": 89644,"Sequence number": 1771212, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::isinf", "pid": 5714, "tid": 5714, + "ts": 6300866262865.216, "dur": 486.749, + "args": { + "External id": 89645,"Sequence number": 1771212, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866262878.726, "dur": 277.230, + "args": { + "External id": 89646,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6300866262906.526, "dur": 40.740, + "args": { + "External id": 89647,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6300866262956.496, "dur": 196.580, + "args": { + "External id": 89648,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], [1]], "Input Dims": [[], [0]], "Ev Idx": 7643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6300866262993.026, "dur": 30.690, + "args": { + "External id": 89649,"Record function id": 0, "Concrete Inputs": ["", "[]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6300866263166.535, "dur": 174.090, + "args": { + "External id": 89650,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 5714, + "ts": 6300866263384.105, "dur": 208.150, + "args": { + "External id": 89651,"Sequence number": 1771212, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6300866263391.045, "dur": 199.270, + "args": { + "External id": 89652,"Sequence number": 1771212, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6300866263398.665, "dur": 188.550, + "args": { + "External id": 89653,"Sequence number": 1771212, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7648 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "Optimizer.step#OptimizersContainer.step", "pid": 5714, "tid": 5714, + "ts": 6300866263765.254, "dur": 6765.455, + "args": { + "External id": 89654,"Record function id": 0, "Ev Idx": 7649 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "Optimizer.step#AdamW.step", "pid": 5714, "tid": 5714, + "ts": 6300866263925.024, "dur": 6581.455, + "args": { + "External id": 89655,"Record function id": 0, "Ev Idx": 7650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_add_", "pid": 5714, "tid": 5714, + "ts": 6300866266217.459, "dur": 336.629, + "args": { + "External id": 89656,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266242.599, "dur": 1.740, + "args": { + "External id": 89657,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266247.279, "dur": 0.180, + "args": { + "External id": 89658,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266249.119, "dur": 0.169, + "args": { + "External id": 89659,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266250.699, "dur": 0.169, + "args": { + "External id": 89660,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266252.199, "dur": 0.220, + "args": { + "External id": 89661,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266253.768, "dur": 0.180, + "args": { + "External id": 89662,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266255.488, "dur": 0.160, + "args": { + "External id": 89663,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266257.028, "dur": 0.180, + "args": { + "External id": 89664,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266258.639, "dur": 0.160, + "args": { + "External id": 89665,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266260.139, "dur": 0.180, + "args": { + "External id": 89666,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266261.679, "dur": 0.220, + "args": { + "External id": 89667,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266263.208, "dur": 0.180, + "args": { + "External id": 89668,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266264.779, "dur": 0.220, + "args": { + "External id": 89669,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266266.388, "dur": 0.171, + "args": { + "External id": 89670,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266267.939, "dur": 0.200, + "args": { + "External id": 89671,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266269.448, "dur": 0.180, + "args": { + "External id": 89672,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266270.968, "dur": 0.171, + "args": { + "External id": 89673,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266272.468, "dur": 0.191, + "args": { + "External id": 89674,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266273.988, "dur": 0.211, + "args": { + "External id": 89675,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266275.528, "dur": 0.180, + "args": { + "External id": 89676,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266277.099, "dur": 0.160, + "args": { + "External id": 89677,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266278.479, "dur": 0.189, + "args": { + "External id": 89678,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266280.019, "dur": 0.180, + "args": { + "External id": 89679,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266281.608, "dur": 0.180, + "args": { + "External id": 89680,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266283.099, "dur": 0.169, + "args": { + "External id": 89681,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266284.648, "dur": 0.160, + "args": { + "External id": 89682,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266286.179, "dur": 0.169, + "args": { + "External id": 89683,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266287.708, "dur": 0.171, + "args": { + "External id": 89684,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266289.099, "dur": 0.189, + "args": { + "External id": 89685,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266290.639, "dur": 0.169, + "args": { + "External id": 89686,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266292.188, "dur": 0.180, + "args": { + "External id": 89687,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266293.668, "dur": 0.171, + "args": { + "External id": 89688,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266295.159, "dur": 0.200, + "args": { + "External id": 89689,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266314.488, "dur": 0.391, + "args": { + "External id": 89690,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266316.588, "dur": 0.180, + "args": { + "External id": 89691,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266318.188, "dur": 0.200, + "args": { + "External id": 89692,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266319.768, "dur": 0.171, + "args": { + "External id": 89693,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266321.379, "dur": 0.169, + "args": { + "External id": 89694,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266322.888, "dur": 0.191, + "args": { + "External id": 89695,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266324.479, "dur": 0.180, + "args": { + "External id": 89696,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266326.039, "dur": 0.189, + "args": { + "External id": 89697,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266327.679, "dur": 0.180, + "args": { + "External id": 89698,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266329.168, "dur": 0.171, + "args": { + "External id": 89699,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266330.759, "dur": 0.169, + "args": { + "External id": 89700,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266332.308, "dur": 0.191, + "args": { + "External id": 89701,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266334.199, "dur": 0.180, + "args": { + "External id": 89702,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266335.708, "dur": 0.171, + "args": { + "External id": 89703,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266337.248, "dur": 0.170, + "args": { + "External id": 89704,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266338.778, "dur": 0.170, + "args": { + "External id": 89705,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266340.288, "dur": 0.170, + "args": { + "External id": 89706,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266342.018, "dur": 0.180, + "args": { + "External id": 89707,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266343.518, "dur": 0.180, + "args": { + "External id": 89708,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266345.068, "dur": 0.180, + "args": { + "External id": 89709,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266346.558, "dur": 0.180, + "args": { + "External id": 89710,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266348.048, "dur": 0.170, + "args": { + "External id": 89711,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266349.798, "dur": 0.190, + "args": { + "External id": 89712,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266351.288, "dur": 0.170, + "args": { + "External id": 89713,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266352.798, "dur": 0.190, + "args": { + "External id": 89714,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266354.278, "dur": 0.170, + "args": { + "External id": 89715,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266355.728, "dur": 0.180, + "args": { + "External id": 89716,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266357.248, "dur": 0.170, + "args": { + "External id": 89717,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266358.788, "dur": 0.190, + "args": { + "External id": 89718,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266360.278, "dur": 0.180, + "args": { + "External id": 89719,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266361.818, "dur": 0.190, + "args": { + "External id": 89720,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266363.298, "dur": 0.180, + "args": { + "External id": 89721,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266364.788, "dur": 0.180, + "args": { + "External id": 89722,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266366.308, "dur": 0.170, + "args": { + "External id": 89723,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266367.838, "dur": 0.200, + "args": { + "External id": 89724,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266369.348, "dur": 0.160, + "args": { + "External id": 89725,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266370.938, "dur": 0.180, + "args": { + "External id": 89726,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266372.508, "dur": 0.170, + "args": { + "External id": 89727,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266374.028, "dur": 0.200, + "args": { + "External id": 89728,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266375.578, "dur": 0.170, + "args": { + "External id": 89729,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266377.038, "dur": 0.180, + "args": { + "External id": 89730,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266378.578, "dur": 0.180, + "args": { + "External id": 89731,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266380.168, "dur": 0.170, + "args": { + "External id": 89732,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266381.658, "dur": 0.180, + "args": { + "External id": 89733,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266383.268, "dur": 0.160, + "args": { + "External id": 89734,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266384.778, "dur": 0.180, + "args": { + "External id": 89735,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266386.368, "dur": 0.160, + "args": { + "External id": 89736,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266388.078, "dur": 0.190, + "args": { + "External id": 89737,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266389.658, "dur": 0.180, + "args": { + "External id": 89738,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266391.198, "dur": 0.160, + "args": { + "External id": 89739,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266392.748, "dur": 0.190, + "args": { + "External id": 89740,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266394.328, "dur": 0.160, + "args": { + "External id": 89741,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266395.928, "dur": 0.180, + "args": { + "External id": 89742,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266397.508, "dur": 0.160, + "args": { + "External id": 89743,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266399.108, "dur": 0.200, + "args": { + "External id": 89744,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266400.688, "dur": 0.170, + "args": { + "External id": 89745,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266402.408, "dur": 0.170, + "args": { + "External id": 89746,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266404.068, "dur": 0.190, + "args": { + "External id": 89747,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266405.708, "dur": 0.170, + "args": { + "External id": 89748,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266407.248, "dur": 0.180, + "args": { + "External id": 89749,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266408.818, "dur": 0.170, + "args": { + "External id": 89750,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266410.338, "dur": 0.190, + "args": { + "External id": 89751,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266412.288, "dur": 0.200, + "args": { + "External id": 89752,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266413.838, "dur": 0.180, + "args": { + "External id": 89753,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266415.588, "dur": 0.160, + "args": { + "External id": 89754,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266417.088, "dur": 0.180, + "args": { + "External id": 89755,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266418.678, "dur": 0.180, + "args": { + "External id": 89756,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266420.178, "dur": 0.190, + "args": { + "External id": 89757,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266421.748, "dur": 0.180, + "args": { + "External id": 89758,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266423.318, "dur": 0.180, + "args": { + "External id": 89759,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266424.948, "dur": 0.170, + "args": { + "External id": 89760,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266426.618, "dur": 0.180, + "args": { + "External id": 89761,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266428.268, "dur": 0.190, + "args": { + "External id": 89762,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266429.858, "dur": 0.170, + "args": { + "External id": 89763,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266431.438, "dur": 0.170, + "args": { + "External id": 89764,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266432.988, "dur": 0.170, + "args": { + "External id": 89765,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266434.638, "dur": 0.180, + "args": { + "External id": 89766,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266436.188, "dur": 0.190, + "args": { + "External id": 89767,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266437.778, "dur": 0.170, + "args": { + "External id": 89768,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266439.348, "dur": 0.200, + "args": { + "External id": 89769,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266440.958, "dur": 0.190, + "args": { + "External id": 89770,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266442.508, "dur": 0.170, + "args": { + "External id": 89771,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266444.368, "dur": 0.170, + "args": { + "External id": 89772,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266445.978, "dur": 0.170, + "args": { + "External id": 89773,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266447.528, "dur": 0.180, + "args": { + "External id": 89774,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266449.088, "dur": 0.170, + "args": { + "External id": 89775,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266450.728, "dur": 0.180, + "args": { + "External id": 89776,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266452.178, "dur": 0.190, + "args": { + "External id": 89777,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266453.638, "dur": 0.190, + "args": { + "External id": 89778,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266455.268, "dur": 0.160, + "args": { + "External id": 89779,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266456.758, "dur": 0.230, + "args": { + "External id": 89780,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266458.368, "dur": 0.190, + "args": { + "External id": 89781,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266460.068, "dur": 0.170, + "args": { + "External id": 89782,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266461.668, "dur": 0.170, + "args": { + "External id": 89783,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6300866266463.188, "dur": 0.170, + "args": { + "External id": 89784,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_fused_adamw_", "pid": 5714, "tid": 5714, + "ts": 6300866267202.366, "dur": 3238.033, + "args": { + "External id": 89785,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "0.00038902679890793812", "0.90000000000000002", "0.94999999999999996", "0.10000000000000001", "1.0000000000000001e-15", "False", "False", "", ""], "Input type": ["TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 7780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_fused_adamw_", "pid": 5714, "tid": 5714, + "ts": 6300866270243.999, "dur": 123.650, + "args": { + "External id": 89786,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "0.00038902679890793812", "0.90000000000000002", "0.94999999999999996", "0.10000000000000001", "1.0000000000000001e-15", "False", "False", "", ""], "Input type": ["TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 7781 + } + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6300865683396.682, "dur": 260.259, + "args": { + "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161137593, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161137593, "pid": 0, "tid": 7, "ts": 6300865683396.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6300865683657.645, "dur": 260.835, + "args": { + "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161137596, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161137596, "pid": 0, "tid": 7, "ts": 6300865683657.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6300865683919.152, "dur": 258.468, + "args": { + "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161137599, "registers per thread": 64, "shared memory": 0, "blocks per SM": 0.554688, "warps per SM": 8.875000, "grid": [71, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 18 + } + }, + { + "ph": "f", "id": 161137599, "pid": 0, "tid": 7, "ts": 6300865683919.152, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6300865686198.059, "dur": 6.560, + "args": { + "External id": 81947, "device": 0, "context": 1, "stream": 7, "correlation": 161137613, "bytes": 131072, "memory bandwidth (GB/s)": 19.98048780487805 + } + }, + { + "ph": "f", "id": 161137613, "pid": 0, "tid": 7, "ts": 6300865686198.059, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865686152.817, "dur": 45.890, + "args": { + "External id": 81947, "cbid": 41, "correlation": 161137613 + } + }, + { + "ph": "s", "id": 161137613, "pid": 5714, "tid": 5714, "ts": 6300865686152.817, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6300865686199.377, "dur": 8.330, + "args": { + "External id": 81947, "cbid": 131, "correlation": 161137614 + } + }, + { + "ph": "s", "id": 161137614, "pid": 5714, "tid": 5714, "ts": 6300865686199.377, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6300865686433.006, "dur": 11.552, + "args": { + "External id": 81951, "device": 0, "context": 1, "stream": 7, "correlation": 161137627, "bytes": 262144, "memory bandwidth (GB/s)": 22.69252077562327 + } + }, + { + "ph": "f", "id": 161137627, "pid": 0, "tid": 7, "ts": 6300865686433.006, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865686241.897, "dur": 191.279, + "args": { + "External id": 81951, "cbid": 41, "correlation": 161137627 + } + }, + { + "ph": "s", "id": 161137627, "pid": 5714, "tid": 5714, "ts": 6300865686241.897, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6300865686433.636, "dur": 13.620, + "args": { + "External id": 81951, "cbid": 131, "correlation": 161137628 + } + }, + { + "ph": "s", "id": 161137628, "pid": 5714, "tid": 5714, "ts": 6300865686433.636, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 0, "tid": 7, + "ts": 6300865686514.095, "dur": 1.120, + "args": { + "External id": 81954, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161137644, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 0.500000, "grid": [32, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161137644, "pid": 0, "tid": 7, "ts": 6300865686514.095, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865686498.616, "dur": 17.120, + "args": { + "External id": 81954, "cbid": 211, "correlation": 161137644 + } + }, + { + "ph": "s", "id": 161137644, "pid": 5714, "tid": 5714, "ts": 6300865686498.616, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865686591.856, "dur": 1.472, + "args": { + "External id": 81968, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161137657, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 161137657, "pid": 0, "tid": 7, "ts": 6300865686591.856, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865686581.196, "dur": 10.970, + "args": { + "External id": 81968, "cbid": 211, "correlation": 161137657 + } + }, + { + "ph": "s", "id": 161137657, "pid": 5714, "tid": 5714, "ts": 6300865686581.196, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#3}::operator()() const::{lambda(int)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865686630.128, "dur": 1.632, + "args": { + "External id": 81972, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161137671, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161137671, "pid": 0, "tid": 7, "ts": 6300865686630.128, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865686621.586, "dur": 9.060, + "args": { + "External id": 81972, "cbid": 211, "correlation": 161137671 + } + }, + { + "ph": "s", "id": 161137671, "pid": 5714, "tid": 5714, "ts": 6300865686621.586, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865686783.285, "dur": 2.711, + "args": { + "cbid": 135, "correlation": 161137683 + } + }, + { + "ph": "f", "id": 161137683, "pid": 5714, "tid": 5714, "ts": 6300865686783.285, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865686790.116, "dur": 2.060, + "args": { + "cbid": 147, "correlation": 161137687 + } + }, + { + "ph": "s", "id": 161137687, "pid": 5714, "tid": 5714, "ts": 6300865686790.116, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865686801.325, "dur": 0.900, + "args": { + "cbid": 135, "correlation": 161137699 + } + }, + { + "ph": "f", "id": 161137699, "pid": 5714, "tid": 5714, "ts": 6300865686801.325, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865686804.385, "dur": 1.500, + "args": { + "cbid": 147, "correlation": 161137703 + } + }, + { + "ph": "s", "id": 161137703, "pid": 5714, "tid": 5714, "ts": 6300865686804.385, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865687419.834, "dur": 68.064, + "args": { + "External id": 82018, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161137723, "registers per thread": 40, "shared memory": 0, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 161137723, "pid": 0, "tid": 17, "ts": 6300865687419.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865687404.634, "dur": 15.520, + "args": { + "External id": 82018, "cbid": 211, "correlation": 161137723 + } + }, + { + "ph": "s", "id": 161137723, "pid": 5714, "tid": 5714, "ts": 6300865687404.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865687689.245, "dur": 31.904, + "args": { + "External id": 82063, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161137736, "registers per thread": 36, "shared memory": 0, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 161137736, "pid": 0, "tid": 17, "ts": 6300865687689.245, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865687674.123, "dur": 16.040, + "args": { + "External id": 82063, "cbid": 211, "correlation": 161137736 + } + }, + { + "ph": "s", "id": 161137736, "pid": 5714, "tid": 5714, "ts": 6300865687674.123, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865687738.593, "dur": 1.760, + "args": { + "cbid": 135, "correlation": 161137746 + } + }, + { + "ph": "f", "id": 161137746, "pid": 5714, "tid": 5714, "ts": 6300865687738.593, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865687742.543, "dur": 1.780, + "args": { + "cbid": 147, "correlation": 161137750 + } + }, + { + "ph": "s", "id": 161137750, "pid": 5714, "tid": 5714, "ts": 6300865687742.543, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865687837.443, "dur": 0.940, + "args": { + "External id": 82065, "cbid": 317, "correlation": 161137763 + } + }, + { + "ph": "f", "id": 161137763, "pid": 5714, "tid": 5714, "ts": 6300865687837.443, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865687840.963, "dur": 1.340, + "args": { + "External id": 82065, "cbid": 135, "correlation": 161137765 + } + }, + { + "ph": "f", "id": 161137765, "pid": 5714, "tid": 5714, "ts": 6300865687840.963, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865687844.253, "dur": 1.540, + "args": { + "External id": 82065, "cbid": 147, "correlation": 161137769 + } + }, + { + "ph": "s", "id": 161137769, "pid": 5714, "tid": 5714, "ts": 6300865687844.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865687873.423, "dur": 0.990, + "args": { + "External id": 82065, "cbid": 409, "correlation": 161137772 + } + }, + { + "ph": "f", "id": 161137772, "pid": 5714, "tid": 5714, "ts": 6300865687873.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865687881.223, "dur": 1.250, + "args": { + "External id": 82065, "cbid": 135, "correlation": 161137775 + } + }, + { + "ph": "f", "id": 161137775, "pid": 5714, "tid": 5714, "ts": 6300865687881.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865687882.663, "dur": 1.000, + "args": { + "External id": 82065, "cbid": 147, "correlation": 161137776 + } + }, + { + "ph": "s", "id": 161137776, "pid": 5714, "tid": 5714, "ts": 6300865687882.663, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865687899.455, "dur": 172525.290, + "args": { + "External id": 82065, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161137778, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 13223616, "Out msg nelems": 52894464, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161137778, "pid": 0, "tid": 20, "ts": 6300865687899.455, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865687886.113, "dur": 12.960, + "args": { + "External id": 82065, "cbid": 430, "correlation": 161137778 + } + }, + { + "ph": "s", "id": 161137778, "pid": 5714, "tid": 5714, "ts": 6300865687886.113, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865687900.413, "dur": 0.450, + "args": { + "External id": 82065, "cbid": 135, "correlation": 161137780 + } + }, + { + "ph": "f", "id": 161137780, "pid": 5714, "tid": 5714, "ts": 6300865687900.413, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865687900.983, "dur": 0.550, + "args": { + "External id": 82065, "cbid": 147, "correlation": 161137781 + } + }, + { + "ph": "s", "id": 161137781, "pid": 5714, "tid": 5714, "ts": 6300865687900.983, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865687903.123, "dur": 1.030, + "args": { + "External id": 82065, "cbid": 135, "correlation": 161137784 + } + }, + { + "ph": "f", "id": 161137784, "pid": 5714, "tid": 5714, "ts": 6300865687903.123, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865687914.333, "dur": 0.490, + "args": { + "External id": 82065, "cbid": 135, "correlation": 161137791 + } + }, + { + "ph": "f", "id": 161137791, "pid": 5714, "tid": 5714, "ts": 6300865687914.333, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865687953.403, "dur": 1.110, + "args": { + "External id": 82067, "cbid": 147, "correlation": 161137796 + } + }, + { + "ph": "s", "id": 161137796, "pid": 5714, "tid": 5714, "ts": 6300865687953.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865687972.873, "dur": 1.080, + "args": { + "cbid": 135, "correlation": 161137811 + } + }, + { + "ph": "f", "id": 161137811, "pid": 5714, "tid": 5714, "ts": 6300865687972.873, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865688067.373, "dur": 5.589, + "args": { + "cbid": 147, "correlation": 161137818 + } + }, + { + "ph": "s", "id": 161137818, "pid": 5714, "tid": 5714, "ts": 6300865688067.373, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865688465.632, "dur": 1.409, + "args": { + "External id": 82109, "cbid": 317, "correlation": 161137975 + } + }, + { + "ph": "f", "id": 161137975, "pid": 5714, "tid": 5714, "ts": 6300865688465.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865860430.761, "dur": 2.496, + "args": { + "External id": 82114, "device": 0, "context": 1, "stream": 7, "correlation": 161137987, "bytes": 22000, "memory bandwidth (GB/s)": 8.814102564102564 + } + }, + { + "ph": "f", "id": 161137987, "pid": 0, "tid": 7, "ts": 6300865860430.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865688508.652, "dur": 14.349, + "args": { + "External id": 82114, "cbid": 41, "correlation": 161137987 + } + }, + { + "ph": "s", "id": 161137987, "pid": 5714, "tid": 5714, "ts": 6300865688508.652, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865688528.281, "dur": 1.951, + "args": { + "External id": 82109, "cbid": 135, "correlation": 161137991 + } + }, + { + "ph": "f", "id": 161137991, "pid": 5714, "tid": 5714, "ts": 6300865688528.281, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865860435.337, "dur": 1043.117, + "args": { + "External id": 82109, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161137995, "registers per thread": 38, "shared memory": 0, "blocks per SM": 20.289062, "warps per SM": 81.156250, "grid": [2597, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161137995, "pid": 0, "tid": 7, "ts": 6300865860435.337, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865688533.601, "dur": 14.260, + "args": { + "External id": 82109, "cbid": 211, "correlation": 161137995 + } + }, + { + "ph": "s", "id": 161137995, "pid": 5714, "tid": 5714, "ts": 6300865688533.601, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865688733.871, "dur": 1.600, + "args": { + "cbid": 135, "correlation": 161138006 + } + }, + { + "ph": "f", "id": 161138006, "pid": 5714, "tid": 5714, "ts": 6300865688733.871, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_0", "pid": 0, "tid": 7, + "ts": 6300865861500.246, "dur": 280.003, + "args": { + "External id": 82118, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138019, "registers per thread": 32, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138019, "pid": 0, "tid": 7, "ts": 6300865861500.246, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865689016.040, "dur": 13.830, + "args": { + "External id": 82118, "cbid": 307, "correlation": 161138019 + } + }, + { + "ph": "s", "id": 161138019, "pid": 5714, "tid": 5714, "ts": 6300865689016.040, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865689520.466, "dur": 557.031, + "args": { + "External id": 82133, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161138034, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161138034, "pid": 0, "tid": 17, "ts": 6300865689520.466, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865689372.059, "dur": 12.911, + "args": { + "External id": 82133, "cbid": 211, "correlation": 161138034 + } + }, + { + "ph": "s", "id": 161138034, "pid": 5714, "tid": 5714, "ts": 6300865689372.059, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865690078.649, "dur": 8.576, + "args": { + "External id": 82149, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161138047, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161138047, "pid": 0, "tid": 17, "ts": 6300865690078.649, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865689509.619, "dur": 13.360, + "args": { + "External id": 82149, "cbid": 211, "correlation": 161138047 + } + }, + { + "ph": "s", "id": 161138047, "pid": 5714, "tid": 5714, "ts": 6300865689509.619, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865689555.819, "dur": 1.670, + "args": { + "cbid": 135, "correlation": 161138057 + } + }, + { + "ph": "f", "id": 161138057, "pid": 5714, "tid": 5714, "ts": 6300865689555.819, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865689559.659, "dur": 1.460, + "args": { + "cbid": 147, "correlation": 161138061 + } + }, + { + "ph": "s", "id": 161138061, "pid": 5714, "tid": 5714, "ts": 6300865689559.659, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865689621.349, "dur": 1.020, + "args": { + "External id": 82151, "cbid": 317, "correlation": 161138074 + } + }, + { + "ph": "f", "id": 161138074, "pid": 5714, "tid": 5714, "ts": 6300865689621.349, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865689624.409, "dur": 1.160, + "args": { + "External id": 82151, "cbid": 135, "correlation": 161138076 + } + }, + { + "ph": "f", "id": 161138076, "pid": 5714, "tid": 5714, "ts": 6300865689624.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865689627.159, "dur": 1.250, + "args": { + "External id": 82151, "cbid": 147, "correlation": 161138080 + } + }, + { + "ph": "s", "id": 161138080, "pid": 5714, "tid": 5714, "ts": 6300865689627.159, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865689647.679, "dur": 0.700, + "args": { + "External id": 82151, "cbid": 409, "correlation": 161138083 + } + }, + { + "ph": "f", "id": 161138083, "pid": 5714, "tid": 5714, "ts": 6300865689647.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865689652.769, "dur": 0.990, + "args": { + "External id": 82151, "cbid": 135, "correlation": 161138086 + } + }, + { + "ph": "f", "id": 161138086, "pid": 5714, "tid": 5714, "ts": 6300865689652.769, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865689653.939, "dur": 0.910, + "args": { + "External id": 82151, "cbid": 147, "correlation": 161138087 + } + }, + { + "ph": "s", "id": 161138087, "pid": 5714, "tid": 5714, "ts": 6300865689653.939, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865860426.409, "dur": 3205.638, + "args": { + "External id": 82151, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161138089, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161138089, "pid": 0, "tid": 20, "ts": 6300865860426.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865689656.079, "dur": 11.000, + "args": { + "External id": 82151, "cbid": 430, "correlation": 161138089 + } + }, + { + "ph": "s", "id": 161138089, "pid": 5714, "tid": 5714, "ts": 6300865689656.079, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865689668.129, "dur": 0.400, + "args": { + "External id": 82151, "cbid": 135, "correlation": 161138091 + } + }, + { + "ph": "f", "id": 161138091, "pid": 5714, "tid": 5714, "ts": 6300865689668.129, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865689668.649, "dur": 0.530, + "args": { + "External id": 82151, "cbid": 147, "correlation": 161138092 + } + }, + { + "ph": "s", "id": 161138092, "pid": 5714, "tid": 5714, "ts": 6300865689668.649, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865689670.969, "dur": 0.980, + "args": { + "External id": 82151, "cbid": 135, "correlation": 161138095 + } + }, + { + "ph": "f", "id": 161138095, "pid": 5714, "tid": 5714, "ts": 6300865689670.969, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865689682.019, "dur": 0.520, + "args": { + "External id": 82151, "cbid": 135, "correlation": 161138102 + } + }, + { + "ph": "f", "id": 161138102, "pid": 5714, "tid": 5714, "ts": 6300865689682.019, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865689713.099, "dur": 1.060, + "args": { + "External id": 82153, "cbid": 147, "correlation": 161138107 + } + }, + { + "ph": "s", "id": 161138107, "pid": 5714, "tid": 5714, "ts": 6300865689713.099, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865689731.779, "dur": 1.010, + "args": { + "cbid": 135, "correlation": 161138122 + } + }, + { + "ph": "f", "id": 161138122, "pid": 5714, "tid": 5714, "ts": 6300865689731.779, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865689777.619, "dur": 1.100, + "args": { + "cbid": 147, "correlation": 161138127 + } + }, + { + "ph": "s", "id": 161138127, "pid": 5714, "tid": 5714, "ts": 6300865689777.619, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865689780.889, "dur": 0.700, + "args": { + "cbid": 147, "correlation": 161138131 + } + }, + { + "ph": "s", "id": 161138131, "pid": 5714, "tid": 5714, "ts": 6300865689780.889, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865689821.938, "dur": 2.420, + "args": { + "cbid": 147, "correlation": 161138137 + } + }, + { + "ph": "s", "id": 161138137, "pid": 5714, "tid": 5714, "ts": 6300865689821.938, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865689959.458, "dur": 1.480, + "args": { + "External id": 82166, "cbid": 317, "correlation": 161138178 + } + }, + { + "ph": "f", "id": 161138178, "pid": 5714, "tid": 5714, "ts": 6300865689959.458, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865689978.718, "dur": 2.830, + "args": { + "External id": 82167, "cbid": 138, "correlation": 161138181 + } + }, + { + "ph": "f", "id": 161138181, "pid": 5714, "tid": 5714, "ts": 6300865689978.718, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865863632.847, "dur": 1.952, + "args": { + "External id": 82171, "device": 0, "context": 1, "stream": 7, "correlation": 161138192, "bytes": 7224, "memory bandwidth (GB/s)": 3.7008196721311477 + } + }, + { + "ph": "f", "id": 161138192, "pid": 0, "tid": 7, "ts": 6300865863632.847, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865690006.008, "dur": 13.570, + "args": { + "External id": 82171, "cbid": 41, "correlation": 161138192 + } + }, + { + "ph": "s", "id": 161138192, "pid": 5714, "tid": 5714, "ts": 6300865690006.008, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865690024.478, "dur": 1.860, + "args": { + "External id": 82166, "cbid": 135, "correlation": 161138196 + } + }, + { + "ph": "f", "id": 161138196, "pid": 5714, "tid": 5714, "ts": 6300865690024.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865863637.263, "dur": 12.800, + "args": { + "External id": 82166, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138200, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138200, "pid": 0, "tid": 7, "ts": 6300865863637.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865690029.088, "dur": 11.090, + "args": { + "External id": 82166, "cbid": 211, "correlation": 161138200 + } + }, + { + "ph": "s", "id": 161138200, "pid": 5714, "tid": 5714, "ts": 6300865690029.088, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865690136.688, "dur": 1.470, + "args": { + "cbid": 135, "correlation": 161138211 + } + }, + { + "ph": "f", "id": 161138211, "pid": 5714, "tid": 5714, "ts": 6300865690136.688, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865863650.703, "dur": 42.465, + "args": { + "External id": 82178, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138237, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138237, "pid": 0, "tid": 7, "ts": 6300865863650.703, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865690424.537, "dur": 13.160, + "args": { + "External id": 82178, "cbid": 307, "correlation": 161138237 + } + }, + { + "ph": "s", "id": 161138237, "pid": 5714, "tid": 5714, "ts": 6300865690424.537, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865863693.776, "dur": 567.750, + "args": { + "External id": 82184, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138260, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138260, "pid": 0, "tid": 7, "ts": 6300865863693.776, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865690647.597, "dur": 13.450, + "args": { + "External id": 82184, "cbid": 211, "correlation": 161138260 + } + }, + { + "ph": "s", "id": 161138260, "pid": 5714, "tid": 5714, "ts": 6300865690647.597, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865864262.230, "dur": 142.978, + "args": { + "External id": 82185, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138283, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138283, "pid": 0, "tid": 7, "ts": 6300865864262.230, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865690689.916, "dur": 6.760, + "args": { + "External id": 82185, "cbid": 211, "correlation": 161138283 + } + }, + { + "ph": "s", "id": 161138283, "pid": 5714, "tid": 5714, "ts": 6300865690689.916, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865864405.816, "dur": 571.687, + "args": { + "External id": 82186, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138306, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138306, "pid": 0, "tid": 7, "ts": 6300865864405.816, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865690720.667, "dur": 5.669, + "args": { + "External id": 82186, "cbid": 211, "correlation": 161138306 + } + }, + { + "ph": "s", "id": 161138306, "pid": 5714, "tid": 5714, "ts": 6300865690720.667, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865864978.111, "dur": 118.529, + "args": { + "External id": 82203, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138326, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138326, "pid": 0, "tid": 7, "ts": 6300865864978.111, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865691112.066, "dur": 13.020, + "args": { + "External id": 82203, "cbid": 307, "correlation": 161138326 + } + }, + { + "ph": "s", "id": 161138326, "pid": 5714, "tid": 5714, "ts": 6300865691112.066, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691158.515, "dur": 5.600, + "args": { + "cbid": 138, "correlation": 161138329 + } + }, + { + "ph": "f", "id": 161138329, "pid": 5714, "tid": 1822426688, "ts": 6300865691158.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691164.615, "dur": 0.600, + "args": { + "cbid": 138, "correlation": 161138330 + } + }, + { + "ph": "f", "id": 161138330, "pid": 5714, "tid": 1822426688, "ts": 6300865691164.615, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691168.466, "dur": 0.569, + "args": { + "cbid": 138, "correlation": 161138331 + } + }, + { + "ph": "f", "id": 161138331, "pid": 5714, "tid": 1822426688, "ts": 6300865691168.466, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691172.055, "dur": 0.691, + "args": { + "cbid": 138, "correlation": 161138332 + } + }, + { + "ph": "f", "id": 161138332, "pid": 5714, "tid": 1822426688, "ts": 6300865691172.055, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691173.006, "dur": 0.389, + "args": { + "cbid": 138, "correlation": 161138333 + } + }, + { + "ph": "f", "id": 161138333, "pid": 5714, "tid": 1822426688, "ts": 6300865691173.006, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691174.755, "dur": 0.480, + "args": { + "cbid": 138, "correlation": 161138334 + } + }, + { + "ph": "f", "id": 161138334, "pid": 5714, "tid": 1822426688, "ts": 6300865691174.755, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691176.355, "dur": 0.740, + "args": { + "cbid": 138, "correlation": 161138335 + } + }, + { + "ph": "f", "id": 161138335, "pid": 5714, "tid": 1822426688, "ts": 6300865691176.355, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691177.346, "dur": 0.400, + "args": { + "cbid": 138, "correlation": 161138336 + } + }, + { + "ph": "f", "id": 161138336, "pid": 5714, "tid": 1822426688, "ts": 6300865691177.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691178.566, "dur": 0.440, + "args": { + "cbid": 138, "correlation": 161138337 + } + }, + { + "ph": "f", "id": 161138337, "pid": 5714, "tid": 1822426688, "ts": 6300865691178.566, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691180.526, "dur": 0.860, + "args": { + "cbid": 138, "correlation": 161138338 + } + }, + { + "ph": "f", "id": 161138338, "pid": 5714, "tid": 1822426688, "ts": 6300865691180.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691181.535, "dur": 0.391, + "args": { + "cbid": 138, "correlation": 161138339 + } + }, + { + "ph": "f", "id": 161138339, "pid": 5714, "tid": 1822426688, "ts": 6300865691181.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691183.126, "dur": 0.469, + "args": { + "cbid": 138, "correlation": 161138340 + } + }, + { + "ph": "f", "id": 161138340, "pid": 5714, "tid": 1822426688, "ts": 6300865691183.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691186.526, "dur": 1.320, + "args": { + "cbid": 138, "correlation": 161138341 + } + }, + { + "ph": "f", "id": 161138341, "pid": 5714, "tid": 1822426688, "ts": 6300865691186.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865691189.915, "dur": 1.320, + "args": { + "cbid": 138, "correlation": 161138343 + } + }, + { + "ph": "f", "id": 161138343, "pid": 5714, "tid": 1822426688, "ts": 6300865691189.915, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865865097.312, "dur": 206.850, + "args": { + "External id": 82219, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138360, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138360, "pid": 0, "tid": 7, "ts": 6300865865097.312, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865691365.135, "dur": 10.520, + "args": { + "External id": 82219, "cbid": 307, "correlation": 161138360 + } + }, + { + "ph": "s", "id": 161138360, "pid": 5714, "tid": 5714, "ts": 6300865691365.135, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865691578.174, "dur": 1.631, + "args": { + "External id": 82225, "cbid": 200, "correlation": 161138367 + } + }, + { + "ph": "f", "id": 161138367, "pid": 5714, "tid": 5714, "ts": 6300865691578.174, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865691579.954, "dur": 0.331, + "args": { + "External id": 82225, "cbid": 200, "correlation": 161138368 + } + }, + { + "ph": "f", "id": 161138368, "pid": 5714, "tid": 5714, "ts": 6300865691579.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865691612.565, "dur": 0.580, + "args": { + "External id": 82225, "cbid": 200, "correlation": 161138391 + } + }, + { + "ph": "f", "id": 161138391, "pid": 5714, "tid": 5714, "ts": 6300865691612.565, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865691623.354, "dur": 3.211, + "args": { + "External id": 82225, "cbid": 273, "correlation": 161138400 + } + }, + { + "ph": "f", "id": 161138400, "pid": 5714, "tid": 5714, "ts": 6300865691623.354, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865865304.771, "dur": 544.326, + "args": { + "External id": 82225, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138401, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138401, "pid": 0, "tid": 7, "ts": 6300865865304.771, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865691627.485, "dur": 12.579, + "args": { + "External id": 82225, "cbid": 211, "correlation": 161138401 + } + }, + { + "ph": "s", "id": 161138401, "pid": 5714, "tid": 5714, "ts": 6300865691627.485, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865865849.737, "dur": 145.537, + "args": { + "External id": 82231, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138424, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138424, "pid": 0, "tid": 7, "ts": 6300865865849.737, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865691718.794, "dur": 8.250, + "args": { + "External id": 82231, "cbid": 211, "correlation": 161138424 + } + }, + { + "ph": "s", "id": 161138424, "pid": 5714, "tid": 5714, "ts": 6300865691718.794, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865865995.882, "dur": 90.722, + "args": { + "External id": 82235, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138450, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138450, "pid": 0, "tid": 7, "ts": 6300865865995.882, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865691953.744, "dur": 10.790, + "args": { + "External id": 82235, "cbid": 307, "correlation": 161138450 + } + }, + { + "ph": "s", "id": 161138450, "pid": 5714, "tid": 5714, "ts": 6300865691953.744, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865866087.308, "dur": 341.988, + "args": { + "External id": 82236, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138470, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138470, "pid": 0, "tid": 7, "ts": 6300865866087.308, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865692003.904, "dur": 8.300, + "args": { + "External id": 82236, "cbid": 211, "correlation": 161138470 + } + }, + { + "ph": "s", "id": 161138470, "pid": 5714, "tid": 5714, "ts": 6300865692003.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865866429.968, "dur": 559.718, + "args": { + "External id": 82237, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138493, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138493, "pid": 0, "tid": 7, "ts": 6300865866429.968, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865692037.084, "dur": 6.289, + "args": { + "External id": 82237, "cbid": 211, "correlation": 161138493 + } + }, + { + "ph": "s", "id": 161138493, "pid": 5714, "tid": 5714, "ts": 6300865692037.084, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6300865866990.294, "dur": 214.595, + "args": { + "External id": 82238, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138505, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138505, "pid": 0, "tid": 7, "ts": 6300865866990.294, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865692086.533, "dur": 7.910, + "args": { + "External id": 82238, "cbid": 307, "correlation": 161138505 + } + }, + { + "ph": "s", "id": 161138505, "pid": 5714, "tid": 5714, "ts": 6300865692086.533, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865692125.493, "dur": 2.900, + "args": { + "External id": 82239, "cbid": 210, "correlation": 161138525 + } + }, + { + "ph": "f", "id": 161138525, "pid": 5714, "tid": 5714, "ts": 6300865692125.493, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865867205.593, "dur": 475.013, + "args": { + "External id": 82239, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138526, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138526, "pid": 0, "tid": 7, "ts": 6300865867205.593, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865692131.233, "dur": 7.180, + "args": { + "External id": 82239, "cbid": 211, "correlation": 161138526 + } + }, + { + "ph": "s", "id": 161138526, "pid": 5714, "tid": 5714, "ts": 6300865692131.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6300865867681.374, "dur": 60.481, + "args": { + "External id": 82240, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138533, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138533, "pid": 0, "tid": 7, "ts": 6300865867681.374, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865692178.953, "dur": 7.050, + "args": { + "External id": 82240, "cbid": 307, "correlation": 161138533 + } + }, + { + "ph": "s", "id": 161138533, "pid": 5714, "tid": 5714, "ts": 6300865692178.953, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865861535.254, "dur": 156.130, + "args": { + "External id": 82256, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161138548, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161138548, "pid": 0, "tid": 17, "ts": 6300865861535.254, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865692642.472, "dur": 13.830, + "args": { + "External id": 82256, "cbid": 211, "correlation": 161138548 + } + }, + { + "ph": "s", "id": 161138548, "pid": 5714, "tid": 5714, "ts": 6300865692642.472, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865861715.128, "dur": 278.116, + "args": { + "External id": 82272, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161138561, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161138561, "pid": 0, "tid": 17, "ts": 6300865861715.128, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865692774.182, "dur": 10.400, + "args": { + "External id": 82272, "cbid": 211, "correlation": 161138561 + } + }, + { + "ph": "s", "id": 161138561, "pid": 5714, "tid": 5714, "ts": 6300865692774.182, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865692810.872, "dur": 1.430, + "args": { + "cbid": 135, "correlation": 161138571 + } + }, + { + "ph": "f", "id": 161138571, "pid": 5714, "tid": 5714, "ts": 6300865692810.872, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865692814.432, "dur": 1.410, + "args": { + "cbid": 147, "correlation": 161138575 + } + }, + { + "ph": "s", "id": 161138575, "pid": 5714, "tid": 5714, "ts": 6300865692814.432, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865692874.942, "dur": 0.949, + "args": { + "External id": 82274, "cbid": 317, "correlation": 161138588 + } + }, + { + "ph": "f", "id": 161138588, "pid": 5714, "tid": 5714, "ts": 6300865692874.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865692878.151, "dur": 1.300, + "args": { + "External id": 82274, "cbid": 135, "correlation": 161138590 + } + }, + { + "ph": "f", "id": 161138590, "pid": 5714, "tid": 5714, "ts": 6300865692878.151, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865692880.982, "dur": 1.220, + "args": { + "External id": 82274, "cbid": 147, "correlation": 161138594 + } + }, + { + "ph": "s", "id": 161138594, "pid": 5714, "tid": 5714, "ts": 6300865692880.982, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865692899.271, "dur": 0.711, + "args": { + "External id": 82274, "cbid": 409, "correlation": 161138597 + } + }, + { + "ph": "f", "id": 161138597, "pid": 5714, "tid": 5714, "ts": 6300865692899.271, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865692904.391, "dur": 0.951, + "args": { + "External id": 82274, "cbid": 135, "correlation": 161138600 + } + }, + { + "ph": "f", "id": 161138600, "pid": 5714, "tid": 5714, "ts": 6300865692904.391, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865692905.542, "dur": 0.929, + "args": { + "External id": 82274, "cbid": 147, "correlation": 161138601 + } + }, + { + "ph": "s", "id": 161138601, "pid": 5714, "tid": 5714, "ts": 6300865692905.542, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865863633.679, "dur": 4295.346, + "args": { + "External id": 82274, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161138603, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161138603, "pid": 0, "tid": 20, "ts": 6300865863633.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865692908.031, "dur": 10.651, + "args": { + "External id": 82274, "cbid": 430, "correlation": 161138603 + } + }, + { + "ph": "s", "id": 161138603, "pid": 5714, "tid": 5714, "ts": 6300865692908.031, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865692919.702, "dur": 0.400, + "args": { + "External id": 82274, "cbid": 135, "correlation": 161138605 + } + }, + { + "ph": "f", "id": 161138605, "pid": 5714, "tid": 5714, "ts": 6300865692919.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865692920.211, "dur": 0.491, + "args": { + "External id": 82274, "cbid": 147, "correlation": 161138606 + } + }, + { + "ph": "s", "id": 161138606, "pid": 5714, "tid": 5714, "ts": 6300865692920.211, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865692922.362, "dur": 1.089, + "args": { + "External id": 82274, "cbid": 135, "correlation": 161138609 + } + }, + { + "ph": "f", "id": 161138609, "pid": 5714, "tid": 5714, "ts": 6300865692922.362, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865692933.942, "dur": 0.500, + "args": { + "External id": 82274, "cbid": 135, "correlation": 161138616 + } + }, + { + "ph": "f", "id": 161138616, "pid": 5714, "tid": 5714, "ts": 6300865692933.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865692966.331, "dur": 1.060, + "args": { + "External id": 82276, "cbid": 147, "correlation": 161138621 + } + }, + { + "ph": "s", "id": 161138621, "pid": 5714, "tid": 5714, "ts": 6300865692966.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865692985.031, "dur": 0.980, + "args": { + "cbid": 135, "correlation": 161138636 + } + }, + { + "ph": "f", "id": 161138636, "pid": 5714, "tid": 5714, "ts": 6300865692985.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865693029.351, "dur": 1.290, + "args": { + "cbid": 147, "correlation": 161138641 + } + }, + { + "ph": "s", "id": 161138641, "pid": 5714, "tid": 5714, "ts": 6300865693029.351, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865693033.281, "dur": 0.850, + "args": { + "cbid": 147, "correlation": 161138645 + } + }, + { + "ph": "s", "id": 161138645, "pid": 5714, "tid": 5714, "ts": 6300865693033.281, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865693075.491, "dur": 2.470, + "args": { + "cbid": 147, "correlation": 161138651 + } + }, + { + "ph": "s", "id": 161138651, "pid": 5714, "tid": 5714, "ts": 6300865693075.491, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865693219.561, "dur": 1.230, + "args": { + "External id": 82289, "cbid": 317, "correlation": 161138692 + } + }, + { + "ph": "f", "id": 161138692, "pid": 5714, "tid": 5714, "ts": 6300865693219.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865693230.251, "dur": 2.920, + "args": { + "External id": 82290, "cbid": 138, "correlation": 161138695 + } + }, + { + "ph": "f", "id": 161138695, "pid": 5714, "tid": 5714, "ts": 6300865693230.251, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865867929.601, "dur": 2.688, + "args": { + "External id": 82294, "device": 0, "context": 1, "stream": 7, "correlation": 161138706, "bytes": 7224, "memory bandwidth (GB/s)": 2.6875 + } + }, + { + "ph": "f", "id": 161138706, "pid": 0, "tid": 7, "ts": 6300865867929.601, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865693258.331, "dur": 13.140, + "args": { + "External id": 82294, "cbid": 41, "correlation": 161138706 + } + }, + { + "ph": "s", "id": 161138706, "pid": 5714, "tid": 5714, "ts": 6300865693258.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865693277.171, "dur": 2.240, + "args": { + "External id": 82289, "cbid": 135, "correlation": 161138710 + } + }, + { + "ph": "f", "id": 161138710, "pid": 5714, "tid": 5714, "ts": 6300865693277.171, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865867934.433, "dur": 19.361, + "args": { + "External id": 82289, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138714, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138714, "pid": 0, "tid": 7, "ts": 6300865867934.433, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865693282.211, "dur": 10.570, + "args": { + "External id": 82289, "cbid": 211, "correlation": 161138714 + } + }, + { + "ph": "s", "id": 161138714, "pid": 5714, "tid": 5714, "ts": 6300865693282.211, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865693399.520, "dur": 1.450, + "args": { + "cbid": 135, "correlation": 161138725 + } + }, + { + "ph": "f", "id": 161138725, "pid": 5714, "tid": 5714, "ts": 6300865693399.520, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865867954.466, "dur": 33.088, + "args": { + "External id": 82301, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138751, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138751, "pid": 0, "tid": 7, "ts": 6300865867954.466, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865693631.200, "dur": 12.230, + "args": { + "External id": 82301, "cbid": 307, "correlation": 161138751 + } + }, + { + "ph": "s", "id": 161138751, "pid": 5714, "tid": 5714, "ts": 6300865693631.200, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865867988.162, "dur": 618.951, + "args": { + "External id": 82307, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138774, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138774, "pid": 0, "tid": 7, "ts": 6300865867988.162, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865693789.320, "dur": 10.769, + "args": { + "External id": 82307, "cbid": 211, "correlation": 161138774 + } + }, + { + "ph": "s", "id": 161138774, "pid": 5714, "tid": 5714, "ts": 6300865693789.320, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865868607.785, "dur": 178.370, + "args": { + "External id": 82308, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138797, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138797, "pid": 0, "tid": 7, "ts": 6300865868607.785, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865693826.500, "dur": 6.069, + "args": { + "External id": 82308, "cbid": 211, "correlation": 161138797 + } + }, + { + "ph": "s", "id": 161138797, "pid": 5714, "tid": 5714, "ts": 6300865693826.500, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865868786.763, "dur": 580.423, + "args": { + "External id": 82309, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138820, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138820, "pid": 0, "tid": 7, "ts": 6300865868786.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865693856.999, "dur": 6.790, + "args": { + "External id": 82309, "cbid": 211, "correlation": 161138820 + } + }, + { + "ph": "s", "id": 161138820, "pid": 5714, "tid": 5714, "ts": 6300865693856.999, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865869367.890, "dur": 52.193, + "args": { + "External id": 82326, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138840, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138840, "pid": 0, "tid": 7, "ts": 6300865869367.890, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865694148.359, "dur": 10.590, + "args": { + "External id": 82326, "cbid": 307, "correlation": 161138840 + } + }, + { + "ph": "s", "id": 161138840, "pid": 5714, "tid": 5714, "ts": 6300865694148.359, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865869420.755, "dur": 62.432, + "args": { + "External id": 82342, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138858, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138858, "pid": 0, "tid": 7, "ts": 6300865869420.755, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865694382.358, "dur": 9.780, + "args": { + "External id": 82342, "cbid": 307, "correlation": 161138858 + } + }, + { + "ph": "s", "id": 161138858, "pid": 5714, "tid": 5714, "ts": 6300865694382.358, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865694544.828, "dur": 0.930, + "args": { + "External id": 82348, "cbid": 200, "correlation": 161138865 + } + }, + { + "ph": "f", "id": 161138865, "pid": 5714, "tid": 5714, "ts": 6300865694544.828, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865694545.908, "dur": 0.220, + "args": { + "External id": 82348, "cbid": 200, "correlation": 161138866 + } + }, + { + "ph": "f", "id": 161138866, "pid": 5714, "tid": 5714, "ts": 6300865694545.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865694574.378, "dur": 0.440, + "args": { + "External id": 82348, "cbid": 200, "correlation": 161138889 + } + }, + { + "ph": "f", "id": 161138889, "pid": 5714, "tid": 5714, "ts": 6300865694574.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865694582.928, "dur": 2.230, + "args": { + "External id": 82348, "cbid": 273, "correlation": 161138898 + } + }, + { + "ph": "f", "id": 161138898, "pid": 5714, "tid": 5714, "ts": 6300865694582.928, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865869483.795, "dur": 424.454, + "args": { + "External id": 82348, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138899, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138899, "pid": 0, "tid": 7, "ts": 6300865869483.795, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865694585.838, "dur": 11.730, + "args": { + "External id": 82348, "cbid": 211, "correlation": 161138899 + } + }, + { + "ph": "s", "id": 161138899, "pid": 5714, "tid": 5714, "ts": 6300865694585.838, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865869908.953, "dur": 144.705, + "args": { + "External id": 82354, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138922, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138922, "pid": 0, "tid": 7, "ts": 6300865869908.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865694670.118, "dur": 8.369, + "args": { + "External id": 82354, "cbid": 211, "correlation": 161138922 + } + }, + { + "ph": "s", "id": 161138922, "pid": 5714, "tid": 5714, "ts": 6300865694670.118, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865870054.298, "dur": 90.113, + "args": { + "External id": 82358, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138948, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161138948, "pid": 0, "tid": 7, "ts": 6300865870054.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865694856.347, "dur": 10.280, + "args": { + "External id": 82358, "cbid": 307, "correlation": 161138948 + } + }, + { + "ph": "s", "id": 161138948, "pid": 5714, "tid": 5714, "ts": 6300865694856.347, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865870145.019, "dur": 342.308, + "args": { + "External id": 82359, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138968, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138968, "pid": 0, "tid": 7, "ts": 6300865870145.019, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865694902.617, "dur": 6.680, + "args": { + "External id": 82359, "cbid": 211, "correlation": 161138968 + } + }, + { + "ph": "s", "id": 161138968, "pid": 5714, "tid": 5714, "ts": 6300865694902.617, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865870488.063, "dur": 337.733, + "args": { + "External id": 82360, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161138991, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161138991, "pid": 0, "tid": 7, "ts": 6300865870488.063, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865694933.627, "dur": 5.680, + "args": { + "External id": 82360, "cbid": 211, "correlation": 161138991 + } + }, + { + "ph": "s", "id": 161138991, "pid": 5714, "tid": 5714, "ts": 6300865694933.627, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6300865870826.500, "dur": 215.618, + "args": { + "External id": 82361, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139003, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139003, "pid": 0, "tid": 7, "ts": 6300865870826.500, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865694978.117, "dur": 6.780, + "args": { + "External id": 82361, "cbid": 307, "correlation": 161139003 + } + }, + { + "ph": "s", "id": 161139003, "pid": 5714, "tid": 5714, "ts": 6300865694978.117, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865695012.477, "dur": 1.840, + "args": { + "External id": 82362, "cbid": 210, "correlation": 161139023 + } + }, + { + "ph": "f", "id": 161139023, "pid": 5714, "tid": 5714, "ts": 6300865695012.477, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865871042.790, "dur": 394.021, + "args": { + "External id": 82362, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139024, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139024, "pid": 0, "tid": 7, "ts": 6300865871042.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865695016.377, "dur": 6.090, + "args": { + "External id": 82362, "cbid": 211, "correlation": 161139024 + } + }, + { + "ph": "s", "id": 161139024, "pid": 5714, "tid": 5714, "ts": 6300865695016.377, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6300865871437.547, "dur": 43.456, + "args": { + "External id": 82363, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139031, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139031, "pid": 0, "tid": 7, "ts": 6300865871437.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865695055.847, "dur": 5.950, + "args": { + "External id": 82363, "cbid": 307, "correlation": 161139031 + } + }, + { + "ph": "s", "id": 161139031, "pid": 5714, "tid": 5714, "ts": 6300865695055.847, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865863651.823, "dur": 48.481, + "args": { + "External id": 82379, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161139046, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161139046, "pid": 0, "tid": 17, "ts": 6300865863651.823, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865695488.956, "dur": 13.200, + "args": { + "External id": 82379, "cbid": 211, "correlation": 161139046 + } + }, + { + "ph": "s", "id": 161139046, "pid": 5714, "tid": 5714, "ts": 6300865695488.956, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865863723.056, "dur": 13.312, + "args": { + "External id": 82395, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161139059, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161139059, "pid": 0, "tid": 17, "ts": 6300865863723.056, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865695619.035, "dur": 10.470, + "args": { + "External id": 82395, "cbid": 211, "correlation": 161139059 + } + }, + { + "ph": "s", "id": 161139059, "pid": 5714, "tid": 5714, "ts": 6300865695619.035, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865695656.735, "dur": 1.420, + "args": { + "cbid": 135, "correlation": 161139069 + } + }, + { + "ph": "f", "id": 161139069, "pid": 5714, "tid": 5714, "ts": 6300865695656.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865695660.185, "dur": 1.440, + "args": { + "cbid": 147, "correlation": 161139073 + } + }, + { + "ph": "s", "id": 161139073, "pid": 5714, "tid": 5714, "ts": 6300865695660.185, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865695720.075, "dur": 0.970, + "args": { + "External id": 82397, "cbid": 317, "correlation": 161139086 + } + }, + { + "ph": "f", "id": 161139086, "pid": 5714, "tid": 5714, "ts": 6300865695720.075, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865695723.085, "dur": 1.310, + "args": { + "External id": 82397, "cbid": 135, "correlation": 161139088 + } + }, + { + "ph": "f", "id": 161139088, "pid": 5714, "tid": 5714, "ts": 6300865695723.085, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865695725.925, "dur": 1.190, + "args": { + "External id": 82397, "cbid": 147, "correlation": 161139092 + } + }, + { + "ph": "s", "id": 161139092, "pid": 5714, "tid": 5714, "ts": 6300865695725.925, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865695744.625, "dur": 0.770, + "args": { + "External id": 82397, "cbid": 409, "correlation": 161139095 + } + }, + { + "ph": "f", "id": 161139095, "pid": 5714, "tid": 5714, "ts": 6300865695744.625, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865695749.765, "dur": 0.850, + "args": { + "External id": 82397, "cbid": 135, "correlation": 161139098 + } + }, + { + "ph": "f", "id": 161139098, "pid": 5714, "tid": 5714, "ts": 6300865695749.765, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865695750.795, "dur": 1.010, + "args": { + "External id": 82397, "cbid": 147, "correlation": 161139099 + } + }, + { + "ph": "s", "id": 161139099, "pid": 5714, "tid": 5714, "ts": 6300865695750.795, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865867931.009, "dur": 6474.893, + "args": { + "External id": 82397, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161139101, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161139101, "pid": 0, "tid": 20, "ts": 6300865867931.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865695753.005, "dur": 10.610, + "args": { + "External id": 82397, "cbid": 430, "correlation": 161139101 + } + }, + { + "ph": "s", "id": 161139101, "pid": 5714, "tid": 5714, "ts": 6300865695753.005, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865695764.645, "dur": 0.400, + "args": { + "External id": 82397, "cbid": 135, "correlation": 161139103 + } + }, + { + "ph": "f", "id": 161139103, "pid": 5714, "tid": 5714, "ts": 6300865695764.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865695765.155, "dur": 0.740, + "args": { + "External id": 82397, "cbid": 147, "correlation": 161139104 + } + }, + { + "ph": "s", "id": 161139104, "pid": 5714, "tid": 5714, "ts": 6300865695765.155, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865695767.635, "dur": 1.130, + "args": { + "External id": 82397, "cbid": 135, "correlation": 161139107 + } + }, + { + "ph": "f", "id": 161139107, "pid": 5714, "tid": 5714, "ts": 6300865695767.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865695778.945, "dur": 0.480, + "args": { + "External id": 82397, "cbid": 135, "correlation": 161139114 + } + }, + { + "ph": "f", "id": 161139114, "pid": 5714, "tid": 5714, "ts": 6300865695778.945, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865695810.115, "dur": 1.160, + "args": { + "External id": 82399, "cbid": 147, "correlation": 161139119 + } + }, + { + "ph": "s", "id": 161139119, "pid": 5714, "tid": 5714, "ts": 6300865695810.115, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865695829.565, "dur": 1.150, + "args": { + "cbid": 135, "correlation": 161139134 + } + }, + { + "ph": "f", "id": 161139134, "pid": 5714, "tid": 5714, "ts": 6300865695829.565, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865695873.945, "dur": 1.120, + "args": { + "cbid": 147, "correlation": 161139139 + } + }, + { + "ph": "s", "id": 161139139, "pid": 5714, "tid": 5714, "ts": 6300865695873.945, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865695877.145, "dur": 0.670, + "args": { + "cbid": 147, "correlation": 161139143 + } + }, + { + "ph": "s", "id": 161139143, "pid": 5714, "tid": 5714, "ts": 6300865695877.145, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865695935.075, "dur": 2.420, + "args": { + "cbid": 147, "correlation": 161139149 + } + }, + { + "ph": "s", "id": 161139149, "pid": 5714, "tid": 5714, "ts": 6300865695935.075, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865696059.144, "dur": 1.280, + "args": { + "External id": 82412, "cbid": 317, "correlation": 161139190 + } + }, + { + "ph": "f", "id": 161139190, "pid": 5714, "tid": 5714, "ts": 6300865696059.144, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865696069.064, "dur": 2.490, + "args": { + "External id": 82413, "cbid": 138, "correlation": 161139193 + } + }, + { + "ph": "f", "id": 161139193, "pid": 5714, "tid": 5714, "ts": 6300865696069.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865874408.046, "dur": 2.112, + "args": { + "External id": 82417, "device": 0, "context": 1, "stream": 7, "correlation": 161139204, "bytes": 7224, "memory bandwidth (GB/s)": 3.4204545454545454 + } + }, + { + "ph": "f", "id": 161139204, "pid": 0, "tid": 7, "ts": 6300865874408.046, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865696095.334, "dur": 13.170, + "args": { + "External id": 82417, "cbid": 41, "correlation": 161139204 + } + }, + { + "ph": "s", "id": 161139204, "pid": 5714, "tid": 5714, "ts": 6300865696095.334, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865696114.734, "dur": 1.970, + "args": { + "External id": 82412, "cbid": 135, "correlation": 161139208 + } + }, + { + "ph": "f", "id": 161139208, "pid": 5714, "tid": 5714, "ts": 6300865696114.734, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865874412.462, "dur": 12.352, + "args": { + "External id": 82412, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139212, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139212, "pid": 0, "tid": 7, "ts": 6300865874412.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865696119.554, "dur": 10.760, + "args": { + "External id": 82412, "cbid": 211, "correlation": 161139212 + } + }, + { + "ph": "s", "id": 161139212, "pid": 5714, "tid": 5714, "ts": 6300865696119.554, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865696223.504, "dur": 1.460, + "args": { + "cbid": 135, "correlation": 161139223 + } + }, + { + "ph": "f", "id": 161139223, "pid": 5714, "tid": 5714, "ts": 6300865696223.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865874425.422, "dur": 508.774, + "args": { + "External id": 82424, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139249, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139249, "pid": 0, "tid": 7, "ts": 6300865874425.422, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865696466.783, "dur": 12.290, + "args": { + "External id": 82424, "cbid": 307, "correlation": 161139249 + } + }, + { + "ph": "s", "id": 161139249, "pid": 5714, "tid": 5714, "ts": 6300865696466.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865875045.013, "dur": 411.365, + "args": { + "External id": 82430, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139272, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139272, "pid": 0, "tid": 7, "ts": 6300865875045.013, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865696622.993, "dur": 10.920, + "args": { + "External id": 82430, "cbid": 211, "correlation": 161139272 + } + }, + { + "ph": "s", "id": 161139272, "pid": 5714, "tid": 5714, "ts": 6300865696622.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865875457.050, "dur": 142.913, + "args": { + "External id": 82431, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139295, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139295, "pid": 0, "tid": 7, "ts": 6300865875457.050, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865696660.133, "dur": 5.730, + "args": { + "External id": 82431, "cbid": 211, "correlation": 161139295 + } + }, + { + "ph": "s", "id": 161139295, "pid": 5714, "tid": 5714, "ts": 6300865696660.133, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865875600.635, "dur": 143.170, + "args": { + "External id": 82432, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139318, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139318, "pid": 0, "tid": 7, "ts": 6300865875600.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865696690.353, "dur": 5.970, + "args": { + "External id": 82432, "cbid": 211, "correlation": 161139318 + } + }, + { + "ph": "s", "id": 161139318, "pid": 5714, "tid": 5714, "ts": 6300865696690.353, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865875744.445, "dur": 52.513, + "args": { + "External id": 82449, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139338, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139338, "pid": 0, "tid": 7, "ts": 6300865875744.445, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865696996.832, "dur": 10.530, + "args": { + "External id": 82449, "cbid": 307, "correlation": 161139338 + } + }, + { + "ph": "s", "id": 161139338, "pid": 5714, "tid": 5714, "ts": 6300865696996.832, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865875797.662, "dur": 61.216, + "args": { + "External id": 82465, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139356, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139356, "pid": 0, "tid": 7, "ts": 6300865875797.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865697217.522, "dur": 9.590, + "args": { + "External id": 82465, "cbid": 307, "correlation": 161139356 + } + }, + { + "ph": "s", "id": 161139356, "pid": 5714, "tid": 5714, "ts": 6300865697217.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865697388.591, "dur": 0.580, + "args": { + "External id": 82471, "cbid": 200, "correlation": 161139363 + } + }, + { + "ph": "f", "id": 161139363, "pid": 5714, "tid": 5714, "ts": 6300865697388.591, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865697389.301, "dur": 0.230, + "args": { + "External id": 82471, "cbid": 200, "correlation": 161139364 + } + }, + { + "ph": "f", "id": 161139364, "pid": 5714, "tid": 5714, "ts": 6300865697389.301, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865697418.491, "dur": 0.430, + "args": { + "External id": 82471, "cbid": 200, "correlation": 161139387 + } + }, + { + "ph": "f", "id": 161139387, "pid": 5714, "tid": 5714, "ts": 6300865697418.491, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865697425.371, "dur": 2.140, + "args": { + "External id": 82471, "cbid": 273, "correlation": 161139396 + } + }, + { + "ph": "f", "id": 161139396, "pid": 5714, "tid": 5714, "ts": 6300865697425.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865875859.550, "dur": 426.662, + "args": { + "External id": 82471, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139397, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139397, "pid": 0, "tid": 7, "ts": 6300865875859.550, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865697428.171, "dur": 11.630, + "args": { + "External id": 82471, "cbid": 211, "correlation": 161139397 + } + }, + { + "ph": "s", "id": 161139397, "pid": 5714, "tid": 5714, "ts": 6300865697428.171, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865876286.852, "dur": 144.737, + "args": { + "External id": 82477, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139420, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139420, "pid": 0, "tid": 7, "ts": 6300865876286.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865697512.081, "dur": 8.410, + "args": { + "External id": 82477, "cbid": 211, "correlation": 161139420 + } + }, + { + "ph": "s", "id": 161139420, "pid": 5714, "tid": 5714, "ts": 6300865697512.081, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865876432.261, "dur": 92.641, + "args": { + "External id": 82481, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139446, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139446, "pid": 0, "tid": 7, "ts": 6300865876432.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865697699.261, "dur": 9.839, + "args": { + "External id": 82481, "cbid": 307, "correlation": 161139446 + } + }, + { + "ph": "s", "id": 161139446, "pid": 5714, "tid": 5714, "ts": 6300865697699.261, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865876525.542, "dur": 342.500, + "args": { + "External id": 82482, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139466, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139466, "pid": 0, "tid": 7, "ts": 6300865876525.542, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865697744.980, "dur": 6.531, + "args": { + "External id": 82482, "cbid": 211, "correlation": 161139466 + } + }, + { + "ph": "s", "id": 161139466, "pid": 5714, "tid": 5714, "ts": 6300865697744.980, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865876868.778, "dur": 338.916, + "args": { + "External id": 82483, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139489, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139489, "pid": 0, "tid": 7, "ts": 6300865876868.778, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865697776.940, "dur": 6.360, + "args": { + "External id": 82483, "cbid": 211, "correlation": 161139489 + } + }, + { + "ph": "s", "id": 161139489, "pid": 5714, "tid": 5714, "ts": 6300865697776.940, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6300865877208.398, "dur": 214.915, + "args": { + "External id": 82484, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139501, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139501, "pid": 0, "tid": 7, "ts": 6300865877208.398, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865697820.970, "dur": 6.190, + "args": { + "External id": 82484, "cbid": 307, "correlation": 161139501 + } + }, + { + "ph": "s", "id": 161139501, "pid": 5714, "tid": 5714, "ts": 6300865697820.970, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865697854.370, "dur": 1.370, + "args": { + "External id": 82485, "cbid": 210, "correlation": 161139521 + } + }, + { + "ph": "f", "id": 161139521, "pid": 5714, "tid": 5714, "ts": 6300865697854.370, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865877423.953, "dur": 347.940, + "args": { + "External id": 82485, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139522, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139522, "pid": 0, "tid": 7, "ts": 6300865877423.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865697857.790, "dur": 6.570, + "args": { + "External id": 82485, "cbid": 211, "correlation": 161139522 + } + }, + { + "ph": "s", "id": 161139522, "pid": 5714, "tid": 5714, "ts": 6300865697857.790, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6300865877772.533, "dur": 44.480, + "args": { + "External id": 82486, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139529, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139529, "pid": 0, "tid": 7, "ts": 6300865877772.533, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865697898.650, "dur": 5.660, + "args": { + "External id": 82486, "cbid": 307, "correlation": 161139529 + } + }, + { + "ph": "s", "id": 161139529, "pid": 5714, "tid": 5714, "ts": 6300865697898.650, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865867956.706, "dur": 42.624, + "args": { + "External id": 82502, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161139544, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161139544, "pid": 0, "tid": 17, "ts": 6300865867956.706, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865698338.699, "dur": 17.660, + "args": { + "External id": 82502, "cbid": 211, "correlation": 161139544 + } + }, + { + "ph": "s", "id": 161139544, "pid": 5714, "tid": 5714, "ts": 6300865698338.699, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865868018.146, "dur": 12.608, + "args": { + "External id": 82518, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161139557, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161139557, "pid": 0, "tid": 17, "ts": 6300865868018.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865698477.889, "dur": 10.350, + "args": { + "External id": 82518, "cbid": 211, "correlation": 161139557 + } + }, + { + "ph": "s", "id": 161139557, "pid": 5714, "tid": 5714, "ts": 6300865698477.889, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865698514.429, "dur": 1.510, + "args": { + "cbid": 135, "correlation": 161139567 + } + }, + { + "ph": "f", "id": 161139567, "pid": 5714, "tid": 5714, "ts": 6300865698514.429, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865698518.039, "dur": 1.460, + "args": { + "cbid": 147, "correlation": 161139571 + } + }, + { + "ph": "s", "id": 161139571, "pid": 5714, "tid": 5714, "ts": 6300865698518.039, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865698579.789, "dur": 1.040, + "args": { + "External id": 82520, "cbid": 317, "correlation": 161139584 + } + }, + { + "ph": "f", "id": 161139584, "pid": 5714, "tid": 5714, "ts": 6300865698579.789, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865698582.849, "dur": 1.349, + "args": { + "External id": 82520, "cbid": 135, "correlation": 161139586 + } + }, + { + "ph": "f", "id": 161139586, "pid": 5714, "tid": 5714, "ts": 6300865698582.849, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865698585.769, "dur": 1.280, + "args": { + "External id": 82520, "cbid": 147, "correlation": 161139590 + } + }, + { + "ph": "s", "id": 161139590, "pid": 5714, "tid": 5714, "ts": 6300865698585.769, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865698603.269, "dur": 0.749, + "args": { + "External id": 82520, "cbid": 409, "correlation": 161139593 + } + }, + { + "ph": "f", "id": 161139593, "pid": 5714, "tid": 5714, "ts": 6300865698603.269, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865698608.389, "dur": 0.829, + "args": { + "External id": 82520, "cbid": 135, "correlation": 161139596 + } + }, + { + "ph": "f", "id": 161139596, "pid": 5714, "tid": 5714, "ts": 6300865698608.389, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865698609.418, "dur": 0.920, + "args": { + "External id": 82520, "cbid": 147, "correlation": 161139597 + } + }, + { + "ph": "s", "id": 161139597, "pid": 5714, "tid": 5714, "ts": 6300865698609.418, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865874408.142, "dur": 13071.065, + "args": { + "External id": 82520, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161139599, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161139599, "pid": 0, "tid": 20, "ts": 6300865874408.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865698611.578, "dur": 10.420, + "args": { + "External id": 82520, "cbid": 430, "correlation": 161139599 + } + }, + { + "ph": "s", "id": 161139599, "pid": 5714, "tid": 5714, "ts": 6300865698611.578, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865698623.269, "dur": 0.440, + "args": { + "External id": 82520, "cbid": 135, "correlation": 161139601 + } + }, + { + "ph": "f", "id": 161139601, "pid": 5714, "tid": 5714, "ts": 6300865698623.269, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865698623.818, "dur": 0.600, + "args": { + "External id": 82520, "cbid": 147, "correlation": 161139602 + } + }, + { + "ph": "s", "id": 161139602, "pid": 5714, "tid": 5714, "ts": 6300865698623.818, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865698626.098, "dur": 0.831, + "args": { + "External id": 82520, "cbid": 135, "correlation": 161139605 + } + }, + { + "ph": "f", "id": 161139605, "pid": 5714, "tid": 5714, "ts": 6300865698626.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865698639.009, "dur": 0.449, + "args": { + "External id": 82520, "cbid": 135, "correlation": 161139612 + } + }, + { + "ph": "f", "id": 161139612, "pid": 5714, "tid": 5714, "ts": 6300865698639.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865698672.218, "dur": 1.031, + "args": { + "External id": 82522, "cbid": 147, "correlation": 161139617 + } + }, + { + "ph": "s", "id": 161139617, "pid": 5714, "tid": 5714, "ts": 6300865698672.218, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865698691.228, "dur": 1.040, + "args": { + "cbid": 135, "correlation": 161139632 + } + }, + { + "ph": "f", "id": 161139632, "pid": 5714, "tid": 5714, "ts": 6300865698691.228, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865698735.448, "dur": 1.150, + "args": { + "cbid": 147, "correlation": 161139637 + } + }, + { + "ph": "s", "id": 161139637, "pid": 5714, "tid": 5714, "ts": 6300865698735.448, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865698738.558, "dur": 0.640, + "args": { + "cbid": 147, "correlation": 161139641 + } + }, + { + "ph": "s", "id": 161139641, "pid": 5714, "tid": 5714, "ts": 6300865698738.558, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865698780.758, "dur": 2.350, + "args": { + "cbid": 147, "correlation": 161139647 + } + }, + { + "ph": "s", "id": 161139647, "pid": 5714, "tid": 5714, "ts": 6300865698780.758, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865698903.138, "dur": 1.320, + "args": { + "External id": 82535, "cbid": 317, "correlation": 161139688 + } + }, + { + "ph": "f", "id": 161139688, "pid": 5714, "tid": 5714, "ts": 6300865698903.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865698914.588, "dur": 2.390, + "args": { + "External id": 82536, "cbid": 138, "correlation": 161139691 + } + }, + { + "ph": "f", "id": 161139691, "pid": 5714, "tid": 5714, "ts": 6300865698914.588, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865887486.599, "dur": 2.272, + "args": { + "External id": 82540, "device": 0, "context": 1, "stream": 7, "correlation": 161139702, "bytes": 7224, "memory bandwidth (GB/s)": 3.1795774647887325 + } + }, + { + "ph": "f", "id": 161139702, "pid": 0, "tid": 7, "ts": 6300865887486.599, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865698940.838, "dur": 13.100, + "args": { + "External id": 82540, "cbid": 41, "correlation": 161139702 + } + }, + { + "ph": "s", "id": 161139702, "pid": 5714, "tid": 5714, "ts": 6300865698940.838, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865698958.398, "dur": 2.120, + "args": { + "External id": 82535, "cbid": 135, "correlation": 161139706 + } + }, + { + "ph": "f", "id": 161139706, "pid": 5714, "tid": 5714, "ts": 6300865698958.398, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865887490.887, "dur": 623.367, + "args": { + "External id": 82535, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139710, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139710, "pid": 0, "tid": 7, "ts": 6300865887490.887, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865698963.338, "dur": 10.790, + "args": { + "External id": 82535, "cbid": 211, "correlation": 161139710 + } + }, + { + "ph": "s", "id": 161139710, "pid": 5714, "tid": 5714, "ts": 6300865698963.338, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865699069.357, "dur": 1.460, + "args": { + "cbid": 135, "correlation": 161139721 + } + }, + { + "ph": "f", "id": 161139721, "pid": 5714, "tid": 5714, "ts": 6300865699069.357, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865888237.968, "dur": 162.434, + "args": { + "External id": 82547, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139747, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139747, "pid": 0, "tid": 7, "ts": 6300865888237.968, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865699309.517, "dur": 11.860, + "args": { + "External id": 82547, "cbid": 307, "correlation": 161139747 + } + }, + { + "ph": "s", "id": 161139747, "pid": 5714, "tid": 5714, "ts": 6300865699309.517, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865888401.138, "dur": 147.362, + "args": { + "External id": 82553, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139770, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139770, "pid": 0, "tid": 7, "ts": 6300865888401.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865699466.927, "dur": 10.660, + "args": { + "External id": 82553, "cbid": 211, "correlation": 161139770 + } + }, + { + "ph": "s", "id": 161139770, "pid": 5714, "tid": 5714, "ts": 6300865699466.927, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865888549.140, "dur": 143.617, + "args": { + "External id": 82554, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139793, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139793, "pid": 0, "tid": 7, "ts": 6300865888549.140, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865699503.127, "dur": 6.549, + "args": { + "External id": 82554, "cbid": 211, "correlation": 161139793 + } + }, + { + "ph": "s", "id": 161139793, "pid": 5714, "tid": 5714, "ts": 6300865699503.127, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865888693.397, "dur": 143.138, + "args": { + "External id": 82555, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139816, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139816, "pid": 0, "tid": 7, "ts": 6300865888693.397, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865699532.036, "dur": 5.531, + "args": { + "External id": 82555, "cbid": 211, "correlation": 161139816 + } + }, + { + "ph": "s", "id": 161139816, "pid": 5714, "tid": 5714, "ts": 6300865699532.036, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865888837.175, "dur": 52.832, + "args": { + "External id": 82572, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139836, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139836, "pid": 0, "tid": 7, "ts": 6300865888837.175, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865699816.006, "dur": 10.450, + "args": { + "External id": 82572, "cbid": 307, "correlation": 161139836 + } + }, + { + "ph": "s", "id": 161139836, "pid": 5714, "tid": 5714, "ts": 6300865699816.006, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865888890.679, "dur": 61.633, + "args": { + "External id": 82588, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139854, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139854, "pid": 0, "tid": 7, "ts": 6300865888890.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865700048.375, "dur": 9.770, + "args": { + "External id": 82588, "cbid": 307, "correlation": 161139854 + } + }, + { + "ph": "s", "id": 161139854, "pid": 5714, "tid": 5714, "ts": 6300865700048.375, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865700206.825, "dur": 0.600, + "args": { + "External id": 82594, "cbid": 200, "correlation": 161139861 + } + }, + { + "ph": "f", "id": 161139861, "pid": 5714, "tid": 5714, "ts": 6300865700206.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865700207.565, "dur": 0.220, + "args": { + "External id": 82594, "cbid": 200, "correlation": 161139862 + } + }, + { + "ph": "f", "id": 161139862, "pid": 5714, "tid": 5714, "ts": 6300865700207.565, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865700237.215, "dur": 0.430, + "args": { + "External id": 82594, "cbid": 200, "correlation": 161139885 + } + }, + { + "ph": "f", "id": 161139885, "pid": 5714, "tid": 5714, "ts": 6300865700237.215, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865700243.845, "dur": 2.190, + "args": { + "External id": 82594, "cbid": 273, "correlation": 161139894 + } + }, + { + "ph": "f", "id": 161139894, "pid": 5714, "tid": 5714, "ts": 6300865700243.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865888952.920, "dur": 424.133, + "args": { + "External id": 82594, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139895, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139895, "pid": 0, "tid": 7, "ts": 6300865888952.920, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865700246.675, "dur": 11.560, + "args": { + "External id": 82594, "cbid": 211, "correlation": 161139895 + } + }, + { + "ph": "s", "id": 161139895, "pid": 5714, "tid": 5714, "ts": 6300865700246.675, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865889377.789, "dur": 144.290, + "args": { + "External id": 82600, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139918, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139918, "pid": 0, "tid": 7, "ts": 6300865889377.789, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865700338.985, "dur": 8.829, + "args": { + "External id": 82600, "cbid": 211, "correlation": 161139918 + } + }, + { + "ph": "s", "id": 161139918, "pid": 5714, "tid": 5714, "ts": 6300865700338.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865889522.751, "dur": 90.561, + "args": { + "External id": 82604, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139944, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139944, "pid": 0, "tid": 7, "ts": 6300865889522.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865700529.634, "dur": 10.050, + "args": { + "External id": 82604, "cbid": 307, "correlation": 161139944 + } + }, + { + "ph": "s", "id": 161139944, "pid": 5714, "tid": 5714, "ts": 6300865700529.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865889613.984, "dur": 348.260, + "args": { + "External id": 82605, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139964, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139964, "pid": 0, "tid": 7, "ts": 6300865889613.984, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865700576.054, "dur": 6.710, + "args": { + "External id": 82605, "cbid": 211, "correlation": 161139964 + } + }, + { + "ph": "s", "id": 161139964, "pid": 5714, "tid": 5714, "ts": 6300865700576.054, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865889962.980, "dur": 338.852, + "args": { + "External id": 82606, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139987, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161139987, "pid": 0, "tid": 7, "ts": 6300865889962.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865700606.734, "dur": 6.060, + "args": { + "External id": 82606, "cbid": 211, "correlation": 161139987 + } + }, + { + "ph": "s", "id": 161139987, "pid": 5714, "tid": 5714, "ts": 6300865700606.734, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6300865890302.472, "dur": 214.371, + "args": { + "External id": 82607, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161139999, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161139999, "pid": 0, "tid": 7, "ts": 6300865890302.472, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865700649.234, "dur": 6.420, + "args": { + "External id": 82607, "cbid": 307, "correlation": 161139999 + } + }, + { + "ph": "s", "id": 161139999, "pid": 5714, "tid": 5714, "ts": 6300865700649.234, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865700683.034, "dur": 1.370, + "args": { + "External id": 82608, "cbid": 210, "correlation": 161140019 + } + }, + { + "ph": "f", "id": 161140019, "pid": 5714, "tid": 5714, "ts": 6300865700683.034, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865890517.483, "dur": 350.180, + "args": { + "External id": 82608, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140020, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140020, "pid": 0, "tid": 7, "ts": 6300865890517.483, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865700686.384, "dur": 5.840, + "args": { + "External id": 82608, "cbid": 211, "correlation": 161140020 + } + }, + { + "ph": "s", "id": 161140020, "pid": 5714, "tid": 5714, "ts": 6300865700686.384, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6300865890868.335, "dur": 41.120, + "args": { + "External id": 82609, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140027, "pid": 0, "tid": 7, "ts": 6300865890868.335, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865700724.424, "dur": 5.700, + "args": { + "External id": 82609, "cbid": 307, "correlation": 161140027 + } + }, + { + "ph": "s", "id": 161140027, "pid": 5714, "tid": 5714, "ts": 6300865700724.424, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865874428.654, "dur": 785.769, + "args": { + "External id": 82625, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161140042, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161140042, "pid": 0, "tid": 17, "ts": 6300865874428.654, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865701136.833, "dur": 13.120, + "args": { + "External id": 82625, "cbid": 211, "correlation": 161140042 + } + }, + { + "ph": "s", "id": 161140042, "pid": 5714, "tid": 5714, "ts": 6300865701136.833, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865875339.768, "dur": 12.608, + "args": { + "External id": 82641, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161140055, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161140055, "pid": 0, "tid": 17, "ts": 6300865875339.768, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865701264.872, "dur": 10.520, + "args": { + "External id": 82641, "cbid": 211, "correlation": 161140055 + } + }, + { + "ph": "s", "id": 161140055, "pid": 5714, "tid": 5714, "ts": 6300865701264.872, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865701311.972, "dur": 1.480, + "args": { + "cbid": 135, "correlation": 161140065 + } + }, + { + "ph": "f", "id": 161140065, "pid": 5714, "tid": 5714, "ts": 6300865701311.972, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865701315.472, "dur": 1.471, + "args": { + "cbid": 147, "correlation": 161140069 + } + }, + { + "ph": "s", "id": 161140069, "pid": 5714, "tid": 5714, "ts": 6300865701315.472, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865701377.382, "dur": 1.010, + "args": { + "External id": 82643, "cbid": 317, "correlation": 161140082 + } + }, + { + "ph": "f", "id": 161140082, "pid": 5714, "tid": 5714, "ts": 6300865701377.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865701380.442, "dur": 1.260, + "args": { + "External id": 82643, "cbid": 135, "correlation": 161140084 + } + }, + { + "ph": "f", "id": 161140084, "pid": 5714, "tid": 5714, "ts": 6300865701380.442, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865701383.162, "dur": 1.240, + "args": { + "External id": 82643, "cbid": 147, "correlation": 161140088 + } + }, + { + "ph": "s", "id": 161140088, "pid": 5714, "tid": 5714, "ts": 6300865701383.162, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865701402.462, "dur": 0.660, + "args": { + "External id": 82643, "cbid": 409, "correlation": 161140091 + } + }, + { + "ph": "f", "id": 161140091, "pid": 5714, "tid": 5714, "ts": 6300865701402.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865701407.552, "dur": 0.800, + "args": { + "External id": 82643, "cbid": 135, "correlation": 161140094 + } + }, + { + "ph": "f", "id": 161140094, "pid": 5714, "tid": 5714, "ts": 6300865701407.552, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865701408.552, "dur": 0.890, + "args": { + "External id": 82643, "cbid": 147, "correlation": 161140095 + } + }, + { + "ph": "s", "id": 161140095, "pid": 5714, "tid": 5714, "ts": 6300865701408.552, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865887481.543, "dur": 6061.319, + "args": { + "External id": 82643, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161140097, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161140097, "pid": 0, "tid": 20, "ts": 6300865887481.543, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865701410.662, "dur": 10.870, + "args": { + "External id": 82643, "cbid": 430, "correlation": 161140097 + } + }, + { + "ph": "s", "id": 161140097, "pid": 5714, "tid": 5714, "ts": 6300865701410.662, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865701422.702, "dur": 0.440, + "args": { + "External id": 82643, "cbid": 135, "correlation": 161140099 + } + }, + { + "ph": "f", "id": 161140099, "pid": 5714, "tid": 5714, "ts": 6300865701422.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865701423.372, "dur": 0.530, + "args": { + "External id": 82643, "cbid": 147, "correlation": 161140100 + } + }, + { + "ph": "s", "id": 161140100, "pid": 5714, "tid": 5714, "ts": 6300865701423.372, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865701425.512, "dur": 0.890, + "args": { + "External id": 82643, "cbid": 135, "correlation": 161140103 + } + }, + { + "ph": "f", "id": 161140103, "pid": 5714, "tid": 5714, "ts": 6300865701425.512, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865701436.932, "dur": 0.480, + "args": { + "External id": 82643, "cbid": 135, "correlation": 161140110 + } + }, + { + "ph": "f", "id": 161140110, "pid": 5714, "tid": 5714, "ts": 6300865701436.932, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865701468.492, "dur": 1.100, + "args": { + "External id": 82645, "cbid": 147, "correlation": 161140115 + } + }, + { + "ph": "s", "id": 161140115, "pid": 5714, "tid": 5714, "ts": 6300865701468.492, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865701487.502, "dur": 0.900, + "args": { + "cbid": 135, "correlation": 161140130 + } + }, + { + "ph": "f", "id": 161140130, "pid": 5714, "tid": 5714, "ts": 6300865701487.502, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865701531.652, "dur": 1.200, + "args": { + "cbid": 147, "correlation": 161140135 + } + }, + { + "ph": "s", "id": 161140135, "pid": 5714, "tid": 5714, "ts": 6300865701531.652, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865701534.982, "dur": 0.690, + "args": { + "cbid": 147, "correlation": 161140139 + } + }, + { + "ph": "s", "id": 161140139, "pid": 5714, "tid": 5714, "ts": 6300865701534.982, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865701576.132, "dur": 2.260, + "args": { + "cbid": 147, "correlation": 161140145 + } + }, + { + "ph": "s", "id": 161140145, "pid": 5714, "tid": 5714, "ts": 6300865701576.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865701711.531, "dur": 1.311, + "args": { + "External id": 82658, "cbid": 317, "correlation": 161140186 + } + }, + { + "ph": "f", "id": 161140186, "pid": 5714, "tid": 5714, "ts": 6300865701711.531, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865701721.322, "dur": 2.509, + "args": { + "External id": 82659, "cbid": 138, "correlation": 161140189 + } + }, + { + "ph": "f", "id": 161140189, "pid": 5714, "tid": 5714, "ts": 6300865701721.322, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865893549.774, "dur": 2.336, + "args": { + "External id": 82663, "device": 0, "context": 1, "stream": 7, "correlation": 161140200, "bytes": 7224, "memory bandwidth (GB/s)": 3.0924657534246576 + } + }, + { + "ph": "f", "id": 161140200, "pid": 0, "tid": 7, "ts": 6300865893549.774, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865701747.502, "dur": 13.239, + "args": { + "External id": 82663, "cbid": 41, "correlation": 161140200 + } + }, + { + "ph": "s", "id": 161140200, "pid": 5714, "tid": 5714, "ts": 6300865701747.502, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865701765.731, "dur": 1.940, + "args": { + "External id": 82658, "cbid": 135, "correlation": 161140204 + } + }, + { + "ph": "f", "id": 161140204, "pid": 5714, "tid": 5714, "ts": 6300865701765.731, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865893554.511, "dur": 464.709, + "args": { + "External id": 82658, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140208, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140208, "pid": 0, "tid": 7, "ts": 6300865893554.511, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865701770.811, "dur": 10.560, + "args": { + "External id": 82658, "cbid": 211, "correlation": 161140208 + } + }, + { + "ph": "s", "id": 161140208, "pid": 5714, "tid": 5714, "ts": 6300865701770.811, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865701876.881, "dur": 1.390, + "args": { + "cbid": 135, "correlation": 161140219 + } + }, + { + "ph": "f", "id": 161140219, "pid": 5714, "tid": 5714, "ts": 6300865701876.881, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865894105.557, "dur": 367.332, + "args": { + "External id": 82670, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140245, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140245, "pid": 0, "tid": 7, "ts": 6300865894105.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865702112.010, "dur": 11.620, + "args": { + "External id": 82670, "cbid": 307, "correlation": 161140245 + } + }, + { + "ph": "s", "id": 161140245, "pid": 5714, "tid": 5714, "ts": 6300865702112.010, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865894473.529, "dur": 145.250, + "args": { + "External id": 82676, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140268, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140268, "pid": 0, "tid": 7, "ts": 6300865894473.529, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865702268.780, "dur": 10.750, + "args": { + "External id": 82676, "cbid": 211, "correlation": 161140268 + } + }, + { + "ph": "s", "id": 161140268, "pid": 5714, "tid": 5714, "ts": 6300865702268.780, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865894619.387, "dur": 143.362, + "args": { + "External id": 82677, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140291, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140291, "pid": 0, "tid": 7, "ts": 6300865894619.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865702314.510, "dur": 7.260, + "args": { + "External id": 82677, "cbid": 211, "correlation": 161140291 + } + }, + { + "ph": "s", "id": 161140291, "pid": 5714, "tid": 5714, "ts": 6300865702314.510, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865894763.453, "dur": 143.265, + "args": { + "External id": 82678, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140314, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140314, "pid": 0, "tid": 7, "ts": 6300865894763.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865702351.200, "dur": 5.480, + "args": { + "External id": 82678, "cbid": 211, "correlation": 161140314 + } + }, + { + "ph": "s", "id": 161140314, "pid": 5714, "tid": 5714, "ts": 6300865702351.200, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865894907.358, "dur": 53.441, + "args": { + "External id": 82695, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140334, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140334, "pid": 0, "tid": 7, "ts": 6300865894907.358, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865702636.749, "dur": 10.450, + "args": { + "External id": 82695, "cbid": 307, "correlation": 161140334 + } + }, + { + "ph": "s", "id": 161140334, "pid": 5714, "tid": 5714, "ts": 6300865702636.749, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865894961.439, "dur": 60.833, + "args": { + "External id": 82711, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140352, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140352, "pid": 0, "tid": 7, "ts": 6300865894961.439, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865702851.809, "dur": 9.380, + "args": { + "External id": 82711, "cbid": 307, "correlation": 161140352 + } + }, + { + "ph": "s", "id": 161140352, "pid": 5714, "tid": 5714, "ts": 6300865702851.809, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865703010.408, "dur": 0.540, + "args": { + "External id": 82717, "cbid": 200, "correlation": 161140359 + } + }, + { + "ph": "f", "id": 161140359, "pid": 5714, "tid": 5714, "ts": 6300865703010.408, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865703011.199, "dur": 0.220, + "args": { + "External id": 82717, "cbid": 200, "correlation": 161140360 + } + }, + { + "ph": "f", "id": 161140360, "pid": 5714, "tid": 5714, "ts": 6300865703011.199, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865703040.499, "dur": 0.429, + "args": { + "External id": 82717, "cbid": 200, "correlation": 161140383 + } + }, + { + "ph": "f", "id": 161140383, "pid": 5714, "tid": 5714, "ts": 6300865703040.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865703048.099, "dur": 2.209, + "args": { + "External id": 82717, "cbid": 273, "correlation": 161140392 + } + }, + { + "ph": "f", "id": 161140392, "pid": 5714, "tid": 5714, "ts": 6300865703048.099, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865895022.912, "dur": 423.748, + "args": { + "External id": 82717, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140393, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140393, "pid": 0, "tid": 7, "ts": 6300865895022.912, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865703050.948, "dur": 11.431, + "args": { + "External id": 82717, "cbid": 211, "correlation": 161140393 + } + }, + { + "ph": "s", "id": 161140393, "pid": 5714, "tid": 5714, "ts": 6300865703050.948, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865895447.396, "dur": 145.154, + "args": { + "External id": 82723, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140416, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140416, "pid": 0, "tid": 7, "ts": 6300865895447.396, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865703134.998, "dur": 8.520, + "args": { + "External id": 82723, "cbid": 211, "correlation": 161140416 + } + }, + { + "ph": "s", "id": 161140416, "pid": 5714, "tid": 5714, "ts": 6300865703134.998, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865895593.190, "dur": 89.697, + "args": { + "External id": 82727, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140442, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140442, "pid": 0, "tid": 7, "ts": 6300865895593.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865703327.278, "dur": 10.230, + "args": { + "External id": 82727, "cbid": 307, "correlation": 161140442 + } + }, + { + "ph": "s", "id": 161140442, "pid": 5714, "tid": 5714, "ts": 6300865703327.278, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865895683.495, "dur": 341.700, + "args": { + "External id": 82728, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140462, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140462, "pid": 0, "tid": 7, "ts": 6300865895683.495, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865703373.908, "dur": 6.570, + "args": { + "External id": 82728, "cbid": 211, "correlation": 161140462 + } + }, + { + "ph": "s", "id": 161140462, "pid": 5714, "tid": 5714, "ts": 6300865703373.908, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865896025.835, "dur": 338.692, + "args": { + "External id": 82729, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140485, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140485, "pid": 0, "tid": 7, "ts": 6300865896025.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865703405.068, "dur": 6.540, + "args": { + "External id": 82729, "cbid": 211, "correlation": 161140485 + } + }, + { + "ph": "s", "id": 161140485, "pid": 5714, "tid": 5714, "ts": 6300865703405.068, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6300865896365.263, "dur": 214.339, + "args": { + "External id": 82730, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140497, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140497, "pid": 0, "tid": 7, "ts": 6300865896365.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865703450.047, "dur": 6.060, + "args": { + "External id": 82730, "cbid": 307, "correlation": 161140497 + } + }, + { + "ph": "s", "id": 161140497, "pid": 5714, "tid": 5714, "ts": 6300865703450.047, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865703484.798, "dur": 1.409, + "args": { + "External id": 82731, "cbid": 210, "correlation": 161140517 + } + }, + { + "ph": "f", "id": 161140517, "pid": 5714, "tid": 5714, "ts": 6300865703484.798, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865896580.338, "dur": 349.988, + "args": { + "External id": 82731, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140518, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140518, "pid": 0, "tid": 7, "ts": 6300865896580.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865703488.187, "dur": 5.951, + "args": { + "External id": 82731, "cbid": 211, "correlation": 161140518 + } + }, + { + "ph": "s", "id": 161140518, "pid": 5714, "tid": 5714, "ts": 6300865703488.187, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6300865896930.966, "dur": 43.136, + "args": { + "External id": 82732, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140525, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140525, "pid": 0, "tid": 7, "ts": 6300865896930.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865703528.077, "dur": 5.700, + "args": { + "External id": 82732, "cbid": 307, "correlation": 161140525 + } + }, + { + "ph": "s", "id": 161140525, "pid": 5714, "tid": 5714, "ts": 6300865703528.077, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865888399.314, "dur": 23.744, + "args": { + "External id": 82748, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161140540, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161140540, "pid": 0, "tid": 17, "ts": 6300865888399.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865703935.606, "dur": 13.271, + "args": { + "External id": 82748, "cbid": 211, "correlation": 161140540 + } + }, + { + "ph": "s", "id": 161140540, "pid": 5714, "tid": 5714, "ts": 6300865703935.606, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865888431.986, "dur": 14.208, + "args": { + "External id": 82764, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161140553, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161140553, "pid": 0, "tid": 17, "ts": 6300865888431.986, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865704071.396, "dur": 10.600, + "args": { + "External id": 82764, "cbid": 211, "correlation": 161140553 + } + }, + { + "ph": "s", "id": 161140553, "pid": 5714, "tid": 5714, "ts": 6300865704071.396, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865704108.746, "dur": 1.480, + "args": { + "cbid": 135, "correlation": 161140563 + } + }, + { + "ph": "f", "id": 161140563, "pid": 5714, "tid": 5714, "ts": 6300865704108.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865704112.266, "dur": 1.420, + "args": { + "cbid": 147, "correlation": 161140567 + } + }, + { + "ph": "s", "id": 161140567, "pid": 5714, "tid": 5714, "ts": 6300865704112.266, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865704173.076, "dur": 1.020, + "args": { + "External id": 82766, "cbid": 317, "correlation": 161140580 + } + }, + { + "ph": "f", "id": 161140580, "pid": 5714, "tid": 5714, "ts": 6300865704173.076, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865704176.096, "dur": 1.210, + "args": { + "External id": 82766, "cbid": 135, "correlation": 161140582 + } + }, + { + "ph": "f", "id": 161140582, "pid": 5714, "tid": 5714, "ts": 6300865704176.096, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865704178.826, "dur": 1.340, + "args": { + "External id": 82766, "cbid": 147, "correlation": 161140586 + } + }, + { + "ph": "s", "id": 161140586, "pid": 5714, "tid": 5714, "ts": 6300865704178.826, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865704196.576, "dur": 0.680, + "args": { + "External id": 82766, "cbid": 409, "correlation": 161140589 + } + }, + { + "ph": "f", "id": 161140589, "pid": 5714, "tid": 5714, "ts": 6300865704196.576, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865704201.626, "dur": 0.850, + "args": { + "External id": 82766, "cbid": 135, "correlation": 161140592 + } + }, + { + "ph": "f", "id": 161140592, "pid": 5714, "tid": 5714, "ts": 6300865704201.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865704202.666, "dur": 0.970, + "args": { + "External id": 82766, "cbid": 147, "correlation": 161140593 + } + }, + { + "ph": "s", "id": 161140593, "pid": 5714, "tid": 5714, "ts": 6300865704202.666, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865893545.038, "dur": 6049.063, + "args": { + "External id": 82766, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161140595, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161140595, "pid": 0, "tid": 20, "ts": 6300865893545.038, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865704204.876, "dur": 10.540, + "args": { + "External id": 82766, "cbid": 430, "correlation": 161140595 + } + }, + { + "ph": "s", "id": 161140595, "pid": 5714, "tid": 5714, "ts": 6300865704204.876, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865704216.436, "dur": 0.390, + "args": { + "External id": 82766, "cbid": 135, "correlation": 161140597 + } + }, + { + "ph": "f", "id": 161140597, "pid": 5714, "tid": 5714, "ts": 6300865704216.436, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865704216.946, "dur": 0.560, + "args": { + "External id": 82766, "cbid": 147, "correlation": 161140598 + } + }, + { + "ph": "s", "id": 161140598, "pid": 5714, "tid": 5714, "ts": 6300865704216.946, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865704219.376, "dur": 0.850, + "args": { + "External id": 82766, "cbid": 135, "correlation": 161140601 + } + }, + { + "ph": "f", "id": 161140601, "pid": 5714, "tid": 5714, "ts": 6300865704219.376, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865704231.456, "dur": 0.490, + "args": { + "External id": 82766, "cbid": 135, "correlation": 161140608 + } + }, + { + "ph": "f", "id": 161140608, "pid": 5714, "tid": 5714, "ts": 6300865704231.456, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865704262.516, "dur": 1.090, + "args": { + "External id": 82768, "cbid": 147, "correlation": 161140613 + } + }, + { + "ph": "s", "id": 161140613, "pid": 5714, "tid": 5714, "ts": 6300865704262.516, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865704281.466, "dur": 0.960, + "args": { + "cbid": 135, "correlation": 161140628 + } + }, + { + "ph": "f", "id": 161140628, "pid": 5714, "tid": 5714, "ts": 6300865704281.466, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865704335.405, "dur": 1.200, + "args": { + "cbid": 147, "correlation": 161140633 + } + }, + { + "ph": "s", "id": 161140633, "pid": 5714, "tid": 5714, "ts": 6300865704335.405, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865704338.565, "dur": 0.711, + "args": { + "cbid": 147, "correlation": 161140637 + } + }, + { + "ph": "s", "id": 161140637, "pid": 5714, "tid": 5714, "ts": 6300865704338.565, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865704380.996, "dur": 2.320, + "args": { + "cbid": 147, "correlation": 161140643 + } + }, + { + "ph": "s", "id": 161140643, "pid": 5714, "tid": 5714, "ts": 6300865704380.996, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865704505.695, "dur": 1.350, + "args": { + "External id": 82781, "cbid": 317, "correlation": 161140684 + } + }, + { + "ph": "f", "id": 161140684, "pid": 5714, "tid": 5714, "ts": 6300865704505.695, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865704516.665, "dur": 2.870, + "args": { + "External id": 82782, "cbid": 138, "correlation": 161140687 + } + }, + { + "ph": "f", "id": 161140687, "pid": 5714, "tid": 5714, "ts": 6300865704516.665, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865899600.661, "dur": 1.632, + "args": { + "External id": 82786, "device": 0, "context": 1, "stream": 7, "correlation": 161140698, "bytes": 7224, "memory bandwidth (GB/s)": 4.426470588235294 + } + }, + { + "ph": "f", "id": 161140698, "pid": 0, "tid": 7, "ts": 6300865899600.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865704548.405, "dur": 13.030, + "args": { + "External id": 82786, "cbid": 41, "correlation": 161140698 + } + }, + { + "ph": "s", "id": 161140698, "pid": 5714, "tid": 5714, "ts": 6300865704548.405, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865704566.115, "dur": 2.000, + "args": { + "External id": 82781, "cbid": 135, "correlation": 161140702 + } + }, + { + "ph": "f", "id": 161140702, "pid": 5714, "tid": 5714, "ts": 6300865704566.115, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865899604.917, "dur": 454.598, + "args": { + "External id": 82781, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140706, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140706, "pid": 0, "tid": 7, "ts": 6300865899604.917, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865704570.865, "dur": 10.930, + "args": { + "External id": 82781, "cbid": 211, "correlation": 161140706 + } + }, + { + "ph": "s", "id": 161140706, "pid": 5714, "tid": 5714, "ts": 6300865704570.865, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865704679.825, "dur": 1.340, + "args": { + "cbid": 135, "correlation": 161140717 + } + }, + { + "ph": "f", "id": 161140717, "pid": 5714, "tid": 5714, "ts": 6300865704679.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865900091.643, "dur": 433.381, + "args": { + "External id": 82793, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140743, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140743, "pid": 0, "tid": 7, "ts": 6300865900091.643, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865704907.524, "dur": 11.470, + "args": { + "External id": 82793, "cbid": 307, "correlation": 161140743 + } + }, + { + "ph": "s", "id": 161140743, "pid": 5714, "tid": 5714, "ts": 6300865704907.524, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865900525.632, "dur": 145.506, + "args": { + "External id": 82799, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140766, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140766, "pid": 0, "tid": 7, "ts": 6300865900525.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865705062.884, "dur": 10.940, + "args": { + "External id": 82799, "cbid": 211, "correlation": 161140766 + } + }, + { + "ph": "s", "id": 161140766, "pid": 5714, "tid": 5714, "ts": 6300865705062.884, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865900671.842, "dur": 143.234, + "args": { + "External id": 82800, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140789, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140789, "pid": 0, "tid": 7, "ts": 6300865900671.842, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865705099.044, "dur": 6.740, + "args": { + "External id": 82800, "cbid": 211, "correlation": 161140789 + } + }, + { + "ph": "s", "id": 161140789, "pid": 5714, "tid": 5714, "ts": 6300865705099.044, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865900815.780, "dur": 142.945, + "args": { + "External id": 82801, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140812, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140812, "pid": 0, "tid": 7, "ts": 6300865900815.780, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865705129.054, "dur": 5.230, + "args": { + "External id": 82801, "cbid": 211, "correlation": 161140812 + } + }, + { + "ph": "s", "id": 161140812, "pid": 5714, "tid": 5714, "ts": 6300865705129.054, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865900959.397, "dur": 54.305, + "args": { + "External id": 82818, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140832, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140832, "pid": 0, "tid": 7, "ts": 6300865900959.397, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865705423.203, "dur": 10.740, + "args": { + "External id": 82818, "cbid": 307, "correlation": 161140832 + } + }, + { + "ph": "s", "id": 161140832, "pid": 5714, "tid": 5714, "ts": 6300865705423.203, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865901014.406, "dur": 61.729, + "args": { + "External id": 82834, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140850, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140850, "pid": 0, "tid": 7, "ts": 6300865901014.406, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865705638.902, "dur": 9.971, + "args": { + "External id": 82834, "cbid": 307, "correlation": 161140850 + } + }, + { + "ph": "s", "id": 161140850, "pid": 5714, "tid": 5714, "ts": 6300865705638.902, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865705798.562, "dur": 0.580, + "args": { + "External id": 82840, "cbid": 200, "correlation": 161140857 + } + }, + { + "ph": "f", "id": 161140857, "pid": 5714, "tid": 5714, "ts": 6300865705798.562, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865705799.272, "dur": 0.220, + "args": { + "External id": 82840, "cbid": 200, "correlation": 161140858 + } + }, + { + "ph": "f", "id": 161140858, "pid": 5714, "tid": 5714, "ts": 6300865705799.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865705828.992, "dur": 0.440, + "args": { + "External id": 82840, "cbid": 200, "correlation": 161140881 + } + }, + { + "ph": "f", "id": 161140881, "pid": 5714, "tid": 5714, "ts": 6300865705828.992, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865705835.392, "dur": 2.200, + "args": { + "External id": 82840, "cbid": 273, "correlation": 161140890 + } + }, + { + "ph": "f", "id": 161140890, "pid": 5714, "tid": 5714, "ts": 6300865705835.392, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865901076.839, "dur": 426.693, + "args": { + "External id": 82840, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140891, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140891, "pid": 0, "tid": 7, "ts": 6300865901076.839, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865705838.212, "dur": 11.400, + "args": { + "External id": 82840, "cbid": 211, "correlation": 161140891 + } + }, + { + "ph": "s", "id": 161140891, "pid": 5714, "tid": 5714, "ts": 6300865705838.212, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865901504.204, "dur": 144.801, + "args": { + "External id": 82846, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140914, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140914, "pid": 0, "tid": 7, "ts": 6300865901504.204, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865705921.102, "dur": 8.410, + "args": { + "External id": 82846, "cbid": 211, "correlation": 161140914 + } + }, + { + "ph": "s", "id": 161140914, "pid": 5714, "tid": 5714, "ts": 6300865705921.102, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865901649.677, "dur": 88.834, + "args": { + "External id": 82850, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140940, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140940, "pid": 0, "tid": 7, "ts": 6300865901649.677, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865706103.781, "dur": 10.191, + "args": { + "External id": 82850, "cbid": 307, "correlation": 161140940 + } + }, + { + "ph": "s", "id": 161140940, "pid": 5714, "tid": 5714, "ts": 6300865706103.781, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865901739.119, "dur": 344.804, + "args": { + "External id": 82851, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140960, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140960, "pid": 0, "tid": 7, "ts": 6300865901739.119, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865706149.081, "dur": 6.631, + "args": { + "External id": 82851, "cbid": 211, "correlation": 161140960 + } + }, + { + "ph": "s", "id": 161140960, "pid": 5714, "tid": 5714, "ts": 6300865706149.081, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865902084.595, "dur": 338.083, + "args": { + "External id": 82852, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140983, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161140983, "pid": 0, "tid": 7, "ts": 6300865902084.595, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865706180.331, "dur": 6.370, + "args": { + "External id": 82852, "cbid": 211, "correlation": 161140983 + } + }, + { + "ph": "s", "id": 161140983, "pid": 5714, "tid": 5714, "ts": 6300865706180.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6300865902423.318, "dur": 349.476, + "args": { + "External id": 82853, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161140995, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161140995, "pid": 0, "tid": 7, "ts": 6300865902423.318, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865706227.461, "dur": 6.340, + "args": { + "External id": 82853, "cbid": 307, "correlation": 161140995 + } + }, + { + "ph": "s", "id": 161140995, "pid": 5714, "tid": 5714, "ts": 6300865706227.461, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865706261.041, "dur": 1.380, + "args": { + "External id": 82854, "cbid": 210, "correlation": 161141015 + } + }, + { + "ph": "f", "id": 161141015, "pid": 5714, "tid": 5714, "ts": 6300865706261.041, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865902773.402, "dur": 669.928, + "args": { + "External id": 82854, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141016, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141016, "pid": 0, "tid": 7, "ts": 6300865902773.402, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865706264.401, "dur": 6.230, + "args": { + "External id": 82854, "cbid": 211, "correlation": 161141016 + } + }, + { + "ph": "s", "id": 161141016, "pid": 5714, "tid": 5714, "ts": 6300865706264.401, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6300865903444.034, "dur": 227.267, + "args": { + "External id": 82855, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141023, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141023, "pid": 0, "tid": 7, "ts": 6300865903444.034, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865706311.781, "dur": 6.660, + "args": { + "External id": 82855, "cbid": 307, "correlation": 161141023 + } + }, + { + "ph": "s", "id": 161141023, "pid": 5714, "tid": 5714, "ts": 6300865706311.781, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865894315.095, "dur": 160.194, + "args": { + "External id": 82871, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161141038, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161141038, "pid": 0, "tid": 17, "ts": 6300865894315.095, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865706722.520, "dur": 13.170, + "args": { + "External id": 82871, "cbid": 211, "correlation": 161141038 + } + }, + { + "ph": "s", "id": 161141038, "pid": 5714, "tid": 5714, "ts": 6300865706722.520, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865894502.009, "dur": 14.049, + "args": { + "External id": 82887, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161141051, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161141051, "pid": 0, "tid": 17, "ts": 6300865894502.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865706851.850, "dur": 10.460, + "args": { + "External id": 82887, "cbid": 211, "correlation": 161141051 + } + }, + { + "ph": "s", "id": 161141051, "pid": 5714, "tid": 5714, "ts": 6300865706851.850, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865706888.990, "dur": 1.420, + "args": { + "cbid": 135, "correlation": 161141061 + } + }, + { + "ph": "f", "id": 161141061, "pid": 5714, "tid": 5714, "ts": 6300865706888.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865706892.380, "dur": 1.470, + "args": { + "cbid": 147, "correlation": 161141065 + } + }, + { + "ph": "s", "id": 161141065, "pid": 5714, "tid": 5714, "ts": 6300865706892.380, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865706953.490, "dur": 0.989, + "args": { + "External id": 82889, "cbid": 317, "correlation": 161141078 + } + }, + { + "ph": "f", "id": 161141078, "pid": 5714, "tid": 5714, "ts": 6300865706953.490, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865706956.490, "dur": 1.289, + "args": { + "External id": 82889, "cbid": 135, "correlation": 161141080 + } + }, + { + "ph": "f", "id": 161141080, "pid": 5714, "tid": 5714, "ts": 6300865706956.490, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865706959.290, "dur": 1.340, + "args": { + "External id": 82889, "cbid": 147, "correlation": 161141084 + } + }, + { + "ph": "s", "id": 161141084, "pid": 5714, "tid": 5714, "ts": 6300865706959.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865706978.079, "dur": 0.711, + "args": { + "External id": 82889, "cbid": 409, "correlation": 161141087 + } + }, + { + "ph": "f", "id": 161141087, "pid": 5714, "tid": 5714, "ts": 6300865706978.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865706983.210, "dur": 0.800, + "args": { + "External id": 82889, "cbid": 135, "correlation": 161141090 + } + }, + { + "ph": "f", "id": 161141090, "pid": 5714, "tid": 5714, "ts": 6300865706983.210, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865706984.199, "dur": 0.891, + "args": { + "External id": 82889, "cbid": 147, "correlation": 161141091 + } + }, + { + "ph": "s", "id": 161141091, "pid": 5714, "tid": 5714, "ts": 6300865706984.199, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865899596.693, "dur": 4839.705, + "args": { + "External id": 82889, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161141093, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161141093, "pid": 0, "tid": 20, "ts": 6300865899596.693, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865706986.199, "dur": 10.791, + "args": { + "External id": 82889, "cbid": 430, "correlation": 161141093 + } + }, + { + "ph": "s", "id": 161141093, "pid": 5714, "tid": 5714, "ts": 6300865706986.199, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865706997.990, "dur": 0.409, + "args": { + "External id": 82889, "cbid": 135, "correlation": 161141095 + } + }, + { + "ph": "f", "id": 161141095, "pid": 5714, "tid": 5714, "ts": 6300865706997.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865706998.530, "dur": 0.520, + "args": { + "External id": 82889, "cbid": 147, "correlation": 161141096 + } + }, + { + "ph": "s", "id": 161141096, "pid": 5714, "tid": 5714, "ts": 6300865706998.530, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865707000.690, "dur": 0.909, + "args": { + "External id": 82889, "cbid": 135, "correlation": 161141099 + } + }, + { + "ph": "f", "id": 161141099, "pid": 5714, "tid": 5714, "ts": 6300865707000.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865707011.499, "dur": 0.471, + "args": { + "External id": 82889, "cbid": 135, "correlation": 161141106 + } + }, + { + "ph": "f", "id": 161141106, "pid": 5714, "tid": 5714, "ts": 6300865707011.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865707043.879, "dur": 1.000, + "args": { + "External id": 82891, "cbid": 147, "correlation": 161141111 + } + }, + { + "ph": "s", "id": 161141111, "pid": 5714, "tid": 5714, "ts": 6300865707043.879, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865707062.309, "dur": 0.940, + "args": { + "cbid": 135, "correlation": 161141126 + } + }, + { + "ph": "f", "id": 161141126, "pid": 5714, "tid": 5714, "ts": 6300865707062.309, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865707105.789, "dur": 1.170, + "args": { + "cbid": 147, "correlation": 161141131 + } + }, + { + "ph": "s", "id": 161141131, "pid": 5714, "tid": 5714, "ts": 6300865707105.789, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865707109.019, "dur": 0.670, + "args": { + "cbid": 147, "correlation": 161141135 + } + }, + { + "ph": "s", "id": 161141135, "pid": 5714, "tid": 5714, "ts": 6300865707109.019, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865707150.709, "dur": 2.240, + "args": { + "cbid": 147, "correlation": 161141141 + } + }, + { + "ph": "s", "id": 161141141, "pid": 5714, "tid": 5714, "ts": 6300865707150.709, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865707271.789, "dur": 1.320, + "args": { + "External id": 82904, "cbid": 317, "correlation": 161141182 + } + }, + { + "ph": "f", "id": 161141182, "pid": 5714, "tid": 5714, "ts": 6300865707271.789, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865707282.819, "dur": 2.910, + "args": { + "External id": 82905, "cbid": 138, "correlation": 161141185 + } + }, + { + "ph": "f", "id": 161141185, "pid": 5714, "tid": 5714, "ts": 6300865707282.819, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865904439.118, "dur": 1.984, + "args": { + "External id": 82909, "device": 0, "context": 1, "stream": 7, "correlation": 161141196, "bytes": 7224, "memory bandwidth (GB/s)": 3.6411290322580645 + } + }, + { + "ph": "f", "id": 161141196, "pid": 0, "tid": 7, "ts": 6300865904439.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865707318.899, "dur": 13.290, + "args": { + "External id": 82909, "cbid": 41, "correlation": 161141196 + } + }, + { + "ph": "s", "id": 161141196, "pid": 5714, "tid": 5714, "ts": 6300865707318.899, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865707337.009, "dur": 2.000, + "args": { + "External id": 82904, "cbid": 135, "correlation": 161141200 + } + }, + { + "ph": "f", "id": 161141200, "pid": 5714, "tid": 5714, "ts": 6300865707337.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865904443.150, "dur": 15.712, + "args": { + "External id": 82904, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141204, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141204, "pid": 0, "tid": 7, "ts": 6300865904443.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865707341.909, "dur": 10.800, + "args": { + "External id": 82904, "cbid": 211, "correlation": 161141204 + } + }, + { + "ph": "s", "id": 161141204, "pid": 5714, "tid": 5714, "ts": 6300865707341.909, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865707449.618, "dur": 1.451, + "args": { + "cbid": 135, "correlation": 161141215 + } + }, + { + "ph": "f", "id": 161141215, "pid": 5714, "tid": 5714, "ts": 6300865707449.618, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865904459.791, "dur": 734.376, + "args": { + "External id": 82916, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141241, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141241, "pid": 0, "tid": 7, "ts": 6300865904459.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865707675.478, "dur": 11.650, + "args": { + "External id": 82916, "cbid": 307, "correlation": 161141241 + } + }, + { + "ph": "s", "id": 161141241, "pid": 5714, "tid": 5714, "ts": 6300865707675.478, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865905249.880, "dur": 242.083, + "args": { + "External id": 82922, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141264, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141264, "pid": 0, "tid": 7, "ts": 6300865905249.880, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865707832.477, "dur": 10.811, + "args": { + "External id": 82922, "cbid": 211, "correlation": 161141264 + } + }, + { + "ph": "s", "id": 161141264, "pid": 5714, "tid": 5714, "ts": 6300865707832.477, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865905492.635, "dur": 143.745, + "args": { + "External id": 82923, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141287, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141287, "pid": 0, "tid": 7, "ts": 6300865905492.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865707868.397, "dur": 6.691, + "args": { + "External id": 82923, "cbid": 211, "correlation": 161141287 + } + }, + { + "ph": "s", "id": 161141287, "pid": 5714, "tid": 5714, "ts": 6300865707868.397, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865905636.988, "dur": 142.562, + "args": { + "External id": 82924, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141310, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141310, "pid": 0, "tid": 7, "ts": 6300865905636.988, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865707899.128, "dur": 5.080, + "args": { + "External id": 82924, "cbid": 211, "correlation": 161141310 + } + }, + { + "ph": "s", "id": 161141310, "pid": 5714, "tid": 5714, "ts": 6300865707899.128, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865905780.190, "dur": 52.737, + "args": { + "External id": 82941, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141330, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141330, "pid": 0, "tid": 7, "ts": 6300865905780.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865708180.197, "dur": 10.340, + "args": { + "External id": 82941, "cbid": 307, "correlation": 161141330 + } + }, + { + "ph": "s", "id": 161141330, "pid": 5714, "tid": 5714, "ts": 6300865708180.197, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865905833.631, "dur": 62.784, + "args": { + "External id": 82957, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141348, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141348, "pid": 0, "tid": 7, "ts": 6300865905833.631, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865708422.666, "dur": 10.470, + "args": { + "External id": 82957, "cbid": 307, "correlation": 161141348 + } + }, + { + "ph": "s", "id": 161141348, "pid": 5714, "tid": 5714, "ts": 6300865708422.666, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865708582.566, "dur": 0.560, + "args": { + "External id": 82963, "cbid": 200, "correlation": 161141355 + } + }, + { + "ph": "f", "id": 161141355, "pid": 5714, "tid": 5714, "ts": 6300865708582.566, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865708583.246, "dur": 0.220, + "args": { + "External id": 82963, "cbid": 200, "correlation": 161141356 + } + }, + { + "ph": "f", "id": 161141356, "pid": 5714, "tid": 5714, "ts": 6300865708583.246, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865708613.576, "dur": 0.440, + "args": { + "External id": 82963, "cbid": 200, "correlation": 161141379 + } + }, + { + "ph": "f", "id": 161141379, "pid": 5714, "tid": 5714, "ts": 6300865708613.576, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865708620.116, "dur": 2.200, + "args": { + "External id": 82963, "cbid": 273, "correlation": 161141388 + } + }, + { + "ph": "f", "id": 161141388, "pid": 5714, "tid": 5714, "ts": 6300865708620.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865905897.119, "dur": 439.718, + "args": { + "External id": 82963, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141389, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141389, "pid": 0, "tid": 7, "ts": 6300865905897.119, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865708622.946, "dur": 11.260, + "args": { + "External id": 82963, "cbid": 211, "correlation": 161141389 + } + }, + { + "ph": "s", "id": 161141389, "pid": 5714, "tid": 5714, "ts": 6300865708622.946, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865906337.541, "dur": 145.153, + "args": { + "External id": 82969, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141412, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141412, "pid": 0, "tid": 7, "ts": 6300865906337.541, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865708705.226, "dur": 8.440, + "args": { + "External id": 82969, "cbid": 211, "correlation": 161141412 + } + }, + { + "ph": "s", "id": 161141412, "pid": 5714, "tid": 5714, "ts": 6300865708705.226, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865906483.430, "dur": 89.761, + "args": { + "External id": 82973, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141438, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141438, "pid": 0, "tid": 7, "ts": 6300865906483.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865708888.005, "dur": 10.130, + "args": { + "External id": 82973, "cbid": 307, "correlation": 161141438 + } + }, + { + "ph": "s", "id": 161141438, "pid": 5714, "tid": 5714, "ts": 6300865708888.005, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865906573.831, "dur": 348.036, + "args": { + "External id": 82974, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141458, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141458, "pid": 0, "tid": 7, "ts": 6300865906573.831, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865708933.885, "dur": 7.180, + "args": { + "External id": 82974, "cbid": 211, "correlation": 161141458 + } + }, + { + "ph": "s", "id": 161141458, "pid": 5714, "tid": 5714, "ts": 6300865708933.885, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865906922.571, "dur": 339.684, + "args": { + "External id": 82975, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141481, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141481, "pid": 0, "tid": 7, "ts": 6300865906922.571, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865708965.735, "dur": 6.100, + "args": { + "External id": 82975, "cbid": 211, "correlation": 161141481 + } + }, + { + "ph": "s", "id": 161141481, "pid": 5714, "tid": 5714, "ts": 6300865708965.735, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6300865907262.991, "dur": 248.035, + "args": { + "External id": 82976, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141493, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141493, "pid": 0, "tid": 7, "ts": 6300865907262.991, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865709010.295, "dur": 5.890, + "args": { + "External id": 82976, "cbid": 307, "correlation": 161141493 + } + }, + { + "ph": "s", "id": 161141493, "pid": 5714, "tid": 5714, "ts": 6300865709010.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865709043.725, "dur": 1.320, + "args": { + "External id": 82977, "cbid": 210, "correlation": 161141513 + } + }, + { + "ph": "f", "id": 161141513, "pid": 5714, "tid": 5714, "ts": 6300865709043.725, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865907511.698, "dur": 575.943, + "args": { + "External id": 82977, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141514, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141514, "pid": 0, "tid": 7, "ts": 6300865907511.698, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865709046.985, "dur": 5.920, + "args": { + "External id": 82977, "cbid": 211, "correlation": 161141514 + } + }, + { + "ph": "s", "id": 161141514, "pid": 5714, "tid": 5714, "ts": 6300865709046.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6300865908088.249, "dur": 197.378, + "args": { + "External id": 82978, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141521, "pid": 0, "tid": 7, "ts": 6300865908088.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865709085.905, "dur": 5.700, + "args": { + "External id": 82978, "cbid": 307, "correlation": 161141521 + } + }, + { + "ph": "s", "id": 161141521, "pid": 5714, "tid": 5714, "ts": 6300865709085.905, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865900133.244, "dur": 394.596, + "args": { + "External id": 82994, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161141536, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161141536, "pid": 0, "tid": 17, "ts": 6300865900133.244, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865709508.864, "dur": 13.330, + "args": { + "External id": 82994, "cbid": 211, "correlation": 161141536 + } + }, + { + "ph": "s", "id": 161141536, "pid": 5714, "tid": 5714, "ts": 6300865709508.864, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865900552.160, "dur": 17.665, + "args": { + "External id": 83010, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161141549, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161141549, "pid": 0, "tid": 17, "ts": 6300865900552.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865709638.753, "dur": 10.651, + "args": { + "External id": 83010, "cbid": 211, "correlation": 161141549 + } + }, + { + "ph": "s", "id": 161141549, "pid": 5714, "tid": 5714, "ts": 6300865709638.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865709676.744, "dur": 1.489, + "args": { + "cbid": 135, "correlation": 161141559 + } + }, + { + "ph": "f", "id": 161141559, "pid": 5714, "tid": 5714, "ts": 6300865709676.744, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865709680.323, "dur": 1.380, + "args": { + "cbid": 147, "correlation": 161141563 + } + }, + { + "ph": "s", "id": 161141563, "pid": 5714, "tid": 5714, "ts": 6300865709680.323, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865709739.533, "dur": 0.990, + "args": { + "External id": 83012, "cbid": 317, "correlation": 161141576 + } + }, + { + "ph": "f", "id": 161141576, "pid": 5714, "tid": 5714, "ts": 6300865709739.533, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865709742.543, "dur": 1.240, + "args": { + "External id": 83012, "cbid": 135, "correlation": 161141578 + } + }, + { + "ph": "f", "id": 161141578, "pid": 5714, "tid": 5714, "ts": 6300865709742.543, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865709745.323, "dur": 1.300, + "args": { + "External id": 83012, "cbid": 147, "correlation": 161141582 + } + }, + { + "ph": "s", "id": 161141582, "pid": 5714, "tid": 5714, "ts": 6300865709745.323, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865709764.203, "dur": 0.750, + "args": { + "External id": 83012, "cbid": 409, "correlation": 161141585 + } + }, + { + "ph": "f", "id": 161141585, "pid": 5714, "tid": 5714, "ts": 6300865709764.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865709769.453, "dur": 0.840, + "args": { + "External id": 83012, "cbid": 135, "correlation": 161141588 + } + }, + { + "ph": "f", "id": 161141588, "pid": 5714, "tid": 5714, "ts": 6300865709769.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865709770.503, "dur": 1.000, + "args": { + "External id": 83012, "cbid": 147, "correlation": 161141589 + } + }, + { + "ph": "s", "id": 161141589, "pid": 5714, "tid": 5714, "ts": 6300865709770.503, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865904437.902, "dur": 4936.922, + "args": { + "External id": 83012, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161141591, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161141591, "pid": 0, "tid": 20, "ts": 6300865904437.902, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865709772.753, "dur": 10.930, + "args": { + "External id": 83012, "cbid": 430, "correlation": 161141591 + } + }, + { + "ph": "s", "id": 161141591, "pid": 5714, "tid": 5714, "ts": 6300865709772.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865709784.723, "dur": 0.430, + "args": { + "External id": 83012, "cbid": 135, "correlation": 161141593 + } + }, + { + "ph": "f", "id": 161141593, "pid": 5714, "tid": 5714, "ts": 6300865709784.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865709785.273, "dur": 0.540, + "args": { + "External id": 83012, "cbid": 147, "correlation": 161141594 + } + }, + { + "ph": "s", "id": 161141594, "pid": 5714, "tid": 5714, "ts": 6300865709785.273, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865709787.573, "dur": 0.840, + "args": { + "External id": 83012, "cbid": 135, "correlation": 161141597 + } + }, + { + "ph": "f", "id": 161141597, "pid": 5714, "tid": 5714, "ts": 6300865709787.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865709798.093, "dur": 0.490, + "args": { + "External id": 83012, "cbid": 135, "correlation": 161141604 + } + }, + { + "ph": "f", "id": 161141604, "pid": 5714, "tid": 5714, "ts": 6300865709798.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865709829.253, "dur": 1.180, + "args": { + "External id": 83014, "cbid": 147, "correlation": 161141609 + } + }, + { + "ph": "s", "id": 161141609, "pid": 5714, "tid": 5714, "ts": 6300865709829.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865709848.683, "dur": 1.000, + "args": { + "cbid": 135, "correlation": 161141624 + } + }, + { + "ph": "f", "id": 161141624, "pid": 5714, "tid": 5714, "ts": 6300865709848.683, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865709893.403, "dur": 1.220, + "args": { + "cbid": 147, "correlation": 161141629 + } + }, + { + "ph": "s", "id": 161141629, "pid": 5714, "tid": 5714, "ts": 6300865709893.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865709896.663, "dur": 0.680, + "args": { + "cbid": 147, "correlation": 161141633 + } + }, + { + "ph": "s", "id": 161141633, "pid": 5714, "tid": 5714, "ts": 6300865709896.663, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865709938.483, "dur": 2.300, + "args": { + "cbid": 147, "correlation": 161141639 + } + }, + { + "ph": "s", "id": 161141639, "pid": 5714, "tid": 5714, "ts": 6300865709938.483, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865710063.163, "dur": 1.289, + "args": { + "External id": 83027, "cbid": 317, "correlation": 161141680 + } + }, + { + "ph": "f", "id": 161141680, "pid": 5714, "tid": 5714, "ts": 6300865710063.163, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865710073.983, "dur": 2.869, + "args": { + "External id": 83028, "cbid": 138, "correlation": 161141683 + } + }, + { + "ph": "f", "id": 161141683, "pid": 5714, "tid": 5714, "ts": 6300865710073.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865909378.792, "dur": 1.664, + "args": { + "External id": 83032, "device": 0, "context": 1, "stream": 7, "correlation": 161141694, "bytes": 7224, "memory bandwidth (GB/s)": 4.341346153846154 + } + }, + { + "ph": "f", "id": 161141694, "pid": 0, "tid": 7, "ts": 6300865909378.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865710100.832, "dur": 13.131, + "args": { + "External id": 83032, "cbid": 41, "correlation": 161141694 + } + }, + { + "ph": "s", "id": 161141694, "pid": 5714, "tid": 5714, "ts": 6300865710100.832, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865710119.832, "dur": 1.920, + "args": { + "External id": 83027, "cbid": 135, "correlation": 161141698 + } + }, + { + "ph": "f", "id": 161141698, "pid": 5714, "tid": 5714, "ts": 6300865710119.832, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865909382.408, "dur": 99.937, + "args": { + "External id": 83027, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141702, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141702, "pid": 0, "tid": 7, "ts": 6300865909382.408, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865710124.712, "dur": 10.830, + "args": { + "External id": 83027, "cbid": 211, "correlation": 161141702 + } + }, + { + "ph": "s", "id": 161141702, "pid": 5714, "tid": 5714, "ts": 6300865710124.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865710231.962, "dur": 1.450, + "args": { + "cbid": 135, "correlation": 161141713 + } + }, + { + "ph": "f", "id": 161141713, "pid": 5714, "tid": 5714, "ts": 6300865710231.962, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865909609.899, "dur": 493.382, + "args": { + "External id": 83039, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141739, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141739, "pid": 0, "tid": 7, "ts": 6300865909609.899, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865710470.342, "dur": 11.540, + "args": { + "External id": 83039, "cbid": 307, "correlation": 161141739 + } + }, + { + "ph": "s", "id": 161141739, "pid": 5714, "tid": 5714, "ts": 6300865710470.342, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865910170.098, "dur": 265.955, + "args": { + "External id": 83045, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141762, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141762, "pid": 0, "tid": 7, "ts": 6300865910170.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865710631.851, "dur": 10.660, + "args": { + "External id": 83045, "cbid": 211, "correlation": 161141762 + } + }, + { + "ph": "s", "id": 161141762, "pid": 5714, "tid": 5714, "ts": 6300865710631.851, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865910436.757, "dur": 143.906, + "args": { + "External id": 83046, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141785, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141785, "pid": 0, "tid": 7, "ts": 6300865910436.757, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865710668.581, "dur": 6.140, + "args": { + "External id": 83046, "cbid": 211, "correlation": 161141785 + } + }, + { + "ph": "s", "id": 161141785, "pid": 5714, "tid": 5714, "ts": 6300865710668.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865910581.335, "dur": 143.137, + "args": { + "External id": 83047, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141808, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141808, "pid": 0, "tid": 7, "ts": 6300865910581.335, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865710697.751, "dur": 5.560, + "args": { + "External id": 83047, "cbid": 211, "correlation": 161141808 + } + }, + { + "ph": "s", "id": 161141808, "pid": 5714, "tid": 5714, "ts": 6300865710697.751, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865910725.208, "dur": 52.641, + "args": { + "External id": 83064, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141828, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141828, "pid": 0, "tid": 7, "ts": 6300865910725.208, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865710981.701, "dur": 10.229, + "args": { + "External id": 83064, "cbid": 307, "correlation": 161141828 + } + }, + { + "ph": "s", "id": 161141828, "pid": 5714, "tid": 5714, "ts": 6300865710981.701, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865910778.521, "dur": 61.920, + "args": { + "External id": 83080, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141846, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141846, "pid": 0, "tid": 7, "ts": 6300865910778.521, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865711194.390, "dur": 9.290, + "args": { + "External id": 83080, "cbid": 307, "correlation": 161141846 + } + }, + { + "ph": "s", "id": 161141846, "pid": 5714, "tid": 5714, "ts": 6300865711194.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865711360.609, "dur": 0.571, + "args": { + "External id": 83086, "cbid": 200, "correlation": 161141853 + } + }, + { + "ph": "f", "id": 161141853, "pid": 5714, "tid": 5714, "ts": 6300865711360.609, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865711361.309, "dur": 0.231, + "args": { + "External id": 83086, "cbid": 200, "correlation": 161141854 + } + }, + { + "ph": "f", "id": 161141854, "pid": 5714, "tid": 5714, "ts": 6300865711361.309, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865711391.949, "dur": 0.540, + "args": { + "External id": 83086, "cbid": 200, "correlation": 161141877 + } + }, + { + "ph": "f", "id": 161141877, "pid": 5714, "tid": 5714, "ts": 6300865711391.949, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865711399.520, "dur": 2.200, + "args": { + "External id": 83086, "cbid": 273, "correlation": 161141886 + } + }, + { + "ph": "f", "id": 161141886, "pid": 5714, "tid": 5714, "ts": 6300865711399.520, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865910841.017, "dur": 429.029, + "args": { + "External id": 83086, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141887, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141887, "pid": 0, "tid": 7, "ts": 6300865910841.017, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865711402.380, "dur": 11.769, + "args": { + "External id": 83086, "cbid": 211, "correlation": 161141887 + } + }, + { + "ph": "s", "id": 161141887, "pid": 5714, "tid": 5714, "ts": 6300865711402.380, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865911270.782, "dur": 145.218, + "args": { + "External id": 83092, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141910, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141910, "pid": 0, "tid": 7, "ts": 6300865911270.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865711486.369, "dur": 8.270, + "args": { + "External id": 83092, "cbid": 211, "correlation": 161141910 + } + }, + { + "ph": "s", "id": 161141910, "pid": 5714, "tid": 5714, "ts": 6300865711486.369, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865911416.608, "dur": 92.321, + "args": { + "External id": 83096, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141936, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141936, "pid": 0, "tid": 7, "ts": 6300865911416.608, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865711669.249, "dur": 9.910, + "args": { + "External id": 83096, "cbid": 307, "correlation": 161141936 + } + }, + { + "ph": "s", "id": 161141936, "pid": 5714, "tid": 5714, "ts": 6300865711669.249, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865911509.569, "dur": 343.140, + "args": { + "External id": 83097, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141956, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141956, "pid": 0, "tid": 7, "ts": 6300865911509.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865711714.259, "dur": 6.510, + "args": { + "External id": 83097, "cbid": 211, "correlation": 161141956 + } + }, + { + "ph": "s", "id": 161141956, "pid": 5714, "tid": 5714, "ts": 6300865711714.259, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865911853.349, "dur": 347.972, + "args": { + "External id": 83098, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141979, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161141979, "pid": 0, "tid": 7, "ts": 6300865911853.349, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865711744.219, "dur": 6.030, + "args": { + "External id": 83098, "cbid": 211, "correlation": 161141979 + } + }, + { + "ph": "s", "id": 161141979, "pid": 5714, "tid": 5714, "ts": 6300865711744.219, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6300865912202.025, "dur": 321.156, + "args": { + "External id": 83099, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161141991, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161141991, "pid": 0, "tid": 7, "ts": 6300865912202.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865711788.688, "dur": 6.451, + "args": { + "External id": 83099, "cbid": 307, "correlation": 161141991 + } + }, + { + "ph": "s", "id": 161141991, "pid": 5714, "tid": 5714, "ts": 6300865711788.688, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865711821.879, "dur": 1.369, + "args": { + "External id": 83100, "cbid": 210, "correlation": 161142011 + } + }, + { + "ph": "f", "id": 161142011, "pid": 5714, "tid": 5714, "ts": 6300865711821.879, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865912527.693, "dur": 655.496, + "args": { + "External id": 83100, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142012, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142012, "pid": 0, "tid": 7, "ts": 6300865912527.693, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865711825.248, "dur": 6.291, + "args": { + "External id": 83100, "cbid": 211, "correlation": 161142012 + } + }, + { + "ph": "s", "id": 161142012, "pid": 5714, "tid": 5714, "ts": 6300865711825.248, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6300865913183.861, "dur": 175.298, + "args": { + "External id": 83101, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142019, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142019, "pid": 0, "tid": 7, "ts": 6300865913183.861, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865711864.148, "dur": 5.871, + "args": { + "External id": 83101, "cbid": 307, "correlation": 161142019 + } + }, + { + "ph": "s", "id": 161142019, "pid": 5714, "tid": 5714, "ts": 6300865711864.148, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865904628.208, "dur": 731.785, + "args": { + "External id": 83117, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161142034, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161142034, "pid": 0, "tid": 17, "ts": 6300865904628.208, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865712271.698, "dur": 13.029, + "args": { + "External id": 83117, "cbid": 211, "correlation": 161142034 + } + }, + { + "ph": "s", "id": 161142034, "pid": 5714, "tid": 5714, "ts": 6300865712271.698, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865905374.681, "dur": 14.688, + "args": { + "External id": 83133, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161142047, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161142047, "pid": 0, "tid": 17, "ts": 6300865905374.681, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865712411.007, "dur": 10.760, + "args": { + "External id": 83133, "cbid": 211, "correlation": 161142047 + } + }, + { + "ph": "s", "id": 161142047, "pid": 5714, "tid": 5714, "ts": 6300865712411.007, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865712448.167, "dur": 1.380, + "args": { + "cbid": 135, "correlation": 161142057 + } + }, + { + "ph": "f", "id": 161142057, "pid": 5714, "tid": 5714, "ts": 6300865712448.167, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865712451.567, "dur": 1.450, + "args": { + "cbid": 147, "correlation": 161142061 + } + }, + { + "ph": "s", "id": 161142061, "pid": 5714, "tid": 5714, "ts": 6300865712451.567, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865712512.187, "dur": 1.060, + "args": { + "External id": 83135, "cbid": 317, "correlation": 161142074 + } + }, + { + "ph": "f", "id": 161142074, "pid": 5714, "tid": 5714, "ts": 6300865712512.187, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865712516.197, "dur": 1.290, + "args": { + "External id": 83135, "cbid": 135, "correlation": 161142076 + } + }, + { + "ph": "f", "id": 161142076, "pid": 5714, "tid": 5714, "ts": 6300865712516.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865712518.957, "dur": 1.250, + "args": { + "External id": 83135, "cbid": 147, "correlation": 161142080 + } + }, + { + "ph": "s", "id": 161142080, "pid": 5714, "tid": 5714, "ts": 6300865712518.957, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865712538.687, "dur": 0.720, + "args": { + "External id": 83135, "cbid": 409, "correlation": 161142083 + } + }, + { + "ph": "f", "id": 161142083, "pid": 5714, "tid": 5714, "ts": 6300865712538.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865712543.807, "dur": 0.760, + "args": { + "External id": 83135, "cbid": 135, "correlation": 161142086 + } + }, + { + "ph": "f", "id": 161142086, "pid": 5714, "tid": 5714, "ts": 6300865712543.807, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865712544.757, "dur": 0.930, + "args": { + "External id": 83135, "cbid": 147, "correlation": 161142087 + } + }, + { + "ph": "s", "id": 161142087, "pid": 5714, "tid": 5714, "ts": 6300865712544.757, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865909376.424, "dur": 4762.872, + "args": { + "External id": 83135, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161142089, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161142089, "pid": 0, "tid": 20, "ts": 6300865909376.424, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865712546.817, "dur": 11.110, + "args": { + "External id": 83135, "cbid": 430, "correlation": 161142089 + } + }, + { + "ph": "s", "id": 161142089, "pid": 5714, "tid": 5714, "ts": 6300865712546.817, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865712559.117, "dur": 0.430, + "args": { + "External id": 83135, "cbid": 135, "correlation": 161142091 + } + }, + { + "ph": "f", "id": 161142091, "pid": 5714, "tid": 5714, "ts": 6300865712559.117, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865712559.687, "dur": 0.520, + "args": { + "External id": 83135, "cbid": 147, "correlation": 161142092 + } + }, + { + "ph": "s", "id": 161142092, "pid": 5714, "tid": 5714, "ts": 6300865712559.687, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865712561.877, "dur": 1.100, + "args": { + "External id": 83135, "cbid": 135, "correlation": 161142095 + } + }, + { + "ph": "f", "id": 161142095, "pid": 5714, "tid": 5714, "ts": 6300865712561.877, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865712571.927, "dur": 0.520, + "args": { + "External id": 83135, "cbid": 135, "correlation": 161142102 + } + }, + { + "ph": "f", "id": 161142102, "pid": 5714, "tid": 5714, "ts": 6300865712571.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865712603.407, "dur": 1.030, + "args": { + "External id": 83137, "cbid": 147, "correlation": 161142107 + } + }, + { + "ph": "s", "id": 161142107, "pid": 5714, "tid": 5714, "ts": 6300865712603.407, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865712622.297, "dur": 0.940, + "args": { + "cbid": 135, "correlation": 161142122 + } + }, + { + "ph": "f", "id": 161142122, "pid": 5714, "tid": 5714, "ts": 6300865712622.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865712666.337, "dur": 1.109, + "args": { + "cbid": 147, "correlation": 161142127 + } + }, + { + "ph": "s", "id": 161142127, "pid": 5714, "tid": 5714, "ts": 6300865712666.337, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865712669.526, "dur": 0.700, + "args": { + "cbid": 147, "correlation": 161142131 + } + }, + { + "ph": "s", "id": 161142131, "pid": 5714, "tid": 5714, "ts": 6300865712669.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865712710.626, "dur": 2.271, + "args": { + "cbid": 147, "correlation": 161142137 + } + }, + { + "ph": "s", "id": 161142137, "pid": 5714, "tid": 5714, "ts": 6300865712710.626, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865712832.196, "dur": 1.290, + "args": { + "External id": 83150, "cbid": 317, "correlation": 161142178 + } + }, + { + "ph": "f", "id": 161142178, "pid": 5714, "tid": 5714, "ts": 6300865712832.196, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865712844.166, "dur": 2.700, + "args": { + "External id": 83151, "cbid": 138, "correlation": 161142181 + } + }, + { + "ph": "f", "id": 161142181, "pid": 5714, "tid": 5714, "ts": 6300865712844.166, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865914145.440, "dur": 1.760, + "args": { + "External id": 83155, "device": 0, "context": 1, "stream": 7, "correlation": 161142192, "bytes": 7224, "memory bandwidth (GB/s)": 4.1045454545454545 + } + }, + { + "ph": "f", "id": 161142192, "pid": 0, "tid": 7, "ts": 6300865914145.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865712871.916, "dur": 12.970, + "args": { + "External id": 83155, "cbid": 41, "correlation": 161142192 + } + }, + { + "ph": "s", "id": 161142192, "pid": 5714, "tid": 5714, "ts": 6300865712871.916, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865712889.626, "dur": 1.890, + "args": { + "External id": 83150, "cbid": 135, "correlation": 161142196 + } + }, + { + "ph": "f", "id": 161142196, "pid": 5714, "tid": 5714, "ts": 6300865712889.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865914224.033, "dur": 668.264, + "args": { + "External id": 83150, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142200, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142200, "pid": 0, "tid": 7, "ts": 6300865914224.033, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865712894.556, "dur": 11.610, + "args": { + "External id": 83150, "cbid": 211, "correlation": 161142200 + } + }, + { + "ph": "s", "id": 161142200, "pid": 5714, "tid": 5714, "ts": 6300865712894.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865713003.796, "dur": 1.480, + "args": { + "cbid": 135, "correlation": 161142211 + } + }, + { + "ph": "f", "id": 161142211, "pid": 5714, "tid": 5714, "ts": 6300865713003.796, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865914954.730, "dur": 104.673, + "args": { + "External id": 83162, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142237, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142237, "pid": 0, "tid": 7, "ts": 6300865914954.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865713231.645, "dur": 11.760, + "args": { + "External id": 83162, "cbid": 307, "correlation": 161142237 + } + }, + { + "ph": "s", "id": 161142237, "pid": 5714, "tid": 5714, "ts": 6300865713231.645, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865915060.075, "dur": 144.641, + "args": { + "External id": 83168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142260, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142260, "pid": 0, "tid": 7, "ts": 6300865915060.075, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865713406.035, "dur": 11.880, + "args": { + "External id": 83168, "cbid": 211, "correlation": 161142260 + } + }, + { + "ph": "s", "id": 161142260, "pid": 5714, "tid": 5714, "ts": 6300865713406.035, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865915205.420, "dur": 143.266, + "args": { + "External id": 83169, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142283, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142283, "pid": 0, "tid": 7, "ts": 6300865915205.420, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865713443.615, "dur": 5.950, + "args": { + "External id": 83169, "cbid": 211, "correlation": 161142283 + } + }, + { + "ph": "s", "id": 161142283, "pid": 5714, "tid": 5714, "ts": 6300865713443.615, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865915349.390, "dur": 143.330, + "args": { + "External id": 83170, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142306, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142306, "pid": 0, "tid": 7, "ts": 6300865915349.390, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865713471.845, "dur": 5.180, + "args": { + "External id": 83170, "cbid": 211, "correlation": 161142306 + } + }, + { + "ph": "s", "id": 161142306, "pid": 5714, "tid": 5714, "ts": 6300865713471.845, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865915493.328, "dur": 52.897, + "args": { + "External id": 83187, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142326, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142326, "pid": 0, "tid": 7, "ts": 6300865915493.328, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865713758.794, "dur": 10.770, + "args": { + "External id": 83187, "cbid": 307, "correlation": 161142326 + } + }, + { + "ph": "s", "id": 161142326, "pid": 5714, "tid": 5714, "ts": 6300865713758.794, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865915546.865, "dur": 62.496, + "args": { + "External id": 83203, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142344, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142344, "pid": 0, "tid": 7, "ts": 6300865915546.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865713972.164, "dur": 9.570, + "args": { + "External id": 83203, "cbid": 307, "correlation": 161142344 + } + }, + { + "ph": "s", "id": 161142344, "pid": 5714, "tid": 5714, "ts": 6300865713972.164, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865714127.893, "dur": 0.560, + "args": { + "External id": 83209, "cbid": 200, "correlation": 161142351 + } + }, + { + "ph": "f", "id": 161142351, "pid": 5714, "tid": 5714, "ts": 6300865714127.893, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865714128.653, "dur": 0.220, + "args": { + "External id": 83209, "cbid": 200, "correlation": 161142352 + } + }, + { + "ph": "f", "id": 161142352, "pid": 5714, "tid": 5714, "ts": 6300865714128.653, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865714160.633, "dur": 0.430, + "args": { + "External id": 83209, "cbid": 200, "correlation": 161142375 + } + }, + { + "ph": "f", "id": 161142375, "pid": 5714, "tid": 5714, "ts": 6300865714160.633, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865714166.573, "dur": 2.180, + "args": { + "External id": 83209, "cbid": 273, "correlation": 161142384 + } + }, + { + "ph": "f", "id": 161142384, "pid": 5714, "tid": 5714, "ts": 6300865714166.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865915609.969, "dur": 430.118, + "args": { + "External id": 83209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142385, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142385, "pid": 0, "tid": 7, "ts": 6300865915609.969, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865714169.403, "dur": 11.270, + "args": { + "External id": 83209, "cbid": 211, "correlation": 161142385 + } + }, + { + "ph": "s", "id": 161142385, "pid": 5714, "tid": 5714, "ts": 6300865714169.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865916040.727, "dur": 144.577, + "args": { + "External id": 83215, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142408, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142408, "pid": 0, "tid": 7, "ts": 6300865916040.727, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865714252.493, "dur": 8.420, + "args": { + "External id": 83215, "cbid": 211, "correlation": 161142408 + } + }, + { + "ph": "s", "id": 161142408, "pid": 5714, "tid": 5714, "ts": 6300865714252.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865916185.944, "dur": 90.497, + "args": { + "External id": 83219, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142434, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142434, "pid": 0, "tid": 7, "ts": 6300865916185.944, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865714450.493, "dur": 10.489, + "args": { + "External id": 83219, "cbid": 307, "correlation": 161142434 + } + }, + { + "ph": "s", "id": 161142434, "pid": 5714, "tid": 5714, "ts": 6300865714450.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865916277.081, "dur": 343.332, + "args": { + "External id": 83220, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142454, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142454, "pid": 0, "tid": 7, "ts": 6300865916277.081, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865714496.402, "dur": 7.140, + "args": { + "External id": 83220, "cbid": 211, "correlation": 161142454 + } + }, + { + "ph": "s", "id": 161142454, "pid": 5714, "tid": 5714, "ts": 6300865714496.402, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865916621.085, "dur": 338.276, + "args": { + "External id": 83221, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142477, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142477, "pid": 0, "tid": 7, "ts": 6300865916621.085, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865714527.612, "dur": 5.500, + "args": { + "External id": 83221, "cbid": 211, "correlation": 161142477 + } + }, + { + "ph": "s", "id": 161142477, "pid": 5714, "tid": 5714, "ts": 6300865714527.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6300865916960.097, "dur": 337.412, + "args": { + "External id": 83222, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142489, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142489, "pid": 0, "tid": 7, "ts": 6300865916960.097, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865714572.542, "dur": 6.490, + "args": { + "External id": 83222, "cbid": 307, "correlation": 161142489 + } + }, + { + "ph": "s", "id": 161142489, "pid": 5714, "tid": 5714, "ts": 6300865714572.542, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865714606.582, "dur": 1.440, + "args": { + "External id": 83223, "cbid": 210, "correlation": 161142509 + } + }, + { + "ph": "f", "id": 161142509, "pid": 5714, "tid": 5714, "ts": 6300865714606.582, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865917298.149, "dur": 577.031, + "args": { + "External id": 83223, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142510, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142510, "pid": 0, "tid": 7, "ts": 6300865917298.149, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865714609.982, "dur": 6.070, + "args": { + "External id": 83223, "cbid": 211, "correlation": 161142510 + } + }, + { + "ph": "s", "id": 161142510, "pid": 5714, "tid": 5714, "ts": 6300865714609.982, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6300865917875.852, "dur": 158.882, + "args": { + "External id": 83224, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142517, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142517, "pid": 0, "tid": 7, "ts": 6300865917875.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865714647.832, "dur": 5.790, + "args": { + "External id": 83224, "cbid": 307, "correlation": 161142517 + } + }, + { + "ph": "s", "id": 161142517, "pid": 5714, "tid": 5714, "ts": 6300865714647.832, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865909784.717, "dur": 525.990, + "args": { + "External id": 83240, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161142532, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161142532, "pid": 0, "tid": 17, "ts": 6300865909784.717, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865715065.401, "dur": 13.100, + "args": { + "External id": 83240, "cbid": 211, "correlation": 161142532 + } + }, + { + "ph": "s", "id": 161142532, "pid": 5714, "tid": 5714, "ts": 6300865715065.401, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300865910316.691, "dur": 14.977, + "args": { + "External id": 83256, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161142545, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161142545, "pid": 0, "tid": 17, "ts": 6300865910316.691, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865715196.021, "dur": 10.490, + "args": { + "External id": 83256, "cbid": 211, "correlation": 161142545 + } + }, + { + "ph": "s", "id": 161142545, "pid": 5714, "tid": 5714, "ts": 6300865715196.021, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865715232.981, "dur": 1.630, + "args": { + "cbid": 135, "correlation": 161142555 + } + }, + { + "ph": "f", "id": 161142555, "pid": 5714, "tid": 5714, "ts": 6300865715232.981, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865715236.711, "dur": 1.470, + "args": { + "cbid": 147, "correlation": 161142559 + } + }, + { + "ph": "s", "id": 161142559, "pid": 5714, "tid": 5714, "ts": 6300865715236.711, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865715308.931, "dur": 1.129, + "args": { + "External id": 83258, "cbid": 317, "correlation": 161142572 + } + }, + { + "ph": "f", "id": 161142572, "pid": 5714, "tid": 5714, "ts": 6300865715308.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865715312.140, "dur": 1.531, + "args": { + "External id": 83258, "cbid": 135, "correlation": 161142574 + } + }, + { + "ph": "f", "id": 161142574, "pid": 5714, "tid": 5714, "ts": 6300865715312.140, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865715315.331, "dur": 1.269, + "args": { + "External id": 83258, "cbid": 147, "correlation": 161142578 + } + }, + { + "ph": "s", "id": 161142578, "pid": 5714, "tid": 5714, "ts": 6300865715315.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300865715336.091, "dur": 0.729, + "args": { + "External id": 83258, "cbid": 409, "correlation": 161142581 + } + }, + { + "ph": "f", "id": 161142581, "pid": 5714, "tid": 5714, "ts": 6300865715336.091, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865715341.191, "dur": 0.829, + "args": { + "External id": 83258, "cbid": 135, "correlation": 161142584 + } + }, + { + "ph": "f", "id": 161142584, "pid": 5714, "tid": 5714, "ts": 6300865715341.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865715342.211, "dur": 0.960, + "args": { + "External id": 83258, "cbid": 147, "correlation": 161142585 + } + }, + { + "ph": "s", "id": 161142585, "pid": 5714, "tid": 5714, "ts": 6300865715342.211, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300865914140.160, "dur": 4693.303, + "args": { + "External id": 83258, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161142587, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161142587, "pid": 0, "tid": 20, "ts": 6300865914140.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300865715344.391, "dur": 11.080, + "args": { + "External id": 83258, "cbid": 430, "correlation": 161142587 + } + }, + { + "ph": "s", "id": 161142587, "pid": 5714, "tid": 5714, "ts": 6300865715344.391, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865715356.511, "dur": 0.569, + "args": { + "External id": 83258, "cbid": 135, "correlation": 161142589 + } + }, + { + "ph": "f", "id": 161142589, "pid": 5714, "tid": 5714, "ts": 6300865715356.511, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865715357.331, "dur": 0.540, + "args": { + "External id": 83258, "cbid": 147, "correlation": 161142590 + } + }, + { + "ph": "s", "id": 161142590, "pid": 5714, "tid": 5714, "ts": 6300865715357.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865715359.571, "dur": 1.020, + "args": { + "External id": 83258, "cbid": 135, "correlation": 161142593 + } + }, + { + "ph": "f", "id": 161142593, "pid": 5714, "tid": 5714, "ts": 6300865715359.571, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865715369.320, "dur": 0.460, + "args": { + "External id": 83258, "cbid": 135, "correlation": 161142600 + } + }, + { + "ph": "f", "id": 161142600, "pid": 5714, "tid": 5714, "ts": 6300865715369.320, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865715401.830, "dur": 1.150, + "args": { + "External id": 83260, "cbid": 147, "correlation": 161142605 + } + }, + { + "ph": "s", "id": 161142605, "pid": 5714, "tid": 5714, "ts": 6300865715401.830, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865715420.940, "dur": 1.020, + "args": { + "cbid": 135, "correlation": 161142620 + } + }, + { + "ph": "f", "id": 161142620, "pid": 5714, "tid": 5714, "ts": 6300865715420.940, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865715465.510, "dur": 1.120, + "args": { + "cbid": 147, "correlation": 161142625 + } + }, + { + "ph": "s", "id": 161142625, "pid": 5714, "tid": 5714, "ts": 6300865715465.510, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865715468.620, "dur": 0.720, + "args": { + "cbid": 147, "correlation": 161142629 + } + }, + { + "ph": "s", "id": 161142629, "pid": 5714, "tid": 5714, "ts": 6300865715468.620, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300865715511.700, "dur": 2.360, + "args": { + "cbid": 147, "correlation": 161142635 + } + }, + { + "ph": "s", "id": 161142635, "pid": 5714, "tid": 5714, "ts": 6300865715511.700, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300865715634.700, "dur": 1.310, + "args": { + "External id": 83273, "cbid": 317, "correlation": 161142676 + } + }, + { + "ph": "f", "id": 161142676, "pid": 5714, "tid": 5714, "ts": 6300865715634.700, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865715645.890, "dur": 2.730, + "args": { + "External id": 83274, "cbid": 138, "correlation": 161142679 + } + }, + { + "ph": "f", "id": 161142679, "pid": 5714, "tid": 5714, "ts": 6300865715645.890, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300865918837.335, "dur": 1.568, + "args": { + "External id": 83278, "device": 0, "context": 1, "stream": 7, "correlation": 161142690, "bytes": 7224, "memory bandwidth (GB/s)": 4.607142857142857 + } + }, + { + "ph": "f", "id": 161142690, "pid": 0, "tid": 7, "ts": 6300865918837.335, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865715671.750, "dur": 12.590, + "args": { + "External id": 83278, "cbid": 41, "correlation": 161142690 + } + }, + { + "ph": "s", "id": 161142690, "pid": 5714, "tid": 5714, "ts": 6300865715671.750, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865715689.790, "dur": 1.810, + "args": { + "External id": 83273, "cbid": 135, "correlation": 161142694 + } + }, + { + "ph": "f", "id": 161142694, "pid": 5714, "tid": 5714, "ts": 6300865715689.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300865918840.631, "dur": 14.561, + "args": { + "External id": 83273, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142698, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142698, "pid": 0, "tid": 7, "ts": 6300865918840.631, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865715694.160, "dur": 11.090, + "args": { + "External id": 83273, "cbid": 211, "correlation": 161142698 + } + }, + { + "ph": "s", "id": 161142698, "pid": 5714, "tid": 5714, "ts": 6300865715694.160, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300865715804.539, "dur": 1.371, + "args": { + "cbid": 135, "correlation": 161142709 + } + }, + { + "ph": "f", "id": 161142709, "pid": 5714, "tid": 5714, "ts": 6300865715804.539, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865918855.928, "dur": 20.672, + "args": { + "External id": 83285, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142735, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142735, "pid": 0, "tid": 7, "ts": 6300865918855.928, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865716032.229, "dur": 11.320, + "args": { + "External id": 83285, "cbid": 307, "correlation": 161142735 + } + }, + { + "ph": "s", "id": 161142735, "pid": 5714, "tid": 5714, "ts": 6300865716032.229, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865918877.336, "dur": 124.993, + "args": { + "External id": 83291, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142758, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142758, "pid": 0, "tid": 7, "ts": 6300865918877.336, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865716194.698, "dur": 10.860, + "args": { + "External id": 83291, "cbid": 211, "correlation": 161142758 + } + }, + { + "ph": "s", "id": 161142758, "pid": 5714, "tid": 5714, "ts": 6300865716194.698, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865919002.969, "dur": 122.626, + "args": { + "External id": 83292, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142781, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142781, "pid": 0, "tid": 7, "ts": 6300865919002.969, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865716231.029, "dur": 7.829, + "args": { + "External id": 83292, "cbid": 211, "correlation": 161142781 + } + }, + { + "ph": "s", "id": 161142781, "pid": 5714, "tid": 5714, "ts": 6300865716231.029, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865919126.203, "dur": 122.465, + "args": { + "External id": 83293, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142804, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142804, "pid": 0, "tid": 7, "ts": 6300865919126.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865716262.538, "dur": 5.800, + "args": { + "External id": 83293, "cbid": 211, "correlation": 161142804 + } + }, + { + "ph": "s", "id": 161142804, "pid": 5714, "tid": 5714, "ts": 6300865716262.538, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865919249.276, "dur": 52.065, + "args": { + "External id": 83310, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142824, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142824, "pid": 0, "tid": 7, "ts": 6300865919249.276, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865716561.538, "dur": 11.240, + "args": { + "External id": 83310, "cbid": 307, "correlation": 161142824 + } + }, + { + "ph": "s", "id": 161142824, "pid": 5714, "tid": 5714, "ts": 6300865716561.538, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865919302.045, "dur": 60.321, + "args": { + "External id": 83326, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142842, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142842, "pid": 0, "tid": 7, "ts": 6300865919302.045, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865716780.667, "dur": 9.250, + "args": { + "External id": 83326, "cbid": 307, "correlation": 161142842 + } + }, + { + "ph": "s", "id": 161142842, "pid": 5714, "tid": 5714, "ts": 6300865716780.667, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865716937.547, "dur": 0.570, + "args": { + "External id": 83332, "cbid": 200, "correlation": 161142849 + } + }, + { + "ph": "f", "id": 161142849, "pid": 5714, "tid": 5714, "ts": 6300865716937.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865716938.257, "dur": 0.220, + "args": { + "External id": 83332, "cbid": 200, "correlation": 161142850 + } + }, + { + "ph": "f", "id": 161142850, "pid": 5714, "tid": 5714, "ts": 6300865716938.257, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865716967.727, "dur": 0.420, + "args": { + "External id": 83332, "cbid": 200, "correlation": 161142873 + } + }, + { + "ph": "f", "id": 161142873, "pid": 5714, "tid": 5714, "ts": 6300865716967.727, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865716974.247, "dur": 2.200, + "args": { + "External id": 83332, "cbid": 273, "correlation": 161142882 + } + }, + { + "ph": "f", "id": 161142882, "pid": 5714, "tid": 5714, "ts": 6300865716974.247, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865919362.974, "dur": 404.228, + "args": { + "External id": 83332, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142883, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142883, "pid": 0, "tid": 7, "ts": 6300865919362.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865716977.087, "dur": 11.370, + "args": { + "External id": 83332, "cbid": 211, "correlation": 161142883 + } + }, + { + "ph": "s", "id": 161142883, "pid": 5714, "tid": 5714, "ts": 6300865716977.087, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865919767.842, "dur": 124.162, + "args": { + "External id": 83338, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142906, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142906, "pid": 0, "tid": 7, "ts": 6300865919767.842, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717060.337, "dur": 8.759, + "args": { + "External id": 83338, "cbid": 211, "correlation": 161142906 + } + }, + { + "ph": "s", "id": 161142906, "pid": 5714, "tid": 5714, "ts": 6300865717060.337, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865919892.676, "dur": 88.321, + "args": { + "External id": 83342, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142932, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142932, "pid": 0, "tid": 7, "ts": 6300865919892.676, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717252.076, "dur": 9.990, + "args": { + "External id": 83342, "cbid": 307, "correlation": 161142932 + } + }, + { + "ph": "s", "id": 161142932, "pid": 5714, "tid": 5714, "ts": 6300865717252.076, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865919981.605, "dur": 328.067, + "args": { + "External id": 83343, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142952, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142952, "pid": 0, "tid": 7, "ts": 6300865919981.605, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717306.656, "dur": 7.310, + "args": { + "External id": 83343, "cbid": 211, "correlation": 161142952 + } + }, + { + "ph": "s", "id": 161142952, "pid": 5714, "tid": 5714, "ts": 6300865717306.656, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865920310.280, "dur": 323.685, + "args": { + "External id": 83344, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142975, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161142975, "pid": 0, "tid": 7, "ts": 6300865920310.280, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717341.606, "dur": 5.750, + "args": { + "External id": 83344, "cbid": 211, "correlation": 161142975 + } + }, + { + "ph": "s", "id": 161142975, "pid": 5714, "tid": 5714, "ts": 6300865717341.606, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6300865920634.669, "dur": 214.402, + "args": { + "External id": 83345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161142987, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161142987, "pid": 0, "tid": 7, "ts": 6300865920634.669, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717385.296, "dur": 6.320, + "args": { + "External id": 83345, "cbid": 307, "correlation": 161142987 + } + }, + { + "ph": "s", "id": 161142987, "pid": 5714, "tid": 5714, "ts": 6300865717385.296, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865717418.696, "dur": 1.390, + "args": { + "External id": 83346, "cbid": 210, "correlation": 161143007 + } + }, + { + "ph": "f", "id": 161143007, "pid": 5714, "tid": 5714, "ts": 6300865717418.696, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865920849.679, "dur": 324.196, + "args": { + "External id": 83346, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143008, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143008, "pid": 0, "tid": 7, "ts": 6300865920849.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717422.056, "dur": 6.090, + "args": { + "External id": 83346, "cbid": 211, "correlation": 161143008 + } + }, + { + "ph": "s", "id": 161143008, "pid": 5714, "tid": 5714, "ts": 6300865717422.056, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6300865921174.547, "dur": 42.048, + "args": { + "External id": 83347, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143015, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143015, "pid": 0, "tid": 7, "ts": 6300865921174.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717470.566, "dur": 6.010, + "args": { + "External id": 83347, "cbid": 307, "correlation": 161143015 + } + }, + { + "ph": "s", "id": 161143015, "pid": 5714, "tid": 5714, "ts": 6300865717470.566, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865921217.235, "dur": 32.609, + "args": { + "External id": 83353, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143026, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143026, "pid": 0, "tid": 7, "ts": 6300865921217.235, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717685.015, "dur": 14.370, + "args": { + "External id": 83353, "cbid": 211, "correlation": 161143026 + } + }, + { + "ph": "s", "id": 161143026, "pid": 5714, "tid": 5714, "ts": 6300865717685.015, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865921351.061, "dur": 74.689, + "args": { + "External id": 83354, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143037, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143037, "pid": 0, "tid": 7, "ts": 6300865921351.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717724.295, "dur": 7.400, + "args": { + "External id": 83354, "cbid": 211, "correlation": 161143037 + } + }, + { + "ph": "s", "id": 161143037, "pid": 5714, "tid": 5714, "ts": 6300865717724.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865921426.390, "dur": 15.840, + "args": { + "External id": 83357, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143051, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143051, "pid": 0, "tid": 7, "ts": 6300865921426.390, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717757.125, "dur": 7.660, + "args": { + "External id": 83357, "cbid": 211, "correlation": 161143051 + } + }, + { + "ph": "s", "id": 161143051, "pid": 5714, "tid": 5714, "ts": 6300865717757.125, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865921442.902, "dur": 1.696, + "args": { + "External id": 83359, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143057, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161143057, "pid": 0, "tid": 7, "ts": 6300865921442.902, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717785.475, "dur": 6.410, + "args": { + "External id": 83359, "cbid": 211, "correlation": 161143057 + } + }, + { + "ph": "s", "id": 161143057, "pid": 5714, "tid": 5714, "ts": 6300865717785.475, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865921445.302, "dur": 1.024, + "args": { + "External id": 83360, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143067, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161143067, "pid": 0, "tid": 7, "ts": 6300865921445.302, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717813.645, "dur": 7.240, + "args": { + "External id": 83360, "cbid": 211, "correlation": 161143067 + } + }, + { + "ph": "s", "id": 161143067, "pid": 5714, "tid": 5714, "ts": 6300865717813.645, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865921447.030, "dur": 88.481, + "args": { + "External id": 83361, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143077, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143077, "pid": 0, "tid": 7, "ts": 6300865921447.030, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717842.975, "dur": 7.490, + "args": { + "External id": 83361, "cbid": 211, "correlation": 161143077 + } + }, + { + "ph": "s", "id": 161143077, "pid": 5714, "tid": 5714, "ts": 6300865717842.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865921536.247, "dur": 48.673, + "args": { + "External id": 83366, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143090, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143090, "pid": 0, "tid": 7, "ts": 6300865921536.247, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717873.395, "dur": 7.010, + "args": { + "External id": 83366, "cbid": 211, "correlation": 161143090 + } + }, + { + "ph": "s", "id": 161143090, "pid": 5714, "tid": 5714, "ts": 6300865717873.395, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865921585.656, "dur": 22.688, + "args": { + "External id": 83367, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143101, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143101, "pid": 0, "tid": 7, "ts": 6300865921585.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717896.695, "dur": 6.350, + "args": { + "External id": 83367, "cbid": 211, "correlation": 161143101 + } + }, + { + "ph": "s", "id": 161143101, "pid": 5714, "tid": 5714, "ts": 6300865717896.695, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865921609.080, "dur": 123.617, + "args": { + "External id": 83375, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143124, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143124, "pid": 0, "tid": 7, "ts": 6300865921609.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865717994.525, "dur": 8.440, + "args": { + "External id": 83375, "cbid": 211, "correlation": 161143124 + } + }, + { + "ph": "s", "id": 161143124, "pid": 5714, "tid": 5714, "ts": 6300865717994.525, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865921733.305, "dur": 122.882, + "args": { + "External id": 83384, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143147, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143147, "pid": 0, "tid": 7, "ts": 6300865921733.305, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865718064.474, "dur": 7.950, + "args": { + "External id": 83384, "cbid": 211, "correlation": 161143147 + } + }, + { + "ph": "s", "id": 161143147, "pid": 5714, "tid": 5714, "ts": 6300865718064.474, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865921856.859, "dur": 122.561, + "args": { + "External id": 83393, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143170, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143170, "pid": 0, "tid": 7, "ts": 6300865921856.859, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865718127.444, "dur": 7.470, + "args": { + "External id": 83393, "cbid": 211, "correlation": 161143170 + } + }, + { + "ph": "s", "id": 161143170, "pid": 5714, "tid": 5714, "ts": 6300865718127.444, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865921980.124, "dur": 50.785, + "args": { + "External id": 83401, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143189, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143189, "pid": 0, "tid": 7, "ts": 6300865921980.124, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865718411.113, "dur": 11.280, + "args": { + "External id": 83401, "cbid": 307, "correlation": 161143189 + } + }, + { + "ph": "s", "id": 161143189, "pid": 5714, "tid": 5714, "ts": 6300865718411.113, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865922031.549, "dur": 62.305, + "args": { + "External id": 83404, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143206, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143206, "pid": 0, "tid": 7, "ts": 6300865922031.549, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865718570.343, "dur": 9.180, + "args": { + "External id": 83404, "cbid": 307, "correlation": 161143206 + } + }, + { + "ph": "s", "id": 161143206, "pid": 5714, "tid": 5714, "ts": 6300865718570.343, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865718689.483, "dur": 0.540, + "args": { + "External id": 83408, "cbid": 200, "correlation": 161143210 + } + }, + { + "ph": "f", "id": 161143210, "pid": 5714, "tid": 5714, "ts": 6300865718689.483, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865718690.163, "dur": 0.230, + "args": { + "External id": 83408, "cbid": 200, "correlation": 161143211 + } + }, + { + "ph": "f", "id": 161143211, "pid": 5714, "tid": 5714, "ts": 6300865718690.163, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865718719.663, "dur": 0.410, + "args": { + "External id": 83408, "cbid": 200, "correlation": 161143234 + } + }, + { + "ph": "f", "id": 161143234, "pid": 5714, "tid": 5714, "ts": 6300865718719.663, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865718727.443, "dur": 2.220, + "args": { + "External id": 83408, "cbid": 273, "correlation": 161143243 + } + }, + { + "ph": "f", "id": 161143243, "pid": 5714, "tid": 5714, "ts": 6300865718727.443, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865922094.558, "dur": 412.421, + "args": { + "External id": 83408, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143244, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143244, "pid": 0, "tid": 7, "ts": 6300865922094.558, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865718730.293, "dur": 11.040, + "args": { + "External id": 83408, "cbid": 211, "correlation": 161143244 + } + }, + { + "ph": "s", "id": 161143244, "pid": 5714, "tid": 5714, "ts": 6300865718730.293, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865922507.587, "dur": 124.225, + "args": { + "External id": 83424, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143270, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143270, "pid": 0, "tid": 7, "ts": 6300865922507.587, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865718873.263, "dur": 10.289, + "args": { + "External id": 83424, "cbid": 211, "correlation": 161143270 + } + }, + { + "ph": "s", "id": 161143270, "pid": 5714, "tid": 5714, "ts": 6300865718873.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865922632.420, "dur": 61.441, + "args": { + "External id": 83426, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143280, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143280, "pid": 0, "tid": 7, "ts": 6300865922632.420, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865718921.642, "dur": 8.860, + "args": { + "External id": 83426, "cbid": 211, "correlation": 161143280 + } + }, + { + "ph": "s", "id": 161143280, "pid": 5714, "tid": 5714, "ts": 6300865718921.642, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865922694.597, "dur": 49.856, + "args": { + "External id": 83431, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143293, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143293, "pid": 0, "tid": 7, "ts": 6300865922694.597, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865718974.542, "dur": 8.240, + "args": { + "External id": 83431, "cbid": 211, "correlation": 161143293 + } + }, + { + "ph": "s", "id": 161143293, "pid": 5714, "tid": 5714, "ts": 6300865718974.542, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865922745.093, "dur": 68.769, + "args": { + "External id": 83432, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143304, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143304, "pid": 0, "tid": 7, "ts": 6300865922745.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719001.052, "dur": 5.780, + "args": { + "External id": 83432, "cbid": 211, "correlation": 161143304 + } + }, + { + "ph": "s", "id": 161143304, "pid": 5714, "tid": 5714, "ts": 6300865719001.052, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865922814.534, "dur": 15.296, + "args": { + "External id": 83435, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143318, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143318, "pid": 0, "tid": 7, "ts": 6300865922814.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719024.272, "dur": 5.780, + "args": { + "External id": 83435, "cbid": 211, "correlation": 161143318 + } + }, + { + "ph": "s", "id": 161143318, "pid": 5714, "tid": 5714, "ts": 6300865719024.272, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865922830.534, "dur": 1.504, + "args": { + "External id": 83437, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143324, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161143324, "pid": 0, "tid": 7, "ts": 6300865922830.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719046.692, "dur": 4.800, + "args": { + "External id": 83437, "cbid": 211, "correlation": 161143324 + } + }, + { + "ph": "s", "id": 161143324, "pid": 5714, "tid": 5714, "ts": 6300865719046.692, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865922832.646, "dur": 1.024, + "args": { + "External id": 83438, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143334, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161143334, "pid": 0, "tid": 7, "ts": 6300865922832.646, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719063.642, "dur": 4.890, + "args": { + "External id": 83438, "cbid": 211, "correlation": 161143334 + } + }, + { + "ph": "s", "id": 161143334, "pid": 5714, "tid": 5714, "ts": 6300865719063.642, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865922834.374, "dur": 91.713, + "args": { + "External id": 83439, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143344, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143344, "pid": 0, "tid": 7, "ts": 6300865922834.374, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719085.262, "dur": 4.980, + "args": { + "External id": 83439, "cbid": 211, "correlation": 161143344 + } + }, + { + "ph": "s", "id": 161143344, "pid": 5714, "tid": 5714, "ts": 6300865719085.262, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865922926.695, "dur": 47.649, + "args": { + "External id": 83444, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143357, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143357, "pid": 0, "tid": 7, "ts": 6300865922926.695, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719109.022, "dur": 5.630, + "args": { + "External id": 83444, "cbid": 211, "correlation": 161143357 + } + }, + { + "ph": "s", "id": 161143357, "pid": 5714, "tid": 5714, "ts": 6300865719109.022, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865922974.952, "dur": 22.816, + "args": { + "External id": 83445, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143368, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143368, "pid": 0, "tid": 7, "ts": 6300865922974.952, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719129.332, "dur": 4.850, + "args": { + "External id": 83445, "cbid": 211, "correlation": 161143368 + } + }, + { + "ph": "s", "id": 161143368, "pid": 5714, "tid": 5714, "ts": 6300865719129.332, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865922998.504, "dur": 323.876, + "args": { + "External id": 83453, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143391, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143391, "pid": 0, "tid": 7, "ts": 6300865922998.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719213.212, "dur": 7.920, + "args": { + "External id": 83453, "cbid": 211, "correlation": 161143391 + } + }, + { + "ph": "s", "id": 161143391, "pid": 5714, "tid": 5714, "ts": 6300865719213.212, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865923323.116, "dur": 322.788, + "args": { + "External id": 83462, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143414, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143414, "pid": 0, "tid": 7, "ts": 6300865923323.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719292.022, "dur": 17.380, + "args": { + "External id": 83462, "cbid": 211, "correlation": 161143414 + } + }, + { + "ph": "s", "id": 161143414, "pid": 5714, "tid": 5714, "ts": 6300865719292.022, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6300865923646.576, "dur": 213.186, + "args": { + "External id": 83464, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143428, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143428, "pid": 0, "tid": 7, "ts": 6300865923646.576, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719407.361, "dur": 9.910, + "args": { + "External id": 83464, "cbid": 307, "correlation": 161143428 + } + }, + { + "ph": "s", "id": 161143428, "pid": 5714, "tid": 5714, "ts": 6300865719407.361, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865719467.261, "dur": 1.590, + "args": { + "External id": 83473, "cbid": 210, "correlation": 161143450 + } + }, + { + "ph": "f", "id": 161143450, "pid": 5714, "tid": 5714, "ts": 6300865719467.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865923860.466, "dur": 326.180, + "args": { + "External id": 83473, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143451, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143451, "pid": 0, "tid": 7, "ts": 6300865923860.466, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719470.921, "dur": 7.490, + "args": { + "External id": 83473, "cbid": 211, "correlation": 161143451 + } + }, + { + "ph": "s", "id": 161143451, "pid": 5714, "tid": 5714, "ts": 6300865719470.921, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865924187.286, "dur": 52.641, + "args": { + "External id": 83475, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143461, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143461, "pid": 0, "tid": 7, "ts": 6300865924187.286, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719519.631, "dur": 7.820, + "args": { + "External id": 83475, "cbid": 211, "correlation": 161143461 + } + }, + { + "ph": "s", "id": 161143461, "pid": 5714, "tid": 5714, "ts": 6300865719519.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865924240.631, "dur": 64.192, + "args": { + "External id": 83480, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143474, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143474, "pid": 0, "tid": 7, "ts": 6300865924240.631, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719582.261, "dur": 8.500, + "args": { + "External id": 83480, "cbid": 211, "correlation": 161143474 + } + }, + { + "ph": "s", "id": 161143474, "pid": 5714, "tid": 5714, "ts": 6300865719582.261, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865924305.495, "dur": 65.986, + "args": { + "External id": 83481, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143485, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143485, "pid": 0, "tid": 7, "ts": 6300865924305.495, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719608.141, "dur": 5.690, + "args": { + "External id": 83481, "cbid": 211, "correlation": 161143485 + } + }, + { + "ph": "s", "id": 161143485, "pid": 5714, "tid": 5714, "ts": 6300865719608.141, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865924372.185, "dur": 15.839, + "args": { + "External id": 83484, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143499, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143499, "pid": 0, "tid": 7, "ts": 6300865924372.185, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719629.471, "dur": 5.330, + "args": { + "External id": 83484, "cbid": 211, "correlation": 161143499 + } + }, + { + "ph": "s", "id": 161143499, "pid": 5714, "tid": 5714, "ts": 6300865719629.471, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865924388.664, "dur": 1.728, + "args": { + "External id": 83486, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143505, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161143505, "pid": 0, "tid": 7, "ts": 6300865924388.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719649.801, "dur": 4.930, + "args": { + "External id": 83486, "cbid": 211, "correlation": 161143505 + } + }, + { + "ph": "s", "id": 161143505, "pid": 5714, "tid": 5714, "ts": 6300865719649.801, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865924391.064, "dur": 1.024, + "args": { + "External id": 83487, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143515, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161143515, "pid": 0, "tid": 7, "ts": 6300865924391.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719668.831, "dur": 4.880, + "args": { + "External id": 83487, "cbid": 211, "correlation": 161143515 + } + }, + { + "ph": "s", "id": 161143515, "pid": 5714, "tid": 5714, "ts": 6300865719668.831, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865924392.792, "dur": 88.162, + "args": { + "External id": 83488, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143525, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143525, "pid": 0, "tid": 7, "ts": 6300865924392.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719689.581, "dur": 4.800, + "args": { + "External id": 83488, "cbid": 211, "correlation": 161143525 + } + }, + { + "ph": "s", "id": 161143525, "pid": 5714, "tid": 5714, "ts": 6300865719689.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865924481.594, "dur": 48.608, + "args": { + "External id": 83493, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143538, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143538, "pid": 0, "tid": 7, "ts": 6300865924481.594, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719714.410, "dur": 5.291, + "args": { + "External id": 83493, "cbid": 211, "correlation": 161143538 + } + }, + { + "ph": "s", "id": 161143538, "pid": 5714, "tid": 5714, "ts": 6300865719714.410, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865924530.810, "dur": 22.848, + "args": { + "External id": 83494, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143549, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143549, "pid": 0, "tid": 7, "ts": 6300865924530.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719732.630, "dur": 4.800, + "args": { + "External id": 83494, "cbid": 211, "correlation": 161143549 + } + }, + { + "ph": "s", "id": 161143549, "pid": 5714, "tid": 5714, "ts": 6300865719732.630, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865924554.842, "dur": 123.458, + "args": { + "External id": 83502, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143572, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143572, "pid": 0, "tid": 7, "ts": 6300865924554.842, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719809.150, "dur": 8.100, + "args": { + "External id": 83502, "cbid": 211, "correlation": 161143572 + } + }, + { + "ph": "s", "id": 161143572, "pid": 5714, "tid": 5714, "ts": 6300865719809.150, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865924679.036, "dur": 122.113, + "args": { + "External id": 83511, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143595, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143595, "pid": 0, "tid": 7, "ts": 6300865924679.036, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719875.950, "dur": 7.480, + "args": { + "External id": 83511, "cbid": 211, "correlation": 161143595 + } + }, + { + "ph": "s", "id": 161143595, "pid": 5714, "tid": 5714, "ts": 6300865719875.950, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865924801.853, "dur": 122.594, + "args": { + "External id": 83520, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143618, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143618, "pid": 0, "tid": 7, "ts": 6300865924801.853, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865719938.550, "dur": 7.160, + "args": { + "External id": 83520, "cbid": 211, "correlation": 161143618 + } + }, + { + "ph": "s", "id": 161143618, "pid": 5714, "tid": 5714, "ts": 6300865719938.550, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865924925.119, "dur": 53.217, + "args": { + "External id": 83528, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143637, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143637, "pid": 0, "tid": 7, "ts": 6300865924925.119, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720136.430, "dur": 9.979, + "args": { + "External id": 83528, "cbid": 307, "correlation": 161143637 + } + }, + { + "ph": "s", "id": 161143637, "pid": 5714, "tid": 5714, "ts": 6300865720136.430, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865924979.072, "dur": 59.776, + "args": { + "External id": 83531, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143654, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143654, "pid": 0, "tid": 7, "ts": 6300865924979.072, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720283.899, "dur": 9.010, + "args": { + "External id": 83531, "cbid": 307, "correlation": 161143654 + } + }, + { + "ph": "s", "id": 161143654, "pid": 5714, "tid": 5714, "ts": 6300865720283.899, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865720401.639, "dur": 0.570, + "args": { + "External id": 83535, "cbid": 200, "correlation": 161143658 + } + }, + { + "ph": "f", "id": 161143658, "pid": 5714, "tid": 5714, "ts": 6300865720401.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865720402.349, "dur": 0.220, + "args": { + "External id": 83535, "cbid": 200, "correlation": 161143659 + } + }, + { + "ph": "f", "id": 161143659, "pid": 5714, "tid": 5714, "ts": 6300865720402.349, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865720431.759, "dur": 0.400, + "args": { + "External id": 83535, "cbid": 200, "correlation": 161143682 + } + }, + { + "ph": "f", "id": 161143682, "pid": 5714, "tid": 5714, "ts": 6300865720431.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865720439.739, "dur": 2.220, + "args": { + "External id": 83535, "cbid": 273, "correlation": 161143691 + } + }, + { + "ph": "f", "id": 161143691, "pid": 5714, "tid": 5714, "ts": 6300865720439.739, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865925039.456, "dur": 409.221, + "args": { + "External id": 83535, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143692, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143692, "pid": 0, "tid": 7, "ts": 6300865925039.456, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720442.599, "dur": 17.680, + "args": { + "External id": 83535, "cbid": 211, "correlation": 161143692 + } + }, + { + "ph": "s", "id": 161143692, "pid": 5714, "tid": 5714, "ts": 6300865720442.599, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865925449.381, "dur": 123.809, + "args": { + "External id": 83551, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143718, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143718, "pid": 0, "tid": 7, "ts": 6300865925449.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720592.648, "dur": 10.711, + "args": { + "External id": 83551, "cbid": 211, "correlation": 161143718 + } + }, + { + "ph": "s", "id": 161143718, "pid": 5714, "tid": 5714, "ts": 6300865720592.648, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865925573.894, "dur": 61.633, + "args": { + "External id": 83553, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143728, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143728, "pid": 0, "tid": 7, "ts": 6300865925573.894, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720636.608, "dur": 7.260, + "args": { + "External id": 83553, "cbid": 211, "correlation": 161143728 + } + }, + { + "ph": "s", "id": 161143728, "pid": 5714, "tid": 5714, "ts": 6300865720636.608, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865925636.231, "dur": 49.825, + "args": { + "External id": 83558, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143741, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143741, "pid": 0, "tid": 7, "ts": 6300865925636.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720688.608, "dur": 7.920, + "args": { + "External id": 83558, "cbid": 211, "correlation": 161143741 + } + }, + { + "ph": "s", "id": 161143741, "pid": 5714, "tid": 5714, "ts": 6300865720688.608, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865925686.760, "dur": 68.640, + "args": { + "External id": 83559, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143752, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143752, "pid": 0, "tid": 7, "ts": 6300865925686.760, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720715.658, "dur": 5.520, + "args": { + "External id": 83559, "cbid": 211, "correlation": 161143752 + } + }, + { + "ph": "s", "id": 161143752, "pid": 5714, "tid": 5714, "ts": 6300865720715.658, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865925756.136, "dur": 14.465, + "args": { + "External id": 83562, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143766, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143766, "pid": 0, "tid": 7, "ts": 6300865925756.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720738.118, "dur": 5.250, + "args": { + "External id": 83562, "cbid": 211, "correlation": 161143766 + } + }, + { + "ph": "s", "id": 161143766, "pid": 5714, "tid": 5714, "ts": 6300865720738.118, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865925771.305, "dur": 1.600, + "args": { + "External id": 83564, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143772, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161143772, "pid": 0, "tid": 7, "ts": 6300865925771.305, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720757.458, "dur": 4.620, + "args": { + "External id": 83564, "cbid": 211, "correlation": 161143772 + } + }, + { + "ph": "s", "id": 161143772, "pid": 5714, "tid": 5714, "ts": 6300865720757.458, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865925773.545, "dur": 1.056, + "args": { + "External id": 83565, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143782, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161143782, "pid": 0, "tid": 7, "ts": 6300865925773.545, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720774.098, "dur": 4.750, + "args": { + "External id": 83565, "cbid": 211, "correlation": 161143782 + } + }, + { + "ph": "s", "id": 161143782, "pid": 5714, "tid": 5714, "ts": 6300865720774.098, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865925775.305, "dur": 90.209, + "args": { + "External id": 83566, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143792, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143792, "pid": 0, "tid": 7, "ts": 6300865925775.305, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720795.348, "dur": 5.020, + "args": { + "External id": 83566, "cbid": 211, "correlation": 161143792 + } + }, + { + "ph": "s", "id": 161143792, "pid": 5714, "tid": 5714, "ts": 6300865720795.348, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865925866.186, "dur": 46.560, + "args": { + "External id": 83571, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143805, "pid": 0, "tid": 7, "ts": 6300865925866.186, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720820.408, "dur": 5.440, + "args": { + "External id": 83571, "cbid": 211, "correlation": 161143805 + } + }, + { + "ph": "s", "id": 161143805, "pid": 5714, "tid": 5714, "ts": 6300865720820.408, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865925913.386, "dur": 22.305, + "args": { + "External id": 83572, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143816, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143816, "pid": 0, "tid": 7, "ts": 6300865925913.386, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720840.478, "dur": 4.910, + "args": { + "External id": 83572, "cbid": 211, "correlation": 161143816 + } + }, + { + "ph": "s", "id": 161143816, "pid": 5714, "tid": 5714, "ts": 6300865720840.478, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865925936.395, "dur": 324.867, + "args": { + "External id": 83580, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143839, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143839, "pid": 0, "tid": 7, "ts": 6300865925936.395, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720910.128, "dur": 7.890, + "args": { + "External id": 83580, "cbid": 211, "correlation": 161143839 + } + }, + { + "ph": "s", "id": 161143839, "pid": 5714, "tid": 5714, "ts": 6300865720910.128, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865926261.966, "dur": 322.820, + "args": { + "External id": 83589, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143862, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143862, "pid": 0, "tid": 7, "ts": 6300865926261.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865720973.778, "dur": 7.700, + "args": { + "External id": 83589, "cbid": 211, "correlation": 161143862 + } + }, + { + "ph": "s", "id": 161143862, "pid": 5714, "tid": 5714, "ts": 6300865720973.778, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6300865926585.426, "dur": 213.027, + "args": { + "External id": 83591, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143876, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143876, "pid": 0, "tid": 7, "ts": 6300865926585.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721059.227, "dur": 8.371, + "args": { + "External id": 83591, "cbid": 307, "correlation": 161143876 + } + }, + { + "ph": "s", "id": 161143876, "pid": 5714, "tid": 5714, "ts": 6300865721059.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865721117.478, "dur": 1.620, + "args": { + "External id": 83600, "cbid": 210, "correlation": 161143898 + } + }, + { + "ph": "f", "id": 161143898, "pid": 5714, "tid": 5714, "ts": 6300865721117.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865926799.189, "dur": 325.827, + "args": { + "External id": 83600, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143899, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161143899, "pid": 0, "tid": 7, "ts": 6300865926799.189, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721121.187, "dur": 7.360, + "args": { + "External id": 83600, "cbid": 211, "correlation": 161143899 + } + }, + { + "ph": "s", "id": 161143899, "pid": 5714, "tid": 5714, "ts": 6300865721121.187, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865927125.656, "dur": 50.529, + "args": { + "External id": 83602, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143909, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143909, "pid": 0, "tid": 7, "ts": 6300865927125.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721168.857, "dur": 7.160, + "args": { + "External id": 83602, "cbid": 211, "correlation": 161143909 + } + }, + { + "ph": "s", "id": 161143909, "pid": 5714, "tid": 5714, "ts": 6300865721168.857, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865927176.857, "dur": 64.577, + "args": { + "External id": 83607, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143922, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143922, "pid": 0, "tid": 7, "ts": 6300865927176.857, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721230.167, "dur": 8.040, + "args": { + "External id": 83607, "cbid": 211, "correlation": 161143922 + } + }, + { + "ph": "s", "id": 161143922, "pid": 5714, "tid": 5714, "ts": 6300865721230.167, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865927242.138, "dur": 68.321, + "args": { + "External id": 83608, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143933, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143933, "pid": 0, "tid": 7, "ts": 6300865927242.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721254.437, "dur": 5.390, + "args": { + "External id": 83608, "cbid": 211, "correlation": 161143933 + } + }, + { + "ph": "s", "id": 161143933, "pid": 5714, "tid": 5714, "ts": 6300865721254.437, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865927311.131, "dur": 14.624, + "args": { + "External id": 83611, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143947, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143947, "pid": 0, "tid": 7, "ts": 6300865927311.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721275.667, "dur": 5.220, + "args": { + "External id": 83611, "cbid": 211, "correlation": 161143947 + } + }, + { + "ph": "s", "id": 161143947, "pid": 5714, "tid": 5714, "ts": 6300865721275.667, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865927326.491, "dur": 1.568, + "args": { + "External id": 83613, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143953, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161143953, "pid": 0, "tid": 7, "ts": 6300865927326.491, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721294.577, "dur": 13.410, + "args": { + "External id": 83613, "cbid": 211, "correlation": 161143953 + } + }, + { + "ph": "s", "id": 161143953, "pid": 5714, "tid": 5714, "ts": 6300865721294.577, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865927328.763, "dur": 1.024, + "args": { + "External id": 83614, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143963, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161143963, "pid": 0, "tid": 7, "ts": 6300865927328.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721321.437, "dur": 5.010, + "args": { + "External id": 83614, "cbid": 211, "correlation": 161143963 + } + }, + { + "ph": "s", "id": 161143963, "pid": 5714, "tid": 5714, "ts": 6300865721321.437, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865927330.491, "dur": 89.345, + "args": { + "External id": 83615, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143973, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143973, "pid": 0, "tid": 7, "ts": 6300865927330.491, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721343.197, "dur": 4.850, + "args": { + "External id": 83615, "cbid": 211, "correlation": 161143973 + } + }, + { + "ph": "s", "id": 161143973, "pid": 5714, "tid": 5714, "ts": 6300865721343.197, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865927420.444, "dur": 48.961, + "args": { + "External id": 83620, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143986, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143986, "pid": 0, "tid": 7, "ts": 6300865927420.444, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721369.967, "dur": 5.250, + "args": { + "External id": 83620, "cbid": 211, "correlation": 161143986 + } + }, + { + "ph": "s", "id": 161143986, "pid": 5714, "tid": 5714, "ts": 6300865721369.967, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865927470.109, "dur": 22.720, + "args": { + "External id": 83621, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161143997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161143997, "pid": 0, "tid": 7, "ts": 6300865927470.109, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721388.507, "dur": 4.660, + "args": { + "External id": 83621, "cbid": 211, "correlation": 161143997 + } + }, + { + "ph": "s", "id": 161143997, "pid": 5714, "tid": 5714, "ts": 6300865721388.507, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865927493.437, "dur": 123.265, + "args": { + "External id": 83629, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144020, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144020, "pid": 0, "tid": 7, "ts": 6300865927493.437, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721466.466, "dur": 7.800, + "args": { + "External id": 83629, "cbid": 211, "correlation": 161144020 + } + }, + { + "ph": "s", "id": 161144020, "pid": 5714, "tid": 5714, "ts": 6300865721466.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865927617.374, "dur": 121.986, + "args": { + "External id": 83638, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144043, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144043, "pid": 0, "tid": 7, "ts": 6300865927617.374, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721531.466, "dur": 7.340, + "args": { + "External id": 83638, "cbid": 211, "correlation": 161144043 + } + }, + { + "ph": "s", "id": 161144043, "pid": 5714, "tid": 5714, "ts": 6300865721531.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865927740.032, "dur": 122.721, + "args": { + "External id": 83647, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144066, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144066, "pid": 0, "tid": 7, "ts": 6300865927740.032, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721591.986, "dur": 7.710, + "args": { + "External id": 83647, "cbid": 211, "correlation": 161144066 + } + }, + { + "ph": "s", "id": 161144066, "pid": 5714, "tid": 5714, "ts": 6300865721591.986, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865927863.361, "dur": 51.841, + "args": { + "External id": 83655, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144085, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144085, "pid": 0, "tid": 7, "ts": 6300865927863.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721789.936, "dur": 10.020, + "args": { + "External id": 83655, "cbid": 307, "correlation": 161144085 + } + }, + { + "ph": "s", "id": 161144085, "pid": 5714, "tid": 5714, "ts": 6300865721789.936, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865927915.906, "dur": 60.577, + "args": { + "External id": 83658, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144102, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144102, "pid": 0, "tid": 7, "ts": 6300865927915.906, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865721937.136, "dur": 9.229, + "args": { + "External id": 83658, "cbid": 307, "correlation": 161144102 + } + }, + { + "ph": "s", "id": 161144102, "pid": 5714, "tid": 5714, "ts": 6300865721937.136, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865722043.645, "dur": 0.520, + "args": { + "External id": 83662, "cbid": 200, "correlation": 161144106 + } + }, + { + "ph": "f", "id": 161144106, "pid": 5714, "tid": 5714, "ts": 6300865722043.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865722044.325, "dur": 0.230, + "args": { + "External id": 83662, "cbid": 200, "correlation": 161144107 + } + }, + { + "ph": "f", "id": 161144107, "pid": 5714, "tid": 5714, "ts": 6300865722044.325, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865722073.465, "dur": 0.440, + "args": { + "External id": 83662, "cbid": 200, "correlation": 161144130 + } + }, + { + "ph": "f", "id": 161144130, "pid": 5714, "tid": 5714, "ts": 6300865722073.465, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865722082.375, "dur": 2.280, + "args": { + "External id": 83662, "cbid": 273, "correlation": 161144139 + } + }, + { + "ph": "f", "id": 161144139, "pid": 5714, "tid": 5714, "ts": 6300865722082.375, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865927977.219, "dur": 416.164, + "args": { + "External id": 83662, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144140, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144140, "pid": 0, "tid": 7, "ts": 6300865927977.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722085.265, "dur": 10.540, + "args": { + "External id": 83662, "cbid": 211, "correlation": 161144140 + } + }, + { + "ph": "s", "id": 161144140, "pid": 5714, "tid": 5714, "ts": 6300865722085.265, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865928394.119, "dur": 123.714, + "args": { + "External id": 83678, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144166, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144166, "pid": 0, "tid": 7, "ts": 6300865928394.119, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722227.575, "dur": 10.220, + "args": { + "External id": 83678, "cbid": 211, "correlation": 161144166 + } + }, + { + "ph": "s", "id": 161144166, "pid": 5714, "tid": 5714, "ts": 6300865722227.575, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865928518.473, "dur": 62.625, + "args": { + "External id": 83680, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144176, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144176, "pid": 0, "tid": 7, "ts": 6300865928518.473, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722272.955, "dur": 7.400, + "args": { + "External id": 83680, "cbid": 211, "correlation": 161144176 + } + }, + { + "ph": "s", "id": 161144176, "pid": 5714, "tid": 5714, "ts": 6300865722272.955, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865928581.770, "dur": 51.200, + "args": { + "External id": 83685, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144189, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144189, "pid": 0, "tid": 7, "ts": 6300865928581.770, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722335.565, "dur": 9.330, + "args": { + "External id": 83685, "cbid": 211, "correlation": 161144189 + } + }, + { + "ph": "s", "id": 161144189, "pid": 5714, "tid": 5714, "ts": 6300865722335.565, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865928633.706, "dur": 68.801, + "args": { + "External id": 83686, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144200, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144200, "pid": 0, "tid": 7, "ts": 6300865928633.706, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722362.544, "dur": 5.691, + "args": { + "External id": 83686, "cbid": 211, "correlation": 161144200 + } + }, + { + "ph": "s", "id": 161144200, "pid": 5714, "tid": 5714, "ts": 6300865722362.544, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865928703.211, "dur": 15.169, + "args": { + "External id": 83689, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144214, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144214, "pid": 0, "tid": 7, "ts": 6300865928703.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722387.524, "dur": 5.551, + "args": { + "External id": 83689, "cbid": 211, "correlation": 161144214 + } + }, + { + "ph": "s", "id": 161144214, "pid": 5714, "tid": 5714, "ts": 6300865722387.524, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865928718.988, "dur": 1.728, + "args": { + "External id": 83691, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144220, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161144220, "pid": 0, "tid": 7, "ts": 6300865928718.988, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722407.015, "dur": 4.689, + "args": { + "External id": 83691, "cbid": 211, "correlation": 161144220 + } + }, + { + "ph": "s", "id": 161144220, "pid": 5714, "tid": 5714, "ts": 6300865722407.015, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865928721.388, "dur": 1.024, + "args": { + "External id": 83692, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144230, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161144230, "pid": 0, "tid": 7, "ts": 6300865928721.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722423.764, "dur": 4.820, + "args": { + "External id": 83692, "cbid": 211, "correlation": 161144230 + } + }, + { + "ph": "s", "id": 161144230, "pid": 5714, "tid": 5714, "ts": 6300865722423.764, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865928723.116, "dur": 89.057, + "args": { + "External id": 83693, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144240, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144240, "pid": 0, "tid": 7, "ts": 6300865928723.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722443.994, "dur": 5.080, + "args": { + "External id": 83693, "cbid": 211, "correlation": 161144240 + } + }, + { + "ph": "s", "id": 161144240, "pid": 5714, "tid": 5714, "ts": 6300865722443.994, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865928812.813, "dur": 47.552, + "args": { + "External id": 83698, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144253, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144253, "pid": 0, "tid": 7, "ts": 6300865928812.813, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722469.684, "dur": 5.430, + "args": { + "External id": 83698, "cbid": 211, "correlation": 161144253 + } + }, + { + "ph": "s", "id": 161144253, "pid": 5714, "tid": 5714, "ts": 6300865722469.684, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865928860.973, "dur": 22.816, + "args": { + "External id": 83699, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144264, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144264, "pid": 0, "tid": 7, "ts": 6300865928860.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722488.474, "dur": 4.650, + "args": { + "External id": 83699, "cbid": 211, "correlation": 161144264 + } + }, + { + "ph": "s", "id": 161144264, "pid": 5714, "tid": 5714, "ts": 6300865722488.474, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865928884.493, "dur": 325.028, + "args": { + "External id": 83707, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144287, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144287, "pid": 0, "tid": 7, "ts": 6300865928884.493, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722559.824, "dur": 7.960, + "args": { + "External id": 83707, "cbid": 211, "correlation": 161144287 + } + }, + { + "ph": "s", "id": 161144287, "pid": 5714, "tid": 5714, "ts": 6300865722559.824, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865929210.225, "dur": 324.132, + "args": { + "External id": 83716, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144310, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144310, "pid": 0, "tid": 7, "ts": 6300865929210.225, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722626.044, "dur": 7.620, + "args": { + "External id": 83716, "cbid": 211, "correlation": 161144310 + } + }, + { + "ph": "s", "id": 161144310, "pid": 5714, "tid": 5714, "ts": 6300865722626.044, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6300865929535.029, "dur": 212.962, + "args": { + "External id": 83718, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144324, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144324, "pid": 0, "tid": 7, "ts": 6300865929535.029, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722710.094, "dur": 8.430, + "args": { + "External id": 83718, "cbid": 307, "correlation": 161144324 + } + }, + { + "ph": "s", "id": 161144324, "pid": 5714, "tid": 5714, "ts": 6300865722710.094, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865722768.624, "dur": 1.580, + "args": { + "External id": 83727, "cbid": 210, "correlation": 161144346 + } + }, + { + "ph": "f", "id": 161144346, "pid": 5714, "tid": 5714, "ts": 6300865722768.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865929748.631, "dur": 326.308, + "args": { + "External id": 83727, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144347, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144347, "pid": 0, "tid": 7, "ts": 6300865929748.631, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722772.194, "dur": 7.330, + "args": { + "External id": 83727, "cbid": 211, "correlation": 161144347 + } + }, + { + "ph": "s", "id": 161144347, "pid": 5714, "tid": 5714, "ts": 6300865722772.194, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865930075.547, "dur": 51.681, + "args": { + "External id": 83729, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144357, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144357, "pid": 0, "tid": 7, "ts": 6300865930075.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722819.263, "dur": 7.871, + "args": { + "External id": 83729, "cbid": 211, "correlation": 161144357 + } + }, + { + "ph": "s", "id": 161144357, "pid": 5714, "tid": 5714, "ts": 6300865722819.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865930127.836, "dur": 64.129, + "args": { + "External id": 83734, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144370, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144370, "pid": 0, "tid": 7, "ts": 6300865930127.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722880.583, "dur": 8.140, + "args": { + "External id": 83734, "cbid": 211, "correlation": 161144370 + } + }, + { + "ph": "s", "id": 161144370, "pid": 5714, "tid": 5714, "ts": 6300865722880.583, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865930192.573, "dur": 63.872, + "args": { + "External id": 83735, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144381, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144381, "pid": 0, "tid": 7, "ts": 6300865930192.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722904.953, "dur": 5.550, + "args": { + "External id": 83735, "cbid": 211, "correlation": 161144381 + } + }, + { + "ph": "s", "id": 161144381, "pid": 5714, "tid": 5714, "ts": 6300865722904.953, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865930257.085, "dur": 15.969, + "args": { + "External id": 83738, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144395, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144395, "pid": 0, "tid": 7, "ts": 6300865930257.085, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722926.433, "dur": 4.850, + "args": { + "External id": 83738, "cbid": 211, "correlation": 161144395 + } + }, + { + "ph": "s", "id": 161144395, "pid": 5714, "tid": 5714, "ts": 6300865722926.433, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865930273.662, "dur": 1.696, + "args": { + "External id": 83740, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144401, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161144401, "pid": 0, "tid": 7, "ts": 6300865930273.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722943.583, "dur": 4.680, + "args": { + "External id": 83740, "cbid": 211, "correlation": 161144401 + } + }, + { + "ph": "s", "id": 161144401, "pid": 5714, "tid": 5714, "ts": 6300865722943.583, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865930276.062, "dur": 1.024, + "args": { + "External id": 83741, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144411, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161144411, "pid": 0, "tid": 7, "ts": 6300865930276.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722962.123, "dur": 4.620, + "args": { + "External id": 83741, "cbid": 211, "correlation": 161144411 + } + }, + { + "ph": "s", "id": 161144411, "pid": 5714, "tid": 5714, "ts": 6300865722962.123, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865930277.758, "dur": 89.537, + "args": { + "External id": 83742, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144421, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144421, "pid": 0, "tid": 7, "ts": 6300865930277.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865722983.483, "dur": 4.910, + "args": { + "External id": 83742, "cbid": 211, "correlation": 161144421 + } + }, + { + "ph": "s", "id": 161144421, "pid": 5714, "tid": 5714, "ts": 6300865722983.483, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865930367.903, "dur": 48.192, + "args": { + "External id": 83747, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144434, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144434, "pid": 0, "tid": 7, "ts": 6300865930367.903, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723010.133, "dur": 5.230, + "args": { + "External id": 83747, "cbid": 211, "correlation": 161144434 + } + }, + { + "ph": "s", "id": 161144434, "pid": 5714, "tid": 5714, "ts": 6300865723010.133, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865930416.703, "dur": 22.721, + "args": { + "External id": 83748, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144445, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144445, "pid": 0, "tid": 7, "ts": 6300865930416.703, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723028.383, "dur": 4.830, + "args": { + "External id": 83748, "cbid": 211, "correlation": 161144445 + } + }, + { + "ph": "s", "id": 161144445, "pid": 5714, "tid": 5714, "ts": 6300865723028.383, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865930440.064, "dur": 123.969, + "args": { + "External id": 83756, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144468, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144468, "pid": 0, "tid": 7, "ts": 6300865930440.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723104.073, "dur": 7.850, + "args": { + "External id": 83756, "cbid": 211, "correlation": 161144468 + } + }, + { + "ph": "s", "id": 161144468, "pid": 5714, "tid": 5714, "ts": 6300865723104.073, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865930564.737, "dur": 122.434, + "args": { + "External id": 83765, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144491, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144491, "pid": 0, "tid": 7, "ts": 6300865930564.737, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723166.683, "dur": 7.630, + "args": { + "External id": 83765, "cbid": 211, "correlation": 161144491 + } + }, + { + "ph": "s", "id": 161144491, "pid": 5714, "tid": 5714, "ts": 6300865723166.683, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865930687.779, "dur": 122.433, + "args": { + "External id": 83774, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144514, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144514, "pid": 0, "tid": 7, "ts": 6300865930687.779, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723229.333, "dur": 7.200, + "args": { + "External id": 83774, "cbid": 211, "correlation": 161144514 + } + }, + { + "ph": "s", "id": 161144514, "pid": 5714, "tid": 5714, "ts": 6300865723229.333, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865930810.852, "dur": 52.992, + "args": { + "External id": 83782, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144533, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144533, "pid": 0, "tid": 7, "ts": 6300865930810.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723439.622, "dur": 10.100, + "args": { + "External id": 83782, "cbid": 307, "correlation": 161144533 + } + }, + { + "ph": "s", "id": 161144533, "pid": 5714, "tid": 5714, "ts": 6300865723439.622, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300865930864.548, "dur": 59.777, + "args": { + "External id": 83785, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144550, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144550, "pid": 0, "tid": 7, "ts": 6300865930864.548, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723585.692, "dur": 8.790, + "args": { + "External id": 83785, "cbid": 307, "correlation": 161144550 + } + }, + { + "ph": "s", "id": 161144550, "pid": 5714, "tid": 5714, "ts": 6300865723585.692, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865723693.341, "dur": 0.580, + "args": { + "External id": 83789, "cbid": 200, "correlation": 161144554 + } + }, + { + "ph": "f", "id": 161144554, "pid": 5714, "tid": 5714, "ts": 6300865723693.341, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865723694.072, "dur": 0.229, + "args": { + "External id": 83789, "cbid": 200, "correlation": 161144555 + } + }, + { + "ph": "f", "id": 161144555, "pid": 5714, "tid": 5714, "ts": 6300865723694.072, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865723723.801, "dur": 0.391, + "args": { + "External id": 83789, "cbid": 200, "correlation": 161144578 + } + }, + { + "ph": "f", "id": 161144578, "pid": 5714, "tid": 5714, "ts": 6300865723723.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865723732.221, "dur": 2.211, + "args": { + "External id": 83789, "cbid": 273, "correlation": 161144587 + } + }, + { + "ph": "f", "id": 161144587, "pid": 5714, "tid": 5714, "ts": 6300865723732.221, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6300865930925.029, "dur": 427.173, + "args": { + "External id": 83789, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144588, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144588, "pid": 0, "tid": 7, "ts": 6300865930925.029, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723735.061, "dur": 10.751, + "args": { + "External id": 83789, "cbid": 211, "correlation": 161144588 + } + }, + { + "ph": "s", "id": 161144588, "pid": 5714, "tid": 5714, "ts": 6300865723735.061, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6300865931352.938, "dur": 124.578, + "args": { + "External id": 83805, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144614, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144614, "pid": 0, "tid": 7, "ts": 6300865931352.938, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723874.131, "dur": 10.450, + "args": { + "External id": 83805, "cbid": 211, "correlation": 161144614 + } + }, + { + "ph": "s", "id": 161144614, "pid": 5714, "tid": 5714, "ts": 6300865723874.131, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865931478.124, "dur": 62.369, + "args": { + "External id": 83807, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144624, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144624, "pid": 0, "tid": 7, "ts": 6300865931478.124, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723926.471, "dur": 7.240, + "args": { + "External id": 83807, "cbid": 211, "correlation": 161144624 + } + }, + { + "ph": "s", "id": 161144624, "pid": 5714, "tid": 5714, "ts": 6300865723926.471, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865931541.197, "dur": 48.544, + "args": { + "External id": 83812, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144637, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144637, "pid": 0, "tid": 7, "ts": 6300865931541.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865723977.311, "dur": 7.770, + "args": { + "External id": 83812, "cbid": 211, "correlation": 161144637 + } + }, + { + "ph": "s", "id": 161144637, "pid": 5714, "tid": 5714, "ts": 6300865723977.311, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865931590.413, "dur": 69.281, + "args": { + "External id": 83813, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144648, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144648, "pid": 0, "tid": 7, "ts": 6300865931590.413, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724002.681, "dur": 5.580, + "args": { + "External id": 83813, "cbid": 211, "correlation": 161144648 + } + }, + { + "ph": "s", "id": 161144648, "pid": 5714, "tid": 5714, "ts": 6300865724002.681, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865931660.270, "dur": 14.560, + "args": { + "External id": 83816, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144662, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144662, "pid": 0, "tid": 7, "ts": 6300865931660.270, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724027.561, "dur": 5.440, + "args": { + "External id": 83816, "cbid": 211, "correlation": 161144662 + } + }, + { + "ph": "s", "id": 161144662, "pid": 5714, "tid": 5714, "ts": 6300865724027.561, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865931675.566, "dur": 1.696, + "args": { + "External id": 83818, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144668, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161144668, "pid": 0, "tid": 7, "ts": 6300865931675.566, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724047.151, "dur": 4.880, + "args": { + "External id": 83818, "cbid": 211, "correlation": 161144668 + } + }, + { + "ph": "s", "id": 161144668, "pid": 5714, "tid": 5714, "ts": 6300865724047.151, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865931677.934, "dur": 1.056, + "args": { + "External id": 83819, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144678, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161144678, "pid": 0, "tid": 7, "ts": 6300865931677.934, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724066.311, "dur": 4.750, + "args": { + "External id": 83819, "cbid": 211, "correlation": 161144678 + } + }, + { + "ph": "s", "id": 161144678, "pid": 5714, "tid": 5714, "ts": 6300865724066.311, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865931679.662, "dur": 91.297, + "args": { + "External id": 83820, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144688, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144688, "pid": 0, "tid": 7, "ts": 6300865931679.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724088.301, "dur": 4.910, + "args": { + "External id": 83820, "cbid": 211, "correlation": 161144688 + } + }, + { + "ph": "s", "id": 161144688, "pid": 5714, "tid": 5714, "ts": 6300865724088.301, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865931771.631, "dur": 49.633, + "args": { + "External id": 83825, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144701, "pid": 0, "tid": 7, "ts": 6300865931771.631, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724113.020, "dur": 5.360, + "args": { + "External id": 83825, "cbid": 211, "correlation": 161144701 + } + }, + { + "ph": "s", "id": 161144701, "pid": 5714, "tid": 5714, "ts": 6300865724113.020, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865931821.840, "dur": 22.688, + "args": { + "External id": 83826, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144712, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144712, "pid": 0, "tid": 7, "ts": 6300865931821.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724132.991, "dur": 4.880, + "args": { + "External id": 83826, "cbid": 211, "correlation": 161144712 + } + }, + { + "ph": "s", "id": 161144712, "pid": 5714, "tid": 5714, "ts": 6300865724132.991, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865931845.168, "dur": 324.836, + "args": { + "External id": 83834, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144735, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144735, "pid": 0, "tid": 7, "ts": 6300865931845.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724204.230, "dur": 8.060, + "args": { + "External id": 83834, "cbid": 211, "correlation": 161144735 + } + }, + { + "ph": "s", "id": 161144735, "pid": 5714, "tid": 5714, "ts": 6300865724204.230, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865932170.708, "dur": 323.908, + "args": { + "External id": 83843, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144758, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144758, "pid": 0, "tid": 7, "ts": 6300865932170.708, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724268.720, "dur": 7.420, + "args": { + "External id": 83843, "cbid": 211, "correlation": 161144758 + } + }, + { + "ph": "s", "id": 161144758, "pid": 5714, "tid": 5714, "ts": 6300865724268.720, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6300865932495.320, "dur": 211.330, + "args": { + "External id": 83845, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144772, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144772, "pid": 0, "tid": 7, "ts": 6300865932495.320, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724365.850, "dur": 9.170, + "args": { + "External id": 83845, "cbid": 307, "correlation": 161144772 + } + }, + { + "ph": "s", "id": 161144772, "pid": 5714, "tid": 5714, "ts": 6300865724365.850, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865724424.120, "dur": 1.540, + "args": { + "External id": 83854, "cbid": 210, "correlation": 161144794 + } + }, + { + "ph": "f", "id": 161144794, "pid": 5714, "tid": 5714, "ts": 6300865724424.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865932707.258, "dur": 326.692, + "args": { + "External id": 83854, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144795, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161144795, "pid": 0, "tid": 7, "ts": 6300865932707.258, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724427.730, "dur": 7.010, + "args": { + "External id": 83854, "cbid": 211, "correlation": 161144795 + } + }, + { + "ph": "s", "id": 161144795, "pid": 5714, "tid": 5714, "ts": 6300865724427.730, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865933034.686, "dur": 52.864, + "args": { + "External id": 83856, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144805, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144805, "pid": 0, "tid": 7, "ts": 6300865933034.686, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724475.110, "dur": 7.660, + "args": { + "External id": 83856, "cbid": 211, "correlation": 161144805 + } + }, + { + "ph": "s", "id": 161144805, "pid": 5714, "tid": 5714, "ts": 6300865724475.110, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy_contig, unsigned int, 3, 128, 1>(at::native::(anonymous namespace)::OpaqueType<2u>*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, unsigned int, 128, 1>, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 0, "tid": 7, + "ts": 6300865933088.190, "dur": 224.100, + "args": { + "External id": 83858, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144816, "registers per thread": 20, "shared memory": 0, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [256, 4, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144816, "pid": 0, "tid": 7, "ts": 6300865933088.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724525.040, "dur": 8.990, + "args": { + "External id": 83858, "cbid": 211, "correlation": 161144816 + } + }, + { + "ph": "s", "id": 161144816, "pid": 5714, "tid": 5714, "ts": 6300865724525.040, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6300865933312.994, "dur": 194.722, + "args": { + "External id": 83863, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144835, "registers per thread": 22, "shared memory": 32, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144835, "pid": 0, "tid": 7, "ts": 6300865933312.994, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724750.969, "dur": 11.480, + "args": { + "External id": 83863, "cbid": 307, "correlation": 161144835 + } + }, + { + "ph": "s", "id": 161144835, "pid": 5714, "tid": 5714, "ts": 6300865724750.969, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865933508.420, "dur": 1.760, + "args": { + "External id": 83867, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144847, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161144847, "pid": 0, "tid": 7, "ts": 6300865933508.420, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724906.589, "dur": 11.320, + "args": { + "External id": 83867, "cbid": 211, "correlation": 161144847 + } + }, + { + "ph": "s", "id": 161144847, "pid": 5714, "tid": 5714, "ts": 6300865724906.589, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865933510.852, "dur": 1.088, + "args": { + "External id": 83871, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144863, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161144863, "pid": 0, "tid": 7, "ts": 6300865933510.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724938.049, "dur": 5.370, + "args": { + "External id": 83871, "cbid": 211, "correlation": 161144863 + } + }, + { + "ph": "s", "id": 161144863, "pid": 5714, "tid": 5714, "ts": 6300865724938.049, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865933512.548, "dur": 0.864, + "args": { + "External id": 83875, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144879, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161144879, "pid": 0, "tid": 7, "ts": 6300865933512.548, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865724959.229, "dur": 4.850, + "args": { + "External id": 83875, "cbid": 211, "correlation": 161144879 + } + }, + { + "ph": "s", "id": 161144879, "pid": 5714, "tid": 5714, "ts": 6300865724959.229, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865933514.020, "dur": 2.208, + "args": { + "External id": 83911, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144907, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 161144907, "pid": 0, "tid": 7, "ts": 6300865933514.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865725115.518, "dur": 9.760, + "args": { + "External id": 83911, "cbid": 211, "correlation": 161144907 + } + }, + { + "ph": "s", "id": 161144907, "pid": 5714, "tid": 5714, "ts": 6300865725115.518, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865933516.868, "dur": 49.760, + "args": { + "External id": 83919, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144925, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144925, "pid": 0, "tid": 7, "ts": 6300865933516.868, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865725250.318, "dur": 13.510, + "args": { + "External id": 83919, "cbid": 211, "correlation": 161144925 + } + }, + { + "ph": "s", "id": 161144925, "pid": 5714, "tid": 5714, "ts": 6300865725250.318, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865933567.332, "dur": 18.240, + "args": { + "External id": 83924, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144942, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144942, "pid": 0, "tid": 7, "ts": 6300865933567.332, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865725317.068, "dur": 8.490, + "args": { + "External id": 83924, "cbid": 211, "correlation": 161144942 + } + }, + { + "ph": "s", "id": 161144942, "pid": 5714, "tid": 5714, "ts": 6300865725317.068, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865933586.212, "dur": 100.642, + "args": { + "External id": 83929, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161144958, "pid": 0, "tid": 7, "ts": 6300865933586.212, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865725345.308, "dur": 5.260, + "args": { + "External id": 83929, "cbid": 211, "correlation": 161144958 + } + }, + { + "ph": "s", "id": 161144958, "pid": 5714, "tid": 5714, "ts": 6300865725345.308, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865933687.558, "dur": 2.016, + "args": { + "External id": 83933, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144974, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161144974, "pid": 0, "tid": 7, "ts": 6300865933687.558, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865725370.488, "dur": 5.030, + "args": { + "External id": 83933, "cbid": 211, "correlation": 161144974 + } + }, + { + "ph": "s", "id": 161144974, "pid": 5714, "tid": 5714, "ts": 6300865725370.488, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865933690.278, "dur": 1.664, + "args": { + "External id": 83934, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161144986, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161144986, "pid": 0, "tid": 7, "ts": 6300865933690.278, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865725401.258, "dur": 7.060, + "args": { + "External id": 83934, "cbid": 211, "correlation": 161144986 + } + }, + { + "ph": "s", "id": 161144986, "pid": 5714, "tid": 5714, "ts": 6300865725401.258, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865933692.550, "dur": 2.080, + "args": { + "External id": 83941, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145004, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161145004, "pid": 0, "tid": 7, "ts": 6300865933692.550, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865725442.708, "dur": 7.349, + "args": { + "External id": 83941, "cbid": 211, "correlation": 161145004 + } + }, + { + "ph": "s", "id": 161145004, "pid": 5714, "tid": 5714, "ts": 6300865725442.708, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 0, "tid": 7, + "ts": 6300865933695.302, "dur": 3.840, + "args": { + "External id": 83936, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145013, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145013, "pid": 0, "tid": 7, "ts": 6300865933695.302, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865725459.308, "dur": 6.869, + "args": { + "External id": 83936, "cbid": 211, "correlation": 161145013 + } + }, + { + "ph": "s", "id": 161145013, "pid": 5714, "tid": 5714, "ts": 6300865725459.308, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865725482.357, "dur": 2.451, + "args": { + "External id": 83943, "cbid": 138, "correlation": 161145018 + } + }, + { + "ph": "f", "id": 161145018, "pid": 5714, "tid": 5714, "ts": 6300865725482.357, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6300865933705.990, "dur": 0.992, + "args": { + "External id": 83943, "device": 0, "context": 1, "stream": 7, "correlation": 161145021, "bytes": 8, "memory bandwidth (GB/s)": 0.008064516129032258 + } + }, + { + "ph": "f", "id": 161145021, "pid": 0, "tid": 7, "ts": 6300865933705.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865725487.357, "dur": 11.300, + "args": { + "External id": 83943, "cbid": 41, "correlation": 161145021 + } + }, + { + "ph": "s", "id": 161145021, "pid": 5714, "tid": 5714, "ts": 6300865725487.357, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791271.428, "dur": 9.200, + "args": { + "cbid": 138, "correlation": 161145023 + } + }, + { + "ph": "f", "id": 161145023, "pid": 5714, "tid": 1822426688, "ts": 6300865791271.428, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791284.518, "dur": 2.050, + "args": { + "cbid": 138, "correlation": 161145025 + } + }, + { + "ph": "f", "id": 161145025, "pid": 5714, "tid": 1822426688, "ts": 6300865791284.518, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791288.928, "dur": 2.030, + "args": { + "cbid": 138, "correlation": 161145027 + } + }, + { + "ph": "f", "id": 161145027, "pid": 5714, "tid": 1822426688, "ts": 6300865791288.928, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791293.388, "dur": 1.290, + "args": { + "cbid": 138, "correlation": 161145029 + } + }, + { + "ph": "f", "id": 161145029, "pid": 5714, "tid": 1822426688, "ts": 6300865791293.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791306.738, "dur": 1.880, + "args": { + "cbid": 138, "correlation": 161145031 + } + }, + { + "ph": "f", "id": 161145031, "pid": 5714, "tid": 1822426688, "ts": 6300865791306.738, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791311.418, "dur": 1.920, + "args": { + "cbid": 138, "correlation": 161145033 + } + }, + { + "ph": "f", "id": 161145033, "pid": 5714, "tid": 1822426688, "ts": 6300865791311.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791315.318, "dur": 2.240, + "args": { + "cbid": 138, "correlation": 161145035 + } + }, + { + "ph": "f", "id": 161145035, "pid": 5714, "tid": 1822426688, "ts": 6300865791315.318, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791319.568, "dur": 1.530, + "args": { + "cbid": 138, "correlation": 161145037 + } + }, + { + "ph": "f", "id": 161145037, "pid": 5714, "tid": 1822426688, "ts": 6300865791319.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791322.938, "dur": 1.200, + "args": { + "cbid": 138, "correlation": 161145039 + } + }, + { + "ph": "f", "id": 161145039, "pid": 5714, "tid": 1822426688, "ts": 6300865791322.938, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791326.228, "dur": 1.180, + "args": { + "cbid": 138, "correlation": 161145041 + } + }, + { + "ph": "f", "id": 161145041, "pid": 5714, "tid": 1822426688, "ts": 6300865791326.228, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865791329.568, "dur": 1.410, + "args": { + "cbid": 138, "correlation": 161145043 + } + }, + { + "ph": "f", "id": 161145043, "pid": 5714, "tid": 1822426688, "ts": 6300865791329.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891427.270, "dur": 11.060, + "args": { + "cbid": 138, "correlation": 161145045 + } + }, + { + "ph": "f", "id": 161145045, "pid": 5714, "tid": 1822426688, "ts": 6300865891427.270, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891438.840, "dur": 1.170, + "args": { + "cbid": 138, "correlation": 161145046 + } + }, + { + "ph": "f", "id": 161145046, "pid": 5714, "tid": 1822426688, "ts": 6300865891438.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891447.400, "dur": 0.770, + "args": { + "cbid": 138, "correlation": 161145047 + } + }, + { + "ph": "f", "id": 161145047, "pid": 5714, "tid": 1822426688, "ts": 6300865891447.400, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891455.270, "dur": 1.610, + "args": { + "cbid": 138, "correlation": 161145048 + } + }, + { + "ph": "f", "id": 161145048, "pid": 5714, "tid": 1822426688, "ts": 6300865891455.270, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891457.080, "dur": 0.580, + "args": { + "cbid": 138, "correlation": 161145049 + } + }, + { + "ph": "f", "id": 161145049, "pid": 5714, "tid": 1822426688, "ts": 6300865891457.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891458.650, "dur": 0.620, + "args": { + "cbid": 138, "correlation": 161145050 + } + }, + { + "ph": "f", "id": 161145050, "pid": 5714, "tid": 1822426688, "ts": 6300865891458.650, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891461.600, "dur": 1.780, + "args": { + "cbid": 138, "correlation": 161145051 + } + }, + { + "ph": "f", "id": 161145051, "pid": 5714, "tid": 1822426688, "ts": 6300865891461.600, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891463.570, "dur": 0.590, + "args": { + "cbid": 138, "correlation": 161145052 + } + }, + { + "ph": "f", "id": 161145052, "pid": 5714, "tid": 1822426688, "ts": 6300865891463.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891465.300, "dur": 0.700, + "args": { + "cbid": 138, "correlation": 161145053 + } + }, + { + "ph": "f", "id": 161145053, "pid": 5714, "tid": 1822426688, "ts": 6300865891465.300, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891468.700, "dur": 1.480, + "args": { + "cbid": 138, "correlation": 161145054 + } + }, + { + "ph": "f", "id": 161145054, "pid": 5714, "tid": 1822426688, "ts": 6300865891468.700, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891470.460, "dur": 0.570, + "args": { + "cbid": 138, "correlation": 161145055 + } + }, + { + "ph": "f", "id": 161145055, "pid": 5714, "tid": 1822426688, "ts": 6300865891470.460, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891472.120, "dur": 0.590, + "args": { + "cbid": 138, "correlation": 161145056 + } + }, + { + "ph": "f", "id": 161145056, "pid": 5714, "tid": 1822426688, "ts": 6300865891472.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891474.990, "dur": 1.790, + "args": { + "cbid": 138, "correlation": 161145057 + } + }, + { + "ph": "f", "id": 161145057, "pid": 5714, "tid": 1822426688, "ts": 6300865891474.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891477.060, "dur": 0.560, + "args": { + "cbid": 138, "correlation": 161145058 + } + }, + { + "ph": "f", "id": 161145058, "pid": 5714, "tid": 1822426688, "ts": 6300865891477.060, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891478.680, "dur": 0.590, + "args": { + "cbid": 138, "correlation": 161145059 + } + }, + { + "ph": "f", "id": 161145059, "pid": 5714, "tid": 1822426688, "ts": 6300865891478.680, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891481.490, "dur": 2.140, + "args": { + "cbid": 138, "correlation": 161145060 + } + }, + { + "ph": "f", "id": 161145060, "pid": 5714, "tid": 1822426688, "ts": 6300865891481.490, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891486.120, "dur": 1.750, + "args": { + "cbid": 138, "correlation": 161145062 + } + }, + { + "ph": "f", "id": 161145062, "pid": 5714, "tid": 1822426688, "ts": 6300865891486.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891489.750, "dur": 1.670, + "args": { + "cbid": 138, "correlation": 161145064 + } + }, + { + "ph": "f", "id": 161145064, "pid": 5714, "tid": 1822426688, "ts": 6300865891489.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891492.860, "dur": 1.100, + "args": { + "cbid": 138, "correlation": 161145066 + } + }, + { + "ph": "f", "id": 161145066, "pid": 5714, "tid": 1822426688, "ts": 6300865891492.860, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891495.720, "dur": 0.980, + "args": { + "cbid": 138, "correlation": 161145068 + } + }, + { + "ph": "f", "id": 161145068, "pid": 5714, "tid": 1822426688, "ts": 6300865891495.720, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865891498.320, "dur": 1.230, + "args": { + "cbid": 138, "correlation": 161145070 + } + }, + { + "ph": "f", "id": 161145070, "pid": 5714, "tid": 1822426688, "ts": 6300865891498.320, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6300865725499.108, "dur": 208214.366, + "args": { + "External id": 83943, "cbid": 131, "correlation": 161145022 + } + }, + { + "ph": "s", "id": 161145022, "pid": 5714, "tid": 5714, "ts": 6300865725499.108, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865933809.704, "dur": 2.270, + "args": { + "External id": 83951, "cbid": 210, "correlation": 161145096 + } + }, + { + "ph": "f", "id": 161145096, "pid": 5714, "tid": 5714, "ts": 6300865933809.704, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865933831.751, "dur": 640.776, + "args": { + "External id": 83951, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145097, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145097, "pid": 0, "tid": 7, "ts": 6300865933831.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865933816.104, "dur": 16.090, + "args": { + "External id": 83951, "cbid": 211, "correlation": 161145097 + } + }, + { + "ph": "s", "id": 161145097, "pid": 5714, "tid": 5714, "ts": 6300865933816.104, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865934473.231, "dur": 170.882, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145116, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161145116, "pid": 0, "tid": 7, "ts": 6300865934473.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865933980.004, "dur": 11.480, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161145116 + } + }, + { + "ph": "s", "id": 161145116, "pid": 5714, "tid": 5714, "ts": 6300865933980.004, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865934644.785, "dur": 4.032, + "args": { + "External id": 83961, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145133, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145133, "pid": 0, "tid": 7, "ts": 6300865934644.785, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934041.554, "dur": 10.669, + "args": { + "External id": 83961, "cbid": 211, "correlation": 161145133 + } + }, + { + "ph": "s", "id": 161145133, "pid": 5714, "tid": 5714, "ts": 6300865934041.554, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865934649.553, "dur": 1.216, + "args": { + "External id": 83966, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145150, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145150, "pid": 0, "tid": 7, "ts": 6300865934649.553, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934086.523, "dur": 7.840, + "args": { + "External id": 83966, "cbid": 211, "correlation": 161145150 + } + }, + { + "ph": "s", "id": 161145150, "pid": 5714, "tid": 5714, "ts": 6300865934086.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865934651.409, "dur": 0.992, + "args": { + "External id": 83968, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145160, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145160, "pid": 0, "tid": 7, "ts": 6300865934651.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934114.893, "dur": 7.520, + "args": { + "External id": 83968, "cbid": 211, "correlation": 161145160 + } + }, + { + "ph": "s", "id": 161145160, "pid": 5714, "tid": 5714, "ts": 6300865934114.893, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865934653.105, "dur": 1.024, + "args": { + "External id": 83969, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145166, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145166, "pid": 0, "tid": 7, "ts": 6300865934653.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934147.883, "dur": 8.370, + "args": { + "External id": 83969, "cbid": 211, "correlation": 161145166 + } + }, + { + "ph": "s", "id": 161145166, "pid": 5714, "tid": 5714, "ts": 6300865934147.883, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865934654.801, "dur": 1.024, + "args": { + "External id": 83970, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145176, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145176, "pid": 0, "tid": 7, "ts": 6300865934654.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934172.053, "dur": 6.240, + "args": { + "External id": 83970, "cbid": 211, "correlation": 161145176 + } + }, + { + "ph": "s", "id": 161145176, "pid": 5714, "tid": 5714, "ts": 6300865934172.053, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865934656.529, "dur": 0.992, + "args": { + "External id": 83971, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145182, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145182, "pid": 0, "tid": 7, "ts": 6300865934656.529, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934188.153, "dur": 5.330, + "args": { + "External id": 83971, "cbid": 211, "correlation": 161145182 + } + }, + { + "ph": "s", "id": 161145182, "pid": 5714, "tid": 5714, "ts": 6300865934188.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865934658.257, "dur": 3.296, + "args": { + "External id": 83972, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145195, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145195, "pid": 0, "tid": 7, "ts": 6300865934658.257, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934217.303, "dur": 7.680, + "args": { + "External id": 83972, "cbid": 211, "correlation": 161145195 + } + }, + { + "ph": "s", "id": 161145195, "pid": 5714, "tid": 5714, "ts": 6300865934217.303, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865934662.225, "dur": 1.056, + "args": { + "External id": 83975, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145201, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145201, "pid": 0, "tid": 7, "ts": 6300865934662.225, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934236.593, "dur": 6.720, + "args": { + "External id": 83975, "cbid": 211, "correlation": 161145201 + } + }, + { + "ph": "s", "id": 161145201, "pid": 5714, "tid": 5714, "ts": 6300865934236.593, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865934663.921, "dur": 1.024, + "args": { + "External id": 83976, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145207, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145207, "pid": 0, "tid": 7, "ts": 6300865934663.921, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934248.753, "dur": 3.880, + "args": { + "External id": 83976, "cbid": 211, "correlation": 161145207 + } + }, + { + "ph": "s", "id": 161145207, "pid": 5714, "tid": 5714, "ts": 6300865934248.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865934665.649, "dur": 233.507, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145221, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161145221, "pid": 0, "tid": 7, "ts": 6300865934665.649, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934360.713, "dur": 10.750, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161145221 + } + }, + { + "ph": "s", "id": 161145221, "pid": 5714, "tid": 5714, "ts": 6300865934360.713, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865934418.093, "dur": 1.070, + "args": { + "External id": 83980, "cbid": 200, "correlation": 161145244 + } + }, + { + "ph": "f", "id": 161145244, "pid": 5714, "tid": 5714, "ts": 6300865934418.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865934900.052, "dur": 0.832, + "args": { + "External id": 83980, "device": 0, "context": 1, "stream": 7, "correlation": 161145247, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161145247, "pid": 0, "tid": 7, "ts": 6300865934900.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865934422.093, "dur": 13.550, + "args": { + "External id": 83980, "cbid": 51, "correlation": 161145247 + } + }, + { + "ph": "s", "id": 161145247, "pid": 5714, "tid": 5714, "ts": 6300865934422.093, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865934902.036, "dur": 689.512, + "args": { + "External id": 83980, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145248, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145248, "pid": 0, "tid": 7, "ts": 6300865934902.036, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934435.913, "dur": 7.360, + "args": { + "External id": 83980, "cbid": 307, "correlation": 161145248 + } + }, + { + "ph": "s", "id": 161145248, "pid": 5714, "tid": 5714, "ts": 6300865934435.913, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865935592.220, "dur": 2.976, + "args": { + "External id": 83983, "device": 0, "context": 1, "stream": 7, "correlation": 161145253, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 161145253, "pid": 0, "tid": 7, "ts": 6300865935592.220, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865934470.062, "dur": 18.260, + "args": { + "External id": 83983, "cbid": 41, "correlation": 161145253 + } + }, + { + "ph": "s", "id": 161145253, "pid": 5714, "tid": 5714, "ts": 6300865934470.062, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865934534.522, "dur": 0.560, + "args": { + "External id": 83988, "cbid": 200, "correlation": 161145281 + } + }, + { + "ph": "f", "id": 161145281, "pid": 5714, "tid": 5714, "ts": 6300865934534.522, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865935595.900, "dur": 692.808, + "args": { + "External id": 83988, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145284, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145284, "pid": 0, "tid": 7, "ts": 6300865935595.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934536.933, "dur": 7.929, + "args": { + "External id": 83988, "cbid": 307, "correlation": 161145284 + } + }, + { + "ph": "s", "id": 161145284, "pid": 5714, "tid": 5714, "ts": 6300865934536.933, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865936289.380, "dur": 221.603, + "args": { + "External id": 83989, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145289, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161145289, "pid": 0, "tid": 7, "ts": 6300865936289.380, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934563.872, "dur": 7.890, + "args": { + "External id": 83989, "cbid": 211, "correlation": 161145289 + } + }, + { + "ph": "s", "id": 161145289, "pid": 5714, "tid": 5714, "ts": 6300865934563.872, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865934619.002, "dur": 1.270, + "args": { + "External id": 83997, "cbid": 210, "correlation": 161145315 + } + }, + { + "ph": "f", "id": 161145315, "pid": 5714, "tid": 5714, "ts": 6300865934619.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865936511.719, "dur": 636.104, + "args": { + "External id": 83997, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145316, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145316, "pid": 0, "tid": 7, "ts": 6300865936511.719, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934623.922, "dur": 7.530, + "args": { + "External id": 83997, "cbid": 211, "correlation": 161145316 + } + }, + { + "ph": "s", "id": 161145316, "pid": 5714, "tid": 5714, "ts": 6300865934623.922, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865937148.559, "dur": 170.753, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145335, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161145335, "pid": 0, "tid": 7, "ts": 6300865937148.559, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934741.682, "dur": 9.240, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161145335 + } + }, + { + "ph": "s", "id": 161145335, "pid": 5714, "tid": 5714, "ts": 6300865934741.682, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865937320.016, "dur": 4.032, + "args": { + "External id": 84007, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145352, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145352, "pid": 0, "tid": 7, "ts": 6300865937320.016, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934785.342, "dur": 7.510, + "args": { + "External id": 84007, "cbid": 211, "correlation": 161145352 + } + }, + { + "ph": "s", "id": 161145352, "pid": 5714, "tid": 5714, "ts": 6300865934785.342, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865937324.688, "dur": 1.216, + "args": { + "External id": 84012, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145369, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145369, "pid": 0, "tid": 7, "ts": 6300865937324.688, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934817.162, "dur": 5.570, + "args": { + "External id": 84012, "cbid": 211, "correlation": 161145369 + } + }, + { + "ph": "s", "id": 161145369, "pid": 5714, "tid": 5714, "ts": 6300865934817.162, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865937326.512, "dur": 1.024, + "args": { + "External id": 84014, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145379, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145379, "pid": 0, "tid": 7, "ts": 6300865937326.512, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934838.822, "dur": 5.250, + "args": { + "External id": 84014, "cbid": 211, "correlation": 161145379 + } + }, + { + "ph": "s", "id": 161145379, "pid": 5714, "tid": 5714, "ts": 6300865934838.822, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865937328.240, "dur": 1.024, + "args": { + "External id": 84015, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145385, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145385, "pid": 0, "tid": 7, "ts": 6300865937328.240, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934851.502, "dur": 4.480, + "args": { + "External id": 84015, "cbid": 211, "correlation": 161145385 + } + }, + { + "ph": "s", "id": 161145385, "pid": 5714, "tid": 5714, "ts": 6300865934851.502, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865937329.904, "dur": 1.024, + "args": { + "External id": 84016, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145395, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145395, "pid": 0, "tid": 7, "ts": 6300865937329.904, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934866.292, "dur": 4.300, + "args": { + "External id": 84016, "cbid": 211, "correlation": 161145395 + } + }, + { + "ph": "s", "id": 161145395, "pid": 5714, "tid": 5714, "ts": 6300865934866.292, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865937331.632, "dur": 1.024, + "args": { + "External id": 84017, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145401, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145401, "pid": 0, "tid": 7, "ts": 6300865937331.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934876.432, "dur": 4.010, + "args": { + "External id": 84017, "cbid": 211, "correlation": 161145401 + } + }, + { + "ph": "s", "id": 161145401, "pid": 5714, "tid": 5714, "ts": 6300865934876.432, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865937333.392, "dur": 3.296, + "args": { + "External id": 84018, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145414, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145414, "pid": 0, "tid": 7, "ts": 6300865937333.392, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934896.921, "dur": 5.051, + "args": { + "External id": 84018, "cbid": 211, "correlation": 161145414 + } + }, + { + "ph": "s", "id": 161145414, "pid": 5714, "tid": 5714, "ts": 6300865934896.921, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865937337.329, "dur": 1.088, + "args": { + "External id": 84021, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145420, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145420, "pid": 0, "tid": 7, "ts": 6300865937337.329, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934908.172, "dur": 4.440, + "args": { + "External id": 84021, "cbid": 211, "correlation": 161145420 + } + }, + { + "ph": "s", "id": 161145420, "pid": 5714, "tid": 5714, "ts": 6300865934908.172, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865937339.121, "dur": 1.024, + "args": { + "External id": 84022, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145426, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145426, "pid": 0, "tid": 7, "ts": 6300865937339.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865934917.601, "dur": 4.000, + "args": { + "External id": 84022, "cbid": 211, "correlation": 161145426 + } + }, + { + "ph": "s", "id": 161145426, "pid": 5714, "tid": 5714, "ts": 6300865934917.601, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865937340.849, "dur": 233.378, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145440, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161145440, "pid": 0, "tid": 7, "ts": 6300865937340.849, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935009.011, "dur": 8.180, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161145440 + } + }, + { + "ph": "s", "id": 161145440, "pid": 5714, "tid": 5714, "ts": 6300865935009.011, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865935052.341, "dur": 0.600, + "args": { + "External id": 84026, "cbid": 200, "correlation": 161145463 + } + }, + { + "ph": "f", "id": 161145463, "pid": 5714, "tid": 5714, "ts": 6300865935052.341, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865937575.251, "dur": 0.800, + "args": { + "External id": 84026, "device": 0, "context": 1, "stream": 7, "correlation": 161145466, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161145466, "pid": 0, "tid": 7, "ts": 6300865937575.251, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865935054.801, "dur": 6.930, + "args": { + "External id": 84026, "cbid": 51, "correlation": 161145466 + } + }, + { + "ph": "s", "id": 161145466, "pid": 5714, "tid": 5714, "ts": 6300865935054.801, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865937577.267, "dur": 689.929, + "args": { + "External id": 84026, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145467, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145467, "pid": 0, "tid": 7, "ts": 6300865937577.267, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935061.991, "dur": 6.340, + "args": { + "External id": 84026, "cbid": 307, "correlation": 161145467 + } + }, + { + "ph": "s", "id": 161145467, "pid": 5714, "tid": 5714, "ts": 6300865935061.991, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865938267.804, "dur": 3.040, + "args": { + "External id": 84029, "device": 0, "context": 1, "stream": 7, "correlation": 161145472, "bytes": 3145728, "memory bandwidth (GB/s)": 1034.778947368421 + } + }, + { + "ph": "f", "id": 161145472, "pid": 0, "tid": 7, "ts": 6300865938267.804, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865935093.241, "dur": 12.480, + "args": { + "External id": 84029, "cbid": 41, "correlation": 161145472 + } + }, + { + "ph": "s", "id": 161145472, "pid": 5714, "tid": 5714, "ts": 6300865935093.241, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865935147.731, "dur": 0.470, + "args": { + "External id": 84034, "cbid": 200, "correlation": 161145500 + } + }, + { + "ph": "f", "id": 161145500, "pid": 5714, "tid": 5714, "ts": 6300865935147.731, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865938271.484, "dur": 687.112, + "args": { + "External id": 84034, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145503, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145503, "pid": 0, "tid": 7, "ts": 6300865938271.484, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935149.741, "dur": 7.630, + "args": { + "External id": 84034, "cbid": 307, "correlation": 161145503 + } + }, + { + "ph": "s", "id": 161145503, "pid": 5714, "tid": 5714, "ts": 6300865935149.741, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865938959.268, "dur": 221.218, + "args": { + "External id": 84035, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145508, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161145508, "pid": 0, "tid": 7, "ts": 6300865938959.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935171.741, "dur": 6.150, + "args": { + "External id": 84035, "cbid": 211, "correlation": 161145508 + } + }, + { + "ph": "s", "id": 161145508, "pid": 5714, "tid": 5714, "ts": 6300865935171.741, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865935222.551, "dur": 1.320, + "args": { + "External id": 84043, "cbid": 210, "correlation": 161145534 + } + }, + { + "ph": "f", "id": 161145534, "pid": 5714, "tid": 5714, "ts": 6300865935222.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865939181.126, "dur": 635.560, + "args": { + "External id": 84043, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145535, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145535, "pid": 0, "tid": 7, "ts": 6300865939181.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935227.531, "dur": 7.700, + "args": { + "External id": 84043, "cbid": 211, "correlation": 161145535 + } + }, + { + "ph": "s", "id": 161145535, "pid": 5714, "tid": 5714, "ts": 6300865935227.531, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865939817.422, "dur": 171.042, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145554, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161145554, "pid": 0, "tid": 7, "ts": 6300865939817.422, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935351.600, "dur": 9.360, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161145554 + } + }, + { + "ph": "s", "id": 161145554, "pid": 5714, "tid": 5714, "ts": 6300865935351.600, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865939989.136, "dur": 4.064, + "args": { + "External id": 84053, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145571, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145571, "pid": 0, "tid": 7, "ts": 6300865939989.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935394.911, "dur": 7.249, + "args": { + "External id": 84053, "cbid": 211, "correlation": 161145571 + } + }, + { + "ph": "s", "id": 161145571, "pid": 5714, "tid": 5714, "ts": 6300865935394.911, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865939993.776, "dur": 1.216, + "args": { + "External id": 84058, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145588, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145588, "pid": 0, "tid": 7, "ts": 6300865939993.776, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935427.760, "dur": 5.410, + "args": { + "External id": 84058, "cbid": 211, "correlation": 161145588 + } + }, + { + "ph": "s", "id": 161145588, "pid": 5714, "tid": 5714, "ts": 6300865935427.760, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865939995.632, "dur": 1.024, + "args": { + "External id": 84060, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145598, "pid": 0, "tid": 7, "ts": 6300865939995.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935446.840, "dur": 5.000, + "args": { + "External id": 84060, "cbid": 211, "correlation": 161145598 + } + }, + { + "ph": "s", "id": 161145598, "pid": 5714, "tid": 5714, "ts": 6300865935446.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865939997.328, "dur": 1.056, + "args": { + "External id": 84061, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145604, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145604, "pid": 0, "tid": 7, "ts": 6300865939997.328, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935458.910, "dur": 4.530, + "args": { + "External id": 84061, "cbid": 211, "correlation": 161145604 + } + }, + { + "ph": "s", "id": 161145604, "pid": 5714, "tid": 5714, "ts": 6300865935458.910, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865939999.024, "dur": 1.056, + "args": { + "External id": 84062, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145614, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145614, "pid": 0, "tid": 7, "ts": 6300865939999.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935474.270, "dur": 4.400, + "args": { + "External id": 84062, "cbid": 211, "correlation": 161145614 + } + }, + { + "ph": "s", "id": 161145614, "pid": 5714, "tid": 5714, "ts": 6300865935474.270, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865940000.752, "dur": 1.024, + "args": { + "External id": 84063, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145620, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145620, "pid": 0, "tid": 7, "ts": 6300865940000.752, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935484.380, "dur": 4.270, + "args": { + "External id": 84063, "cbid": 211, "correlation": 161145620 + } + }, + { + "ph": "s", "id": 161145620, "pid": 5714, "tid": 5714, "ts": 6300865935484.380, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865940002.512, "dur": 3.296, + "args": { + "External id": 84064, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145633, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145633, "pid": 0, "tid": 7, "ts": 6300865940002.512, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935504.250, "dur": 5.170, + "args": { + "External id": 84064, "cbid": 211, "correlation": 161145633 + } + }, + { + "ph": "s", "id": 161145633, "pid": 5714, "tid": 5714, "ts": 6300865935504.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865940006.448, "dur": 1.088, + "args": { + "External id": 84067, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145639, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145639, "pid": 0, "tid": 7, "ts": 6300865940006.448, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935515.520, "dur": 4.230, + "args": { + "External id": 84067, "cbid": 211, "correlation": 161145639 + } + }, + { + "ph": "s", "id": 161145639, "pid": 5714, "tid": 5714, "ts": 6300865935515.520, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865940008.176, "dur": 0.992, + "args": { + "External id": 84068, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145645, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145645, "pid": 0, "tid": 7, "ts": 6300865940008.176, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935524.890, "dur": 3.940, + "args": { + "External id": 84068, "cbid": 211, "correlation": 161145645 + } + }, + { + "ph": "s", "id": 161145645, "pid": 5714, "tid": 5714, "ts": 6300865935524.890, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865940009.872, "dur": 233.571, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145659, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161145659, "pid": 0, "tid": 7, "ts": 6300865940009.872, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935615.910, "dur": 8.260, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161145659 + } + }, + { + "ph": "s", "id": 161145659, "pid": 5714, "tid": 5714, "ts": 6300865935615.910, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865935657.860, "dur": 0.570, + "args": { + "External id": 84072, "cbid": 200, "correlation": 161145682 + } + }, + { + "ph": "f", "id": 161145682, "pid": 5714, "tid": 5714, "ts": 6300865935657.860, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865940244.371, "dur": 0.864, + "args": { + "External id": 84072, "device": 0, "context": 1, "stream": 7, "correlation": 161145685, "bytes": 1536, "memory bandwidth (GB/s)": 1.7777777777777777 + } + }, + { + "ph": "f", "id": 161145685, "pid": 0, "tid": 7, "ts": 6300865940244.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865935660.280, "dur": 6.970, + "args": { + "External id": 84072, "cbid": 51, "correlation": 161145685 + } + }, + { + "ph": "s", "id": 161145685, "pid": 5714, "tid": 5714, "ts": 6300865935660.280, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865940246.387, "dur": 686.120, + "args": { + "External id": 84072, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145686, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145686, "pid": 0, "tid": 7, "ts": 6300865940246.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935667.510, "dur": 5.860, + "args": { + "External id": 84072, "cbid": 307, "correlation": 161145686 + } + }, + { + "ph": "s", "id": 161145686, "pid": 5714, "tid": 5714, "ts": 6300865935667.510, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865940933.147, "dur": 2.944, + "args": { + "External id": 84075, "device": 0, "context": 1, "stream": 7, "correlation": 161145691, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161145691, "pid": 0, "tid": 7, "ts": 6300865940933.147, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865935696.870, "dur": 12.100, + "args": { + "External id": 84075, "cbid": 41, "correlation": 161145691 + } + }, + { + "ph": "s", "id": 161145691, "pid": 5714, "tid": 5714, "ts": 6300865935696.870, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865935751.090, "dur": 0.500, + "args": { + "External id": 84080, "cbid": 200, "correlation": 161145719 + } + }, + { + "ph": "f", "id": 161145719, "pid": 5714, "tid": 5714, "ts": 6300865935751.090, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865940936.827, "dur": 687.752, + "args": { + "External id": 84080, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145722, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145722, "pid": 0, "tid": 7, "ts": 6300865940936.827, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935753.130, "dur": 6.860, + "args": { + "External id": 84080, "cbid": 307, "correlation": 161145722 + } + }, + { + "ph": "s", "id": 161145722, "pid": 5714, "tid": 5714, "ts": 6300865935753.130, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865941625.219, "dur": 221.187, + "args": { + "External id": 84081, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145727, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161145727, "pid": 0, "tid": 7, "ts": 6300865941625.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935774.379, "dur": 6.071, + "args": { + "External id": 84081, "cbid": 211, "correlation": 161145727 + } + }, + { + "ph": "s", "id": 161145727, "pid": 5714, "tid": 5714, "ts": 6300865935774.379, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865935826.499, "dur": 1.160, + "args": { + "External id": 84089, "cbid": 210, "correlation": 161145753 + } + }, + { + "ph": "f", "id": 161145753, "pid": 5714, "tid": 5714, "ts": 6300865935826.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865941847.014, "dur": 637.127, + "args": { + "External id": 84089, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145754, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145754, "pid": 0, "tid": 7, "ts": 6300865941847.014, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935831.270, "dur": 7.440, + "args": { + "External id": 84089, "cbid": 211, "correlation": 161145754 + } + }, + { + "ph": "s", "id": 161145754, "pid": 5714, "tid": 5714, "ts": 6300865935831.270, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865942484.813, "dur": 170.338, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145773, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161145773, "pid": 0, "tid": 7, "ts": 6300865942484.813, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935943.959, "dur": 8.730, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161145773 + } + }, + { + "ph": "s", "id": 161145773, "pid": 5714, "tid": 5714, "ts": 6300865935943.959, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865942655.791, "dur": 4.064, + "args": { + "External id": 84099, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145790, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145790, "pid": 0, "tid": 7, "ts": 6300865942655.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865935986.439, "dur": 7.080, + "args": { + "External id": 84099, "cbid": 211, "correlation": 161145790 + } + }, + { + "ph": "s", "id": 161145790, "pid": 5714, "tid": 5714, "ts": 6300865935986.439, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865942660.463, "dur": 1.184, + "args": { + "External id": 84104, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145807, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145807, "pid": 0, "tid": 7, "ts": 6300865942660.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936021.459, "dur": 5.380, + "args": { + "External id": 84104, "cbid": 211, "correlation": 161145807 + } + }, + { + "ph": "s", "id": 161145807, "pid": 5714, "tid": 5714, "ts": 6300865936021.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865942662.287, "dur": 1.024, + "args": { + "External id": 84106, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145817, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145817, "pid": 0, "tid": 7, "ts": 6300865942662.287, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936041.759, "dur": 4.800, + "args": { + "External id": 84106, "cbid": 211, "correlation": 161145817 + } + }, + { + "ph": "s", "id": 161145817, "pid": 5714, "tid": 5714, "ts": 6300865936041.759, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865942664.015, "dur": 1.024, + "args": { + "External id": 84107, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145823, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145823, "pid": 0, "tid": 7, "ts": 6300865942664.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936053.469, "dur": 4.420, + "args": { + "External id": 84107, "cbid": 211, "correlation": 161145823 + } + }, + { + "ph": "s", "id": 161145823, "pid": 5714, "tid": 5714, "ts": 6300865936053.469, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865942665.679, "dur": 1.056, + "args": { + "External id": 84108, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145833, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145833, "pid": 0, "tid": 7, "ts": 6300865942665.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936068.549, "dur": 4.490, + "args": { + "External id": 84108, "cbid": 211, "correlation": 161145833 + } + }, + { + "ph": "s", "id": 161145833, "pid": 5714, "tid": 5714, "ts": 6300865936068.549, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865942667.407, "dur": 1.024, + "args": { + "External id": 84109, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145839, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145839, "pid": 0, "tid": 7, "ts": 6300865942667.407, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936078.609, "dur": 4.320, + "args": { + "External id": 84109, "cbid": 211, "correlation": 161145839 + } + }, + { + "ph": "s", "id": 161145839, "pid": 5714, "tid": 5714, "ts": 6300865936078.609, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865942669.167, "dur": 3.296, + "args": { + "External id": 84110, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145852, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145852, "pid": 0, "tid": 7, "ts": 6300865942669.167, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936100.979, "dur": 4.980, + "args": { + "External id": 84110, "cbid": 211, "correlation": 161145852 + } + }, + { + "ph": "s", "id": 161145852, "pid": 5714, "tid": 5714, "ts": 6300865936100.979, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865942673.103, "dur": 1.088, + "args": { + "External id": 84113, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145858, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145858, "pid": 0, "tid": 7, "ts": 6300865942673.103, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936111.999, "dur": 4.120, + "args": { + "External id": 84113, "cbid": 211, "correlation": 161145858 + } + }, + { + "ph": "s", "id": 161145858, "pid": 5714, "tid": 5714, "ts": 6300865936111.999, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865942674.831, "dur": 0.992, + "args": { + "External id": 84114, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145864, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161145864, "pid": 0, "tid": 7, "ts": 6300865942674.831, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936121.029, "dur": 3.760, + "args": { + "External id": 84114, "cbid": 211, "correlation": 161145864 + } + }, + { + "ph": "s", "id": 161145864, "pid": 5714, "tid": 5714, "ts": 6300865936121.029, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865942676.527, "dur": 233.955, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145878, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161145878, "pid": 0, "tid": 7, "ts": 6300865942676.527, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936210.309, "dur": 7.909, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161145878 + } + }, + { + "ph": "s", "id": 161145878, "pid": 5714, "tid": 5714, "ts": 6300865936210.309, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865936251.789, "dur": 0.569, + "args": { + "External id": 84118, "cbid": 200, "correlation": 161145901 + } + }, + { + "ph": "f", "id": 161145901, "pid": 5714, "tid": 5714, "ts": 6300865936251.789, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865942911.346, "dur": 0.800, + "args": { + "External id": 84118, "device": 0, "context": 1, "stream": 7, "correlation": 161145904, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161145904, "pid": 0, "tid": 7, "ts": 6300865942911.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865936254.198, "dur": 6.711, + "args": { + "External id": 84118, "cbid": 51, "correlation": 161145904 + } + }, + { + "ph": "s", "id": 161145904, "pid": 5714, "tid": 5714, "ts": 6300865936254.198, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865942912.914, "dur": 689.160, + "args": { + "External id": 84118, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145905, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145905, "pid": 0, "tid": 7, "ts": 6300865942912.914, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936261.189, "dur": 5.829, + "args": { + "External id": 84118, "cbid": 307, "correlation": 161145905 + } + }, + { + "ph": "s", "id": 161145905, "pid": 5714, "tid": 5714, "ts": 6300865936261.189, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865943602.746, "dur": 2.976, + "args": { + "External id": 84121, "device": 0, "context": 1, "stream": 7, "correlation": 161145910, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 161145910, "pid": 0, "tid": 7, "ts": 6300865943602.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865936291.729, "dur": 22.649, + "args": { + "External id": 84121, "cbid": 41, "correlation": 161145910 + } + }, + { + "ph": "s", "id": 161145910, "pid": 5714, "tid": 5714, "ts": 6300865936291.729, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865936358.908, "dur": 0.490, + "args": { + "External id": 84126, "cbid": 200, "correlation": 161145938 + } + }, + { + "ph": "f", "id": 161145938, "pid": 5714, "tid": 5714, "ts": 6300865936358.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865943606.298, "dur": 688.712, + "args": { + "External id": 84126, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145941, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145941, "pid": 0, "tid": 7, "ts": 6300865943606.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936360.928, "dur": 7.180, + "args": { + "External id": 84126, "cbid": 307, "correlation": 161145941 + } + }, + { + "ph": "s", "id": 161145941, "pid": 5714, "tid": 5714, "ts": 6300865936360.928, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865944295.682, "dur": 221.091, + "args": { + "External id": 84127, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145946, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161145946, "pid": 0, "tid": 7, "ts": 6300865944295.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936384.038, "dur": 6.060, + "args": { + "External id": 84127, "cbid": 211, "correlation": 161145946 + } + }, + { + "ph": "s", "id": 161145946, "pid": 5714, "tid": 5714, "ts": 6300865936384.038, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865936434.858, "dur": 1.090, + "args": { + "External id": 84135, "cbid": 210, "correlation": 161145972 + } + }, + { + "ph": "f", "id": 161145972, "pid": 5714, "tid": 5714, "ts": 6300865936434.858, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865944517.381, "dur": 632.615, + "args": { + "External id": 84135, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145973, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161145973, "pid": 0, "tid": 7, "ts": 6300865944517.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936439.508, "dur": 7.720, + "args": { + "External id": 84135, "cbid": 211, "correlation": 161145973 + } + }, + { + "ph": "s", "id": 161145973, "pid": 5714, "tid": 5714, "ts": 6300865936439.508, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865945150.700, "dur": 171.298, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161145992, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161145992, "pid": 0, "tid": 7, "ts": 6300865945150.700, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936552.948, "dur": 8.870, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161145992 + } + }, + { + "ph": "s", "id": 161145992, "pid": 5714, "tid": 5714, "ts": 6300865936552.948, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865945322.638, "dur": 4.065, + "args": { + "External id": 84145, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146009, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146009, "pid": 0, "tid": 7, "ts": 6300865945322.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936595.688, "dur": 7.140, + "args": { + "External id": 84145, "cbid": 211, "correlation": 161146009 + } + }, + { + "ph": "s", "id": 161146009, "pid": 5714, "tid": 5714, "ts": 6300865936595.688, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865945327.439, "dur": 1.184, + "args": { + "External id": 84150, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146026, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146026, "pid": 0, "tid": 7, "ts": 6300865945327.439, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936627.308, "dur": 5.500, + "args": { + "External id": 84150, "cbid": 211, "correlation": 161146026 + } + }, + { + "ph": "s", "id": 161146026, "pid": 5714, "tid": 5714, "ts": 6300865936627.308, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865945329.263, "dur": 1.024, + "args": { + "External id": 84152, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146036, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146036, "pid": 0, "tid": 7, "ts": 6300865945329.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936646.788, "dur": 4.929, + "args": { + "External id": 84152, "cbid": 211, "correlation": 161146036 + } + }, + { + "ph": "s", "id": 161146036, "pid": 5714, "tid": 5714, "ts": 6300865936646.788, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865945330.959, "dur": 1.056, + "args": { + "External id": 84153, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146042, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146042, "pid": 0, "tid": 7, "ts": 6300865945330.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936659.908, "dur": 4.389, + "args": { + "External id": 84153, "cbid": 211, "correlation": 161146042 + } + }, + { + "ph": "s", "id": 161146042, "pid": 5714, "tid": 5714, "ts": 6300865936659.908, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865945332.655, "dur": 1.056, + "args": { + "External id": 84154, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146052, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146052, "pid": 0, "tid": 7, "ts": 6300865945332.655, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936675.268, "dur": 4.360, + "args": { + "External id": 84154, "cbid": 211, "correlation": 161146052 + } + }, + { + "ph": "s", "id": 161146052, "pid": 5714, "tid": 5714, "ts": 6300865936675.268, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865945334.383, "dur": 1.024, + "args": { + "External id": 84155, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146058, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146058, "pid": 0, "tid": 7, "ts": 6300865945334.383, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936685.417, "dur": 4.140, + "args": { + "External id": 84155, "cbid": 211, "correlation": 161146058 + } + }, + { + "ph": "s", "id": 161146058, "pid": 5714, "tid": 5714, "ts": 6300865936685.417, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865945336.143, "dur": 3.296, + "args": { + "External id": 84156, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146071, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146071, "pid": 0, "tid": 7, "ts": 6300865945336.143, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936705.348, "dur": 4.940, + "args": { + "External id": 84156, "cbid": 211, "correlation": 161146071 + } + }, + { + "ph": "s", "id": 161146071, "pid": 5714, "tid": 5714, "ts": 6300865936705.348, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865945340.079, "dur": 1.088, + "args": { + "External id": 84159, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146077, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146077, "pid": 0, "tid": 7, "ts": 6300865945340.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936717.617, "dur": 4.531, + "args": { + "External id": 84159, "cbid": 211, "correlation": 161146077 + } + }, + { + "ph": "s", "id": 161146077, "pid": 5714, "tid": 5714, "ts": 6300865936717.617, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865945341.775, "dur": 1.024, + "args": { + "External id": 84160, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146083, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146083, "pid": 0, "tid": 7, "ts": 6300865945341.775, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936727.388, "dur": 3.989, + "args": { + "External id": 84160, "cbid": 211, "correlation": 161146083 + } + }, + { + "ph": "s", "id": 161146083, "pid": 5714, "tid": 5714, "ts": 6300865936727.388, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865945343.503, "dur": 233.986, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146097, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161146097, "pid": 0, "tid": 7, "ts": 6300865945343.503, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936817.227, "dur": 8.070, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161146097 + } + }, + { + "ph": "s", "id": 161146097, "pid": 5714, "tid": 5714, "ts": 6300865936817.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865936858.837, "dur": 0.580, + "args": { + "External id": 84164, "cbid": 200, "correlation": 161146120 + } + }, + { + "ph": "f", "id": 161146120, "pid": 5714, "tid": 5714, "ts": 6300865936858.837, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865945578.289, "dur": 0.832, + "args": { + "External id": 84164, "device": 0, "context": 1, "stream": 7, "correlation": 161146123, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161146123, "pid": 0, "tid": 7, "ts": 6300865945578.289, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865936861.267, "dur": 6.640, + "args": { + "External id": 84164, "cbid": 51, "correlation": 161146123 + } + }, + { + "ph": "s", "id": 161146123, "pid": 5714, "tid": 5714, "ts": 6300865936861.267, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865945579.889, "dur": 685.800, + "args": { + "External id": 84164, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146124, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146124, "pid": 0, "tid": 7, "ts": 6300865945579.889, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936868.147, "dur": 5.970, + "args": { + "External id": 84164, "cbid": 307, "correlation": 161146124 + } + }, + { + "ph": "s", "id": 161146124, "pid": 5714, "tid": 5714, "ts": 6300865936868.147, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865946266.297, "dur": 2.976, + "args": { + "External id": 84167, "device": 0, "context": 1, "stream": 7, "correlation": 161146129, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 161146129, "pid": 0, "tid": 7, "ts": 6300865946266.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865936899.367, "dur": 12.320, + "args": { + "External id": 84167, "cbid": 41, "correlation": 161146129 + } + }, + { + "ph": "s", "id": 161146129, "pid": 5714, "tid": 5714, "ts": 6300865936899.367, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865936956.697, "dur": 0.610, + "args": { + "External id": 84172, "cbid": 200, "correlation": 161146157 + } + }, + { + "ph": "f", "id": 161146157, "pid": 5714, "tid": 5714, "ts": 6300865936956.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865946269.977, "dur": 688.457, + "args": { + "External id": 84172, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146160, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146160, "pid": 0, "tid": 7, "ts": 6300865946269.977, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936965.077, "dur": 9.120, + "args": { + "External id": 84172, "cbid": 307, "correlation": 161146160 + } + }, + { + "ph": "s", "id": 161146160, "pid": 5714, "tid": 5714, "ts": 6300865936965.077, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865946959.138, "dur": 220.610, + "args": { + "External id": 84173, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146165, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161146165, "pid": 0, "tid": 7, "ts": 6300865946959.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865936989.747, "dur": 6.220, + "args": { + "External id": 84173, "cbid": 211, "correlation": 161146165 + } + }, + { + "ph": "s", "id": 161146165, "pid": 5714, "tid": 5714, "ts": 6300865936989.747, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865937041.047, "dur": 1.190, + "args": { + "External id": 84181, "cbid": 210, "correlation": 161146191 + } + }, + { + "ph": "f", "id": 161146191, "pid": 5714, "tid": 5714, "ts": 6300865937041.047, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865947180.452, "dur": 633.416, + "args": { + "External id": 84181, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146192, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146192, "pid": 0, "tid": 7, "ts": 6300865947180.452, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937045.727, "dur": 7.650, + "args": { + "External id": 84181, "cbid": 211, "correlation": 161146192 + } + }, + { + "ph": "s", "id": 161146192, "pid": 5714, "tid": 5714, "ts": 6300865937045.727, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865947814.604, "dur": 170.882, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146211, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161146211, "pid": 0, "tid": 7, "ts": 6300865947814.604, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937157.416, "dur": 8.971, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161146211 + } + }, + { + "ph": "s", "id": 161146211, "pid": 5714, "tid": 5714, "ts": 6300865937157.416, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865947986.158, "dur": 4.192, + "args": { + "External id": 84191, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146228, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146228, "pid": 0, "tid": 7, "ts": 6300865947986.158, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937199.466, "dur": 7.250, + "args": { + "External id": 84191, "cbid": 211, "correlation": 161146228 + } + }, + { + "ph": "s", "id": 161146228, "pid": 5714, "tid": 5714, "ts": 6300865937199.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865947990.958, "dur": 1.344, + "args": { + "External id": 84196, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146245, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146245, "pid": 0, "tid": 7, "ts": 6300865947990.958, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937232.046, "dur": 5.570, + "args": { + "External id": 84196, "cbid": 211, "correlation": 161146245 + } + }, + { + "ph": "s", "id": 161146245, "pid": 5714, "tid": 5714, "ts": 6300865937232.046, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865947992.942, "dur": 0.992, + "args": { + "External id": 84198, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146255, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146255, "pid": 0, "tid": 7, "ts": 6300865947992.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937252.106, "dur": 4.770, + "args": { + "External id": 84198, "cbid": 211, "correlation": 161146255 + } + }, + { + "ph": "s", "id": 161146255, "pid": 5714, "tid": 5714, "ts": 6300865937252.106, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865947994.638, "dur": 1.024, + "args": { + "External id": 84199, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146261, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146261, "pid": 0, "tid": 7, "ts": 6300865947994.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937264.246, "dur": 4.310, + "args": { + "External id": 84199, "cbid": 211, "correlation": 161146261 + } + }, + { + "ph": "s", "id": 161146261, "pid": 5714, "tid": 5714, "ts": 6300865937264.246, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865947996.334, "dur": 1.024, + "args": { + "External id": 84200, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146271, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146271, "pid": 0, "tid": 7, "ts": 6300865947996.334, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937279.076, "dur": 4.280, + "args": { + "External id": 84200, "cbid": 211, "correlation": 161146271 + } + }, + { + "ph": "s", "id": 161146271, "pid": 5714, "tid": 5714, "ts": 6300865937279.076, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865947998.030, "dur": 1.024, + "args": { + "External id": 84201, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146277, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146277, "pid": 0, "tid": 7, "ts": 6300865947998.030, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937288.816, "dur": 4.270, + "args": { + "External id": 84201, "cbid": 211, "correlation": 161146277 + } + }, + { + "ph": "s", "id": 161146277, "pid": 5714, "tid": 5714, "ts": 6300865937288.816, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865947999.790, "dur": 3.296, + "args": { + "External id": 84202, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146290, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146290, "pid": 0, "tid": 7, "ts": 6300865947999.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937321.266, "dur": 6.190, + "args": { + "External id": 84202, "cbid": 211, "correlation": 161146290 + } + }, + { + "ph": "s", "id": 161146290, "pid": 5714, "tid": 5714, "ts": 6300865937321.266, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865948003.758, "dur": 1.056, + "args": { + "External id": 84205, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146296, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146296, "pid": 0, "tid": 7, "ts": 6300865948003.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937334.136, "dur": 4.310, + "args": { + "External id": 84205, "cbid": 211, "correlation": 161146296 + } + }, + { + "ph": "s", "id": 161146296, "pid": 5714, "tid": 5714, "ts": 6300865937334.136, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865948005.454, "dur": 0.992, + "args": { + "External id": 84206, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146302, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146302, "pid": 0, "tid": 7, "ts": 6300865948005.454, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937343.496, "dur": 3.990, + "args": { + "External id": 84206, "cbid": 211, "correlation": 161146302 + } + }, + { + "ph": "s", "id": 161146302, "pid": 5714, "tid": 5714, "ts": 6300865937343.496, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865948007.182, "dur": 233.506, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146316, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161146316, "pid": 0, "tid": 7, "ts": 6300865948007.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937433.696, "dur": 8.150, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161146316 + } + }, + { + "ph": "s", "id": 161146316, "pid": 5714, "tid": 5714, "ts": 6300865937433.696, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865937477.356, "dur": 0.570, + "args": { + "External id": 84210, "cbid": 200, "correlation": 161146339 + } + }, + { + "ph": "f", "id": 161146339, "pid": 5714, "tid": 5714, "ts": 6300865937477.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865948241.552, "dur": 0.800, + "args": { + "External id": 84210, "device": 0, "context": 1, "stream": 7, "correlation": 161146342, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161146342, "pid": 0, "tid": 7, "ts": 6300865948241.552, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865937479.706, "dur": 6.810, + "args": { + "External id": 84210, "cbid": 51, "correlation": 161146342 + } + }, + { + "ph": "s", "id": 161146342, "pid": 5714, "tid": 5714, "ts": 6300865937479.706, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865948243.120, "dur": 689.961, + "args": { + "External id": 84210, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146343, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146343, "pid": 0, "tid": 7, "ts": 6300865948243.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937486.756, "dur": 5.790, + "args": { + "External id": 84210, "cbid": 307, "correlation": 161146343 + } + }, + { + "ph": "s", "id": 161146343, "pid": 5714, "tid": 5714, "ts": 6300865937486.756, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865948933.689, "dur": 2.976, + "args": { + "External id": 84213, "device": 0, "context": 1, "stream": 7, "correlation": 161146348, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 161146348, "pid": 0, "tid": 7, "ts": 6300865948933.689, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865937516.416, "dur": 11.799, + "args": { + "External id": 84213, "cbid": 41, "correlation": 161146348 + } + }, + { + "ph": "s", "id": 161146348, "pid": 5714, "tid": 5714, "ts": 6300865937516.416, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865937570.406, "dur": 0.509, + "args": { + "External id": 84218, "cbid": 200, "correlation": 161146376 + } + }, + { + "ph": "f", "id": 161146376, "pid": 5714, "tid": 5714, "ts": 6300865937570.406, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865948937.369, "dur": 688.200, + "args": { + "External id": 84218, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146379, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146379, "pid": 0, "tid": 7, "ts": 6300865948937.369, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937572.386, "dur": 7.080, + "args": { + "External id": 84218, "cbid": 307, "correlation": 161146379 + } + }, + { + "ph": "s", "id": 161146379, "pid": 5714, "tid": 5714, "ts": 6300865937572.386, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865949626.241, "dur": 220.994, + "args": { + "External id": 84219, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146384, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161146384, "pid": 0, "tid": 7, "ts": 6300865949626.241, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937606.066, "dur": 5.920, + "args": { + "External id": 84219, "cbid": 211, "correlation": 161146384 + } + }, + { + "ph": "s", "id": 161146384, "pid": 5714, "tid": 5714, "ts": 6300865937606.066, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865937656.115, "dur": 1.160, + "args": { + "External id": 84227, "cbid": 210, "correlation": 161146410 + } + }, + { + "ph": "f", "id": 161146410, "pid": 5714, "tid": 5714, "ts": 6300865937656.115, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865949847.971, "dur": 636.360, + "args": { + "External id": 84227, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146411, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146411, "pid": 0, "tid": 7, "ts": 6300865949847.971, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937660.855, "dur": 7.270, + "args": { + "External id": 84227, "cbid": 211, "correlation": 161146411 + } + }, + { + "ph": "s", "id": 161146411, "pid": 5714, "tid": 5714, "ts": 6300865937660.855, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865950485.067, "dur": 171.074, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146430, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161146430, "pid": 0, "tid": 7, "ts": 6300865950485.067, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937772.315, "dur": 8.910, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161146430 + } + }, + { + "ph": "s", "id": 161146430, "pid": 5714, "tid": 5714, "ts": 6300865937772.315, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865950656.845, "dur": 4.128, + "args": { + "External id": 84237, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146447, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146447, "pid": 0, "tid": 7, "ts": 6300865950656.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937816.685, "dur": 7.240, + "args": { + "External id": 84237, "cbid": 211, "correlation": 161146447 + } + }, + { + "ph": "s", "id": 161146447, "pid": 5714, "tid": 5714, "ts": 6300865937816.685, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865950661.613, "dur": 1.216, + "args": { + "External id": 84242, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146464, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146464, "pid": 0, "tid": 7, "ts": 6300865950661.613, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937846.375, "dur": 5.480, + "args": { + "External id": 84242, "cbid": 211, "correlation": 161146464 + } + }, + { + "ph": "s", "id": 161146464, "pid": 5714, "tid": 5714, "ts": 6300865937846.375, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865950663.469, "dur": 1.024, + "args": { + "External id": 84244, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146474, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146474, "pid": 0, "tid": 7, "ts": 6300865950663.469, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937867.005, "dur": 4.850, + "args": { + "External id": 84244, "cbid": 211, "correlation": 161146474 + } + }, + { + "ph": "s", "id": 161146474, "pid": 5714, "tid": 5714, "ts": 6300865937867.005, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865950665.165, "dur": 1.024, + "args": { + "External id": 84245, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146480, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146480, "pid": 0, "tid": 7, "ts": 6300865950665.165, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937878.975, "dur": 4.380, + "args": { + "External id": 84245, "cbid": 211, "correlation": 161146480 + } + }, + { + "ph": "s", "id": 161146480, "pid": 5714, "tid": 5714, "ts": 6300865937878.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865950666.861, "dur": 1.024, + "args": { + "External id": 84246, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146490, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146490, "pid": 0, "tid": 7, "ts": 6300865950666.861, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937893.515, "dur": 4.170, + "args": { + "External id": 84246, "cbid": 211, "correlation": 161146490 + } + }, + { + "ph": "s", "id": 161146490, "pid": 5714, "tid": 5714, "ts": 6300865937893.515, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865950668.557, "dur": 1.056, + "args": { + "External id": 84247, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146496, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146496, "pid": 0, "tid": 7, "ts": 6300865950668.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937903.355, "dur": 4.430, + "args": { + "External id": 84247, "cbid": 211, "correlation": 161146496 + } + }, + { + "ph": "s", "id": 161146496, "pid": 5714, "tid": 5714, "ts": 6300865937903.355, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865950670.317, "dur": 3.360, + "args": { + "External id": 84248, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146509, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146509, "pid": 0, "tid": 7, "ts": 6300865950670.317, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937925.015, "dur": 5.210, + "args": { + "External id": 84248, "cbid": 211, "correlation": 161146509 + } + }, + { + "ph": "s", "id": 161146509, "pid": 5714, "tid": 5714, "ts": 6300865937925.015, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865950674.285, "dur": 1.056, + "args": { + "External id": 84251, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146515, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146515, "pid": 0, "tid": 7, "ts": 6300865950674.285, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937936.305, "dur": 4.290, + "args": { + "External id": 84251, "cbid": 211, "correlation": 161146515 + } + }, + { + "ph": "s", "id": 161146515, "pid": 5714, "tid": 5714, "ts": 6300865937936.305, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865950675.981, "dur": 1.024, + "args": { + "External id": 84252, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146521, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146521, "pid": 0, "tid": 7, "ts": 6300865950675.981, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865937945.595, "dur": 4.040, + "args": { + "External id": 84252, "cbid": 211, "correlation": 161146521 + } + }, + { + "ph": "s", "id": 161146521, "pid": 5714, "tid": 5714, "ts": 6300865937945.595, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865950677.709, "dur": 233.795, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146535, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161146535, "pid": 0, "tid": 7, "ts": 6300865950677.709, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938036.374, "dur": 7.951, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161146535 + } + }, + { + "ph": "s", "id": 161146535, "pid": 5714, "tid": 5714, "ts": 6300865938036.374, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865938077.684, "dur": 0.570, + "args": { + "External id": 84256, "cbid": 200, "correlation": 161146558 + } + }, + { + "ph": "f", "id": 161146558, "pid": 5714, "tid": 5714, "ts": 6300865938077.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865950912.400, "dur": 0.800, + "args": { + "External id": 84256, "device": 0, "context": 1, "stream": 7, "correlation": 161146561, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161146561, "pid": 0, "tid": 7, "ts": 6300865950912.400, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865938080.064, "dur": 6.860, + "args": { + "External id": 84256, "cbid": 51, "correlation": 161146561 + } + }, + { + "ph": "s", "id": 161146561, "pid": 5714, "tid": 5714, "ts": 6300865938080.064, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865950914.384, "dur": 687.944, + "args": { + "External id": 84256, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146562, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146562, "pid": 0, "tid": 7, "ts": 6300865950914.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938087.144, "dur": 5.870, + "args": { + "External id": 84256, "cbid": 307, "correlation": 161146562 + } + }, + { + "ph": "s", "id": 161146562, "pid": 5714, "tid": 5714, "ts": 6300865938087.144, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865951602.968, "dur": 2.976, + "args": { + "External id": 84259, "device": 0, "context": 1, "stream": 7, "correlation": 161146567, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 161146567, "pid": 0, "tid": 7, "ts": 6300865951602.968, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865938116.894, "dur": 12.110, + "args": { + "External id": 84259, "cbid": 41, "correlation": 161146567 + } + }, + { + "ph": "s", "id": 161146567, "pid": 5714, "tid": 5714, "ts": 6300865938116.894, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865938169.594, "dur": 0.490, + "args": { + "External id": 84264, "cbid": 200, "correlation": 161146595 + } + }, + { + "ph": "f", "id": 161146595, "pid": 5714, "tid": 5714, "ts": 6300865938169.594, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865951606.648, "dur": 687.880, + "args": { + "External id": 84264, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146598, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146598, "pid": 0, "tid": 7, "ts": 6300865951606.648, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938171.564, "dur": 7.140, + "args": { + "External id": 84264, "cbid": 307, "correlation": 161146598 + } + }, + { + "ph": "s", "id": 161146598, "pid": 5714, "tid": 5714, "ts": 6300865938171.564, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865952295.168, "dur": 221.219, + "args": { + "External id": 84265, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146603, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161146603, "pid": 0, "tid": 7, "ts": 6300865952295.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938193.024, "dur": 6.020, + "args": { + "External id": 84265, "cbid": 211, "correlation": 161146603 + } + }, + { + "ph": "s", "id": 161146603, "pid": 5714, "tid": 5714, "ts": 6300865938193.024, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865938245.024, "dur": 1.210, + "args": { + "External id": 84273, "cbid": 210, "correlation": 161146629 + } + }, + { + "ph": "f", "id": 161146629, "pid": 5714, "tid": 5714, "ts": 6300865938245.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865952517.091, "dur": 634.631, + "args": { + "External id": 84273, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146630, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146630, "pid": 0, "tid": 7, "ts": 6300865952517.091, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938249.814, "dur": 7.530, + "args": { + "External id": 84273, "cbid": 211, "correlation": 161146630 + } + }, + { + "ph": "s", "id": 161146630, "pid": 5714, "tid": 5714, "ts": 6300865938249.814, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865953152.426, "dur": 171.010, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146649, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161146649, "pid": 0, "tid": 7, "ts": 6300865953152.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938371.354, "dur": 9.490, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161146649 + } + }, + { + "ph": "s", "id": 161146649, "pid": 5714, "tid": 5714, "ts": 6300865938371.354, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865953324.108, "dur": 4.000, + "args": { + "External id": 84283, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146666, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146666, "pid": 0, "tid": 7, "ts": 6300865953324.108, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938417.444, "dur": 7.549, + "args": { + "External id": 84283, "cbid": 211, "correlation": 161146666 + } + }, + { + "ph": "s", "id": 161146666, "pid": 5714, "tid": 5714, "ts": 6300865938417.444, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865953328.748, "dur": 1.184, + "args": { + "External id": 84288, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146683, "pid": 0, "tid": 7, "ts": 6300865953328.748, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938449.473, "dur": 5.351, + "args": { + "External id": 84288, "cbid": 211, "correlation": 161146683 + } + }, + { + "ph": "s", "id": 161146683, "pid": 5714, "tid": 5714, "ts": 6300865938449.473, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865953330.604, "dur": 1.024, + "args": { + "External id": 84290, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146693, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146693, "pid": 0, "tid": 7, "ts": 6300865953330.604, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938468.444, "dur": 4.889, + "args": { + "External id": 84290, "cbid": 211, "correlation": 161146693 + } + }, + { + "ph": "s", "id": 161146693, "pid": 5714, "tid": 5714, "ts": 6300865938468.444, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865953332.300, "dur": 1.056, + "args": { + "External id": 84291, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146699, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146699, "pid": 0, "tid": 7, "ts": 6300865953332.300, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938480.293, "dur": 4.311, + "args": { + "External id": 84291, "cbid": 211, "correlation": 161146699 + } + }, + { + "ph": "s", "id": 161146699, "pid": 5714, "tid": 5714, "ts": 6300865938480.293, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865953333.996, "dur": 1.024, + "args": { + "External id": 84292, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146709, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146709, "pid": 0, "tid": 7, "ts": 6300865953333.996, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938494.784, "dur": 4.320, + "args": { + "External id": 84292, "cbid": 211, "correlation": 161146709 + } + }, + { + "ph": "s", "id": 161146709, "pid": 5714, "tid": 5714, "ts": 6300865938494.784, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865953335.724, "dur": 1.024, + "args": { + "External id": 84293, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146715, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146715, "pid": 0, "tid": 7, "ts": 6300865953335.724, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938504.683, "dur": 4.120, + "args": { + "External id": 84293, "cbid": 211, "correlation": 161146715 + } + }, + { + "ph": "s", "id": 161146715, "pid": 5714, "tid": 5714, "ts": 6300865938504.683, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865953337.484, "dur": 3.328, + "args": { + "External id": 84294, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146728, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146728, "pid": 0, "tid": 7, "ts": 6300865953337.484, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938523.523, "dur": 5.000, + "args": { + "External id": 84294, "cbid": 211, "correlation": 161146728 + } + }, + { + "ph": "s", "id": 161146728, "pid": 5714, "tid": 5714, "ts": 6300865938523.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865953341.452, "dur": 1.056, + "args": { + "External id": 84297, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146734, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146734, "pid": 0, "tid": 7, "ts": 6300865953341.452, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938535.663, "dur": 3.990, + "args": { + "External id": 84297, "cbid": 211, "correlation": 161146734 + } + }, + { + "ph": "s", "id": 161146734, "pid": 5714, "tid": 5714, "ts": 6300865938535.663, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865953343.116, "dur": 1.024, + "args": { + "External id": 84298, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146740, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146740, "pid": 0, "tid": 7, "ts": 6300865953343.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938544.463, "dur": 3.830, + "args": { + "External id": 84298, "cbid": 211, "correlation": 161146740 + } + }, + { + "ph": "s", "id": 161146740, "pid": 5714, "tid": 5714, "ts": 6300865938544.463, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865953344.844, "dur": 234.947, + "args": { + "External id": 83914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146754, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161146754, "pid": 0, "tid": 7, "ts": 6300865953344.844, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938633.693, "dur": 8.260, + "args": { + "External id": 83914, "cbid": 307, "correlation": 161146754 + } + }, + { + "ph": "s", "id": 161146754, "pid": 5714, "tid": 5714, "ts": 6300865938633.693, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865938675.033, "dur": 0.560, + "args": { + "External id": 84302, "cbid": 200, "correlation": 161146777 + } + }, + { + "ph": "f", "id": 161146777, "pid": 5714, "tid": 5714, "ts": 6300865938675.033, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865953580.687, "dur": 0.800, + "args": { + "External id": 84302, "device": 0, "context": 1, "stream": 7, "correlation": 161146780, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161146780, "pid": 0, "tid": 7, "ts": 6300865953580.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865938677.433, "dur": 6.830, + "args": { + "External id": 84302, "cbid": 51, "correlation": 161146780 + } + }, + { + "ph": "s", "id": 161146780, "pid": 5714, "tid": 5714, "ts": 6300865938677.433, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865953582.255, "dur": 691.049, + "args": { + "External id": 84302, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146781, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146781, "pid": 0, "tid": 7, "ts": 6300865953582.255, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938684.493, "dur": 5.930, + "args": { + "External id": 84302, "cbid": 307, "correlation": 161146781 + } + }, + { + "ph": "s", "id": 161146781, "pid": 5714, "tid": 5714, "ts": 6300865938684.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865954273.976, "dur": 2.944, + "args": { + "External id": 84305, "device": 0, "context": 1, "stream": 7, "correlation": 161146786, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161146786, "pid": 0, "tid": 7, "ts": 6300865954273.976, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865938716.323, "dur": 13.310, + "args": { + "External id": 84305, "cbid": 41, "correlation": 161146786 + } + }, + { + "ph": "s", "id": 161146786, "pid": 5714, "tid": 5714, "ts": 6300865938716.323, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865938769.673, "dur": 0.520, + "args": { + "External id": 84310, "cbid": 200, "correlation": 161146814 + } + }, + { + "ph": "f", "id": 161146814, "pid": 5714, "tid": 5714, "ts": 6300865938769.673, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865954277.688, "dur": 687.112, + "args": { + "External id": 84310, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146817, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161146817, "pid": 0, "tid": 7, "ts": 6300865954277.688, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938771.653, "dur": 7.160, + "args": { + "External id": 84310, "cbid": 307, "correlation": 161146817 + } + }, + { + "ph": "s", "id": 161146817, "pid": 5714, "tid": 5714, "ts": 6300865938771.653, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865954965.504, "dur": 220.418, + "args": { + "External id": 84311, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146822, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161146822, "pid": 0, "tid": 7, "ts": 6300865954965.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938793.003, "dur": 5.990, + "args": { + "External id": 84311, "cbid": 211, "correlation": 161146822 + } + }, + { + "ph": "s", "id": 161146822, "pid": 5714, "tid": 5714, "ts": 6300865938793.003, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865955186.530, "dur": 5.568, + "args": { + "External id": 84313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146835, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146835, "pid": 0, "tid": 7, "ts": 6300865955186.530, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938824.423, "dur": 7.360, + "args": { + "External id": 84313, "cbid": 211, "correlation": 161146835 + } + }, + { + "ph": "s", "id": 161146835, "pid": 5714, "tid": 5714, "ts": 6300865938824.423, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865955192.706, "dur": 157.730, + "args": { + "External id": 84318, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161146848, "pid": 0, "tid": 7, "ts": 6300865955192.706, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938857.632, "dur": 6.111, + "args": { + "External id": 84318, "cbid": 211, "correlation": 161146848 + } + }, + { + "ph": "s", "id": 161146848, "pid": 5714, "tid": 5714, "ts": 6300865938857.632, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865955351.108, "dur": 1.504, + "args": { + "External id": 84323, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146856, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146856, "pid": 0, "tid": 7, "ts": 6300865955351.108, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865938928.032, "dur": 7.760, + "args": { + "External id": 84323, "cbid": 211, "correlation": 161146856 + } + }, + { + "ph": "s", "id": 161146856, "pid": 5714, "tid": 5714, "ts": 6300865938928.032, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865955353.252, "dur": 2.368, + "args": { + "External id": 84342, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146876, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 161146876, "pid": 0, "tid": 7, "ts": 6300865955353.252, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865939023.552, "dur": 13.350, + "args": { + "External id": 84342, "cbid": 211, "correlation": 161146876 + } + }, + { + "ph": "s", "id": 161146876, "pid": 5714, "tid": 5714, "ts": 6300865939023.552, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865955356.260, "dur": 59.265, + "args": { + "External id": 84350, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146894, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161146894, "pid": 0, "tid": 7, "ts": 6300865955356.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865939144.122, "dur": 10.250, + "args": { + "External id": 84350, "cbid": 211, "correlation": 161146894 + } + }, + { + "ph": "s", "id": 161146894, "pid": 5714, "tid": 5714, "ts": 6300865939144.122, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865955416.197, "dur": 14.688, + "args": { + "External id": 84355, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146911, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161146911, "pid": 0, "tid": 7, "ts": 6300865955416.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865939189.372, "dur": 7.840, + "args": { + "External id": 84355, "cbid": 211, "correlation": 161146911 + } + }, + { + "ph": "s", "id": 161146911, "pid": 5714, "tid": 5714, "ts": 6300865939189.372, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865955431.557, "dur": 101.217, + "args": { + "External id": 84360, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146927, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161146927, "pid": 0, "tid": 7, "ts": 6300865955431.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865939216.862, "dur": 5.690, + "args": { + "External id": 84360, "cbid": 211, "correlation": 161146927 + } + }, + { + "ph": "s", "id": 161146927, "pid": 5714, "tid": 5714, "ts": 6300865939216.862, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865955533.478, "dur": 1.824, + "args": { + "External id": 84364, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146943, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161146943, "pid": 0, "tid": 7, "ts": 6300865955533.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865939243.252, "dur": 4.950, + "args": { + "External id": 84364, "cbid": 211, "correlation": 161146943 + } + }, + { + "ph": "s", "id": 161146943, "pid": 5714, "tid": 5714, "ts": 6300865939243.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865955535.911, "dur": 1.856, + "args": { + "External id": 84365, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146955, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161146955, "pid": 0, "tid": 7, "ts": 6300865955535.911, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865939268.522, "dur": 6.330, + "args": { + "External id": 84365, "cbid": 211, "correlation": 161146955 + } + }, + { + "ph": "s", "id": 161146955, "pid": 5714, "tid": 5714, "ts": 6300865939268.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865955538.439, "dur": 2.080, + "args": { + "External id": 84372, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146973, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161146973, "pid": 0, "tid": 7, "ts": 6300865955538.439, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865939314.842, "dur": 8.149, + "args": { + "External id": 84372, "cbid": 211, "correlation": 161146973 + } + }, + { + "ph": "s", "id": 161146973, "pid": 5714, "tid": 5714, "ts": 6300865939314.842, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 0, "tid": 7, + "ts": 6300865955541.191, "dur": 3.872, + "args": { + "External id": 84367, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161146982, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161146982, "pid": 0, "tid": 7, "ts": 6300865955541.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865939330.511, "dur": 5.480, + "args": { + "External id": 84367, "cbid": 211, "correlation": 161146982 + } + }, + { + "ph": "s", "id": 161146982, "pid": 5714, "tid": 5714, "ts": 6300865939330.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939350.062, "dur": 2.440, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146987 + } + }, + { + "ph": "f", "id": 161146987, "pid": 5714, "tid": 5714, "ts": 6300865939350.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939353.691, "dur": 0.851, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146988 + } + }, + { + "ph": "f", "id": 161146988, "pid": 5714, "tid": 5714, "ts": 6300865939353.691, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939355.022, "dur": 0.929, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146989 + } + }, + { + "ph": "f", "id": 161146989, "pid": 5714, "tid": 5714, "ts": 6300865939355.022, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939356.411, "dur": 0.880, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146990 + } + }, + { + "ph": "f", "id": 161146990, "pid": 5714, "tid": 5714, "ts": 6300865939356.411, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939357.491, "dur": 0.631, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146991 + } + }, + { + "ph": "f", "id": 161146991, "pid": 5714, "tid": 5714, "ts": 6300865939357.491, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939358.462, "dur": 0.660, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146992 + } + }, + { + "ph": "f", "id": 161146992, "pid": 5714, "tid": 5714, "ts": 6300865939358.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939359.551, "dur": 0.840, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146993 + } + }, + { + "ph": "f", "id": 161146993, "pid": 5714, "tid": 5714, "ts": 6300865939359.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939360.731, "dur": 0.591, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146994 + } + }, + { + "ph": "f", "id": 161146994, "pid": 5714, "tid": 5714, "ts": 6300865939360.731, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939361.742, "dur": 0.600, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146995 + } + }, + { + "ph": "f", "id": 161146995, "pid": 5714, "tid": 5714, "ts": 6300865939361.742, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939362.951, "dur": 0.931, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146996 + } + }, + { + "ph": "f", "id": 161146996, "pid": 5714, "tid": 5714, "ts": 6300865939362.951, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300865939364.431, "dur": 0.491, + "args": { + "External id": 84374, "cbid": 138, "correlation": 161146997 + } + }, + { + "ph": "f", "id": 161146997, "pid": 5714, "tid": 5714, "ts": 6300865939364.431, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6300865955551.239, "dur": 1.056, + "args": { + "External id": 84374, "device": 0, "context": 1, "stream": 7, "correlation": 161146999, "bytes": 8, "memory bandwidth (GB/s)": 0.007575757575757576 + } + }, + { + "ph": "f", "id": 161146999, "pid": 0, "tid": 7, "ts": 6300865955551.239, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865939366.511, "dur": 11.211, + "args": { + "External id": 84374, "cbid": 41, "correlation": 161146999 + } + }, + { + "ph": "s", "id": 161146999, "pid": 5714, "tid": 5714, "ts": 6300865939366.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6300865939378.042, "dur": 16178.923, + "args": { + "External id": 84374, "cbid": 131, "correlation": 161147000 + } + }, + { + "ph": "s", "id": 161147000, "pid": 5714, "tid": 5714, "ts": 6300865939378.042, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865955622.174, "dur": 1.780, + "args": { + "External id": 84382, "cbid": 210, "correlation": 161147025 + } + }, + { + "ph": "f", "id": 161147025, "pid": 5714, "tid": 5714, "ts": 6300865955622.174, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865955639.592, "dur": 637.959, + "args": { + "External id": 84382, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147026, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147026, "pid": 0, "tid": 7, "ts": 6300865955639.592, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955627.794, "dur": 11.591, + "args": { + "External id": 84382, "cbid": 211, "correlation": 161147026 + } + }, + { + "ph": "s", "id": 161147026, "pid": 5714, "tid": 5714, "ts": 6300865955627.794, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865956278.287, "dur": 170.850, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147045, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161147045, "pid": 0, "tid": 7, "ts": 6300865956278.287, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955749.934, "dur": 9.210, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161147045 + } + }, + { + "ph": "s", "id": 161147045, "pid": 5714, "tid": 5714, "ts": 6300865955749.934, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865956449.873, "dur": 4.192, + "args": { + "External id": 84392, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147062, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147062, "pid": 0, "tid": 7, "ts": 6300865956449.873, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955798.444, "dur": 7.650, + "args": { + "External id": 84392, "cbid": 211, "correlation": 161147062 + } + }, + { + "ph": "s", "id": 161147062, "pid": 5714, "tid": 5714, "ts": 6300865955798.444, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865956454.769, "dur": 1.216, + "args": { + "External id": 84397, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147079, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147079, "pid": 0, "tid": 7, "ts": 6300865956454.769, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955829.374, "dur": 5.530, + "args": { + "External id": 84397, "cbid": 211, "correlation": 161147079 + } + }, + { + "ph": "s", "id": 161147079, "pid": 5714, "tid": 5714, "ts": 6300865955829.374, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865956456.657, "dur": 0.992, + "args": { + "External id": 84399, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147089, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147089, "pid": 0, "tid": 7, "ts": 6300865956456.657, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955850.364, "dur": 5.270, + "args": { + "External id": 84399, "cbid": 211, "correlation": 161147089 + } + }, + { + "ph": "s", "id": 161147089, "pid": 5714, "tid": 5714, "ts": 6300865955850.364, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865956458.353, "dur": 1.056, + "args": { + "External id": 84400, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147095, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147095, "pid": 0, "tid": 7, "ts": 6300865956458.353, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955862.924, "dur": 4.500, + "args": { + "External id": 84400, "cbid": 211, "correlation": 161147095 + } + }, + { + "ph": "s", "id": 161147095, "pid": 5714, "tid": 5714, "ts": 6300865955862.924, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865956460.081, "dur": 1.024, + "args": { + "External id": 84401, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147105, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147105, "pid": 0, "tid": 7, "ts": 6300865956460.081, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955878.394, "dur": 4.900, + "args": { + "External id": 84401, "cbid": 211, "correlation": 161147105 + } + }, + { + "ph": "s", "id": 161147105, "pid": 5714, "tid": 5714, "ts": 6300865955878.394, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865956461.777, "dur": 1.024, + "args": { + "External id": 84402, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147111, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147111, "pid": 0, "tid": 7, "ts": 6300865956461.777, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955888.994, "dur": 4.220, + "args": { + "External id": 84402, "cbid": 211, "correlation": 161147111 + } + }, + { + "ph": "s", "id": 161147111, "pid": 5714, "tid": 5714, "ts": 6300865955888.994, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865956463.505, "dur": 3.456, + "args": { + "External id": 84403, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147124, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147124, "pid": 0, "tid": 7, "ts": 6300865956463.505, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955910.164, "dur": 5.010, + "args": { + "External id": 84403, "cbid": 211, "correlation": 161147124 + } + }, + { + "ph": "s", "id": 161147124, "pid": 5714, "tid": 5714, "ts": 6300865955910.164, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865956467.601, "dur": 1.088, + "args": { + "External id": 84406, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147130, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147130, "pid": 0, "tid": 7, "ts": 6300865956467.601, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955922.524, "dur": 4.650, + "args": { + "External id": 84406, "cbid": 211, "correlation": 161147130 + } + }, + { + "ph": "s", "id": 161147130, "pid": 5714, "tid": 5714, "ts": 6300865955922.524, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865956469.297, "dur": 1.024, + "args": { + "External id": 84407, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147136, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147136, "pid": 0, "tid": 7, "ts": 6300865956469.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865955932.244, "dur": 4.150, + "args": { + "External id": 84407, "cbid": 211, "correlation": 161147136 + } + }, + { + "ph": "s", "id": 161147136, "pid": 5714, "tid": 5714, "ts": 6300865955932.244, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865956471.025, "dur": 234.147, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147150, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161147150, "pid": 0, "tid": 7, "ts": 6300865956471.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956022.133, "dur": 8.260, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161147150 + } + }, + { + "ph": "s", "id": 161147150, "pid": 5714, "tid": 5714, "ts": 6300865956022.133, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865956065.093, "dur": 0.620, + "args": { + "External id": 84411, "cbid": 200, "correlation": 161147173 + } + }, + { + "ph": "f", "id": 161147173, "pid": 5714, "tid": 5714, "ts": 6300865956065.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865956705.972, "dur": 0.832, + "args": { + "External id": 84411, "device": 0, "context": 1, "stream": 7, "correlation": 161147176, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161147176, "pid": 0, "tid": 7, "ts": 6300865956705.972, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865956067.553, "dur": 7.200, + "args": { + "External id": 84411, "cbid": 51, "correlation": 161147176 + } + }, + { + "ph": "s", "id": 161147176, "pid": 5714, "tid": 5714, "ts": 6300865956067.553, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865956707.988, "dur": 687.432, + "args": { + "External id": 84411, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147177, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147177, "pid": 0, "tid": 7, "ts": 6300865956707.988, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956075.024, "dur": 5.929, + "args": { + "External id": 84411, "cbid": 307, "correlation": 161147177 + } + }, + { + "ph": "s", "id": 161147177, "pid": 5714, "tid": 5714, "ts": 6300865956075.024, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865957396.028, "dur": 3.008, + "args": { + "External id": 84414, "device": 0, "context": 1, "stream": 7, "correlation": 161147182, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 161147182, "pid": 0, "tid": 7, "ts": 6300865957396.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865956106.793, "dur": 14.430, + "args": { + "External id": 84414, "cbid": 41, "correlation": 161147182 + } + }, + { + "ph": "s", "id": 161147182, "pid": 5714, "tid": 5714, "ts": 6300865956106.793, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865956162.893, "dur": 0.500, + "args": { + "External id": 84419, "cbid": 200, "correlation": 161147210 + } + }, + { + "ph": "f", "id": 161147210, "pid": 5714, "tid": 5714, "ts": 6300865956162.893, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865957399.740, "dur": 692.904, + "args": { + "External id": 84419, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147213, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147213, "pid": 0, "tid": 7, "ts": 6300865957399.740, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956164.873, "dur": 7.170, + "args": { + "External id": 84419, "cbid": 307, "correlation": 161147213 + } + }, + { + "ph": "s", "id": 161147213, "pid": 5714, "tid": 5714, "ts": 6300865956164.873, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865958093.284, "dur": 220.803, + "args": { + "External id": 84420, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147218, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161147218, "pid": 0, "tid": 7, "ts": 6300865958093.284, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956186.563, "dur": 6.250, + "args": { + "External id": 84420, "cbid": 211, "correlation": 161147218 + } + }, + { + "ph": "s", "id": 161147218, "pid": 5714, "tid": 5714, "ts": 6300865956186.563, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865956239.653, "dur": 1.190, + "args": { + "External id": 84428, "cbid": 210, "correlation": 161147244 + } + }, + { + "ph": "f", "id": 161147244, "pid": 5714, "tid": 5714, "ts": 6300865956239.653, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865958314.823, "dur": 635.335, + "args": { + "External id": 84428, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147245, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147245, "pid": 0, "tid": 7, "ts": 6300865958314.823, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956244.483, "dur": 7.600, + "args": { + "External id": 84428, "cbid": 211, "correlation": 161147245 + } + }, + { + "ph": "s", "id": 161147245, "pid": 5714, "tid": 5714, "ts": 6300865956244.483, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865958950.766, "dur": 171.075, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147264, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161147264, "pid": 0, "tid": 7, "ts": 6300865958950.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956366.313, "dur": 9.630, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161147264 + } + }, + { + "ph": "s", "id": 161147264, "pid": 5714, "tid": 5714, "ts": 6300865956366.313, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865959122.417, "dur": 4.160, + "args": { + "External id": 84438, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147281, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147281, "pid": 0, "tid": 7, "ts": 6300865959122.417, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956410.703, "dur": 7.240, + "args": { + "External id": 84438, "cbid": 211, "correlation": 161147281 + } + }, + { + "ph": "s", "id": 161147281, "pid": 5714, "tid": 5714, "ts": 6300865956410.703, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865959127.249, "dur": 1.184, + "args": { + "External id": 84443, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147298, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147298, "pid": 0, "tid": 7, "ts": 6300865959127.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956442.393, "dur": 5.350, + "args": { + "External id": 84443, "cbid": 211, "correlation": 161147298 + } + }, + { + "ph": "s", "id": 161147298, "pid": 5714, "tid": 5714, "ts": 6300865956442.393, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865959129.105, "dur": 1.024, + "args": { + "External id": 84445, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147308, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147308, "pid": 0, "tid": 7, "ts": 6300865959129.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956463.672, "dur": 4.991, + "args": { + "External id": 84445, "cbid": 211, "correlation": 161147308 + } + }, + { + "ph": "s", "id": 161147308, "pid": 5714, "tid": 5714, "ts": 6300865956463.672, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865959130.801, "dur": 1.056, + "args": { + "External id": 84446, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147314, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147314, "pid": 0, "tid": 7, "ts": 6300865959130.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956476.183, "dur": 4.520, + "args": { + "External id": 84446, "cbid": 211, "correlation": 161147314 + } + }, + { + "ph": "s", "id": 161147314, "pid": 5714, "tid": 5714, "ts": 6300865956476.183, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865959132.529, "dur": 1.024, + "args": { + "External id": 84447, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147324, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147324, "pid": 0, "tid": 7, "ts": 6300865959132.529, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956492.112, "dur": 4.551, + "args": { + "External id": 84447, "cbid": 211, "correlation": 161147324 + } + }, + { + "ph": "s", "id": 161147324, "pid": 5714, "tid": 5714, "ts": 6300865956492.112, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865959134.257, "dur": 1.024, + "args": { + "External id": 84448, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147330, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147330, "pid": 0, "tid": 7, "ts": 6300865959134.257, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956502.412, "dur": 4.320, + "args": { + "External id": 84448, "cbid": 211, "correlation": 161147330 + } + }, + { + "ph": "s", "id": 161147330, "pid": 5714, "tid": 5714, "ts": 6300865956502.412, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865959136.017, "dur": 3.360, + "args": { + "External id": 84449, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147343, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147343, "pid": 0, "tid": 7, "ts": 6300865959136.017, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956522.983, "dur": 5.100, + "args": { + "External id": 84449, "cbid": 211, "correlation": 161147343 + } + }, + { + "ph": "s", "id": 161147343, "pid": 5714, "tid": 5714, "ts": 6300865956522.983, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865959139.985, "dur": 1.120, + "args": { + "External id": 84452, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147349, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147349, "pid": 0, "tid": 7, "ts": 6300865959139.985, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956534.223, "dur": 4.220, + "args": { + "External id": 84452, "cbid": 211, "correlation": 161147349 + } + }, + { + "ph": "s", "id": 161147349, "pid": 5714, "tid": 5714, "ts": 6300865956534.223, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865959141.713, "dur": 0.992, + "args": { + "External id": 84453, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147355, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147355, "pid": 0, "tid": 7, "ts": 6300865959141.713, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956543.403, "dur": 3.779, + "args": { + "External id": 84453, "cbid": 211, "correlation": 161147355 + } + }, + { + "ph": "s", "id": 161147355, "pid": 5714, "tid": 5714, "ts": 6300865956543.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865959143.441, "dur": 234.211, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147369, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161147369, "pid": 0, "tid": 7, "ts": 6300865959143.441, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956634.352, "dur": 7.940, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161147369 + } + }, + { + "ph": "s", "id": 161147369, "pid": 5714, "tid": 5714, "ts": 6300865956634.352, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865956676.372, "dur": 0.570, + "args": { + "External id": 84457, "cbid": 200, "correlation": 161147392 + } + }, + { + "ph": "f", "id": 161147392, "pid": 5714, "tid": 5714, "ts": 6300865956676.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865959378.452, "dur": 0.832, + "args": { + "External id": 84457, "device": 0, "context": 1, "stream": 7, "correlation": 161147395, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161147395, "pid": 0, "tid": 7, "ts": 6300865959378.452, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865956678.792, "dur": 7.910, + "args": { + "External id": 84457, "cbid": 51, "correlation": 161147395 + } + }, + { + "ph": "s", "id": 161147395, "pid": 5714, "tid": 5714, "ts": 6300865956678.792, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865959380.468, "dur": 688.872, + "args": { + "External id": 84457, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147396, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147396, "pid": 0, "tid": 7, "ts": 6300865959380.468, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956686.952, "dur": 5.950, + "args": { + "External id": 84457, "cbid": 307, "correlation": 161147396 + } + }, + { + "ph": "s", "id": 161147396, "pid": 5714, "tid": 5714, "ts": 6300865956686.952, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865960070.044, "dur": 2.976, + "args": { + "External id": 84460, "device": 0, "context": 1, "stream": 7, "correlation": 161147401, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 161147401, "pid": 0, "tid": 7, "ts": 6300865960070.044, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865956717.852, "dur": 12.110, + "args": { + "External id": 84460, "cbid": 41, "correlation": 161147401 + } + }, + { + "ph": "s", "id": 161147401, "pid": 5714, "tid": 5714, "ts": 6300865956717.852, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865956770.882, "dur": 0.520, + "args": { + "External id": 84465, "cbid": 200, "correlation": 161147429 + } + }, + { + "ph": "f", "id": 161147429, "pid": 5714, "tid": 5714, "ts": 6300865956770.882, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865960073.724, "dur": 688.712, + "args": { + "External id": 84465, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147432, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147432, "pid": 0, "tid": 7, "ts": 6300865960073.724, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956773.002, "dur": 6.980, + "args": { + "External id": 84465, "cbid": 307, "correlation": 161147432 + } + }, + { + "ph": "s", "id": 161147432, "pid": 5714, "tid": 5714, "ts": 6300865956773.002, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865960763.108, "dur": 220.354, + "args": { + "External id": 84466, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147437, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161147437, "pid": 0, "tid": 7, "ts": 6300865960763.108, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956794.072, "dur": 5.870, + "args": { + "External id": 84466, "cbid": 211, "correlation": 161147437 + } + }, + { + "ph": "s", "id": 161147437, "pid": 5714, "tid": 5714, "ts": 6300865956794.072, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865956844.592, "dur": 1.160, + "args": { + "External id": 84474, "cbid": 210, "correlation": 161147463 + } + }, + { + "ph": "f", "id": 161147463, "pid": 5714, "tid": 5714, "ts": 6300865956844.592, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865960984.102, "dur": 637.992, + "args": { + "External id": 84474, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147464, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147464, "pid": 0, "tid": 7, "ts": 6300865960984.102, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956849.282, "dur": 7.360, + "args": { + "External id": 84474, "cbid": 211, "correlation": 161147464 + } + }, + { + "ph": "s", "id": 161147464, "pid": 5714, "tid": 5714, "ts": 6300865956849.282, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865961622.798, "dur": 170.946, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147483, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161147483, "pid": 0, "tid": 7, "ts": 6300865961622.798, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865956961.071, "dur": 8.991, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161147483 + } + }, + { + "ph": "s", "id": 161147483, "pid": 5714, "tid": 5714, "ts": 6300865956961.071, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865961794.384, "dur": 4.032, + "args": { + "External id": 84484, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147500, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147500, "pid": 0, "tid": 7, "ts": 6300865961794.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957019.691, "dur": 7.430, + "args": { + "External id": 84484, "cbid": 211, "correlation": 161147500 + } + }, + { + "ph": "s", "id": 161147500, "pid": 5714, "tid": 5714, "ts": 6300865957019.691, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865961799.056, "dur": 1.184, + "args": { + "External id": 84489, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147517, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147517, "pid": 0, "tid": 7, "ts": 6300865961799.056, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957050.281, "dur": 5.260, + "args": { + "External id": 84489, "cbid": 211, "correlation": 161147517 + } + }, + { + "ph": "s", "id": 161147517, "pid": 5714, "tid": 5714, "ts": 6300865957050.281, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865961800.912, "dur": 1.024, + "args": { + "External id": 84491, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147527, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147527, "pid": 0, "tid": 7, "ts": 6300865961800.912, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957070.861, "dur": 4.780, + "args": { + "External id": 84491, "cbid": 211, "correlation": 161147527 + } + }, + { + "ph": "s", "id": 161147527, "pid": 5714, "tid": 5714, "ts": 6300865957070.861, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865961802.640, "dur": 1.056, + "args": { + "External id": 84492, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147533, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147533, "pid": 0, "tid": 7, "ts": 6300865961802.640, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957082.811, "dur": 4.270, + "args": { + "External id": 84492, "cbid": 211, "correlation": 161147533 + } + }, + { + "ph": "s", "id": 161147533, "pid": 5714, "tid": 5714, "ts": 6300865957082.811, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865961804.368, "dur": 1.024, + "args": { + "External id": 84493, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147543, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147543, "pid": 0, "tid": 7, "ts": 6300865961804.368, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957097.691, "dur": 4.390, + "args": { + "External id": 84493, "cbid": 211, "correlation": 161147543 + } + }, + { + "ph": "s", "id": 161147543, "pid": 5714, "tid": 5714, "ts": 6300865957097.691, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865961806.096, "dur": 1.024, + "args": { + "External id": 84494, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147549, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147549, "pid": 0, "tid": 7, "ts": 6300865961806.096, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957107.751, "dur": 4.170, + "args": { + "External id": 84494, "cbid": 211, "correlation": 161147549 + } + }, + { + "ph": "s", "id": 161147549, "pid": 5714, "tid": 5714, "ts": 6300865957107.751, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865961807.856, "dur": 3.360, + "args": { + "External id": 84495, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147562, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147562, "pid": 0, "tid": 7, "ts": 6300865961807.856, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957127.631, "dur": 4.820, + "args": { + "External id": 84495, "cbid": 211, "correlation": 161147562 + } + }, + { + "ph": "s", "id": 161147562, "pid": 5714, "tid": 5714, "ts": 6300865957127.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865961811.824, "dur": 1.088, + "args": { + "External id": 84498, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147568, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147568, "pid": 0, "tid": 7, "ts": 6300865961811.824, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957138.381, "dur": 4.220, + "args": { + "External id": 84498, "cbid": 211, "correlation": 161147568 + } + }, + { + "ph": "s", "id": 161147568, "pid": 5714, "tid": 5714, "ts": 6300865957138.381, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865961813.552, "dur": 1.024, + "args": { + "External id": 84499, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147574, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147574, "pid": 0, "tid": 7, "ts": 6300865961813.552, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957147.411, "dur": 3.840, + "args": { + "External id": 84499, "cbid": 211, "correlation": 161147574 + } + }, + { + "ph": "s", "id": 161147574, "pid": 5714, "tid": 5714, "ts": 6300865957147.411, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865961815.280, "dur": 233.635, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147588, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161147588, "pid": 0, "tid": 7, "ts": 6300865961815.280, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957238.901, "dur": 8.000, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161147588 + } + }, + { + "ph": "s", "id": 161147588, "pid": 5714, "tid": 5714, "ts": 6300865957238.901, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865957281.341, "dur": 0.540, + "args": { + "External id": 84503, "cbid": 200, "correlation": 161147611 + } + }, + { + "ph": "f", "id": 161147611, "pid": 5714, "tid": 5714, "ts": 6300865957281.341, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865962049.811, "dur": 0.800, + "args": { + "External id": 84503, "device": 0, "context": 1, "stream": 7, "correlation": 161147614, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161147614, "pid": 0, "tid": 7, "ts": 6300865962049.811, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865957283.731, "dur": 6.700, + "args": { + "External id": 84503, "cbid": 51, "correlation": 161147614 + } + }, + { + "ph": "s", "id": 161147614, "pid": 5714, "tid": 5714, "ts": 6300865957283.731, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865962051.827, "dur": 687.464, + "args": { + "External id": 84503, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147615, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147615, "pid": 0, "tid": 7, "ts": 6300865962051.827, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957290.671, "dur": 15.630, + "args": { + "External id": 84503, "cbid": 307, "correlation": 161147615 + } + }, + { + "ph": "s", "id": 161147615, "pid": 5714, "tid": 5714, "ts": 6300865957290.671, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865962740.027, "dur": 3.136, + "args": { + "External id": 84506, "device": 0, "context": 1, "stream": 7, "correlation": 161147620, "bytes": 3145728, "memory bandwidth (GB/s)": 1003.1020408163265 + } + }, + { + "ph": "f", "id": 161147620, "pid": 0, "tid": 7, "ts": 6300865962740.027, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865957332.501, "dur": 12.969, + "args": { + "External id": 84506, "cbid": 41, "correlation": 161147620 + } + }, + { + "ph": "s", "id": 161147620, "pid": 5714, "tid": 5714, "ts": 6300865957332.501, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865957387.301, "dur": 0.489, + "args": { + "External id": 84511, "cbid": 200, "correlation": 161147648 + } + }, + { + "ph": "f", "id": 161147648, "pid": 5714, "tid": 5714, "ts": 6300865957387.301, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865962743.803, "dur": 688.136, + "args": { + "External id": 84511, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147651, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147651, "pid": 0, "tid": 7, "ts": 6300865962743.803, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957389.341, "dur": 7.269, + "args": { + "External id": 84511, "cbid": 307, "correlation": 161147651 + } + }, + { + "ph": "s", "id": 161147651, "pid": 5714, "tid": 5714, "ts": 6300865957389.341, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865963432.643, "dur": 220.259, + "args": { + "External id": 84512, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147656, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161147656, "pid": 0, "tid": 7, "ts": 6300865963432.643, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957411.101, "dur": 5.900, + "args": { + "External id": 84512, "cbid": 211, "correlation": 161147656 + } + }, + { + "ph": "s", "id": 161147656, "pid": 5714, "tid": 5714, "ts": 6300865957411.101, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865957460.280, "dur": 1.340, + "args": { + "External id": 84520, "cbid": 210, "correlation": 161147682 + } + }, + { + "ph": "f", "id": 161147682, "pid": 5714, "tid": 5714, "ts": 6300865957460.280, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865963653.670, "dur": 635.463, + "args": { + "External id": 84520, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147683, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147683, "pid": 0, "tid": 7, "ts": 6300865963653.670, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957465.150, "dur": 7.490, + "args": { + "External id": 84520, "cbid": 211, "correlation": 161147683 + } + }, + { + "ph": "s", "id": 161147683, "pid": 5714, "tid": 5714, "ts": 6300865957465.150, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865964289.805, "dur": 171.010, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147702, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161147702, "pid": 0, "tid": 7, "ts": 6300865964289.805, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957577.710, "dur": 8.970, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161147702 + } + }, + { + "ph": "s", "id": 161147702, "pid": 5714, "tid": 5714, "ts": 6300865957577.710, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865964461.551, "dur": 4.064, + "args": { + "External id": 84530, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147719, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147719, "pid": 0, "tid": 7, "ts": 6300865964461.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957621.210, "dur": 7.190, + "args": { + "External id": 84530, "cbid": 211, "correlation": 161147719 + } + }, + { + "ph": "s", "id": 161147719, "pid": 5714, "tid": 5714, "ts": 6300865957621.210, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865964466.223, "dur": 1.216, + "args": { + "External id": 84535, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147736, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147736, "pid": 0, "tid": 7, "ts": 6300865964466.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957651.860, "dur": 5.450, + "args": { + "External id": 84535, "cbid": 211, "correlation": 161147736 + } + }, + { + "ph": "s", "id": 161147736, "pid": 5714, "tid": 5714, "ts": 6300865957651.860, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865964468.111, "dur": 0.992, + "args": { + "External id": 84537, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147746, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147746, "pid": 0, "tid": 7, "ts": 6300865964468.111, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957672.350, "dur": 8.510, + "args": { + "External id": 84537, "cbid": 211, "correlation": 161147746 + } + }, + { + "ph": "s", "id": 161147746, "pid": 5714, "tid": 5714, "ts": 6300865957672.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865964469.839, "dur": 1.056, + "args": { + "External id": 84538, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147752, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147752, "pid": 0, "tid": 7, "ts": 6300865964469.839, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957688.120, "dur": 4.490, + "args": { + "External id": 84538, "cbid": 211, "correlation": 161147752 + } + }, + { + "ph": "s", "id": 161147752, "pid": 5714, "tid": 5714, "ts": 6300865957688.120, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865964471.535, "dur": 1.056, + "args": { + "External id": 84539, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147762, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147762, "pid": 0, "tid": 7, "ts": 6300865964471.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957703.410, "dur": 4.250, + "args": { + "External id": 84539, "cbid": 211, "correlation": 161147762 + } + }, + { + "ph": "s", "id": 161147762, "pid": 5714, "tid": 5714, "ts": 6300865957703.410, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865964473.295, "dur": 1.024, + "args": { + "External id": 84540, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147768, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147768, "pid": 0, "tid": 7, "ts": 6300865964473.295, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957713.190, "dur": 4.050, + "args": { + "External id": 84540, "cbid": 211, "correlation": 161147768 + } + }, + { + "ph": "s", "id": 161147768, "pid": 5714, "tid": 5714, "ts": 6300865957713.190, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865964475.055, "dur": 3.361, + "args": { + "External id": 84541, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147781, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147781, "pid": 0, "tid": 7, "ts": 6300865964475.055, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957731.920, "dur": 5.010, + "args": { + "External id": 84541, "cbid": 211, "correlation": 161147781 + } + }, + { + "ph": "s", "id": 161147781, "pid": 5714, "tid": 5714, "ts": 6300865957731.920, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865964479.024, "dur": 1.088, + "args": { + "External id": 84544, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147787, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147787, "pid": 0, "tid": 7, "ts": 6300865964479.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957744.280, "dur": 4.250, + "args": { + "External id": 84544, "cbid": 211, "correlation": 161147787 + } + }, + { + "ph": "s", "id": 161147787, "pid": 5714, "tid": 5714, "ts": 6300865957744.280, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865964480.720, "dur": 1.024, + "args": { + "External id": 84545, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147793, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147793, "pid": 0, "tid": 7, "ts": 6300865964480.720, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957753.510, "dur": 4.250, + "args": { + "External id": 84545, "cbid": 211, "correlation": 161147793 + } + }, + { + "ph": "s", "id": 161147793, "pid": 5714, "tid": 5714, "ts": 6300865957753.510, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865964482.448, "dur": 233.154, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147807, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161147807, "pid": 0, "tid": 7, "ts": 6300865964482.448, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957843.380, "dur": 8.260, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161147807 + } + }, + { + "ph": "s", "id": 161147807, "pid": 5714, "tid": 5714, "ts": 6300865957843.380, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865957885.389, "dur": 0.570, + "args": { + "External id": 84549, "cbid": 200, "correlation": 161147830 + } + }, + { + "ph": "f", "id": 161147830, "pid": 5714, "tid": 5714, "ts": 6300865957885.389, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865964716.466, "dur": 1.056, + "args": { + "External id": 84549, "device": 0, "context": 1, "stream": 7, "correlation": 161147833, "bytes": 1536, "memory bandwidth (GB/s)": 1.4545454545454546 + } + }, + { + "ph": "f", "id": 161147833, "pid": 0, "tid": 7, "ts": 6300865964716.466, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865957887.789, "dur": 6.890, + "args": { + "External id": 84549, "cbid": 51, "correlation": 161147833 + } + }, + { + "ph": "s", "id": 161147833, "pid": 5714, "tid": 5714, "ts": 6300865957887.789, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865964718.738, "dur": 687.144, + "args": { + "External id": 84549, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147834, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147834, "pid": 0, "tid": 7, "ts": 6300865964718.738, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957894.929, "dur": 5.930, + "args": { + "External id": 84549, "cbid": 307, "correlation": 161147834 + } + }, + { + "ph": "s", "id": 161147834, "pid": 5714, "tid": 5714, "ts": 6300865957894.929, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865965406.490, "dur": 2.976, + "args": { + "External id": 84552, "device": 0, "context": 1, "stream": 7, "correlation": 161147839, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 161147839, "pid": 0, "tid": 7, "ts": 6300865965406.490, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865957925.599, "dur": 11.960, + "args": { + "External id": 84552, "cbid": 41, "correlation": 161147839 + } + }, + { + "ph": "s", "id": 161147839, "pid": 5714, "tid": 5714, "ts": 6300865957925.599, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865957977.909, "dur": 0.480, + "args": { + "External id": 84557, "cbid": 200, "correlation": 161147867 + } + }, + { + "ph": "f", "id": 161147867, "pid": 5714, "tid": 5714, "ts": 6300865957977.909, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865965410.074, "dur": 687.209, + "args": { + "External id": 84557, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147870, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147870, "pid": 0, "tid": 7, "ts": 6300865965410.074, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865957979.899, "dur": 7.170, + "args": { + "External id": 84557, "cbid": 307, "correlation": 161147870 + } + }, + { + "ph": "s", "id": 161147870, "pid": 5714, "tid": 5714, "ts": 6300865957979.899, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865966097.987, "dur": 220.866, + "args": { + "External id": 84558, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147875, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161147875, "pid": 0, "tid": 7, "ts": 6300865966097.987, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958001.009, "dur": 6.080, + "args": { + "External id": 84558, "cbid": 211, "correlation": 161147875 + } + }, + { + "ph": "s", "id": 161147875, "pid": 5714, "tid": 5714, "ts": 6300865958001.009, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865958052.139, "dur": 1.200, + "args": { + "External id": 84566, "cbid": 210, "correlation": 161147901 + } + }, + { + "ph": "f", "id": 161147901, "pid": 5714, "tid": 5714, "ts": 6300865958052.139, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865966319.525, "dur": 634.312, + "args": { + "External id": 84566, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147902, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161147902, "pid": 0, "tid": 7, "ts": 6300865966319.525, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958056.869, "dur": 7.320, + "args": { + "External id": 84566, "cbid": 211, "correlation": 161147902 + } + }, + { + "ph": "s", "id": 161147902, "pid": 5714, "tid": 5714, "ts": 6300865958056.869, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865966955.085, "dur": 170.657, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147921, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161147921, "pid": 0, "tid": 7, "ts": 6300865966955.085, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958168.799, "dur": 8.830, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161147921 + } + }, + { + "ph": "s", "id": 161147921, "pid": 5714, "tid": 5714, "ts": 6300865958168.799, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865967126.446, "dur": 4.064, + "args": { + "External id": 84576, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147938, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147938, "pid": 0, "tid": 7, "ts": 6300865967126.446, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958210.599, "dur": 6.960, + "args": { + "External id": 84576, "cbid": 211, "correlation": 161147938 + } + }, + { + "ph": "s", "id": 161147938, "pid": 5714, "tid": 5714, "ts": 6300865958210.599, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865967131.150, "dur": 1.184, + "args": { + "External id": 84581, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147955, "pid": 0, "tid": 7, "ts": 6300865967131.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958241.348, "dur": 5.220, + "args": { + "External id": 84581, "cbid": 211, "correlation": 161147955 + } + }, + { + "ph": "s", "id": 161147955, "pid": 5714, "tid": 5714, "ts": 6300865958241.348, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865967133.006, "dur": 1.024, + "args": { + "External id": 84583, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147965, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147965, "pid": 0, "tid": 7, "ts": 6300865967133.006, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958262.379, "dur": 4.780, + "args": { + "External id": 84583, "cbid": 211, "correlation": 161147965 + } + }, + { + "ph": "s", "id": 161147965, "pid": 5714, "tid": 5714, "ts": 6300865958262.379, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865967134.734, "dur": 1.056, + "args": { + "External id": 84584, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147971, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147971, "pid": 0, "tid": 7, "ts": 6300865967134.734, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958274.119, "dur": 4.360, + "args": { + "External id": 84584, "cbid": 211, "correlation": 161147971 + } + }, + { + "ph": "s", "id": 161147971, "pid": 5714, "tid": 5714, "ts": 6300865958274.119, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865967136.462, "dur": 1.024, + "args": { + "External id": 84585, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147981, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147981, "pid": 0, "tid": 7, "ts": 6300865967136.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958288.588, "dur": 4.380, + "args": { + "External id": 84585, "cbid": 211, "correlation": 161147981 + } + }, + { + "ph": "s", "id": 161147981, "pid": 5714, "tid": 5714, "ts": 6300865958288.588, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865967138.158, "dur": 1.056, + "args": { + "External id": 84586, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161147987, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161147987, "pid": 0, "tid": 7, "ts": 6300865967138.158, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958308.388, "dur": 5.010, + "args": { + "External id": 84586, "cbid": 211, "correlation": 161147987 + } + }, + { + "ph": "s", "id": 161147987, "pid": 5714, "tid": 5714, "ts": 6300865958308.388, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865967139.951, "dur": 3.360, + "args": { + "External id": 84587, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148000, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148000, "pid": 0, "tid": 7, "ts": 6300865967139.951, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958330.038, "dur": 5.260, + "args": { + "External id": 84587, "cbid": 211, "correlation": 161148000 + } + }, + { + "ph": "s", "id": 161148000, "pid": 5714, "tid": 5714, "ts": 6300865958330.038, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865967143.919, "dur": 1.088, + "args": { + "External id": 84590, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148006, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148006, "pid": 0, "tid": 7, "ts": 6300865967143.919, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958342.748, "dur": 4.380, + "args": { + "External id": 84590, "cbid": 211, "correlation": 161148006 + } + }, + { + "ph": "s", "id": 161148006, "pid": 5714, "tid": 5714, "ts": 6300865958342.748, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865967145.647, "dur": 1.024, + "args": { + "External id": 84591, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148012, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148012, "pid": 0, "tid": 7, "ts": 6300865967145.647, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958352.148, "dur": 4.090, + "args": { + "External id": 84591, "cbid": 211, "correlation": 161148012 + } + }, + { + "ph": "s", "id": 161148012, "pid": 5714, "tid": 5714, "ts": 6300865958352.148, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865967147.375, "dur": 233.890, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148026, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161148026, "pid": 0, "tid": 7, "ts": 6300865967147.375, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958443.158, "dur": 8.150, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161148026 + } + }, + { + "ph": "s", "id": 161148026, "pid": 5714, "tid": 5714, "ts": 6300865958443.158, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865958485.178, "dur": 0.560, + "args": { + "External id": 84595, "cbid": 200, "correlation": 161148049 + } + }, + { + "ph": "f", "id": 161148049, "pid": 5714, "tid": 5714, "ts": 6300865958485.178, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865967382.065, "dur": 0.800, + "args": { + "External id": 84595, "device": 0, "context": 1, "stream": 7, "correlation": 161148052, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161148052, "pid": 0, "tid": 7, "ts": 6300865967382.065, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865958487.568, "dur": 6.650, + "args": { + "External id": 84595, "cbid": 51, "correlation": 161148052 + } + }, + { + "ph": "s", "id": 161148052, "pid": 5714, "tid": 5714, "ts": 6300865958487.568, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865967384.081, "dur": 687.593, + "args": { + "External id": 84595, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148053, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148053, "pid": 0, "tid": 7, "ts": 6300865967384.081, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958494.438, "dur": 5.800, + "args": { + "External id": 84595, "cbid": 307, "correlation": 161148053 + } + }, + { + "ph": "s", "id": 161148053, "pid": 5714, "tid": 5714, "ts": 6300865958494.438, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865968072.378, "dur": 2.976, + "args": { + "External id": 84598, "device": 0, "context": 1, "stream": 7, "correlation": 161148058, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 161148058, "pid": 0, "tid": 7, "ts": 6300865968072.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865958524.858, "dur": 13.120, + "args": { + "External id": 84598, "cbid": 41, "correlation": 161148058 + } + }, + { + "ph": "s", "id": 161148058, "pid": 5714, "tid": 5714, "ts": 6300865958524.858, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865958579.808, "dur": 0.500, + "args": { + "External id": 84603, "cbid": 200, "correlation": 161148086 + } + }, + { + "ph": "f", "id": 161148086, "pid": 5714, "tid": 5714, "ts": 6300865958579.808, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865968075.994, "dur": 692.200, + "args": { + "External id": 84603, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148089, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148089, "pid": 0, "tid": 7, "ts": 6300865968075.994, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958581.788, "dur": 6.950, + "args": { + "External id": 84603, "cbid": 307, "correlation": 161148089 + } + }, + { + "ph": "s", "id": 161148089, "pid": 5714, "tid": 5714, "ts": 6300865958581.788, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865968768.866, "dur": 220.930, + "args": { + "External id": 84604, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148094, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161148094, "pid": 0, "tid": 7, "ts": 6300865968768.866, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958604.118, "dur": 5.810, + "args": { + "External id": 84604, "cbid": 211, "correlation": 161148094 + } + }, + { + "ph": "s", "id": 161148094, "pid": 5714, "tid": 5714, "ts": 6300865958604.118, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865958654.378, "dur": 1.169, + "args": { + "External id": 84612, "cbid": 210, "correlation": 161148120 + } + }, + { + "ph": "f", "id": 161148120, "pid": 5714, "tid": 5714, "ts": 6300865958654.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865968990.468, "dur": 635.336, + "args": { + "External id": 84612, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148121, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148121, "pid": 0, "tid": 7, "ts": 6300865968990.468, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958659.067, "dur": 7.431, + "args": { + "External id": 84612, "cbid": 211, "correlation": 161148121 + } + }, + { + "ph": "s", "id": 161148121, "pid": 5714, "tid": 5714, "ts": 6300865958659.067, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865969626.540, "dur": 170.850, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148140, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161148140, "pid": 0, "tid": 7, "ts": 6300865969626.540, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958771.907, "dur": 8.930, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161148140 + } + }, + { + "ph": "s", "id": 161148140, "pid": 5714, "tid": 5714, "ts": 6300865958771.907, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865969797.998, "dur": 4.096, + "args": { + "External id": 84622, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148157, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148157, "pid": 0, "tid": 7, "ts": 6300865969797.998, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958815.557, "dur": 7.030, + "args": { + "External id": 84622, "cbid": 211, "correlation": 161148157 + } + }, + { + "ph": "s", "id": 161148157, "pid": 5714, "tid": 5714, "ts": 6300865958815.557, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865969802.830, "dur": 1.184, + "args": { + "External id": 84627, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148174, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148174, "pid": 0, "tid": 7, "ts": 6300865969802.830, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958847.997, "dur": 5.340, + "args": { + "External id": 84627, "cbid": 211, "correlation": 161148174 + } + }, + { + "ph": "s", "id": 161148174, "pid": 5714, "tid": 5714, "ts": 6300865958847.997, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865969804.686, "dur": 1.024, + "args": { + "External id": 84629, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148184, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148184, "pid": 0, "tid": 7, "ts": 6300865969804.686, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958867.257, "dur": 4.900, + "args": { + "External id": 84629, "cbid": 211, "correlation": 161148184 + } + }, + { + "ph": "s", "id": 161148184, "pid": 5714, "tid": 5714, "ts": 6300865958867.257, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865969806.414, "dur": 1.056, + "args": { + "External id": 84630, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148190, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148190, "pid": 0, "tid": 7, "ts": 6300865969806.414, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958879.277, "dur": 4.610, + "args": { + "External id": 84630, "cbid": 211, "correlation": 161148190 + } + }, + { + "ph": "s", "id": 161148190, "pid": 5714, "tid": 5714, "ts": 6300865958879.277, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865969808.142, "dur": 1.024, + "args": { + "External id": 84631, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148200, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148200, "pid": 0, "tid": 7, "ts": 6300865969808.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958894.767, "dur": 4.610, + "args": { + "External id": 84631, "cbid": 211, "correlation": 161148200 + } + }, + { + "ph": "s", "id": 161148200, "pid": 5714, "tid": 5714, "ts": 6300865958894.767, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865969809.870, "dur": 1.024, + "args": { + "External id": 84632, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148206, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148206, "pid": 0, "tid": 7, "ts": 6300865969809.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958904.837, "dur": 4.310, + "args": { + "External id": 84632, "cbid": 211, "correlation": 161148206 + } + }, + { + "ph": "s", "id": 161148206, "pid": 5714, "tid": 5714, "ts": 6300865958904.837, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865969811.630, "dur": 3.360, + "args": { + "External id": 84633, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148219, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148219, "pid": 0, "tid": 7, "ts": 6300865969811.630, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958927.127, "dur": 5.010, + "args": { + "External id": 84633, "cbid": 211, "correlation": 161148219 + } + }, + { + "ph": "s", "id": 161148219, "pid": 5714, "tid": 5714, "ts": 6300865958927.127, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865969815.598, "dur": 1.056, + "args": { + "External id": 84636, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148225, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148225, "pid": 0, "tid": 7, "ts": 6300865969815.598, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958938.557, "dur": 4.010, + "args": { + "External id": 84636, "cbid": 211, "correlation": 161148225 + } + }, + { + "ph": "s", "id": 161148225, "pid": 5714, "tid": 5714, "ts": 6300865958938.557, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865969817.294, "dur": 1.024, + "args": { + "External id": 84637, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148231, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148231, "pid": 0, "tid": 7, "ts": 6300865969817.294, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865958947.597, "dur": 3.920, + "args": { + "External id": 84637, "cbid": 211, "correlation": 161148231 + } + }, + { + "ph": "s", "id": 161148231, "pid": 5714, "tid": 5714, "ts": 6300865958947.597, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865969819.054, "dur": 233.059, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148245, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161148245, "pid": 0, "tid": 7, "ts": 6300865969819.054, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959035.847, "dur": 8.200, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161148245 + } + }, + { + "ph": "s", "id": 161148245, "pid": 5714, "tid": 5714, "ts": 6300865959035.847, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865959078.187, "dur": 0.570, + "args": { + "External id": 84641, "cbid": 200, "correlation": 161148268 + } + }, + { + "ph": "f", "id": 161148268, "pid": 5714, "tid": 5714, "ts": 6300865959078.187, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865970052.977, "dur": 0.832, + "args": { + "External id": 84641, "device": 0, "context": 1, "stream": 7, "correlation": 161148271, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161148271, "pid": 0, "tid": 7, "ts": 6300865970052.977, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865959080.597, "dur": 7.329, + "args": { + "External id": 84641, "cbid": 51, "correlation": 161148271 + } + }, + { + "ph": "s", "id": 161148271, "pid": 5714, "tid": 5714, "ts": 6300865959080.597, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865970054.993, "dur": 686.696, + "args": { + "External id": 84641, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148272, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148272, "pid": 0, "tid": 7, "ts": 6300865970054.993, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959088.166, "dur": 5.800, + "args": { + "External id": 84641, "cbid": 307, "correlation": 161148272 + } + }, + { + "ph": "s", "id": 161148272, "pid": 5714, "tid": 5714, "ts": 6300865959088.166, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865970742.425, "dur": 2.912, + "args": { + "External id": 84644, "device": 0, "context": 1, "stream": 7, "correlation": 161148277, "bytes": 3145728, "memory bandwidth (GB/s)": 1080.2637362637363 + } + }, + { + "ph": "f", "id": 161148277, "pid": 0, "tid": 7, "ts": 6300865970742.425, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865959117.786, "dur": 12.211, + "args": { + "External id": 84644, "cbid": 41, "correlation": 161148277 + } + }, + { + "ph": "s", "id": 161148277, "pid": 5714, "tid": 5714, "ts": 6300865959117.786, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865959170.746, "dur": 0.491, + "args": { + "External id": 84649, "cbid": 200, "correlation": 161148305 + } + }, + { + "ph": "f", "id": 161148305, "pid": 5714, "tid": 5714, "ts": 6300865959170.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865970746.009, "dur": 688.328, + "args": { + "External id": 84649, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148308, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148308, "pid": 0, "tid": 7, "ts": 6300865970746.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959172.877, "dur": 7.029, + "args": { + "External id": 84649, "cbid": 307, "correlation": 161148308 + } + }, + { + "ph": "s", "id": 161148308, "pid": 5714, "tid": 5714, "ts": 6300865959172.877, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865971435.009, "dur": 220.387, + "args": { + "External id": 84650, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148313, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161148313, "pid": 0, "tid": 7, "ts": 6300865971435.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959194.316, "dur": 5.890, + "args": { + "External id": 84650, "cbid": 211, "correlation": 161148313 + } + }, + { + "ph": "s", "id": 161148313, "pid": 5714, "tid": 5714, "ts": 6300865959194.316, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865959245.746, "dur": 1.180, + "args": { + "External id": 84658, "cbid": 210, "correlation": 161148339 + } + }, + { + "ph": "f", "id": 161148339, "pid": 5714, "tid": 5714, "ts": 6300865959245.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865971656.132, "dur": 635.239, + "args": { + "External id": 84658, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148340, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148340, "pid": 0, "tid": 7, "ts": 6300865971656.132, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959250.386, "dur": 7.610, + "args": { + "External id": 84658, "cbid": 211, "correlation": 161148340 + } + }, + { + "ph": "s", "id": 161148340, "pid": 5714, "tid": 5714, "ts": 6300865959250.386, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865972292.427, "dur": 170.850, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148359, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161148359, "pid": 0, "tid": 7, "ts": 6300865972292.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959371.266, "dur": 9.470, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161148359 + } + }, + { + "ph": "s", "id": 161148359, "pid": 5714, "tid": 5714, "ts": 6300865959371.266, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865972463.949, "dur": 4.032, + "args": { + "External id": 84668, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148376, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148376, "pid": 0, "tid": 7, "ts": 6300865972463.949, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959417.376, "dur": 7.450, + "args": { + "External id": 84668, "cbid": 211, "correlation": 161148376 + } + }, + { + "ph": "s", "id": 161148376, "pid": 5714, "tid": 5714, "ts": 6300865959417.376, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865972468.621, "dur": 1.184, + "args": { + "External id": 84673, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148393, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148393, "pid": 0, "tid": 7, "ts": 6300865972468.621, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959447.716, "dur": 5.740, + "args": { + "External id": 84673, "cbid": 211, "correlation": 161148393 + } + }, + { + "ph": "s", "id": 161148393, "pid": 5714, "tid": 5714, "ts": 6300865959447.716, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865972470.509, "dur": 1.024, + "args": { + "External id": 84675, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148403, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148403, "pid": 0, "tid": 7, "ts": 6300865972470.509, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959467.166, "dur": 4.850, + "args": { + "External id": 84675, "cbid": 211, "correlation": 161148403 + } + }, + { + "ph": "s", "id": 161148403, "pid": 5714, "tid": 5714, "ts": 6300865959467.166, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865972472.205, "dur": 1.056, + "args": { + "External id": 84676, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148409, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148409, "pid": 0, "tid": 7, "ts": 6300865972472.205, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959479.536, "dur": 4.310, + "args": { + "External id": 84676, "cbid": 211, "correlation": 161148409 + } + }, + { + "ph": "s", "id": 161148409, "pid": 5714, "tid": 5714, "ts": 6300865959479.536, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865972473.965, "dur": 1.024, + "args": { + "External id": 84677, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148419, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148419, "pid": 0, "tid": 7, "ts": 6300865972473.965, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959494.116, "dur": 4.220, + "args": { + "External id": 84677, "cbid": 211, "correlation": 161148419 + } + }, + { + "ph": "s", "id": 161148419, "pid": 5714, "tid": 5714, "ts": 6300865959494.116, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865972475.661, "dur": 1.056, + "args": { + "External id": 84678, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148425, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148425, "pid": 0, "tid": 7, "ts": 6300865972475.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959504.086, "dur": 4.040, + "args": { + "External id": 84678, "cbid": 211, "correlation": 161148425 + } + }, + { + "ph": "s", "id": 161148425, "pid": 5714, "tid": 5714, "ts": 6300865959504.086, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865972477.453, "dur": 3.584, + "args": { + "External id": 84679, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148438, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148438, "pid": 0, "tid": 7, "ts": 6300865972477.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959524.176, "dur": 5.140, + "args": { + "External id": 84679, "cbid": 211, "correlation": 161148438 + } + }, + { + "ph": "s", "id": 161148438, "pid": 5714, "tid": 5714, "ts": 6300865959524.176, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865972481.677, "dur": 1.088, + "args": { + "External id": 84682, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148444, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148444, "pid": 0, "tid": 7, "ts": 6300865972481.677, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959535.285, "dur": 4.511, + "args": { + "External id": 84682, "cbid": 211, "correlation": 161148444 + } + }, + { + "ph": "s", "id": 161148444, "pid": 5714, "tid": 5714, "ts": 6300865959535.285, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865972483.405, "dur": 0.992, + "args": { + "External id": 84683, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148450, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148450, "pid": 0, "tid": 7, "ts": 6300865972483.405, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959544.696, "dur": 3.889, + "args": { + "External id": 84683, "cbid": 211, "correlation": 161148450 + } + }, + { + "ph": "s", "id": 161148450, "pid": 5714, "tid": 5714, "ts": 6300865959544.696, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865972485.133, "dur": 233.891, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148464, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161148464, "pid": 0, "tid": 7, "ts": 6300865972485.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959635.465, "dur": 7.920, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161148464 + } + }, + { + "ph": "s", "id": 161148464, "pid": 5714, "tid": 5714, "ts": 6300865959635.465, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865959676.735, "dur": 0.580, + "args": { + "External id": 84687, "cbid": 200, "correlation": 161148487 + } + }, + { + "ph": "f", "id": 161148487, "pid": 5714, "tid": 5714, "ts": 6300865959676.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865972719.888, "dur": 0.800, + "args": { + "External id": 84687, "device": 0, "context": 1, "stream": 7, "correlation": 161148490, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161148490, "pid": 0, "tid": 7, "ts": 6300865972719.888, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865959679.175, "dur": 6.680, + "args": { + "External id": 84687, "cbid": 51, "correlation": 161148490 + } + }, + { + "ph": "s", "id": 161148490, "pid": 5714, "tid": 5714, "ts": 6300865959679.175, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865972722.160, "dur": 688.424, + "args": { + "External id": 84687, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148491, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148491, "pid": 0, "tid": 7, "ts": 6300865972722.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959686.095, "dur": 6.110, + "args": { + "External id": 84687, "cbid": 307, "correlation": 161148491 + } + }, + { + "ph": "s", "id": 161148491, "pid": 5714, "tid": 5714, "ts": 6300865959686.095, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865973411.256, "dur": 2.944, + "args": { + "External id": 84690, "device": 0, "context": 1, "stream": 7, "correlation": 161148496, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161148496, "pid": 0, "tid": 7, "ts": 6300865973411.256, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865959716.935, "dur": 12.300, + "args": { + "External id": 84690, "cbid": 41, "correlation": 161148496 + } + }, + { + "ph": "s", "id": 161148496, "pid": 5714, "tid": 5714, "ts": 6300865959716.935, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865959770.965, "dur": 0.480, + "args": { + "External id": 84695, "cbid": 200, "correlation": 161148524 + } + }, + { + "ph": "f", "id": 161148524, "pid": 5714, "tid": 5714, "ts": 6300865959770.965, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865973414.840, "dur": 686.856, + "args": { + "External id": 84695, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148527, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148527, "pid": 0, "tid": 7, "ts": 6300865973414.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959773.025, "dur": 6.970, + "args": { + "External id": 84695, "cbid": 307, "correlation": 161148527 + } + }, + { + "ph": "s", "id": 161148527, "pid": 5714, "tid": 5714, "ts": 6300865959773.025, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865974102.400, "dur": 221.027, + "args": { + "External id": 84696, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148532, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161148532, "pid": 0, "tid": 7, "ts": 6300865974102.400, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959794.125, "dur": 5.920, + "args": { + "External id": 84696, "cbid": 211, "correlation": 161148532 + } + }, + { + "ph": "s", "id": 161148532, "pid": 5714, "tid": 5714, "ts": 6300865959794.125, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865959844.075, "dur": 1.210, + "args": { + "External id": 84704, "cbid": 210, "correlation": 161148558 + } + }, + { + "ph": "f", "id": 161148558, "pid": 5714, "tid": 5714, "ts": 6300865959844.075, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865974324.067, "dur": 636.647, + "args": { + "External id": 84704, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148559, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148559, "pid": 0, "tid": 7, "ts": 6300865974324.067, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959848.805, "dur": 7.260, + "args": { + "External id": 84704, "cbid": 211, "correlation": 161148559 + } + }, + { + "ph": "s", "id": 161148559, "pid": 5714, "tid": 5714, "ts": 6300865959848.805, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865974961.418, "dur": 170.979, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148578, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161148578, "pid": 0, "tid": 7, "ts": 6300865974961.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865959961.395, "dur": 8.829, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161148578 + } + }, + { + "ph": "s", "id": 161148578, "pid": 5714, "tid": 5714, "ts": 6300865959961.395, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865975133.133, "dur": 4.256, + "args": { + "External id": 84714, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148595, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148595, "pid": 0, "tid": 7, "ts": 6300865975133.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960002.624, "dur": 7.071, + "args": { + "External id": 84714, "cbid": 211, "correlation": 161148595 + } + }, + { + "ph": "s", "id": 161148595, "pid": 5714, "tid": 5714, "ts": 6300865960002.624, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865975138.093, "dur": 1.184, + "args": { + "External id": 84719, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148612, "pid": 0, "tid": 7, "ts": 6300865975138.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960034.855, "dur": 5.460, + "args": { + "External id": 84719, "cbid": 211, "correlation": 161148612 + } + }, + { + "ph": "s", "id": 161148612, "pid": 5714, "tid": 5714, "ts": 6300865960034.855, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865975139.949, "dur": 1.024, + "args": { + "External id": 84721, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148622, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148622, "pid": 0, "tid": 7, "ts": 6300865975139.949, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960053.975, "dur": 5.020, + "args": { + "External id": 84721, "cbid": 211, "correlation": 161148622 + } + }, + { + "ph": "s", "id": 161148622, "pid": 5714, "tid": 5714, "ts": 6300865960053.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865975141.677, "dur": 1.056, + "args": { + "External id": 84722, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148628, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148628, "pid": 0, "tid": 7, "ts": 6300865975141.677, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960066.114, "dur": 4.340, + "args": { + "External id": 84722, "cbid": 211, "correlation": 161148628 + } + }, + { + "ph": "s", "id": 161148628, "pid": 5714, "tid": 5714, "ts": 6300865960066.114, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865975143.405, "dur": 1.024, + "args": { + "External id": 84723, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148638, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148638, "pid": 0, "tid": 7, "ts": 6300865975143.405, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960081.114, "dur": 4.400, + "args": { + "External id": 84723, "cbid": 211, "correlation": 161148638 + } + }, + { + "ph": "s", "id": 161148638, "pid": 5714, "tid": 5714, "ts": 6300865960081.114, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865975145.133, "dur": 1.024, + "args": { + "External id": 84724, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148644, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148644, "pid": 0, "tid": 7, "ts": 6300865975145.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960091.244, "dur": 4.120, + "args": { + "External id": 84724, "cbid": 211, "correlation": 161148644 + } + }, + { + "ph": "s", "id": 161148644, "pid": 5714, "tid": 5714, "ts": 6300865960091.244, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865975146.893, "dur": 3.392, + "args": { + "External id": 84725, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148657, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148657, "pid": 0, "tid": 7, "ts": 6300865975146.893, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960109.934, "dur": 4.980, + "args": { + "External id": 84725, "cbid": 211, "correlation": 161148657 + } + }, + { + "ph": "s", "id": 161148657, "pid": 5714, "tid": 5714, "ts": 6300865960109.934, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865975151.021, "dur": 1.088, + "args": { + "External id": 84728, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148663, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148663, "pid": 0, "tid": 7, "ts": 6300865975151.021, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960122.424, "dur": 4.370, + "args": { + "External id": 84728, "cbid": 211, "correlation": 161148663 + } + }, + { + "ph": "s", "id": 161148663, "pid": 5714, "tid": 5714, "ts": 6300865960122.424, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865975152.717, "dur": 1.024, + "args": { + "External id": 84729, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148669, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148669, "pid": 0, "tid": 7, "ts": 6300865975152.717, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960131.524, "dur": 3.860, + "args": { + "External id": 84729, "cbid": 211, "correlation": 161148669 + } + }, + { + "ph": "s", "id": 161148669, "pid": 5714, "tid": 5714, "ts": 6300865960131.524, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865975154.477, "dur": 233.539, + "args": { + "External id": 84345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148683, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161148683, "pid": 0, "tid": 7, "ts": 6300865975154.477, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960220.334, "dur": 7.860, + "args": { + "External id": 84345, "cbid": 307, "correlation": 161148683 + } + }, + { + "ph": "s", "id": 161148683, "pid": 5714, "tid": 5714, "ts": 6300865960220.334, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865960261.554, "dur": 0.540, + "args": { + "External id": 84733, "cbid": 200, "correlation": 161148706 + } + }, + { + "ph": "f", "id": 161148706, "pid": 5714, "tid": 5714, "ts": 6300865960261.554, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865975389.008, "dur": 0.960, + "args": { + "External id": 84733, "device": 0, "context": 1, "stream": 7, "correlation": 161148709, "bytes": 1536, "memory bandwidth (GB/s)": 1.6 + } + }, + { + "ph": "f", "id": 161148709, "pid": 0, "tid": 7, "ts": 6300865975389.008, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865960263.904, "dur": 7.160, + "args": { + "External id": 84733, "cbid": 51, "correlation": 161148709 + } + }, + { + "ph": "s", "id": 161148709, "pid": 5714, "tid": 5714, "ts": 6300865960263.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865975390.736, "dur": 689.223, + "args": { + "External id": 84733, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148710, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148710, "pid": 0, "tid": 7, "ts": 6300865975390.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960271.304, "dur": 5.880, + "args": { + "External id": 84733, "cbid": 307, "correlation": 161148710 + } + }, + { + "ph": "s", "id": 161148710, "pid": 5714, "tid": 5714, "ts": 6300865960271.304, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865976080.632, "dur": 3.008, + "args": { + "External id": 84736, "device": 0, "context": 1, "stream": 7, "correlation": 161148715, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 161148715, "pid": 0, "tid": 7, "ts": 6300865976080.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865960309.944, "dur": 13.870, + "args": { + "External id": 84736, "cbid": 41, "correlation": 161148715 + } + }, + { + "ph": "s", "id": 161148715, "pid": 5714, "tid": 5714, "ts": 6300865960309.944, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865960367.134, "dur": 0.480, + "args": { + "External id": 84741, "cbid": 200, "correlation": 161148743 + } + }, + { + "ph": "f", "id": 161148743, "pid": 5714, "tid": 5714, "ts": 6300865960367.134, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865976084.248, "dur": 689.032, + "args": { + "External id": 84741, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148746, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148746, "pid": 0, "tid": 7, "ts": 6300865976084.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960369.124, "dur": 7.170, + "args": { + "External id": 84741, "cbid": 307, "correlation": 161148746 + } + }, + { + "ph": "s", "id": 161148746, "pid": 5714, "tid": 5714, "ts": 6300865960369.124, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865976773.984, "dur": 220.866, + "args": { + "External id": 84742, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148751, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161148751, "pid": 0, "tid": 7, "ts": 6300865976773.984, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960390.594, "dur": 6.060, + "args": { + "External id": 84742, "cbid": 211, "correlation": 161148751 + } + }, + { + "ph": "s", "id": 161148751, "pid": 5714, "tid": 5714, "ts": 6300865960390.594, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865976995.490, "dur": 5.473, + "args": { + "External id": 84744, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148764, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148764, "pid": 0, "tid": 7, "ts": 6300865976995.490, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960419.434, "dur": 6.160, + "args": { + "External id": 84744, "cbid": 211, "correlation": 161148764 + } + }, + { + "ph": "s", "id": 161148764, "pid": 5714, "tid": 5714, "ts": 6300865960419.434, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865977001.571, "dur": 159.266, + "args": { + "External id": 84749, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148777, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161148777, "pid": 0, "tid": 7, "ts": 6300865977001.571, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960448.863, "dur": 6.071, + "args": { + "External id": 84749, "cbid": 211, "correlation": 161148777 + } + }, + { + "ph": "s", "id": 161148777, "pid": 5714, "tid": 5714, "ts": 6300865960448.863, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865977161.445, "dur": 1.536, + "args": { + "External id": 84754, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148785, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148785, "pid": 0, "tid": 7, "ts": 6300865977161.445, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960512.503, "dur": 7.360, + "args": { + "External id": 84754, "cbid": 211, "correlation": 161148785 + } + }, + { + "ph": "s", "id": 161148785, "pid": 5714, "tid": 5714, "ts": 6300865960512.503, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865977163.589, "dur": 1.312, + "args": { + "External id": 84755, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148791, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148791, "pid": 0, "tid": 7, "ts": 6300865977163.589, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960531.803, "dur": 4.920, + "args": { + "External id": 84755, "cbid": 211, "correlation": 161148791 + } + }, + { + "ph": "s", "id": 161148791, "pid": 5714, "tid": 5714, "ts": 6300865960531.803, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865977165.637, "dur": 2.176, + "args": { + "External id": 84774, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148811, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 161148811, "pid": 0, "tid": 7, "ts": 6300865977165.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960616.403, "dur": 8.590, + "args": { + "External id": 84774, "cbid": 211, "correlation": 161148811 + } + }, + { + "ph": "s", "id": 161148811, "pid": 5714, "tid": 5714, "ts": 6300865960616.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865977168.517, "dur": 59.264, + "args": { + "External id": 84782, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148829, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161148829, "pid": 0, "tid": 7, "ts": 6300865977168.517, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960717.243, "dur": 9.570, + "args": { + "External id": 84782, "cbid": 211, "correlation": 161148829 + } + }, + { + "ph": "s", "id": 161148829, "pid": 5714, "tid": 5714, "ts": 6300865960717.243, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865977228.421, "dur": 15.136, + "args": { + "External id": 84787, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148846, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161148846, "pid": 0, "tid": 7, "ts": 6300865977228.421, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960760.023, "dur": 6.520, + "args": { + "External id": 84787, "cbid": 211, "correlation": 161148846 + } + }, + { + "ph": "s", "id": 161148846, "pid": 5714, "tid": 5714, "ts": 6300865960760.023, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865977244.197, "dur": 99.905, + "args": { + "External id": 84792, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148862, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161148862, "pid": 0, "tid": 7, "ts": 6300865977244.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960785.863, "dur": 5.070, + "args": { + "External id": 84792, "cbid": 211, "correlation": 161148862 + } + }, + { + "ph": "s", "id": 161148862, "pid": 5714, "tid": 5714, "ts": 6300865960785.863, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865977344.870, "dur": 1.920, + "args": { + "External id": 84796, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148878, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161148878, "pid": 0, "tid": 7, "ts": 6300865977344.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960809.363, "dur": 4.960, + "args": { + "External id": 84796, "cbid": 211, "correlation": 161148878 + } + }, + { + "ph": "s", "id": 161148878, "pid": 5714, "tid": 5714, "ts": 6300865960809.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865977347.462, "dur": 1.697, + "args": { + "External id": 84797, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148890, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161148890, "pid": 0, "tid": 7, "ts": 6300865977347.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960833.473, "dur": 5.390, + "args": { + "External id": 84797, "cbid": 211, "correlation": 161148890 + } + }, + { + "ph": "s", "id": 161148890, "pid": 5714, "tid": 5714, "ts": 6300865960833.473, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865977349.767, "dur": 2.080, + "args": { + "External id": 84804, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148908, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161148908, "pid": 0, "tid": 7, "ts": 6300865977349.767, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960868.253, "dur": 6.349, + "args": { + "External id": 84804, "cbid": 211, "correlation": 161148908 + } + }, + { + "ph": "s", "id": 161148908, "pid": 5714, "tid": 5714, "ts": 6300865960868.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 0, "tid": 7, + "ts": 6300865977352.519, "dur": 3.872, + "args": { + "External id": 84799, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148917, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148917, "pid": 0, "tid": 7, "ts": 6300865977352.519, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865960881.513, "dur": 4.400, + "args": { + "External id": 84799, "cbid": 211, "correlation": 161148917 + } + }, + { + "ph": "s", "id": 161148917, "pid": 5714, "tid": 5714, "ts": 6300865960881.513, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6300865977360.647, "dur": 0.992, + "args": { + "External id": 84806, "device": 0, "context": 1, "stream": 7, "correlation": 161148923, "bytes": 8, "memory bandwidth (GB/s)": 0.008064516129032258 + } + }, + { + "ph": "f", "id": 161148923, "pid": 0, "tid": 7, "ts": 6300865977360.647, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865960899.033, "dur": 10.289, + "args": { + "External id": 84806, "cbid": 41, "correlation": 161148923 + } + }, + { + "ph": "s", "id": 161148923, "pid": 5714, "tid": 5714, "ts": 6300865960899.033, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6300865960909.673, "dur": 16455.252, + "args": { + "External id": 84806, "cbid": 131, "correlation": 161148924 + } + }, + { + "ph": "s", "id": 161148924, "pid": 5714, "tid": 5714, "ts": 6300865960909.673, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865977419.925, "dur": 1.600, + "args": { + "External id": 84814, "cbid": 210, "correlation": 161148949 + } + }, + { + "ph": "f", "id": 161148949, "pid": 5714, "tid": 5714, "ts": 6300865977419.925, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865977435.975, "dur": 636.552, + "args": { + "External id": 84814, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148950, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161148950, "pid": 0, "tid": 7, "ts": 6300865977435.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977425.205, "dur": 10.130, + "args": { + "External id": 84814, "cbid": 211, "correlation": 161148950 + } + }, + { + "ph": "s", "id": 161148950, "pid": 5714, "tid": 5714, "ts": 6300865977425.205, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865978073.167, "dur": 171.426, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148969, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161148969, "pid": 0, "tid": 7, "ts": 6300865978073.167, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977545.755, "dur": 8.930, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161148969 + } + }, + { + "ph": "s", "id": 161148969, "pid": 5714, "tid": 5714, "ts": 6300865977545.755, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865978245.297, "dur": 4.096, + "args": { + "External id": 84824, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161148986, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161148986, "pid": 0, "tid": 7, "ts": 6300865978245.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977591.175, "dur": 7.360, + "args": { + "External id": 84824, "cbid": 211, "correlation": 161148986 + } + }, + { + "ph": "s", "id": 161148986, "pid": 5714, "tid": 5714, "ts": 6300865977591.175, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865978250.001, "dur": 1.216, + "args": { + "External id": 84829, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149003, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149003, "pid": 0, "tid": 7, "ts": 6300865978250.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977626.915, "dur": 5.980, + "args": { + "External id": 84829, "cbid": 211, "correlation": 161149003 + } + }, + { + "ph": "s", "id": 161149003, "pid": 5714, "tid": 5714, "ts": 6300865977626.915, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865978251.889, "dur": 1.024, + "args": { + "External id": 84831, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149013, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149013, "pid": 0, "tid": 7, "ts": 6300865978251.889, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977646.895, "dur": 5.100, + "args": { + "External id": 84831, "cbid": 211, "correlation": 161149013 + } + }, + { + "ph": "s", "id": 161149013, "pid": 5714, "tid": 5714, "ts": 6300865977646.895, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865978253.585, "dur": 1.056, + "args": { + "External id": 84832, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149019, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149019, "pid": 0, "tid": 7, "ts": 6300865978253.585, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977659.275, "dur": 4.340, + "args": { + "External id": 84832, "cbid": 211, "correlation": 161149019 + } + }, + { + "ph": "s", "id": 161149019, "pid": 5714, "tid": 5714, "ts": 6300865977659.275, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865978255.345, "dur": 1.056, + "args": { + "External id": 84833, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149029, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149029, "pid": 0, "tid": 7, "ts": 6300865978255.345, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977674.104, "dur": 4.600, + "args": { + "External id": 84833, "cbid": 211, "correlation": 161149029 + } + }, + { + "ph": "s", "id": 161149029, "pid": 5714, "tid": 5714, "ts": 6300865977674.104, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865978257.105, "dur": 1.056, + "args": { + "External id": 84834, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149035, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149035, "pid": 0, "tid": 7, "ts": 6300865978257.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977685.894, "dur": 4.180, + "args": { + "External id": 84834, "cbid": 211, "correlation": 161149035 + } + }, + { + "ph": "s", "id": 161149035, "pid": 5714, "tid": 5714, "ts": 6300865977685.894, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865978258.769, "dur": 3.360, + "args": { + "External id": 84835, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149048, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149048, "pid": 0, "tid": 7, "ts": 6300865978258.769, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977706.314, "dur": 4.910, + "args": { + "External id": 84835, "cbid": 211, "correlation": 161149048 + } + }, + { + "ph": "s", "id": 161149048, "pid": 5714, "tid": 5714, "ts": 6300865977706.314, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865978262.801, "dur": 1.120, + "args": { + "External id": 84838, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149054, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149054, "pid": 0, "tid": 7, "ts": 6300865978262.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977717.694, "dur": 4.590, + "args": { + "External id": 84838, "cbid": 211, "correlation": 161149054 + } + }, + { + "ph": "s", "id": 161149054, "pid": 5714, "tid": 5714, "ts": 6300865977717.694, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865978264.529, "dur": 1.024, + "args": { + "External id": 84839, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149060, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149060, "pid": 0, "tid": 7, "ts": 6300865978264.529, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977727.244, "dur": 3.860, + "args": { + "External id": 84839, "cbid": 211, "correlation": 161149060 + } + }, + { + "ph": "s", "id": 161149060, "pid": 5714, "tid": 5714, "ts": 6300865977727.244, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865978266.225, "dur": 235.971, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149074, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161149074, "pid": 0, "tid": 7, "ts": 6300865978266.225, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977817.874, "dur": 8.110, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161149074 + } + }, + { + "ph": "s", "id": 161149074, "pid": 5714, "tid": 5714, "ts": 6300865977817.874, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865977859.814, "dur": 0.540, + "args": { + "External id": 84843, "cbid": 200, "correlation": 161149097 + } + }, + { + "ph": "f", "id": 161149097, "pid": 5714, "tid": 5714, "ts": 6300865977859.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865978503.060, "dur": 0.832, + "args": { + "External id": 84843, "device": 0, "context": 1, "stream": 7, "correlation": 161149100, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161149100, "pid": 0, "tid": 7, "ts": 6300865978503.060, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865977862.204, "dur": 6.760, + "args": { + "External id": 84843, "cbid": 51, "correlation": 161149100 + } + }, + { + "ph": "s", "id": 161149100, "pid": 5714, "tid": 5714, "ts": 6300865977862.204, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865978505.076, "dur": 689.256, + "args": { + "External id": 84843, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149101, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149101, "pid": 0, "tid": 7, "ts": 6300865978505.076, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977869.234, "dur": 5.770, + "args": { + "External id": 84843, "cbid": 307, "correlation": 161149101 + } + }, + { + "ph": "s", "id": 161149101, "pid": 5714, "tid": 5714, "ts": 6300865977869.234, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865979195.004, "dur": 3.040, + "args": { + "External id": 84846, "device": 0, "context": 1, "stream": 7, "correlation": 161149106, "bytes": 3145728, "memory bandwidth (GB/s)": 1034.778947368421 + } + }, + { + "ph": "f", "id": 161149106, "pid": 0, "tid": 7, "ts": 6300865979195.004, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865977900.424, "dur": 13.310, + "args": { + "External id": 84846, "cbid": 41, "correlation": 161149106 + } + }, + { + "ph": "s", "id": 161149106, "pid": 5714, "tid": 5714, "ts": 6300865977900.424, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865977955.304, "dur": 0.490, + "args": { + "External id": 84851, "cbid": 200, "correlation": 161149134 + } + }, + { + "ph": "f", "id": 161149134, "pid": 5714, "tid": 5714, "ts": 6300865977955.304, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865979198.652, "dur": 698.153, + "args": { + "External id": 84851, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149137, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149137, "pid": 0, "tid": 7, "ts": 6300865979198.652, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977957.294, "dur": 7.280, + "args": { + "External id": 84851, "cbid": 307, "correlation": 161149137 + } + }, + { + "ph": "s", "id": 161149137, "pid": 5714, "tid": 5714, "ts": 6300865977957.294, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865979897.477, "dur": 221.090, + "args": { + "External id": 84852, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149142, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161149142, "pid": 0, "tid": 7, "ts": 6300865979897.477, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865977978.894, "dur": 5.980, + "args": { + "External id": 84852, "cbid": 211, "correlation": 161149142 + } + }, + { + "ph": "s", "id": 161149142, "pid": 5714, "tid": 5714, "ts": 6300865977978.894, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865978032.474, "dur": 1.249, + "args": { + "External id": 84860, "cbid": 210, "correlation": 161149168 + } + }, + { + "ph": "f", "id": 161149168, "pid": 5714, "tid": 5714, "ts": 6300865978032.474, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865980119.239, "dur": 643.784, + "args": { + "External id": 84860, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149169, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149169, "pid": 0, "tid": 7, "ts": 6300865980119.239, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978037.294, "dur": 7.669, + "args": { + "External id": 84860, "cbid": 211, "correlation": 161149169 + } + }, + { + "ph": "s", "id": 161149169, "pid": 5714, "tid": 5714, "ts": 6300865978037.294, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865980763.631, "dur": 170.754, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149188, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161149188, "pid": 0, "tid": 7, "ts": 6300865980763.631, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978153.063, "dur": 8.670, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161149188 + } + }, + { + "ph": "s", "id": 161149188, "pid": 5714, "tid": 5714, "ts": 6300865978153.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865980935.089, "dur": 4.096, + "args": { + "External id": 84870, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149205, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149205, "pid": 0, "tid": 7, "ts": 6300865980935.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978195.893, "dur": 7.140, + "args": { + "External id": 84870, "cbid": 211, "correlation": 161149205 + } + }, + { + "ph": "s", "id": 161149205, "pid": 5714, "tid": 5714, "ts": 6300865978195.893, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865980939.793, "dur": 1.216, + "args": { + "External id": 84875, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149222, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149222, "pid": 0, "tid": 7, "ts": 6300865980939.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978227.853, "dur": 5.460, + "args": { + "External id": 84875, "cbid": 211, "correlation": 161149222 + } + }, + { + "ph": "s", "id": 161149222, "pid": 5714, "tid": 5714, "ts": 6300865978227.853, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865980941.681, "dur": 1.024, + "args": { + "External id": 84877, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149232, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149232, "pid": 0, "tid": 7, "ts": 6300865980941.681, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978247.893, "dur": 5.380, + "args": { + "External id": 84877, "cbid": 211, "correlation": 161149232 + } + }, + { + "ph": "s", "id": 161149232, "pid": 5714, "tid": 5714, "ts": 6300865978247.893, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865980943.409, "dur": 1.088, + "args": { + "External id": 84878, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149238, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149238, "pid": 0, "tid": 7, "ts": 6300865980943.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978260.503, "dur": 4.610, + "args": { + "External id": 84878, "cbid": 211, "correlation": 161149238 + } + }, + { + "ph": "s", "id": 161149238, "pid": 5714, "tid": 5714, "ts": 6300865978260.503, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865980945.169, "dur": 1.056, + "args": { + "External id": 84879, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149248, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149248, "pid": 0, "tid": 7, "ts": 6300865980945.169, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978276.033, "dur": 4.520, + "args": { + "External id": 84879, "cbid": 211, "correlation": 161149248 + } + }, + { + "ph": "s", "id": 161149248, "pid": 5714, "tid": 5714, "ts": 6300865978276.033, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865980946.929, "dur": 1.024, + "args": { + "External id": 84880, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149254, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149254, "pid": 0, "tid": 7, "ts": 6300865980946.929, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978286.193, "dur": 4.300, + "args": { + "External id": 84880, "cbid": 211, "correlation": 161149254 + } + }, + { + "ph": "s", "id": 161149254, "pid": 5714, "tid": 5714, "ts": 6300865978286.193, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865980948.593, "dur": 3.392, + "args": { + "External id": 84881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149267, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149267, "pid": 0, "tid": 7, "ts": 6300865980948.593, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978316.043, "dur": 5.960, + "args": { + "External id": 84881, "cbid": 211, "correlation": 161149267 + } + }, + { + "ph": "s", "id": 161149267, "pid": 5714, "tid": 5714, "ts": 6300865978316.043, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865980952.625, "dur": 1.088, + "args": { + "External id": 84884, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149273, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149273, "pid": 0, "tid": 7, "ts": 6300865980952.625, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978329.093, "dur": 4.400, + "args": { + "External id": 84884, "cbid": 211, "correlation": 161149273 + } + }, + { + "ph": "s", "id": 161149273, "pid": 5714, "tid": 5714, "ts": 6300865978329.093, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865980954.385, "dur": 0.992, + "args": { + "External id": 84885, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149279, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149279, "pid": 0, "tid": 7, "ts": 6300865980954.385, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978338.553, "dur": 4.040, + "args": { + "External id": 84885, "cbid": 211, "correlation": 161149279 + } + }, + { + "ph": "s", "id": 161149279, "pid": 5714, "tid": 5714, "ts": 6300865978338.553, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865980956.113, "dur": 235.555, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149293, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161149293, "pid": 0, "tid": 7, "ts": 6300865980956.113, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978430.873, "dur": 8.220, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161149293 + } + }, + { + "ph": "s", "id": 161149293, "pid": 5714, "tid": 5714, "ts": 6300865978430.873, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865978472.982, "dur": 0.540, + "args": { + "External id": 84889, "cbid": 200, "correlation": 161149316 + } + }, + { + "ph": "f", "id": 161149316, "pid": 5714, "tid": 5714, "ts": 6300865978472.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865981192.596, "dur": 1.184, + "args": { + "External id": 84889, "device": 0, "context": 1, "stream": 7, "correlation": 161149319, "bytes": 1536, "memory bandwidth (GB/s)": 1.2972972972972974 + } + }, + { + "ph": "f", "id": 161149319, "pid": 0, "tid": 7, "ts": 6300865981192.596, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865978475.293, "dur": 7.880, + "args": { + "External id": 84889, "cbid": 51, "correlation": 161149319 + } + }, + { + "ph": "s", "id": 161149319, "pid": 5714, "tid": 5714, "ts": 6300865978475.293, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865981194.964, "dur": 688.840, + "args": { + "External id": 84889, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149320, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149320, "pid": 0, "tid": 7, "ts": 6300865981194.964, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978483.433, "dur": 5.809, + "args": { + "External id": 84889, "cbid": 307, "correlation": 161149320 + } + }, + { + "ph": "s", "id": 161149320, "pid": 5714, "tid": 5714, "ts": 6300865978483.433, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865981884.540, "dur": 2.944, + "args": { + "External id": 84892, "device": 0, "context": 1, "stream": 7, "correlation": 161149325, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161149325, "pid": 0, "tid": 7, "ts": 6300865981884.540, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865978513.882, "dur": 11.960, + "args": { + "External id": 84892, "cbid": 41, "correlation": 161149325 + } + }, + { + "ph": "s", "id": 161149325, "pid": 5714, "tid": 5714, "ts": 6300865978513.882, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865978567.202, "dur": 0.470, + "args": { + "External id": 84897, "cbid": 200, "correlation": 161149353 + } + }, + { + "ph": "f", "id": 161149353, "pid": 5714, "tid": 5714, "ts": 6300865978567.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865981888.156, "dur": 688.168, + "args": { + "External id": 84897, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149356, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149356, "pid": 0, "tid": 7, "ts": 6300865981888.156, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978569.182, "dur": 7.050, + "args": { + "External id": 84897, "cbid": 307, "correlation": 161149356 + } + }, + { + "ph": "s", "id": 161149356, "pid": 5714, "tid": 5714, "ts": 6300865978569.182, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865982577.028, "dur": 222.594, + "args": { + "External id": 84898, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149361, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161149361, "pid": 0, "tid": 7, "ts": 6300865982577.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978590.482, "dur": 5.900, + "args": { + "External id": 84898, "cbid": 211, "correlation": 161149361 + } + }, + { + "ph": "s", "id": 161149361, "pid": 5714, "tid": 5714, "ts": 6300865978590.482, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865978641.952, "dur": 1.200, + "args": { + "External id": 84906, "cbid": 210, "correlation": 161149387 + } + }, + { + "ph": "f", "id": 161149387, "pid": 5714, "tid": 5714, "ts": 6300865978641.952, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865982800.358, "dur": 641.576, + "args": { + "External id": 84906, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149388, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149388, "pid": 0, "tid": 7, "ts": 6300865982800.358, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978646.672, "dur": 7.580, + "args": { + "External id": 84906, "cbid": 211, "correlation": 161149388 + } + }, + { + "ph": "s", "id": 161149388, "pid": 5714, "tid": 5714, "ts": 6300865978646.672, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865983442.670, "dur": 171.362, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149407, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161149407, "pid": 0, "tid": 7, "ts": 6300865983442.670, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978757.912, "dur": 8.740, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161149407 + } + }, + { + "ph": "s", "id": 161149407, "pid": 5714, "tid": 5714, "ts": 6300865978757.912, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865983614.736, "dur": 4.128, + "args": { + "External id": 84916, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149424, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149424, "pid": 0, "tid": 7, "ts": 6300865983614.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978801.612, "dur": 7.200, + "args": { + "External id": 84916, "cbid": 211, "correlation": 161149424 + } + }, + { + "ph": "s", "id": 161149424, "pid": 5714, "tid": 5714, "ts": 6300865978801.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865983619.504, "dur": 1.216, + "args": { + "External id": 84921, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149441, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149441, "pid": 0, "tid": 7, "ts": 6300865983619.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978832.762, "dur": 5.360, + "args": { + "External id": 84921, "cbid": 211, "correlation": 161149441 + } + }, + { + "ph": "s", "id": 161149441, "pid": 5714, "tid": 5714, "ts": 6300865978832.762, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865983621.392, "dur": 1.024, + "args": { + "External id": 84923, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149451, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149451, "pid": 0, "tid": 7, "ts": 6300865983621.392, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978852.432, "dur": 5.040, + "args": { + "External id": 84923, "cbid": 211, "correlation": 161149451 + } + }, + { + "ph": "s", "id": 161149451, "pid": 5714, "tid": 5714, "ts": 6300865978852.432, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865983623.120, "dur": 1.088, + "args": { + "External id": 84924, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149457, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149457, "pid": 0, "tid": 7, "ts": 6300865983623.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978864.802, "dur": 4.410, + "args": { + "External id": 84924, "cbid": 211, "correlation": 161149457 + } + }, + { + "ph": "s", "id": 161149457, "pid": 5714, "tid": 5714, "ts": 6300865978864.802, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865983624.880, "dur": 1.025, + "args": { + "External id": 84925, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149467, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149467, "pid": 0, "tid": 7, "ts": 6300865983624.880, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978879.322, "dur": 4.250, + "args": { + "External id": 84925, "cbid": 211, "correlation": 161149467 + } + }, + { + "ph": "s", "id": 161149467, "pid": 5714, "tid": 5714, "ts": 6300865978879.322, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865983626.641, "dur": 1.056, + "args": { + "External id": 84926, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149473, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149473, "pid": 0, "tid": 7, "ts": 6300865983626.641, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978889.282, "dur": 4.530, + "args": { + "External id": 84926, "cbid": 211, "correlation": 161149473 + } + }, + { + "ph": "s", "id": 161149473, "pid": 5714, "tid": 5714, "ts": 6300865978889.282, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865983628.433, "dur": 3.392, + "args": { + "External id": 84927, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149486, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149486, "pid": 0, "tid": 7, "ts": 6300865983628.433, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978909.892, "dur": 5.020, + "args": { + "External id": 84927, "cbid": 211, "correlation": 161149486 + } + }, + { + "ph": "s", "id": 161149486, "pid": 5714, "tid": 5714, "ts": 6300865978909.892, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865983632.464, "dur": 1.120, + "args": { + "External id": 84930, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149492, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149492, "pid": 0, "tid": 7, "ts": 6300865983632.464, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978921.052, "dur": 4.309, + "args": { + "External id": 84930, "cbid": 211, "correlation": 161149492 + } + }, + { + "ph": "s", "id": 161149492, "pid": 5714, "tid": 5714, "ts": 6300865978921.052, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865983634.224, "dur": 1.024, + "args": { + "External id": 84931, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149498, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149498, "pid": 0, "tid": 7, "ts": 6300865983634.224, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865978930.312, "dur": 4.009, + "args": { + "External id": 84931, "cbid": 211, "correlation": 161149498 + } + }, + { + "ph": "s", "id": 161149498, "pid": 5714, "tid": 5714, "ts": 6300865978930.312, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865983635.952, "dur": 236.515, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149512, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161149512, "pid": 0, "tid": 7, "ts": 6300865983635.952, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979018.811, "dur": 8.100, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161149512 + } + }, + { + "ph": "s", "id": 161149512, "pid": 5714, "tid": 5714, "ts": 6300865979018.811, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865979060.631, "dur": 0.560, + "args": { + "External id": 84935, "cbid": 200, "correlation": 161149535 + } + }, + { + "ph": "f", "id": 161149535, "pid": 5714, "tid": 5714, "ts": 6300865979060.631, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865983873.363, "dur": 0.800, + "args": { + "External id": 84935, "device": 0, "context": 1, "stream": 7, "correlation": 161149538, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161149538, "pid": 0, "tid": 7, "ts": 6300865983873.363, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865979063.061, "dur": 7.080, + "args": { + "External id": 84935, "cbid": 51, "correlation": 161149538 + } + }, + { + "ph": "s", "id": 161149538, "pid": 5714, "tid": 5714, "ts": 6300865979063.061, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865983875.379, "dur": 693.000, + "args": { + "External id": 84935, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149539, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149539, "pid": 0, "tid": 7, "ts": 6300865983875.379, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979070.361, "dur": 6.220, + "args": { + "External id": 84935, "cbid": 307, "correlation": 161149539 + } + }, + { + "ph": "s", "id": 161149539, "pid": 5714, "tid": 5714, "ts": 6300865979070.361, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865984568.987, "dur": 3.008, + "args": { + "External id": 84938, "device": 0, "context": 1, "stream": 7, "correlation": 161149544, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 161149544, "pid": 0, "tid": 7, "ts": 6300865984568.987, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865979102.621, "dur": 13.120, + "args": { + "External id": 84938, "cbid": 41, "correlation": 161149544 + } + }, + { + "ph": "s", "id": 161149544, "pid": 5714, "tid": 5714, "ts": 6300865979102.621, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865979157.521, "dur": 0.500, + "args": { + "External id": 84943, "cbid": 200, "correlation": 161149572 + } + }, + { + "ph": "f", "id": 161149572, "pid": 5714, "tid": 5714, "ts": 6300865979157.521, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865984572.635, "dur": 688.360, + "args": { + "External id": 84943, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149575, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149575, "pid": 0, "tid": 7, "ts": 6300865984572.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979159.551, "dur": 7.180, + "args": { + "External id": 84943, "cbid": 307, "correlation": 161149575 + } + }, + { + "ph": "s", "id": 161149575, "pid": 5714, "tid": 5714, "ts": 6300865979159.551, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865985261.763, "dur": 221.187, + "args": { + "External id": 84944, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149580, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161149580, "pid": 0, "tid": 7, "ts": 6300865985261.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979181.491, "dur": 6.250, + "args": { + "External id": 84944, "cbid": 211, "correlation": 161149580 + } + }, + { + "ph": "s", "id": 161149580, "pid": 5714, "tid": 5714, "ts": 6300865979181.491, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865979231.421, "dur": 1.170, + "args": { + "External id": 84952, "cbid": 210, "correlation": 161149606 + } + }, + { + "ph": "f", "id": 161149606, "pid": 5714, "tid": 5714, "ts": 6300865979231.421, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865985483.686, "dur": 643.655, + "args": { + "External id": 84952, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149607, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149607, "pid": 0, "tid": 7, "ts": 6300865985483.686, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979236.101, "dur": 7.200, + "args": { + "External id": 84952, "cbid": 211, "correlation": 161149607 + } + }, + { + "ph": "s", "id": 161149607, "pid": 5714, "tid": 5714, "ts": 6300865979236.101, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865986128.077, "dur": 170.627, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149626, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161149626, "pid": 0, "tid": 7, "ts": 6300865986128.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979355.971, "dur": 9.160, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161149626 + } + }, + { + "ph": "s", "id": 161149626, "pid": 5714, "tid": 5714, "ts": 6300865979355.971, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865986299.344, "dur": 4.096, + "args": { + "External id": 84962, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149643, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149643, "pid": 0, "tid": 7, "ts": 6300865986299.344, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979400.680, "dur": 7.080, + "args": { + "External id": 84962, "cbid": 211, "correlation": 161149643 + } + }, + { + "ph": "s", "id": 161149643, "pid": 5714, "tid": 5714, "ts": 6300865979400.680, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865986304.080, "dur": 1.408, + "args": { + "External id": 84967, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149660, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149660, "pid": 0, "tid": 7, "ts": 6300865986304.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979431.610, "dur": 5.550, + "args": { + "External id": 84967, "cbid": 211, "correlation": 161149660 + } + }, + { + "ph": "s", "id": 161149660, "pid": 5714, "tid": 5714, "ts": 6300865979431.610, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865986306.128, "dur": 1.024, + "args": { + "External id": 84969, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149670, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149670, "pid": 0, "tid": 7, "ts": 6300865986306.128, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979450.810, "dur": 5.060, + "args": { + "External id": 84969, "cbid": 211, "correlation": 161149670 + } + }, + { + "ph": "s", "id": 161149670, "pid": 5714, "tid": 5714, "ts": 6300865979450.810, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865986307.856, "dur": 1.056, + "args": { + "External id": 84970, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149676, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149676, "pid": 0, "tid": 7, "ts": 6300865986307.856, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979463.160, "dur": 4.380, + "args": { + "External id": 84970, "cbid": 211, "correlation": 161149676 + } + }, + { + "ph": "s", "id": 161149676, "pid": 5714, "tid": 5714, "ts": 6300865979463.160, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865986309.616, "dur": 1.056, + "args": { + "External id": 84971, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149686, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149686, "pid": 0, "tid": 7, "ts": 6300865986309.616, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979477.840, "dur": 4.310, + "args": { + "External id": 84971, "cbid": 211, "correlation": 161149686 + } + }, + { + "ph": "s", "id": 161149686, "pid": 5714, "tid": 5714, "ts": 6300865979477.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865986311.344, "dur": 1.056, + "args": { + "External id": 84972, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149692, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149692, "pid": 0, "tid": 7, "ts": 6300865986311.344, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979489.200, "dur": 4.250, + "args": { + "External id": 84972, "cbid": 211, "correlation": 161149692 + } + }, + { + "ph": "s", "id": 161149692, "pid": 5714, "tid": 5714, "ts": 6300865979489.200, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865986313.040, "dur": 3.392, + "args": { + "External id": 84973, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149705, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149705, "pid": 0, "tid": 7, "ts": 6300865986313.040, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979508.470, "dur": 5.110, + "args": { + "External id": 84973, "cbid": 211, "correlation": 161149705 + } + }, + { + "ph": "s", "id": 161149705, "pid": 5714, "tid": 5714, "ts": 6300865979508.470, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865986317.072, "dur": 1.088, + "args": { + "External id": 84976, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149711, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149711, "pid": 0, "tid": 7, "ts": 6300865986317.072, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979519.700, "dur": 4.680, + "args": { + "External id": 84976, "cbid": 211, "correlation": 161149711 + } + }, + { + "ph": "s", "id": 161149711, "pid": 5714, "tid": 5714, "ts": 6300865979519.700, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865986318.800, "dur": 1.024, + "args": { + "External id": 84977, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149717, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149717, "pid": 0, "tid": 7, "ts": 6300865986318.800, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979529.220, "dur": 3.800, + "args": { + "External id": 84977, "cbid": 211, "correlation": 161149717 + } + }, + { + "ph": "s", "id": 161149717, "pid": 5714, "tid": 5714, "ts": 6300865979529.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865986320.560, "dur": 234.787, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149731, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161149731, "pid": 0, "tid": 7, "ts": 6300865986320.560, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979619.700, "dur": 8.030, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161149731 + } + }, + { + "ph": "s", "id": 161149731, "pid": 5714, "tid": 5714, "ts": 6300865979619.700, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865979660.670, "dur": 0.570, + "args": { + "External id": 84981, "cbid": 200, "correlation": 161149754 + } + }, + { + "ph": "f", "id": 161149754, "pid": 5714, "tid": 5714, "ts": 6300865979660.670, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865986556.243, "dur": 0.832, + "args": { + "External id": 84981, "device": 0, "context": 1, "stream": 7, "correlation": 161149757, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161149757, "pid": 0, "tid": 7, "ts": 6300865986556.243, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865979663.030, "dur": 6.930, + "args": { + "External id": 84981, "cbid": 51, "correlation": 161149757 + } + }, + { + "ph": "s", "id": 161149757, "pid": 5714, "tid": 5714, "ts": 6300865979663.030, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865986558.259, "dur": 690.216, + "args": { + "External id": 84981, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149758, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149758, "pid": 0, "tid": 7, "ts": 6300865986558.259, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979670.210, "dur": 5.710, + "args": { + "External id": 84981, "cbid": 307, "correlation": 161149758 + } + }, + { + "ph": "s", "id": 161149758, "pid": 5714, "tid": 5714, "ts": 6300865979670.210, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865987249.083, "dur": 2.944, + "args": { + "External id": 84984, "device": 0, "context": 1, "stream": 7, "correlation": 161149763, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161149763, "pid": 0, "tid": 7, "ts": 6300865987249.083, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865979699.400, "dur": 12.040, + "args": { + "External id": 84984, "cbid": 41, "correlation": 161149763 + } + }, + { + "ph": "s", "id": 161149763, "pid": 5714, "tid": 5714, "ts": 6300865979699.400, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865979754.680, "dur": 0.480, + "args": { + "External id": 84989, "cbid": 200, "correlation": 161149791 + } + }, + { + "ph": "f", "id": 161149791, "pid": 5714, "tid": 5714, "ts": 6300865979754.680, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865987252.731, "dur": 687.528, + "args": { + "External id": 84989, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149794, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149794, "pid": 0, "tid": 7, "ts": 6300865987252.731, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979756.640, "dur": 7.250, + "args": { + "External id": 84989, "cbid": 307, "correlation": 161149794 + } + }, + { + "ph": "s", "id": 161149794, "pid": 5714, "tid": 5714, "ts": 6300865979756.640, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865987940.899, "dur": 221.091, + "args": { + "External id": 84990, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149799, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161149799, "pid": 0, "tid": 7, "ts": 6300865987940.899, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979777.659, "dur": 6.020, + "args": { + "External id": 84990, "cbid": 211, "correlation": 161149799 + } + }, + { + "ph": "s", "id": 161149799, "pid": 5714, "tid": 5714, "ts": 6300865979777.659, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865979839.970, "dur": 1.249, + "args": { + "External id": 84998, "cbid": 210, "correlation": 161149825 + } + }, + { + "ph": "f", "id": 161149825, "pid": 5714, "tid": 5714, "ts": 6300865979839.970, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865988162.630, "dur": 642.919, + "args": { + "External id": 84998, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149826, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149826, "pid": 0, "tid": 7, "ts": 6300865988162.630, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979844.679, "dur": 7.831, + "args": { + "External id": 84998, "cbid": 211, "correlation": 161149826 + } + }, + { + "ph": "s", "id": 161149826, "pid": 5714, "tid": 5714, "ts": 6300865979844.679, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865988806.253, "dur": 171.362, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149845, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161149845, "pid": 0, "tid": 7, "ts": 6300865988806.253, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865979956.439, "dur": 8.950, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161149845 + } + }, + { + "ph": "s", "id": 161149845, "pid": 5714, "tid": 5714, "ts": 6300865979956.439, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865988978.351, "dur": 4.032, + "args": { + "External id": 85008, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149862, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149862, "pid": 0, "tid": 7, "ts": 6300865988978.351, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980000.459, "dur": 7.160, + "args": { + "External id": 85008, "cbid": 211, "correlation": 161149862 + } + }, + { + "ph": "s", "id": 161149862, "pid": 5714, "tid": 5714, "ts": 6300865980000.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865988983.087, "dur": 1.216, + "args": { + "External id": 85013, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149879, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149879, "pid": 0, "tid": 7, "ts": 6300865988983.087, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980030.299, "dur": 5.320, + "args": { + "External id": 85013, "cbid": 211, "correlation": 161149879 + } + }, + { + "ph": "s", "id": 161149879, "pid": 5714, "tid": 5714, "ts": 6300865980030.299, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865988984.975, "dur": 1.024, + "args": { + "External id": 85015, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149889, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149889, "pid": 0, "tid": 7, "ts": 6300865988984.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980050.559, "dur": 4.780, + "args": { + "External id": 85015, "cbid": 211, "correlation": 161149889 + } + }, + { + "ph": "s", "id": 161149889, "pid": 5714, "tid": 5714, "ts": 6300865980050.559, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865988986.703, "dur": 1.088, + "args": { + "External id": 85016, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149895, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149895, "pid": 0, "tid": 7, "ts": 6300865988986.703, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980062.429, "dur": 4.440, + "args": { + "External id": 85016, "cbid": 211, "correlation": 161149895 + } + }, + { + "ph": "s", "id": 161149895, "pid": 5714, "tid": 5714, "ts": 6300865980062.429, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865988988.463, "dur": 1.056, + "args": { + "External id": 85017, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149905, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149905, "pid": 0, "tid": 7, "ts": 6300865988988.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980077.209, "dur": 4.360, + "args": { + "External id": 85017, "cbid": 211, "correlation": 161149905 + } + }, + { + "ph": "s", "id": 161149905, "pid": 5714, "tid": 5714, "ts": 6300865980077.209, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865988990.223, "dur": 1.056, + "args": { + "External id": 85018, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149911, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149911, "pid": 0, "tid": 7, "ts": 6300865988990.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980088.479, "dur": 4.100, + "args": { + "External id": 85018, "cbid": 211, "correlation": 161149911 + } + }, + { + "ph": "s", "id": 161149911, "pid": 5714, "tid": 5714, "ts": 6300865980088.479, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865988992.015, "dur": 3.392, + "args": { + "External id": 85019, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149924, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149924, "pid": 0, "tid": 7, "ts": 6300865988992.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980107.689, "dur": 4.970, + "args": { + "External id": 85019, "cbid": 211, "correlation": 161149924 + } + }, + { + "ph": "s", "id": 161149924, "pid": 5714, "tid": 5714, "ts": 6300865980107.689, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865988996.047, "dur": 1.120, + "args": { + "External id": 85022, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149930, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149930, "pid": 0, "tid": 7, "ts": 6300865988996.047, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980118.829, "dur": 4.180, + "args": { + "External id": 85022, "cbid": 211, "correlation": 161149930 + } + }, + { + "ph": "s", "id": 161149930, "pid": 5714, "tid": 5714, "ts": 6300865980118.829, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865988997.807, "dur": 1.024, + "args": { + "External id": 85023, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149936, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161149936, "pid": 0, "tid": 7, "ts": 6300865988997.807, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980127.959, "dur": 3.930, + "args": { + "External id": 85023, "cbid": 211, "correlation": 161149936 + } + }, + { + "ph": "s", "id": 161149936, "pid": 5714, "tid": 5714, "ts": 6300865980127.959, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865988999.535, "dur": 235.587, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149950, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161149950, "pid": 0, "tid": 7, "ts": 6300865988999.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980217.758, "dur": 8.060, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161149950 + } + }, + { + "ph": "s", "id": 161149950, "pid": 5714, "tid": 5714, "ts": 6300865980217.758, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865980259.058, "dur": 0.560, + "args": { + "External id": 85027, "cbid": 200, "correlation": 161149973 + } + }, + { + "ph": "f", "id": 161149973, "pid": 5714, "tid": 5714, "ts": 6300865980259.058, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865989235.986, "dur": 0.928, + "args": { + "External id": 85027, "device": 0, "context": 1, "stream": 7, "correlation": 161149976, "bytes": 1536, "memory bandwidth (GB/s)": 1.6551724137931034 + } + }, + { + "ph": "f", "id": 161149976, "pid": 0, "tid": 7, "ts": 6300865989235.986, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865980261.478, "dur": 7.011, + "args": { + "External id": 85027, "cbid": 51, "correlation": 161149976 + } + }, + { + "ph": "s", "id": 161149976, "pid": 5714, "tid": 5714, "ts": 6300865980261.478, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865989238.098, "dur": 689.704, + "args": { + "External id": 85027, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161149977, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161149977, "pid": 0, "tid": 7, "ts": 6300865989238.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980268.718, "dur": 5.731, + "args": { + "External id": 85027, "cbid": 307, "correlation": 161149977 + } + }, + { + "ph": "s", "id": 161149977, "pid": 5714, "tid": 5714, "ts": 6300865980268.718, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865989928.538, "dur": 3.008, + "args": { + "External id": 85030, "device": 0, "context": 1, "stream": 7, "correlation": 161149982, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 161149982, "pid": 0, "tid": 7, "ts": 6300865989928.538, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865980307.018, "dur": 12.330, + "args": { + "External id": 85030, "cbid": 41, "correlation": 161149982 + } + }, + { + "ph": "s", "id": 161149982, "pid": 5714, "tid": 5714, "ts": 6300865980307.018, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865980363.128, "dur": 0.500, + "args": { + "External id": 85035, "cbid": 200, "correlation": 161150010 + } + }, + { + "ph": "f", "id": 161150010, "pid": 5714, "tid": 5714, "ts": 6300865980363.128, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865989932.154, "dur": 691.752, + "args": { + "External id": 85035, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150013, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150013, "pid": 0, "tid": 7, "ts": 6300865989932.154, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980365.138, "dur": 7.210, + "args": { + "External id": 85035, "cbid": 307, "correlation": 161150013 + } + }, + { + "ph": "s", "id": 161150013, "pid": 5714, "tid": 5714, "ts": 6300865980365.138, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865990624.642, "dur": 220.579, + "args": { + "External id": 85036, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150018, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161150018, "pid": 0, "tid": 7, "ts": 6300865990624.642, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980386.918, "dur": 6.030, + "args": { + "External id": 85036, "cbid": 211, "correlation": 161150018 + } + }, + { + "ph": "s", "id": 161150018, "pid": 5714, "tid": 5714, "ts": 6300865980386.918, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865980437.858, "dur": 1.140, + "args": { + "External id": 85044, "cbid": 210, "correlation": 161150044 + } + }, + { + "ph": "f", "id": 161150044, "pid": 5714, "tid": 5714, "ts": 6300865980437.858, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865990845.925, "dur": 642.375, + "args": { + "External id": 85044, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150045, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150045, "pid": 0, "tid": 7, "ts": 6300865990845.925, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980442.488, "dur": 7.150, + "args": { + "External id": 85044, "cbid": 211, "correlation": 161150045 + } + }, + { + "ph": "s", "id": 161150045, "pid": 5714, "tid": 5714, "ts": 6300865980442.488, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865991489.037, "dur": 170.882, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150064, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161150064, "pid": 0, "tid": 7, "ts": 6300865991489.037, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980554.768, "dur": 8.790, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161150064 + } + }, + { + "ph": "s", "id": 161150064, "pid": 5714, "tid": 5714, "ts": 6300865980554.768, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865991660.623, "dur": 4.096, + "args": { + "External id": 85054, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150081, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150081, "pid": 0, "tid": 7, "ts": 6300865991660.623, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980597.068, "dur": 7.110, + "args": { + "External id": 85054, "cbid": 211, "correlation": 161150081 + } + }, + { + "ph": "s", "id": 161150081, "pid": 5714, "tid": 5714, "ts": 6300865980597.068, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865991665.359, "dur": 1.184, + "args": { + "External id": 85059, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150098, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150098, "pid": 0, "tid": 7, "ts": 6300865991665.359, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980629.538, "dur": 5.770, + "args": { + "External id": 85059, "cbid": 211, "correlation": 161150098 + } + }, + { + "ph": "s", "id": 161150098, "pid": 5714, "tid": 5714, "ts": 6300865980629.538, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865991667.247, "dur": 1.024, + "args": { + "External id": 85061, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150108, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150108, "pid": 0, "tid": 7, "ts": 6300865991667.247, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980648.948, "dur": 4.820, + "args": { + "External id": 85061, "cbid": 211, "correlation": 161150108 + } + }, + { + "ph": "s", "id": 161150108, "pid": 5714, "tid": 5714, "ts": 6300865980648.948, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865991668.975, "dur": 1.056, + "args": { + "External id": 85062, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150114, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150114, "pid": 0, "tid": 7, "ts": 6300865991668.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980660.828, "dur": 4.269, + "args": { + "External id": 85062, "cbid": 211, "correlation": 161150114 + } + }, + { + "ph": "s", "id": 161150114, "pid": 5714, "tid": 5714, "ts": 6300865980660.828, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865991670.735, "dur": 1.056, + "args": { + "External id": 85063, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150124, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150124, "pid": 0, "tid": 7, "ts": 6300865991670.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980675.197, "dur": 4.220, + "args": { + "External id": 85063, "cbid": 211, "correlation": 161150124 + } + }, + { + "ph": "s", "id": 161150124, "pid": 5714, "tid": 5714, "ts": 6300865980675.197, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865991672.463, "dur": 1.056, + "args": { + "External id": 85064, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150130, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150130, "pid": 0, "tid": 7, "ts": 6300865991672.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980684.848, "dur": 4.220, + "args": { + "External id": 85064, "cbid": 211, "correlation": 161150130 + } + }, + { + "ph": "s", "id": 161150130, "pid": 5714, "tid": 5714, "ts": 6300865980684.848, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865991674.159, "dur": 3.328, + "args": { + "External id": 85065, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150143, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150143, "pid": 0, "tid": 7, "ts": 6300865991674.159, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980705.537, "dur": 5.191, + "args": { + "External id": 85065, "cbid": 211, "correlation": 161150143 + } + }, + { + "ph": "s", "id": 161150143, "pid": 5714, "tid": 5714, "ts": 6300865980705.537, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865991678.191, "dur": 1.088, + "args": { + "External id": 85068, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150149, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150149, "pid": 0, "tid": 7, "ts": 6300865991678.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980716.797, "dur": 4.151, + "args": { + "External id": 85068, "cbid": 211, "correlation": 161150149 + } + }, + { + "ph": "s", "id": 161150149, "pid": 5714, "tid": 5714, "ts": 6300865980716.797, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865991679.919, "dur": 1.024, + "args": { + "External id": 85069, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150155, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150155, "pid": 0, "tid": 7, "ts": 6300865991679.919, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980725.717, "dur": 3.960, + "args": { + "External id": 85069, "cbid": 211, "correlation": 161150155 + } + }, + { + "ph": "s", "id": 161150155, "pid": 5714, "tid": 5714, "ts": 6300865980725.717, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865991681.679, "dur": 234.627, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150169, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161150169, "pid": 0, "tid": 7, "ts": 6300865991681.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980814.677, "dur": 8.080, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161150169 + } + }, + { + "ph": "s", "id": 161150169, "pid": 5714, "tid": 5714, "ts": 6300865980814.677, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865980856.147, "dur": 0.560, + "args": { + "External id": 85073, "cbid": 200, "correlation": 161150192 + } + }, + { + "ph": "f", "id": 161150192, "pid": 5714, "tid": 5714, "ts": 6300865980856.147, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865991917.202, "dur": 0.800, + "args": { + "External id": 85073, "device": 0, "context": 1, "stream": 7, "correlation": 161150195, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161150195, "pid": 0, "tid": 7, "ts": 6300865991917.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865980858.507, "dur": 6.910, + "args": { + "External id": 85073, "cbid": 51, "correlation": 161150195 + } + }, + { + "ph": "s", "id": 161150195, "pid": 5714, "tid": 5714, "ts": 6300865980858.507, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865991919.218, "dur": 691.816, + "args": { + "External id": 85073, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150196, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150196, "pid": 0, "tid": 7, "ts": 6300865991919.218, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980865.637, "dur": 5.980, + "args": { + "External id": 85073, "cbid": 307, "correlation": 161150196 + } + }, + { + "ph": "s", "id": 161150196, "pid": 5714, "tid": 5714, "ts": 6300865980865.637, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865992611.738, "dur": 2.944, + "args": { + "External id": 85076, "device": 0, "context": 1, "stream": 7, "correlation": 161150201, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161150201, "pid": 0, "tid": 7, "ts": 6300865992611.738, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865980896.127, "dur": 13.510, + "args": { + "External id": 85076, "cbid": 41, "correlation": 161150201 + } + }, + { + "ph": "s", "id": 161150201, "pid": 5714, "tid": 5714, "ts": 6300865980896.127, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865980949.087, "dur": 0.520, + "args": { + "External id": 85081, "cbid": 200, "correlation": 161150229 + } + }, + { + "ph": "f", "id": 161150229, "pid": 5714, "tid": 5714, "ts": 6300865980949.087, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865992615.578, "dur": 688.520, + "args": { + "External id": 85081, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150232, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150232, "pid": 0, "tid": 7, "ts": 6300865992615.578, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980951.077, "dur": 7.240, + "args": { + "External id": 85081, "cbid": 307, "correlation": 161150232 + } + }, + { + "ph": "s", "id": 161150232, "pid": 5714, "tid": 5714, "ts": 6300865980951.077, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865993304.802, "dur": 221.731, + "args": { + "External id": 85082, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150237, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161150237, "pid": 0, "tid": 7, "ts": 6300865993304.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865980973.347, "dur": 6.510, + "args": { + "External id": 85082, "cbid": 211, "correlation": 161150237 + } + }, + { + "ph": "s", "id": 161150237, "pid": 5714, "tid": 5714, "ts": 6300865980973.347, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865981023.637, "dur": 1.140, + "args": { + "External id": 85090, "cbid": 210, "correlation": 161150263 + } + }, + { + "ph": "f", "id": 161150263, "pid": 5714, "tid": 5714, "ts": 6300865981023.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865993527.237, "dur": 641.927, + "args": { + "External id": 85090, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150264, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150264, "pid": 0, "tid": 7, "ts": 6300865993527.237, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981028.297, "dur": 7.260, + "args": { + "External id": 85090, "cbid": 211, "correlation": 161150264 + } + }, + { + "ph": "s", "id": 161150264, "pid": 5714, "tid": 5714, "ts": 6300865981028.297, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865994169.932, "dur": 171.202, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150283, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161150283, "pid": 0, "tid": 7, "ts": 6300865994169.932, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981138.136, "dur": 9.011, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161150283 + } + }, + { + "ph": "s", "id": 161150283, "pid": 5714, "tid": 5714, "ts": 6300865981138.136, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865994341.870, "dur": 4.160, + "args": { + "External id": 85100, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150300, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150300, "pid": 0, "tid": 7, "ts": 6300865994341.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981181.307, "dur": 7.269, + "args": { + "External id": 85100, "cbid": 211, "correlation": 161150300 + } + }, + { + "ph": "s", "id": 161150300, "pid": 5714, "tid": 5714, "ts": 6300865981181.307, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865994346.766, "dur": 1.216, + "args": { + "External id": 85105, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150317, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150317, "pid": 0, "tid": 7, "ts": 6300865994346.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981215.136, "dur": 5.430, + "args": { + "External id": 85105, "cbid": 211, "correlation": 161150317 + } + }, + { + "ph": "s", "id": 161150317, "pid": 5714, "tid": 5714, "ts": 6300865981215.136, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865994348.654, "dur": 1.024, + "args": { + "External id": 85107, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150327, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150327, "pid": 0, "tid": 7, "ts": 6300865994348.654, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981235.226, "dur": 4.790, + "args": { + "External id": 85107, "cbid": 211, "correlation": 161150327 + } + }, + { + "ph": "s", "id": 161150327, "pid": 5714, "tid": 5714, "ts": 6300865981235.226, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865994350.382, "dur": 1.088, + "args": { + "External id": 85108, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150333, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150333, "pid": 0, "tid": 7, "ts": 6300865994350.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981247.006, "dur": 4.510, + "args": { + "External id": 85108, "cbid": 211, "correlation": 161150333 + } + }, + { + "ph": "s", "id": 161150333, "pid": 5714, "tid": 5714, "ts": 6300865981247.006, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865994352.142, "dur": 1.056, + "args": { + "External id": 85109, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150343, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150343, "pid": 0, "tid": 7, "ts": 6300865994352.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981261.426, "dur": 4.290, + "args": { + "External id": 85109, "cbid": 211, "correlation": 161150343 + } + }, + { + "ph": "s", "id": 161150343, "pid": 5714, "tid": 5714, "ts": 6300865981261.426, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865994353.902, "dur": 1.056, + "args": { + "External id": 85110, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150349, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150349, "pid": 0, "tid": 7, "ts": 6300865994353.902, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981271.586, "dur": 4.450, + "args": { + "External id": 85110, "cbid": 211, "correlation": 161150349 + } + }, + { + "ph": "s", "id": 161150349, "pid": 5714, "tid": 5714, "ts": 6300865981271.586, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865994355.694, "dur": 3.424, + "args": { + "External id": 85111, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150362, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150362, "pid": 0, "tid": 7, "ts": 6300865994355.694, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981294.396, "dur": 13.540, + "args": { + "External id": 85111, "cbid": 211, "correlation": 161150362 + } + }, + { + "ph": "s", "id": 161150362, "pid": 5714, "tid": 5714, "ts": 6300865981294.396, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865994359.758, "dur": 1.088, + "args": { + "External id": 85114, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150368, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150368, "pid": 0, "tid": 7, "ts": 6300865994359.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981315.256, "dur": 4.690, + "args": { + "External id": 85114, "cbid": 211, "correlation": 161150368 + } + }, + { + "ph": "s", "id": 161150368, "pid": 5714, "tid": 5714, "ts": 6300865981315.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865994361.486, "dur": 1.024, + "args": { + "External id": 85115, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150374, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150374, "pid": 0, "tid": 7, "ts": 6300865994361.486, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981325.386, "dur": 3.880, + "args": { + "External id": 85115, "cbid": 211, "correlation": 161150374 + } + }, + { + "ph": "s", "id": 161150374, "pid": 5714, "tid": 5714, "ts": 6300865981325.386, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865994363.246, "dur": 235.107, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150388, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161150388, "pid": 0, "tid": 7, "ts": 6300865994363.246, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981416.026, "dur": 8.130, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161150388 + } + }, + { + "ph": "s", "id": 161150388, "pid": 5714, "tid": 5714, "ts": 6300865981416.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865981457.516, "dur": 0.570, + "args": { + "External id": 85119, "cbid": 200, "correlation": 161150411 + } + }, + { + "ph": "f", "id": 161150411, "pid": 5714, "tid": 5714, "ts": 6300865981457.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865994599.249, "dur": 0.832, + "args": { + "External id": 85119, "device": 0, "context": 1, "stream": 7, "correlation": 161150414, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161150414, "pid": 0, "tid": 7, "ts": 6300865994599.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865981459.916, "dur": 7.010, + "args": { + "External id": 85119, "cbid": 51, "correlation": 161150414 + } + }, + { + "ph": "s", "id": 161150414, "pid": 5714, "tid": 5714, "ts": 6300865981459.916, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865994600.881, "dur": 690.824, + "args": { + "External id": 85119, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150415, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150415, "pid": 0, "tid": 7, "ts": 6300865994600.881, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981467.176, "dur": 5.830, + "args": { + "External id": 85119, "cbid": 307, "correlation": 161150415 + } + }, + { + "ph": "s", "id": 161150415, "pid": 5714, "tid": 5714, "ts": 6300865981467.176, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865995292.409, "dur": 2.977, + "args": { + "External id": 85122, "device": 0, "context": 1, "stream": 7, "correlation": 161150420, "bytes": 3145728, "memory bandwidth (GB/s)": 1056.6771918038294 + } + }, + { + "ph": "f", "id": 161150420, "pid": 0, "tid": 7, "ts": 6300865995292.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865981498.346, "dur": 12.040, + "args": { + "External id": 85122, "cbid": 41, "correlation": 161150420 + } + }, + { + "ph": "s", "id": 161150420, "pid": 5714, "tid": 5714, "ts": 6300865981498.346, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865981552.155, "dur": 0.480, + "args": { + "External id": 85127, "cbid": 200, "correlation": 161150448 + } + }, + { + "ph": "f", "id": 161150448, "pid": 5714, "tid": 5714, "ts": 6300865981552.155, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865995296.026, "dur": 688.071, + "args": { + "External id": 85127, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150451, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150451, "pid": 0, "tid": 7, "ts": 6300865995296.026, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981554.226, "dur": 6.980, + "args": { + "External id": 85127, "cbid": 307, "correlation": 161150451 + } + }, + { + "ph": "s", "id": 161150451, "pid": 5714, "tid": 5714, "ts": 6300865981554.226, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865995984.737, "dur": 220.579, + "args": { + "External id": 85128, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150456, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161150456, "pid": 0, "tid": 7, "ts": 6300865995984.737, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981575.526, "dur": 5.960, + "args": { + "External id": 85128, "cbid": 211, "correlation": 161150456 + } + }, + { + "ph": "s", "id": 161150456, "pid": 5714, "tid": 5714, "ts": 6300865981575.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865981625.226, "dur": 1.249, + "args": { + "External id": 85136, "cbid": 210, "correlation": 161150482 + } + }, + { + "ph": "f", "id": 161150482, "pid": 5714, "tid": 5714, "ts": 6300865981625.226, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865996206.020, "dur": 644.232, + "args": { + "External id": 85136, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150483, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150483, "pid": 0, "tid": 7, "ts": 6300865996206.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981629.985, "dur": 7.450, + "args": { + "External id": 85136, "cbid": 211, "correlation": 161150483 + } + }, + { + "ph": "s", "id": 161150483, "pid": 5714, "tid": 5714, "ts": 6300865981629.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865996850.956, "dur": 170.754, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150502, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161150502, "pid": 0, "tid": 7, "ts": 6300865996850.956, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981744.265, "dur": 8.810, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161150502 + } + }, + { + "ph": "s", "id": 161150502, "pid": 5714, "tid": 5714, "ts": 6300865981744.265, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865997022.382, "dur": 4.096, + "args": { + "External id": 85146, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150519, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150519, "pid": 0, "tid": 7, "ts": 6300865997022.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981787.935, "dur": 7.170, + "args": { + "External id": 85146, "cbid": 211, "correlation": 161150519 + } + }, + { + "ph": "s", "id": 161150519, "pid": 5714, "tid": 5714, "ts": 6300865981787.935, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865997027.150, "dur": 1.216, + "args": { + "External id": 85151, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150536, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150536, "pid": 0, "tid": 7, "ts": 6300865997027.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981819.225, "dur": 5.310, + "args": { + "External id": 85151, "cbid": 211, "correlation": 161150536 + } + }, + { + "ph": "s", "id": 161150536, "pid": 5714, "tid": 5714, "ts": 6300865981819.225, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865997029.038, "dur": 1.024, + "args": { + "External id": 85153, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150546, "pid": 0, "tid": 7, "ts": 6300865997029.038, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981838.225, "dur": 5.080, + "args": { + "External id": 85153, "cbid": 211, "correlation": 161150546 + } + }, + { + "ph": "s", "id": 161150546, "pid": 5714, "tid": 5714, "ts": 6300865981838.225, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865997030.798, "dur": 1.056, + "args": { + "External id": 85154, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150552, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150552, "pid": 0, "tid": 7, "ts": 6300865997030.798, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981850.475, "dur": 4.390, + "args": { + "External id": 85154, "cbid": 211, "correlation": 161150552 + } + }, + { + "ph": "s", "id": 161150552, "pid": 5714, "tid": 5714, "ts": 6300865981850.475, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865997032.526, "dur": 1.056, + "args": { + "External id": 85155, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150562, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150562, "pid": 0, "tid": 7, "ts": 6300865997032.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981866.485, "dur": 4.310, + "args": { + "External id": 85155, "cbid": 211, "correlation": 161150562 + } + }, + { + "ph": "s", "id": 161150562, "pid": 5714, "tid": 5714, "ts": 6300865981866.485, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865997034.286, "dur": 1.056, + "args": { + "External id": 85156, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150568, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150568, "pid": 0, "tid": 7, "ts": 6300865997034.286, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981876.345, "dur": 4.120, + "args": { + "External id": 85156, "cbid": 211, "correlation": 161150568 + } + }, + { + "ph": "s", "id": 161150568, "pid": 5714, "tid": 5714, "ts": 6300865981876.345, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865997035.950, "dur": 3.392, + "args": { + "External id": 85157, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150581, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150581, "pid": 0, "tid": 7, "ts": 6300865997035.950, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981894.965, "dur": 5.030, + "args": { + "External id": 85157, "cbid": 211, "correlation": 161150581 + } + }, + { + "ph": "s", "id": 161150581, "pid": 5714, "tid": 5714, "ts": 6300865981894.965, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865997039.982, "dur": 1.088, + "args": { + "External id": 85160, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150587, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150587, "pid": 0, "tid": 7, "ts": 6300865997039.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981906.185, "dur": 4.090, + "args": { + "External id": 85160, "cbid": 211, "correlation": 161150587 + } + }, + { + "ph": "s", "id": 161150587, "pid": 5714, "tid": 5714, "ts": 6300865981906.185, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865997041.710, "dur": 1.024, + "args": { + "External id": 85161, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150593, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150593, "pid": 0, "tid": 7, "ts": 6300865997041.710, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865981916.295, "dur": 3.910, + "args": { + "External id": 85161, "cbid": 211, "correlation": 161150593 + } + }, + { + "ph": "s", "id": 161150593, "pid": 5714, "tid": 5714, "ts": 6300865981916.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300865997043.470, "dur": 236.259, + "args": { + "External id": 84777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150607, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161150607, "pid": 0, "tid": 7, "ts": 6300865997043.470, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982004.245, "dur": 7.989, + "args": { + "External id": 84777, "cbid": 307, "correlation": 161150607 + } + }, + { + "ph": "s", "id": 161150607, "pid": 5714, "tid": 5714, "ts": 6300865982004.245, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865982045.334, "dur": 0.560, + "args": { + "External id": 85165, "cbid": 200, "correlation": 161150630 + } + }, + { + "ph": "f", "id": 161150630, "pid": 5714, "tid": 5714, "ts": 6300865982045.334, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300865997280.625, "dur": 0.832, + "args": { + "External id": 85165, "device": 0, "context": 1, "stream": 7, "correlation": 161150633, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161150633, "pid": 0, "tid": 7, "ts": 6300865997280.625, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865982047.725, "dur": 7.000, + "args": { + "External id": 85165, "cbid": 51, "correlation": 161150633 + } + }, + { + "ph": "s", "id": 161150633, "pid": 5714, "tid": 5714, "ts": 6300865982047.725, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865997282.257, "dur": 690.376, + "args": { + "External id": 85165, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150634, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150634, "pid": 0, "tid": 7, "ts": 6300865997282.257, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982054.965, "dur": 5.700, + "args": { + "External id": 85165, "cbid": 307, "correlation": 161150634 + } + }, + { + "ph": "s", "id": 161150634, "pid": 5714, "tid": 5714, "ts": 6300865982054.965, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300865997973.305, "dur": 2.944, + "args": { + "External id": 85168, "device": 0, "context": 1, "stream": 7, "correlation": 161150639, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161150639, "pid": 0, "tid": 7, "ts": 6300865997973.305, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865982084.304, "dur": 13.140, + "args": { + "External id": 85168, "cbid": 41, "correlation": 161150639 + } + }, + { + "ph": "s", "id": 161150639, "pid": 5714, "tid": 5714, "ts": 6300865982084.304, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865982139.414, "dur": 0.450, + "args": { + "External id": 85173, "cbid": 200, "correlation": 161150667 + } + }, + { + "ph": "f", "id": 161150667, "pid": 5714, "tid": 5714, "ts": 6300865982139.414, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300865997976.953, "dur": 688.776, + "args": { + "External id": 85173, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150670, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150670, "pid": 0, "tid": 7, "ts": 6300865997976.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982141.394, "dur": 7.110, + "args": { + "External id": 85173, "cbid": 307, "correlation": 161150670 + } + }, + { + "ph": "s", "id": 161150670, "pid": 5714, "tid": 5714, "ts": 6300865982141.394, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865998666.337, "dur": 221.347, + "args": { + "External id": 85174, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150675, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161150675, "pid": 0, "tid": 7, "ts": 6300865998666.337, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982162.604, "dur": 5.920, + "args": { + "External id": 85174, "cbid": 211, "correlation": 161150675 + } + }, + { + "ph": "s", "id": 161150675, "pid": 5714, "tid": 5714, "ts": 6300865982162.604, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300865998888.420, "dur": 5.120, + "args": { + "External id": 85176, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150688, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150688, "pid": 0, "tid": 7, "ts": 6300865998888.420, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982191.004, "dur": 6.280, + "args": { + "External id": 85176, "cbid": 211, "correlation": 161150688 + } + }, + { + "ph": "s", "id": 161150688, "pid": 5714, "tid": 5714, "ts": 6300865982191.004, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300865998894.180, "dur": 157.281, + "args": { + "External id": 85181, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161150701, "pid": 0, "tid": 7, "ts": 6300865998894.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982221.934, "dur": 6.090, + "args": { + "External id": 85181, "cbid": 211, "correlation": 161150701 + } + }, + { + "ph": "s", "id": 161150701, "pid": 5714, "tid": 5714, "ts": 6300865982221.934, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865999052.133, "dur": 1.792, + "args": { + "External id": 85186, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150709, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150709, "pid": 0, "tid": 7, "ts": 6300865999052.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982286.454, "dur": 7.930, + "args": { + "External id": 85186, "cbid": 211, "correlation": 161150709 + } + }, + { + "ph": "s", "id": 161150709, "pid": 5714, "tid": 5714, "ts": 6300865982286.454, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300865999054.565, "dur": 1.376, + "args": { + "External id": 85187, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150715, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150715, "pid": 0, "tid": 7, "ts": 6300865999054.565, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982315.774, "dur": 5.810, + "args": { + "External id": 85187, "cbid": 211, "correlation": 161150715 + } + }, + { + "ph": "s", "id": 161150715, "pid": 5714, "tid": 5714, "ts": 6300865982315.774, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865999056.645, "dur": 2.304, + "args": { + "External id": 85206, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150735, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 161150735, "pid": 0, "tid": 7, "ts": 6300865999056.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982402.224, "dur": 8.909, + "args": { + "External id": 85206, "cbid": 211, "correlation": 161150735 + } + }, + { + "ph": "s", "id": 161150735, "pid": 5714, "tid": 5714, "ts": 6300865982402.224, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300865999059.557, "dur": 59.841, + "args": { + "External id": 85214, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150753, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161150753, "pid": 0, "tid": 7, "ts": 6300865999059.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982503.824, "dur": 9.539, + "args": { + "External id": 85214, "cbid": 211, "correlation": 161150753 + } + }, + { + "ph": "s", "id": 161150753, "pid": 5714, "tid": 5714, "ts": 6300865982503.824, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865999119.974, "dur": 15.393, + "args": { + "External id": 85219, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150770, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161150770, "pid": 0, "tid": 7, "ts": 6300865999119.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982546.843, "dur": 6.670, + "args": { + "External id": 85219, "cbid": 211, "correlation": 161150770 + } + }, + { + "ph": "s", "id": 161150770, "pid": 5714, "tid": 5714, "ts": 6300865982546.843, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865999136.071, "dur": 100.353, + "args": { + "External id": 85224, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161150786, "pid": 0, "tid": 7, "ts": 6300865999136.071, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982571.683, "dur": 5.010, + "args": { + "External id": 85224, "cbid": 211, "correlation": 161150786 + } + }, + { + "ph": "s", "id": 161150786, "pid": 5714, "tid": 5714, "ts": 6300865982571.683, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300865999237.064, "dur": 1.952, + "args": { + "External id": 85228, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150802, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161150802, "pid": 0, "tid": 7, "ts": 6300865999237.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982595.233, "dur": 4.830, + "args": { + "External id": 85228, "cbid": 211, "correlation": 161150802 + } + }, + { + "ph": "s", "id": 161150802, "pid": 5714, "tid": 5714, "ts": 6300865982595.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300865999239.688, "dur": 1.696, + "args": { + "External id": 85229, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150814, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161150814, "pid": 0, "tid": 7, "ts": 6300865999239.688, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982618.933, "dur": 5.350, + "args": { + "External id": 85229, "cbid": 211, "correlation": 161150814 + } + }, + { + "ph": "s", "id": 161150814, "pid": 5714, "tid": 5714, "ts": 6300865982618.933, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300865999242.024, "dur": 2.144, + "args": { + "External id": 85236, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150832, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161150832, "pid": 0, "tid": 7, "ts": 6300865999242.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982653.523, "dur": 6.230, + "args": { + "External id": 85236, "cbid": 211, "correlation": 161150832 + } + }, + { + "ph": "s", "id": 161150832, "pid": 5714, "tid": 5714, "ts": 6300865982653.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 0, "tid": 7, + "ts": 6300865999244.840, "dur": 3.904, + "args": { + "External id": 85231, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150841, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150841, "pid": 0, "tid": 7, "ts": 6300865999244.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865982666.533, "dur": 4.480, + "args": { + "External id": 85231, "cbid": 211, "correlation": 161150841 + } + }, + { + "ph": "s", "id": 161150841, "pid": 5714, "tid": 5714, "ts": 6300865982666.533, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6300865999249.928, "dur": 0.992, + "args": { + "External id": 85238, "device": 0, "context": 1, "stream": 7, "correlation": 161150847, "bytes": 8, "memory bandwidth (GB/s)": 0.008064516129032258 + } + }, + { + "ph": "f", "id": 161150847, "pid": 0, "tid": 7, "ts": 6300865999249.928, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865982684.123, "dur": 10.160, + "args": { + "External id": 85238, "cbid": 41, "correlation": 161150847 + } + }, + { + "ph": "s", "id": 161150847, "pid": 5714, "tid": 5714, "ts": 6300865982684.123, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991587.233, "dur": 7.050, + "args": { + "cbid": 138, "correlation": 161150849 + } + }, + { + "ph": "f", "id": 161150849, "pid": 5714, "tid": 1822426688, "ts": 6300865991587.233, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991594.783, "dur": 0.900, + "args": { + "cbid": 138, "correlation": 161150850 + } + }, + { + "ph": "f", "id": 161150850, "pid": 5714, "tid": 1822426688, "ts": 6300865991594.783, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991599.953, "dur": 0.530, + "args": { + "cbid": 138, "correlation": 161150851 + } + }, + { + "ph": "f", "id": 161150851, "pid": 5714, "tid": 1822426688, "ts": 6300865991599.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991604.413, "dur": 1.260, + "args": { + "cbid": 138, "correlation": 161150852 + } + }, + { + "ph": "f", "id": 161150852, "pid": 5714, "tid": 1822426688, "ts": 6300865991604.413, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991605.823, "dur": 0.480, + "args": { + "cbid": 138, "correlation": 161150853 + } + }, + { + "ph": "f", "id": 161150853, "pid": 5714, "tid": 1822426688, "ts": 6300865991605.823, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991607.423, "dur": 0.480, + "args": { + "cbid": 138, "correlation": 161150854 + } + }, + { + "ph": "f", "id": 161150854, "pid": 5714, "tid": 1822426688, "ts": 6300865991607.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991609.593, "dur": 0.730, + "args": { + "cbid": 138, "correlation": 161150855 + } + }, + { + "ph": "f", "id": 161150855, "pid": 5714, "tid": 1822426688, "ts": 6300865991609.593, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991610.453, "dur": 0.400, + "args": { + "cbid": 138, "correlation": 161150856 + } + }, + { + "ph": "f", "id": 161150856, "pid": 5714, "tid": 1822426688, "ts": 6300865991610.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991611.823, "dur": 0.440, + "args": { + "cbid": 138, "correlation": 161150857 + } + }, + { + "ph": "f", "id": 161150857, "pid": 5714, "tid": 1822426688, "ts": 6300865991611.823, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991614.223, "dur": 0.650, + "args": { + "cbid": 138, "correlation": 161150858 + } + }, + { + "ph": "f", "id": 161150858, "pid": 5714, "tid": 1822426688, "ts": 6300865991614.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991615.063, "dur": 0.400, + "args": { + "cbid": 138, "correlation": 161150859 + } + }, + { + "ph": "f", "id": 161150859, "pid": 5714, "tid": 1822426688, "ts": 6300865991615.063, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991616.613, "dur": 0.520, + "args": { + "cbid": 138, "correlation": 161150860 + } + }, + { + "ph": "f", "id": 161150860, "pid": 5714, "tid": 1822426688, "ts": 6300865991616.613, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991618.663, "dur": 0.740, + "args": { + "cbid": 138, "correlation": 161150861 + } + }, + { + "ph": "f", "id": 161150861, "pid": 5714, "tid": 1822426688, "ts": 6300865991618.663, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991619.643, "dur": 0.450, + "args": { + "cbid": 138, "correlation": 161150862 + } + }, + { + "ph": "f", "id": 161150862, "pid": 5714, "tid": 1822426688, "ts": 6300865991619.643, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991620.853, "dur": 0.430, + "args": { + "cbid": 138, "correlation": 161150863 + } + }, + { + "ph": "f", "id": 161150863, "pid": 5714, "tid": 1822426688, "ts": 6300865991620.853, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991622.863, "dur": 0.830, + "args": { + "cbid": 138, "correlation": 161150864 + } + }, + { + "ph": "f", "id": 161150864, "pid": 5714, "tid": 1822426688, "ts": 6300865991622.863, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991623.833, "dur": 0.410, + "args": { + "cbid": 138, "correlation": 161150865 + } + }, + { + "ph": "f", "id": 161150865, "pid": 5714, "tid": 1822426688, "ts": 6300865991623.833, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300865991625.043, "dur": 0.440, + "args": { + "cbid": 138, "correlation": 161150866 + } + }, + { + "ph": "f", "id": 161150866, "pid": 5714, "tid": 1822426688, "ts": 6300865991625.043, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6300865982694.633, "dur": 16559.542, + "args": { + "External id": 85238, "cbid": 131, "correlation": 161150848 + } + }, + { + "ph": "s", "id": 161150848, "pid": 5714, "tid": 5714, "ts": 6300865982694.633, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865999321.255, "dur": 1.650, + "args": { + "External id": 85246, "cbid": 210, "correlation": 161150891 + } + }, + { + "ph": "f", "id": 161150891, "pid": 5714, "tid": 5714, "ts": 6300865999321.255, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300865999337.321, "dur": 643.623, + "args": { + "External id": 85246, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150892, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161150892, "pid": 0, "tid": 7, "ts": 6300865999337.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999326.505, "dur": 10.250, + "args": { + "External id": 85246, "cbid": 211, "correlation": 161150892 + } + }, + { + "ph": "s", "id": 161150892, "pid": 5714, "tid": 5714, "ts": 6300865999326.505, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300865999981.616, "dur": 171.043, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150911, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161150911, "pid": 0, "tid": 7, "ts": 6300865999981.616, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999446.825, "dur": 9.020, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161150911 + } + }, + { + "ph": "s", "id": 161150911, "pid": 5714, "tid": 5714, "ts": 6300865999446.825, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866000153.427, "dur": 4.096, + "args": { + "External id": 85256, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150928, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150928, "pid": 0, "tid": 7, "ts": 6300866000153.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999492.665, "dur": 7.400, + "args": { + "External id": 85256, "cbid": 211, "correlation": 161150928 + } + }, + { + "ph": "s", "id": 161150928, "pid": 5714, "tid": 5714, "ts": 6300865999492.665, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866000158.131, "dur": 1.215, + "args": { + "External id": 85261, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150945, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150945, "pid": 0, "tid": 7, "ts": 6300866000158.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999524.885, "dur": 5.790, + "args": { + "External id": 85261, "cbid": 211, "correlation": 161150945 + } + }, + { + "ph": "s", "id": 161150945, "pid": 5714, "tid": 5714, "ts": 6300865999524.885, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866000159.986, "dur": 1.024, + "args": { + "External id": 85263, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150955, "pid": 0, "tid": 7, "ts": 6300866000159.986, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999544.175, "dur": 4.790, + "args": { + "External id": 85263, "cbid": 211, "correlation": 161150955 + } + }, + { + "ph": "s", "id": 161150955, "pid": 5714, "tid": 5714, "ts": 6300865999544.175, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866000161.746, "dur": 1.056, + "args": { + "External id": 85264, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150961, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150961, "pid": 0, "tid": 7, "ts": 6300866000161.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999556.265, "dur": 4.270, + "args": { + "External id": 85264, "cbid": 211, "correlation": 161150961 + } + }, + { + "ph": "s", "id": 161150961, "pid": 5714, "tid": 5714, "ts": 6300865999556.265, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866000163.506, "dur": 1.024, + "args": { + "External id": 85265, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150971, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150971, "pid": 0, "tid": 7, "ts": 6300866000163.506, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999570.915, "dur": 4.199, + "args": { + "External id": 85265, "cbid": 211, "correlation": 161150971 + } + }, + { + "ph": "s", "id": 161150971, "pid": 5714, "tid": 5714, "ts": 6300865999570.915, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866000165.234, "dur": 1.056, + "args": { + "External id": 85266, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150977, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150977, "pid": 0, "tid": 7, "ts": 6300866000165.234, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999580.965, "dur": 4.160, + "args": { + "External id": 85266, "cbid": 211, "correlation": 161150977 + } + }, + { + "ph": "s", "id": 161150977, "pid": 5714, "tid": 5714, "ts": 6300865999580.965, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866000166.898, "dur": 3.520, + "args": { + "External id": 85267, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150990, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150990, "pid": 0, "tid": 7, "ts": 6300866000166.898, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999602.885, "dur": 5.109, + "args": { + "External id": 85267, "cbid": 211, "correlation": 161150990 + } + }, + { + "ph": "s", "id": 161150990, "pid": 5714, "tid": 5714, "ts": 6300865999602.885, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866000171.058, "dur": 1.120, + "args": { + "External id": 85270, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161150996, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161150996, "pid": 0, "tid": 7, "ts": 6300866000171.058, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999614.365, "dur": 4.380, + "args": { + "External id": 85270, "cbid": 211, "correlation": 161150996 + } + }, + { + "ph": "s", "id": 161150996, "pid": 5714, "tid": 5714, "ts": 6300865999614.365, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866000172.818, "dur": 1.024, + "args": { + "External id": 85271, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151002, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151002, "pid": 0, "tid": 7, "ts": 6300866000172.818, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999623.794, "dur": 3.871, + "args": { + "External id": 85271, "cbid": 211, "correlation": 161151002 + } + }, + { + "ph": "s", "id": 161151002, "pid": 5714, "tid": 5714, "ts": 6300865999623.794, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300866000174.546, "dur": 234.435, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151016, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161151016, "pid": 0, "tid": 7, "ts": 6300866000174.546, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999712.034, "dur": 8.190, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161151016 + } + }, + { + "ph": "s", "id": 161151016, "pid": 5714, "tid": 5714, "ts": 6300865999712.034, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865999754.024, "dur": 0.560, + "args": { + "External id": 85275, "cbid": 200, "correlation": 161151039 + } + }, + { + "ph": "f", "id": 161151039, "pid": 5714, "tid": 5714, "ts": 6300865999754.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866000409.845, "dur": 0.800, + "args": { + "External id": 85275, "device": 0, "context": 1, "stream": 7, "correlation": 161151042, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161151042, "pid": 0, "tid": 7, "ts": 6300866000409.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300865999756.374, "dur": 6.790, + "args": { + "External id": 85275, "cbid": 51, "correlation": 161151042 + } + }, + { + "ph": "s", "id": 161151042, "pid": 5714, "tid": 5714, "ts": 6300865999756.374, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866000411.861, "dur": 691.913, + "args": { + "External id": 85275, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151043, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151043, "pid": 0, "tid": 7, "ts": 6300866000411.861, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999763.434, "dur": 5.870, + "args": { + "External id": 85275, "cbid": 307, "correlation": 161151043 + } + }, + { + "ph": "s", "id": 161151043, "pid": 5714, "tid": 5714, "ts": 6300865999763.434, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866001104.478, "dur": 3.008, + "args": { + "External id": 85278, "device": 0, "context": 1, "stream": 7, "correlation": 161151048, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 161151048, "pid": 0, "tid": 7, "ts": 6300866001104.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300865999795.954, "dur": 14.990, + "args": { + "External id": 85278, "cbid": 41, "correlation": 161151048 + } + }, + { + "ph": "s", "id": 161151048, "pid": 5714, "tid": 5714, "ts": 6300865999795.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300865999854.074, "dur": 0.530, + "args": { + "External id": 85283, "cbid": 200, "correlation": 161151076 + } + }, + { + "ph": "f", "id": 161151076, "pid": 5714, "tid": 5714, "ts": 6300865999854.074, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866001108.126, "dur": 695.208, + "args": { + "External id": 85283, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151079, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151079, "pid": 0, "tid": 7, "ts": 6300866001108.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999856.084, "dur": 7.280, + "args": { + "External id": 85283, "cbid": 307, "correlation": 161151079 + } + }, + { + "ph": "s", "id": 161151079, "pid": 5714, "tid": 5714, "ts": 6300865999856.084, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866001803.974, "dur": 221.506, + "args": { + "External id": 85284, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151084, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161151084, "pid": 0, "tid": 7, "ts": 6300866001803.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999877.964, "dur": 6.200, + "args": { + "External id": 85284, "cbid": 211, "correlation": 161151084 + } + }, + { + "ph": "s", "id": 161151084, "pid": 5714, "tid": 5714, "ts": 6300865999877.964, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300865999928.864, "dur": 1.190, + "args": { + "External id": 85292, "cbid": 210, "correlation": 161151110 + } + }, + { + "ph": "f", "id": 161151110, "pid": 5714, "tid": 5714, "ts": 6300865999928.864, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300866002026.152, "dur": 642.056, + "args": { + "External id": 85292, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151111, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151111, "pid": 0, "tid": 7, "ts": 6300866002026.152, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300865999933.534, "dur": 7.080, + "args": { + "External id": 85292, "cbid": 211, "correlation": 161151111 + } + }, + { + "ph": "s", "id": 161151111, "pid": 5714, "tid": 5714, "ts": 6300865999933.534, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300866002668.912, "dur": 171.586, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151130, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161151130, "pid": 0, "tid": 7, "ts": 6300866002668.912, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000048.073, "dur": 8.991, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161151130 + } + }, + { + "ph": "s", "id": 161151130, "pid": 5714, "tid": 5714, "ts": 6300866000048.073, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866002841.170, "dur": 4.288, + "args": { + "External id": 85302, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151147, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151147, "pid": 0, "tid": 7, "ts": 6300866002841.170, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000091.793, "dur": 7.271, + "args": { + "External id": 85302, "cbid": 211, "correlation": 161151147 + } + }, + { + "ph": "s", "id": 161151147, "pid": 5714, "tid": 5714, "ts": 6300866000091.793, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866002846.066, "dur": 1.216, + "args": { + "External id": 85307, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151164, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151164, "pid": 0, "tid": 7, "ts": 6300866002846.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000122.083, "dur": 5.640, + "args": { + "External id": 85307, "cbid": 211, "correlation": 161151164 + } + }, + { + "ph": "s", "id": 161151164, "pid": 5714, "tid": 5714, "ts": 6300866000122.083, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866002847.954, "dur": 1.024, + "args": { + "External id": 85309, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151174, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151174, "pid": 0, "tid": 7, "ts": 6300866002847.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000142.633, "dur": 5.000, + "args": { + "External id": 85309, "cbid": 211, "correlation": 161151174 + } + }, + { + "ph": "s", "id": 161151174, "pid": 5714, "tid": 5714, "ts": 6300866000142.633, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866002849.714, "dur": 1.056, + "args": { + "External id": 85310, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151180, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151180, "pid": 0, "tid": 7, "ts": 6300866002849.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000155.083, "dur": 4.460, + "args": { + "External id": 85310, "cbid": 211, "correlation": 161151180 + } + }, + { + "ph": "s", "id": 161151180, "pid": 5714, "tid": 5714, "ts": 6300866000155.083, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866002851.442, "dur": 1.056, + "args": { + "External id": 85311, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151190, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151190, "pid": 0, "tid": 7, "ts": 6300866002851.442, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000170.153, "dur": 4.560, + "args": { + "External id": 85311, "cbid": 211, "correlation": 161151190 + } + }, + { + "ph": "s", "id": 161151190, "pid": 5714, "tid": 5714, "ts": 6300866000170.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866002853.202, "dur": 1.056, + "args": { + "External id": 85312, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151196, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151196, "pid": 0, "tid": 7, "ts": 6300866002853.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000181.513, "dur": 4.300, + "args": { + "External id": 85312, "cbid": 211, "correlation": 161151196 + } + }, + { + "ph": "s", "id": 161151196, "pid": 5714, "tid": 5714, "ts": 6300866000181.513, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866002854.994, "dur": 3.392, + "args": { + "External id": 85313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151209, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151209, "pid": 0, "tid": 7, "ts": 6300866002854.994, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000202.163, "dur": 5.010, + "args": { + "External id": 85313, "cbid": 211, "correlation": 161151209 + } + }, + { + "ph": "s", "id": 161151209, "pid": 5714, "tid": 5714, "ts": 6300866000202.163, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866002859.026, "dur": 1.088, + "args": { + "External id": 85316, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151215, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151215, "pid": 0, "tid": 7, "ts": 6300866002859.026, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000213.343, "dur": 4.230, + "args": { + "External id": 85316, "cbid": 211, "correlation": 161151215 + } + }, + { + "ph": "s", "id": 161151215, "pid": 5714, "tid": 5714, "ts": 6300866000213.343, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866002860.754, "dur": 1.024, + "args": { + "External id": 85317, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151221, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151221, "pid": 0, "tid": 7, "ts": 6300866002860.754, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000222.493, "dur": 3.850, + "args": { + "External id": 85317, "cbid": 211, "correlation": 161151221 + } + }, + { + "ph": "s", "id": 161151221, "pid": 5714, "tid": 5714, "ts": 6300866000222.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300866002862.514, "dur": 234.147, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151235, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161151235, "pid": 0, "tid": 7, "ts": 6300866002862.514, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000322.543, "dur": 8.980, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161151235 + } + }, + { + "ph": "s", "id": 161151235, "pid": 5714, "tid": 5714, "ts": 6300866000322.543, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866000366.593, "dur": 0.570, + "args": { + "External id": 85321, "cbid": 200, "correlation": 161151258 + } + }, + { + "ph": "f", "id": 161151258, "pid": 5714, "tid": 5714, "ts": 6300866000366.593, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866003097.557, "dur": 0.832, + "args": { + "External id": 85321, "device": 0, "context": 1, "stream": 7, "correlation": 161151261, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161151261, "pid": 0, "tid": 7, "ts": 6300866003097.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300866000368.973, "dur": 6.740, + "args": { + "External id": 85321, "cbid": 51, "correlation": 161151261 + } + }, + { + "ph": "s", "id": 161151261, "pid": 5714, "tid": 5714, "ts": 6300866000368.973, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866003099.605, "dur": 694.600, + "args": { + "External id": 85321, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151262, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151262, "pid": 0, "tid": 7, "ts": 6300866003099.605, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000375.963, "dur": 6.010, + "args": { + "External id": 85321, "cbid": 307, "correlation": 161151262 + } + }, + { + "ph": "s", "id": 161151262, "pid": 5714, "tid": 5714, "ts": 6300866000375.963, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866003794.909, "dur": 2.944, + "args": { + "External id": 85324, "device": 0, "context": 1, "stream": 7, "correlation": 161151267, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161151267, "pid": 0, "tid": 7, "ts": 6300866003794.909, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300866000406.213, "dur": 12.500, + "args": { + "External id": 85324, "cbid": 41, "correlation": 161151267 + } + }, + { + "ph": "s", "id": 161151267, "pid": 5714, "tid": 5714, "ts": 6300866000406.213, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866000460.223, "dur": 0.500, + "args": { + "External id": 85329, "cbid": 200, "correlation": 161151295 + } + }, + { + "ph": "f", "id": 161151295, "pid": 5714, "tid": 5714, "ts": 6300866000460.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866003798.557, "dur": 687.816, + "args": { + "External id": 85329, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151298, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151298, "pid": 0, "tid": 7, "ts": 6300866003798.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000462.263, "dur": 7.209, + "args": { + "External id": 85329, "cbid": 307, "correlation": 161151298 + } + }, + { + "ph": "s", "id": 161151298, "pid": 5714, "tid": 5714, "ts": 6300866000462.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866004487.045, "dur": 221.731, + "args": { + "External id": 85330, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151303, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161151303, "pid": 0, "tid": 7, "ts": 6300866004487.045, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000483.712, "dur": 6.451, + "args": { + "External id": 85330, "cbid": 211, "correlation": 161151303 + } + }, + { + "ph": "s", "id": 161151303, "pid": 5714, "tid": 5714, "ts": 6300866000483.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300866000535.623, "dur": 1.169, + "args": { + "External id": 85338, "cbid": 210, "correlation": 161151329 + } + }, + { + "ph": "f", "id": 161151329, "pid": 5714, "tid": 5714, "ts": 6300866000535.623, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300866004709.512, "dur": 643.656, + "args": { + "External id": 85338, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151330, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151330, "pid": 0, "tid": 7, "ts": 6300866004709.512, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000540.312, "dur": 7.480, + "args": { + "External id": 85338, "cbid": 211, "correlation": 161151330 + } + }, + { + "ph": "s", "id": 161151330, "pid": 5714, "tid": 5714, "ts": 6300866000540.312, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300866005353.840, "dur": 170.913, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151349, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161151349, "pid": 0, "tid": 7, "ts": 6300866005353.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000651.852, "dur": 8.800, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161151349 + } + }, + { + "ph": "s", "id": 161151349, "pid": 5714, "tid": 5714, "ts": 6300866000651.852, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866005525.361, "dur": 4.160, + "args": { + "External id": 85348, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151366, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151366, "pid": 0, "tid": 7, "ts": 6300866005525.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000698.122, "dur": 7.200, + "args": { + "External id": 85348, "cbid": 211, "correlation": 161151366 + } + }, + { + "ph": "s", "id": 161151366, "pid": 5714, "tid": 5714, "ts": 6300866000698.122, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866005530.161, "dur": 1.216, + "args": { + "External id": 85353, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151383, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151383, "pid": 0, "tid": 7, "ts": 6300866005530.161, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000728.112, "dur": 5.390, + "args": { + "External id": 85353, "cbid": 211, "correlation": 161151383 + } + }, + { + "ph": "s", "id": 161151383, "pid": 5714, "tid": 5714, "ts": 6300866000728.112, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866005532.049, "dur": 1.024, + "args": { + "External id": 85355, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151393, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151393, "pid": 0, "tid": 7, "ts": 6300866005532.049, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000748.192, "dur": 5.020, + "args": { + "External id": 85355, "cbid": 211, "correlation": 161151393 + } + }, + { + "ph": "s", "id": 161151393, "pid": 5714, "tid": 5714, "ts": 6300866000748.192, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866005533.810, "dur": 1.056, + "args": { + "External id": 85356, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151399, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151399, "pid": 0, "tid": 7, "ts": 6300866005533.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000760.452, "dur": 4.290, + "args": { + "External id": 85356, "cbid": 211, "correlation": 161151399 + } + }, + { + "ph": "s", "id": 161151399, "pid": 5714, "tid": 5714, "ts": 6300866000760.452, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866005535.570, "dur": 1.024, + "args": { + "External id": 85357, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151409, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151409, "pid": 0, "tid": 7, "ts": 6300866005535.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000776.182, "dur": 4.480, + "args": { + "External id": 85357, "cbid": 211, "correlation": 161151409 + } + }, + { + "ph": "s", "id": 161151409, "pid": 5714, "tid": 5714, "ts": 6300866000776.182, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866005537.298, "dur": 1.056, + "args": { + "External id": 85358, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151415, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151415, "pid": 0, "tid": 7, "ts": 6300866005537.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000787.542, "dur": 4.080, + "args": { + "External id": 85358, "cbid": 211, "correlation": 161151415 + } + }, + { + "ph": "s", "id": 161151415, "pid": 5714, "tid": 5714, "ts": 6300866000787.542, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866005539.090, "dur": 3.424, + "args": { + "External id": 85359, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151428, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151428, "pid": 0, "tid": 7, "ts": 6300866005539.090, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000806.302, "dur": 4.940, + "args": { + "External id": 85359, "cbid": 211, "correlation": 161151428 + } + }, + { + "ph": "s", "id": 161151428, "pid": 5714, "tid": 5714, "ts": 6300866000806.302, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866005543.122, "dur": 1.120, + "args": { + "External id": 85362, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151434, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151434, "pid": 0, "tid": 7, "ts": 6300866005543.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000817.382, "dur": 4.290, + "args": { + "External id": 85362, "cbid": 211, "correlation": 161151434 + } + }, + { + "ph": "s", "id": 161151434, "pid": 5714, "tid": 5714, "ts": 6300866000817.382, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866005544.882, "dur": 1.024, + "args": { + "External id": 85363, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151440, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151440, "pid": 0, "tid": 7, "ts": 6300866005544.882, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000826.612, "dur": 4.040, + "args": { + "External id": 85363, "cbid": 211, "correlation": 161151440 + } + }, + { + "ph": "s", "id": 161151440, "pid": 5714, "tid": 5714, "ts": 6300866000826.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300866005546.610, "dur": 232.866, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151454, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161151454, "pid": 0, "tid": 7, "ts": 6300866005546.610, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000917.422, "dur": 8.180, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161151454 + } + }, + { + "ph": "s", "id": 161151454, "pid": 5714, "tid": 5714, "ts": 6300866000917.422, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866000959.271, "dur": 0.591, + "args": { + "External id": 85367, "cbid": 200, "correlation": 161151477 + } + }, + { + "ph": "f", "id": 161151477, "pid": 5714, "tid": 5714, "ts": 6300866000959.271, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866005780.340, "dur": 0.832, + "args": { + "External id": 85367, "device": 0, "context": 1, "stream": 7, "correlation": 161151480, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161151480, "pid": 0, "tid": 7, "ts": 6300866005780.340, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300866000961.682, "dur": 8.189, + "args": { + "External id": 85367, "cbid": 51, "correlation": 161151480 + } + }, + { + "ph": "s", "id": 161151480, "pid": 5714, "tid": 5714, "ts": 6300866000961.682, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866005782.388, "dur": 693.705, + "args": { + "External id": 85367, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151481, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151481, "pid": 0, "tid": 7, "ts": 6300866005782.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866000970.122, "dur": 5.780, + "args": { + "External id": 85367, "cbid": 307, "correlation": 161151481 + } + }, + { + "ph": "s", "id": 161151481, "pid": 5714, "tid": 5714, "ts": 6300866000970.122, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866006476.733, "dur": 3.008, + "args": { + "External id": 85370, "device": 0, "context": 1, "stream": 7, "correlation": 161151486, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 161151486, "pid": 0, "tid": 7, "ts": 6300866006476.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300866000999.261, "dur": 11.990, + "args": { + "External id": 85370, "cbid": 41, "correlation": 161151486 + } + }, + { + "ph": "s", "id": 161151486, "pid": 5714, "tid": 5714, "ts": 6300866000999.261, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866001052.341, "dur": 0.490, + "args": { + "External id": 85375, "cbid": 200, "correlation": 161151514 + } + }, + { + "ph": "f", "id": 161151514, "pid": 5714, "tid": 5714, "ts": 6300866001052.341, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866006480.381, "dur": 688.263, + "args": { + "External id": 85375, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151517, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151517, "pid": 0, "tid": 7, "ts": 6300866006480.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001054.311, "dur": 7.210, + "args": { + "External id": 85375, "cbid": 307, "correlation": 161151517 + } + }, + { + "ph": "s", "id": 161151517, "pid": 5714, "tid": 5714, "ts": 6300866001054.311, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866007169.348, "dur": 221.027, + "args": { + "External id": 85376, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151522, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161151522, "pid": 0, "tid": 7, "ts": 6300866007169.348, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001075.711, "dur": 5.840, + "args": { + "External id": 85376, "cbid": 211, "correlation": 161151522 + } + }, + { + "ph": "s", "id": 161151522, "pid": 5714, "tid": 5714, "ts": 6300866001075.711, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300866001126.861, "dur": 1.170, + "args": { + "External id": 85384, "cbid": 210, "correlation": 161151548 + } + }, + { + "ph": "f", "id": 161151548, "pid": 5714, "tid": 5714, "ts": 6300866001126.861, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300866007391.015, "dur": 642.951, + "args": { + "External id": 85384, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151549, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151549, "pid": 0, "tid": 7, "ts": 6300866007391.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001131.611, "dur": 7.600, + "args": { + "External id": 85384, "cbid": 211, "correlation": 161151549 + } + }, + { + "ph": "s", "id": 161151549, "pid": 5714, "tid": 5714, "ts": 6300866001131.611, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300866008034.574, "dur": 171.139, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151568, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161151568, "pid": 0, "tid": 7, "ts": 6300866008034.574, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001255.201, "dur": 8.800, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161151568 + } + }, + { + "ph": "s", "id": 161151568, "pid": 5714, "tid": 5714, "ts": 6300866001255.201, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866008206.417, "dur": 4.224, + "args": { + "External id": 85394, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151585, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151585, "pid": 0, "tid": 7, "ts": 6300866008206.417, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001306.651, "dur": 7.700, + "args": { + "External id": 85394, "cbid": 211, "correlation": 161151585 + } + }, + { + "ph": "s", "id": 161151585, "pid": 5714, "tid": 5714, "ts": 6300866001306.651, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866008211.281, "dur": 1.216, + "args": { + "External id": 85399, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151602, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151602, "pid": 0, "tid": 7, "ts": 6300866008211.281, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001340.390, "dur": 5.320, + "args": { + "External id": 85399, "cbid": 211, "correlation": 161151602 + } + }, + { + "ph": "s", "id": 161151602, "pid": 5714, "tid": 5714, "ts": 6300866001340.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866008213.201, "dur": 1.024, + "args": { + "External id": 85401, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151612, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151612, "pid": 0, "tid": 7, "ts": 6300866008213.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001360.550, "dur": 4.840, + "args": { + "External id": 85401, "cbid": 211, "correlation": 161151612 + } + }, + { + "ph": "s", "id": 161151612, "pid": 5714, "tid": 5714, "ts": 6300866001360.550, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866008214.929, "dur": 1.088, + "args": { + "External id": 85402, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151618, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151618, "pid": 0, "tid": 7, "ts": 6300866008214.929, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001372.741, "dur": 4.480, + "args": { + "External id": 85402, "cbid": 211, "correlation": 161151618 + } + }, + { + "ph": "s", "id": 161151618, "pid": 5714, "tid": 5714, "ts": 6300866001372.741, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866008216.689, "dur": 1.056, + "args": { + "External id": 85403, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151628, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151628, "pid": 0, "tid": 7, "ts": 6300866008216.689, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001387.690, "dur": 4.280, + "args": { + "External id": 85403, "cbid": 211, "correlation": 161151628 + } + }, + { + "ph": "s", "id": 161151628, "pid": 5714, "tid": 5714, "ts": 6300866001387.690, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866008218.449, "dur": 1.056, + "args": { + "External id": 85404, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151634, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151634, "pid": 0, "tid": 7, "ts": 6300866008218.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001397.550, "dur": 4.360, + "args": { + "External id": 85404, "cbid": 211, "correlation": 161151634 + } + }, + { + "ph": "s", "id": 161151634, "pid": 5714, "tid": 5714, "ts": 6300866001397.550, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866008220.209, "dur": 3.392, + "args": { + "External id": 85405, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151647, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151647, "pid": 0, "tid": 7, "ts": 6300866008220.209, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001418.101, "dur": 4.960, + "args": { + "External id": 85405, "cbid": 211, "correlation": 161151647 + } + }, + { + "ph": "s", "id": 161151647, "pid": 5714, "tid": 5714, "ts": 6300866001418.101, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866008224.273, "dur": 1.088, + "args": { + "External id": 85408, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151653, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151653, "pid": 0, "tid": 7, "ts": 6300866008224.273, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001429.170, "dur": 4.350, + "args": { + "External id": 85408, "cbid": 211, "correlation": 161151653 + } + }, + { + "ph": "s", "id": 161151653, "pid": 5714, "tid": 5714, "ts": 6300866001429.170, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866008226.001, "dur": 1.024, + "args": { + "External id": 85409, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151659, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151659, "pid": 0, "tid": 7, "ts": 6300866008226.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001438.250, "dur": 3.920, + "args": { + "External id": 85409, "cbid": 211, "correlation": 161151659 + } + }, + { + "ph": "s", "id": 161151659, "pid": 5714, "tid": 5714, "ts": 6300866001438.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300866008227.761, "dur": 234.147, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151673, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161151673, "pid": 0, "tid": 7, "ts": 6300866008227.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001528.250, "dur": 8.380, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161151673 + } + }, + { + "ph": "s", "id": 161151673, "pid": 5714, "tid": 5714, "ts": 6300866001528.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866001570.520, "dur": 0.580, + "args": { + "External id": 85413, "cbid": 200, "correlation": 161151696 + } + }, + { + "ph": "f", "id": 161151696, "pid": 5714, "tid": 5714, "ts": 6300866001570.520, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866008462.804, "dur": 0.800, + "args": { + "External id": 85413, "device": 0, "context": 1, "stream": 7, "correlation": 161151699, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161151699, "pid": 0, "tid": 7, "ts": 6300866008462.804, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300866001572.940, "dur": 6.670, + "args": { + "External id": 85413, "cbid": 51, "correlation": 161151699 + } + }, + { + "ph": "s", "id": 161151699, "pid": 5714, "tid": 5714, "ts": 6300866001572.940, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866008464.820, "dur": 694.056, + "args": { + "External id": 85413, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151700, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151700, "pid": 0, "tid": 7, "ts": 6300866008464.820, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001579.850, "dur": 5.870, + "args": { + "External id": 85413, "cbid": 307, "correlation": 161151700 + } + }, + { + "ph": "s", "id": 161151700, "pid": 5714, "tid": 5714, "ts": 6300866001579.850, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866009159.516, "dur": 3.008, + "args": { + "External id": 85416, "device": 0, "context": 1, "stream": 7, "correlation": 161151705, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 161151705, "pid": 0, "tid": 7, "ts": 6300866009159.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300866001612.370, "dur": 12.030, + "args": { + "External id": 85416, "cbid": 41, "correlation": 161151705 + } + }, + { + "ph": "s", "id": 161151705, "pid": 5714, "tid": 5714, "ts": 6300866001612.370, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866001664.180, "dur": 0.520, + "args": { + "External id": 85421, "cbid": 200, "correlation": 161151733 + } + }, + { + "ph": "f", "id": 161151733, "pid": 5714, "tid": 5714, "ts": 6300866001664.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866009163.164, "dur": 686.760, + "args": { + "External id": 85421, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151736, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151736, "pid": 0, "tid": 7, "ts": 6300866009163.164, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001666.250, "dur": 6.890, + "args": { + "External id": 85421, "cbid": 307, "correlation": 161151736 + } + }, + { + "ph": "s", "id": 161151736, "pid": 5714, "tid": 5714, "ts": 6300866001666.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866009850.660, "dur": 220.451, + "args": { + "External id": 85422, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151741, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161151741, "pid": 0, "tid": 7, "ts": 6300866009850.660, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001688.370, "dur": 5.900, + "args": { + "External id": 85422, "cbid": 211, "correlation": 161151741 + } + }, + { + "ph": "s", "id": 161151741, "pid": 5714, "tid": 5714, "ts": 6300866001688.370, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300866001737.160, "dur": 1.140, + "args": { + "External id": 85430, "cbid": 210, "correlation": 161151767 + } + }, + { + "ph": "f", "id": 161151767, "pid": 5714, "tid": 5714, "ts": 6300866001737.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300866010071.815, "dur": 644.327, + "args": { + "External id": 85430, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151768, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151768, "pid": 0, "tid": 7, "ts": 6300866010071.815, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001741.860, "dur": 7.240, + "args": { + "External id": 85430, "cbid": 211, "correlation": 161151768 + } + }, + { + "ph": "s", "id": 161151768, "pid": 5714, "tid": 5714, "ts": 6300866001741.860, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300866010716.782, "dur": 171.362, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151787, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161151787, "pid": 0, "tid": 7, "ts": 6300866010716.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001852.889, "dur": 8.920, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161151787 + } + }, + { + "ph": "s", "id": 161151787, "pid": 5714, "tid": 5714, "ts": 6300866001852.889, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866010888.816, "dur": 4.096, + "args": { + "External id": 85440, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151804, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151804, "pid": 0, "tid": 7, "ts": 6300866010888.816, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001895.809, "dur": 7.250, + "args": { + "External id": 85440, "cbid": 211, "correlation": 161151804 + } + }, + { + "ph": "s", "id": 161151804, "pid": 5714, "tid": 5714, "ts": 6300866001895.809, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866010893.520, "dur": 1.216, + "args": { + "External id": 85445, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151821, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151821, "pid": 0, "tid": 7, "ts": 6300866010893.520, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001927.019, "dur": 5.590, + "args": { + "External id": 85445, "cbid": 211, "correlation": 161151821 + } + }, + { + "ph": "s", "id": 161151821, "pid": 5714, "tid": 5714, "ts": 6300866001927.019, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866010895.440, "dur": 1.024, + "args": { + "External id": 85447, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151831, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151831, "pid": 0, "tid": 7, "ts": 6300866010895.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001946.679, "dur": 5.000, + "args": { + "External id": 85447, "cbid": 211, "correlation": 161151831 + } + }, + { + "ph": "s", "id": 161151831, "pid": 5714, "tid": 5714, "ts": 6300866001946.679, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866010897.168, "dur": 1.088, + "args": { + "External id": 85448, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151837, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151837, "pid": 0, "tid": 7, "ts": 6300866010897.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001958.909, "dur": 4.430, + "args": { + "External id": 85448, "cbid": 211, "correlation": 161151837 + } + }, + { + "ph": "s", "id": 161151837, "pid": 5714, "tid": 5714, "ts": 6300866001958.909, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866010898.928, "dur": 1.056, + "args": { + "External id": 85449, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151847, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151847, "pid": 0, "tid": 7, "ts": 6300866010898.928, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001973.279, "dur": 4.280, + "args": { + "External id": 85449, "cbid": 211, "correlation": 161151847 + } + }, + { + "ph": "s", "id": 161151847, "pid": 5714, "tid": 5714, "ts": 6300866001973.279, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866010900.656, "dur": 1.056, + "args": { + "External id": 85450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151853, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151853, "pid": 0, "tid": 7, "ts": 6300866010900.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866001983.219, "dur": 4.250, + "args": { + "External id": 85450, "cbid": 211, "correlation": 161151853 + } + }, + { + "ph": "s", "id": 161151853, "pid": 5714, "tid": 5714, "ts": 6300866001983.219, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866010902.480, "dur": 3.328, + "args": { + "External id": 85451, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151866, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151866, "pid": 0, "tid": 7, "ts": 6300866010902.480, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002003.489, "dur": 5.070, + "args": { + "External id": 85451, "cbid": 211, "correlation": 161151866 + } + }, + { + "ph": "s", "id": 161151866, "pid": 5714, "tid": 5714, "ts": 6300866002003.489, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866010906.480, "dur": 1.120, + "args": { + "External id": 85454, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151872, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151872, "pid": 0, "tid": 7, "ts": 6300866010906.480, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002014.639, "dur": 4.960, + "args": { + "External id": 85454, "cbid": 211, "correlation": 161151872 + } + }, + { + "ph": "s", "id": 161151872, "pid": 5714, "tid": 5714, "ts": 6300866002014.639, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866010908.240, "dur": 1.024, + "args": { + "External id": 85455, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151878, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161151878, "pid": 0, "tid": 7, "ts": 6300866010908.240, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002024.389, "dur": 4.310, + "args": { + "External id": 85455, "cbid": 211, "correlation": 161151878 + } + }, + { + "ph": "s", "id": 161151878, "pid": 5714, "tid": 5714, "ts": 6300866002024.389, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300866010909.968, "dur": 232.995, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151892, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161151892, "pid": 0, "tid": 7, "ts": 6300866010909.968, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002112.829, "dur": 8.010, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161151892 + } + }, + { + "ph": "s", "id": 161151892, "pid": 5714, "tid": 5714, "ts": 6300866002112.829, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866002154.249, "dur": 0.550, + "args": { + "External id": 85459, "cbid": 200, "correlation": 161151915 + } + }, + { + "ph": "f", "id": 161151915, "pid": 5714, "tid": 5714, "ts": 6300866002154.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866011143.795, "dur": 0.832, + "args": { + "External id": 85459, "device": 0, "context": 1, "stream": 7, "correlation": 161151918, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161151918, "pid": 0, "tid": 7, "ts": 6300866011143.795, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300866002156.629, "dur": 6.910, + "args": { + "External id": 85459, "cbid": 51, "correlation": 161151918 + } + }, + { + "ph": "s", "id": 161151918, "pid": 5714, "tid": 5714, "ts": 6300866002156.629, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866011145.843, "dur": 693.160, + "args": { + "External id": 85459, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151919, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151919, "pid": 0, "tid": 7, "ts": 6300866011145.843, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002163.779, "dur": 6.030, + "args": { + "External id": 85459, "cbid": 307, "correlation": 161151919 + } + }, + { + "ph": "s", "id": 161151919, "pid": 5714, "tid": 5714, "ts": 6300866002163.779, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866011839.675, "dur": 2.944, + "args": { + "External id": 85462, "device": 0, "context": 1, "stream": 7, "correlation": 161151924, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161151924, "pid": 0, "tid": 7, "ts": 6300866011839.675, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300866002194.499, "dur": 12.880, + "args": { + "External id": 85462, "cbid": 41, "correlation": 161151924 + } + }, + { + "ph": "s", "id": 161151924, "pid": 5714, "tid": 5714, "ts": 6300866002194.499, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866002248.428, "dur": 0.471, + "args": { + "External id": 85467, "cbid": 200, "correlation": 161151952 + } + }, + { + "ph": "f", "id": 161151952, "pid": 5714, "tid": 5714, "ts": 6300866002248.428, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866011843.292, "dur": 692.199, + "args": { + "External id": 85467, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151955, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151955, "pid": 0, "tid": 7, "ts": 6300866011843.292, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002250.379, "dur": 7.080, + "args": { + "External id": 85467, "cbid": 307, "correlation": 161151955 + } + }, + { + "ph": "s", "id": 161151955, "pid": 5714, "tid": 5714, "ts": 6300866002250.379, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866012536.131, "dur": 220.771, + "args": { + "External id": 85468, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151960, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161151960, "pid": 0, "tid": 7, "ts": 6300866012536.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002271.948, "dur": 6.000, + "args": { + "External id": 85468, "cbid": 211, "correlation": 161151960 + } + }, + { + "ph": "s", "id": 161151960, "pid": 5714, "tid": 5714, "ts": 6300866002271.948, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300866002332.248, "dur": 1.300, + "args": { + "External id": 85476, "cbid": 210, "correlation": 161151986 + } + }, + { + "ph": "f", "id": 161151986, "pid": 5714, "tid": 5714, "ts": 6300866002332.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300866012757.510, "dur": 644.200, + "args": { + "External id": 85476, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161151987, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161151987, "pid": 0, "tid": 7, "ts": 6300866012757.510, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002336.998, "dur": 8.160, + "args": { + "External id": 85476, "cbid": 211, "correlation": 161151987 + } + }, + { + "ph": "s", "id": 161151987, "pid": 5714, "tid": 5714, "ts": 6300866002336.998, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300866013402.382, "dur": 171.170, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152006, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161152006, "pid": 0, "tid": 7, "ts": 6300866013402.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002447.888, "dur": 8.700, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161152006 + } + }, + { + "ph": "s", "id": 161152006, "pid": 5714, "tid": 5714, "ts": 6300866002447.888, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866013574.224, "dur": 4.064, + "args": { + "External id": 85486, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152023, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152023, "pid": 0, "tid": 7, "ts": 6300866013574.224, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002491.868, "dur": 7.160, + "args": { + "External id": 85486, "cbid": 211, "correlation": 161152023 + } + }, + { + "ph": "s", "id": 161152023, "pid": 5714, "tid": 5714, "ts": 6300866002491.868, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866013578.960, "dur": 1.184, + "args": { + "External id": 85491, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152040, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152040, "pid": 0, "tid": 7, "ts": 6300866013578.960, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002521.518, "dur": 5.510, + "args": { + "External id": 85491, "cbid": 211, "correlation": 161152040 + } + }, + { + "ph": "s", "id": 161152040, "pid": 5714, "tid": 5714, "ts": 6300866002521.518, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866013580.848, "dur": 1.024, + "args": { + "External id": 85493, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152050, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152050, "pid": 0, "tid": 7, "ts": 6300866013580.848, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002540.518, "dur": 5.010, + "args": { + "External id": 85493, "cbid": 211, "correlation": 161152050 + } + }, + { + "ph": "s", "id": 161152050, "pid": 5714, "tid": 5714, "ts": 6300866002540.518, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866013582.608, "dur": 1.056, + "args": { + "External id": 85494, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152056, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152056, "pid": 0, "tid": 7, "ts": 6300866013582.608, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002552.568, "dur": 4.310, + "args": { + "External id": 85494, "cbid": 211, "correlation": 161152056 + } + }, + { + "ph": "s", "id": 161152056, "pid": 5714, "tid": 5714, "ts": 6300866002552.568, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866013584.336, "dur": 1.056, + "args": { + "External id": 85495, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152066, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152066, "pid": 0, "tid": 7, "ts": 6300866013584.336, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002567.108, "dur": 4.230, + "args": { + "External id": 85495, "cbid": 211, "correlation": 161152066 + } + }, + { + "ph": "s", "id": 161152066, "pid": 5714, "tid": 5714, "ts": 6300866002567.108, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866013586.096, "dur": 1.024, + "args": { + "External id": 85496, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152072, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152072, "pid": 0, "tid": 7, "ts": 6300866013586.096, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002577.028, "dur": 4.120, + "args": { + "External id": 85496, "cbid": 211, "correlation": 161152072 + } + }, + { + "ph": "s", "id": 161152072, "pid": 5714, "tid": 5714, "ts": 6300866002577.028, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866013587.888, "dur": 3.488, + "args": { + "External id": 85497, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152085, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152085, "pid": 0, "tid": 7, "ts": 6300866013587.888, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002598.428, "dur": 5.040, + "args": { + "External id": 85497, "cbid": 211, "correlation": 161152085 + } + }, + { + "ph": "s", "id": 161152085, "pid": 5714, "tid": 5714, "ts": 6300866002598.428, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866013592.048, "dur": 1.120, + "args": { + "External id": 85500, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152091, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152091, "pid": 0, "tid": 7, "ts": 6300866013592.048, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002609.648, "dur": 4.200, + "args": { + "External id": 85500, "cbid": 211, "correlation": 161152091 + } + }, + { + "ph": "s", "id": 161152091, "pid": 5714, "tid": 5714, "ts": 6300866002609.648, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866013593.808, "dur": 1.024, + "args": { + "External id": 85501, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152097, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152097, "pid": 0, "tid": 7, "ts": 6300866013593.808, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002618.818, "dur": 3.820, + "args": { + "External id": 85501, "cbid": 211, "correlation": 161152097 + } + }, + { + "ph": "s", "id": 161152097, "pid": 5714, "tid": 5714, "ts": 6300866002618.818, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300866013595.568, "dur": 233.411, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152111, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161152111, "pid": 0, "tid": 7, "ts": 6300866013595.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002707.747, "dur": 8.080, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161152111 + } + }, + { + "ph": "s", "id": 161152111, "pid": 5714, "tid": 5714, "ts": 6300866002707.747, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866002763.297, "dur": 0.580, + "args": { + "External id": 85505, "cbid": 200, "correlation": 161152134 + } + }, + { + "ph": "f", "id": 161152134, "pid": 5714, "tid": 5714, "ts": 6300866002763.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866013829.843, "dur": 0.832, + "args": { + "External id": 85505, "device": 0, "context": 1, "stream": 7, "correlation": 161152137, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161152137, "pid": 0, "tid": 7, "ts": 6300866013829.843, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300866002765.687, "dur": 6.930, + "args": { + "External id": 85505, "cbid": 51, "correlation": 161152137 + } + }, + { + "ph": "s", "id": 161152137, "pid": 5714, "tid": 5714, "ts": 6300866002765.687, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866013831.475, "dur": 691.944, + "args": { + "External id": 85505, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152138, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161152138, "pid": 0, "tid": 7, "ts": 6300866013831.475, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002772.837, "dur": 5.670, + "args": { + "External id": 85505, "cbid": 307, "correlation": 161152138 + } + }, + { + "ph": "s", "id": 161152138, "pid": 5714, "tid": 5714, "ts": 6300866002772.837, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866014524.123, "dur": 2.976, + "args": { + "External id": 85508, "device": 0, "context": 1, "stream": 7, "correlation": 161152143, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 161152143, "pid": 0, "tid": 7, "ts": 6300866014524.123, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300866002803.317, "dur": 12.410, + "args": { + "External id": 85508, "cbid": 41, "correlation": 161152143 + } + }, + { + "ph": "s", "id": 161152143, "pid": 5714, "tid": 5714, "ts": 6300866002803.317, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866002856.807, "dur": 0.480, + "args": { + "External id": 85513, "cbid": 200, "correlation": 161152171 + } + }, + { + "ph": "f", "id": 161152171, "pid": 5714, "tid": 5714, "ts": 6300866002856.807, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866014527.739, "dur": 688.840, + "args": { + "External id": 85513, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152174, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161152174, "pid": 0, "tid": 7, "ts": 6300866014527.739, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002858.767, "dur": 7.110, + "args": { + "External id": 85513, "cbid": 307, "correlation": 161152174 + } + }, + { + "ph": "s", "id": 161152174, "pid": 5714, "tid": 5714, "ts": 6300866002858.767, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866015217.251, "dur": 220.835, + "args": { + "External id": 85514, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152179, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152179, "pid": 0, "tid": 7, "ts": 6300866015217.251, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002880.097, "dur": 6.010, + "args": { + "External id": 85514, "cbid": 211, "correlation": 161152179 + } + }, + { + "ph": "s", "id": 161152179, "pid": 5714, "tid": 5714, "ts": 6300866002880.097, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300866002929.077, "dur": 1.200, + "args": { + "External id": 85522, "cbid": 210, "correlation": 161152205 + } + }, + { + "ph": "f", "id": 161152205, "pid": 5714, "tid": 5714, "ts": 6300866002929.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300866015438.758, "dur": 642.567, + "args": { + "External id": 85522, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152206, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161152206, "pid": 0, "tid": 7, "ts": 6300866015438.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866002933.727, "dur": 7.480, + "args": { + "External id": 85522, "cbid": 211, "correlation": 161152206 + } + }, + { + "ph": "s", "id": 161152206, "pid": 5714, "tid": 5714, "ts": 6300866002933.727, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300866016082.029, "dur": 170.978, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152225, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161152225, "pid": 0, "tid": 7, "ts": 6300866016082.029, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003044.997, "dur": 8.840, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161152225 + } + }, + { + "ph": "s", "id": 161152225, "pid": 5714, "tid": 5714, "ts": 6300866003044.997, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866016253.711, "dur": 4.064, + "args": { + "External id": 85532, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152242, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152242, "pid": 0, "tid": 7, "ts": 6300866016253.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003098.217, "dur": 7.160, + "args": { + "External id": 85532, "cbid": 211, "correlation": 161152242 + } + }, + { + "ph": "s", "id": 161152242, "pid": 5714, "tid": 5714, "ts": 6300866003098.217, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866016258.415, "dur": 1.216, + "args": { + "External id": 85537, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152259, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152259, "pid": 0, "tid": 7, "ts": 6300866016258.415, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003127.657, "dur": 5.409, + "args": { + "External id": 85537, "cbid": 211, "correlation": 161152259 + } + }, + { + "ph": "s", "id": 161152259, "pid": 5714, "tid": 5714, "ts": 6300866003127.657, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866016260.335, "dur": 1.024, + "args": { + "External id": 85539, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152269, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152269, "pid": 0, "tid": 7, "ts": 6300866016260.335, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003147.877, "dur": 4.800, + "args": { + "External id": 85539, "cbid": 211, "correlation": 161152269 + } + }, + { + "ph": "s", "id": 161152269, "pid": 5714, "tid": 5714, "ts": 6300866003147.877, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866016262.063, "dur": 1.088, + "args": { + "External id": 85540, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152275, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152275, "pid": 0, "tid": 7, "ts": 6300866016262.063, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003159.817, "dur": 4.309, + "args": { + "External id": 85540, "cbid": 211, "correlation": 161152275 + } + }, + { + "ph": "s", "id": 161152275, "pid": 5714, "tid": 5714, "ts": 6300866003159.817, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866016263.823, "dur": 1.024, + "args": { + "External id": 85541, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152285, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152285, "pid": 0, "tid": 7, "ts": 6300866016263.823, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003174.346, "dur": 4.300, + "args": { + "External id": 85541, "cbid": 211, "correlation": 161152285 + } + }, + { + "ph": "s", "id": 161152285, "pid": 5714, "tid": 5714, "ts": 6300866003174.346, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866016265.551, "dur": 1.056, + "args": { + "External id": 85542, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152291, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152291, "pid": 0, "tid": 7, "ts": 6300866016265.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003184.166, "dur": 4.171, + "args": { + "External id": 85542, "cbid": 211, "correlation": 161152291 + } + }, + { + "ph": "s", "id": 161152291, "pid": 5714, "tid": 5714, "ts": 6300866003184.166, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866016267.343, "dur": 3.392, + "args": { + "External id": 85543, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152304, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152304, "pid": 0, "tid": 7, "ts": 6300866016267.343, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003204.296, "dur": 4.920, + "args": { + "External id": 85543, "cbid": 211, "correlation": 161152304 + } + }, + { + "ph": "s", "id": 161152304, "pid": 5714, "tid": 5714, "ts": 6300866003204.296, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866016271.375, "dur": 1.120, + "args": { + "External id": 85546, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152310, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152310, "pid": 0, "tid": 7, "ts": 6300866016271.375, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003215.216, "dur": 4.180, + "args": { + "External id": 85546, "cbid": 211, "correlation": 161152310 + } + }, + { + "ph": "s", "id": 161152310, "pid": 5714, "tid": 5714, "ts": 6300866003215.216, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866016273.135, "dur": 1.024, + "args": { + "External id": 85547, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152316, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152316, "pid": 0, "tid": 7, "ts": 6300866016273.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003224.206, "dur": 3.990, + "args": { + "External id": 85547, "cbid": 211, "correlation": 161152316 + } + }, + { + "ph": "s", "id": 161152316, "pid": 5714, "tid": 5714, "ts": 6300866003224.206, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300866016274.895, "dur": 233.539, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152330, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161152330, "pid": 0, "tid": 7, "ts": 6300866016274.895, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003322.736, "dur": 8.570, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161152330 + } + }, + { + "ph": "s", "id": 161152330, "pid": 5714, "tid": 5714, "ts": 6300866003322.736, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866003365.696, "dur": 0.590, + "args": { + "External id": 85551, "cbid": 200, "correlation": 161152353 + } + }, + { + "ph": "f", "id": 161152353, "pid": 5714, "tid": 5714, "ts": 6300866003365.696, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866016509.394, "dur": 0.832, + "args": { + "External id": 85551, "device": 0, "context": 1, "stream": 7, "correlation": 161152356, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161152356, "pid": 0, "tid": 7, "ts": 6300866016509.394, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300866003368.026, "dur": 6.610, + "args": { + "External id": 85551, "cbid": 51, "correlation": 161152356 + } + }, + { + "ph": "s", "id": 161152356, "pid": 5714, "tid": 5714, "ts": 6300866003368.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866016511.442, "dur": 695.624, + "args": { + "External id": 85551, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152357, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161152357, "pid": 0, "tid": 7, "ts": 6300866016511.442, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003374.866, "dur": 5.710, + "args": { + "External id": 85551, "cbid": 307, "correlation": 161152357 + } + }, + { + "ph": "s", "id": 161152357, "pid": 5714, "tid": 5714, "ts": 6300866003374.866, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866017207.770, "dur": 2.944, + "args": { + "External id": 85554, "device": 0, "context": 1, "stream": 7, "correlation": 161152362, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161152362, "pid": 0, "tid": 7, "ts": 6300866017207.770, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300866003406.186, "dur": 11.650, + "args": { + "External id": 85554, "cbid": 41, "correlation": 161152362 + } + }, + { + "ph": "s", "id": 161152362, "pid": 5714, "tid": 5714, "ts": 6300866003406.186, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866003458.226, "dur": 0.500, + "args": { + "External id": 85559, "cbid": 200, "correlation": 161152390 + } + }, + { + "ph": "f", "id": 161152390, "pid": 5714, "tid": 5714, "ts": 6300866003458.226, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866017211.386, "dur": 689.193, + "args": { + "External id": 85559, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152393, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161152393, "pid": 0, "tid": 7, "ts": 6300866017211.386, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003460.246, "dur": 6.970, + "args": { + "External id": 85559, "cbid": 307, "correlation": 161152393 + } + }, + { + "ph": "s", "id": 161152393, "pid": 5714, "tid": 5714, "ts": 6300866003460.246, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866017901.283, "dur": 221.058, + "args": { + "External id": 85560, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152398, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152398, "pid": 0, "tid": 7, "ts": 6300866017901.283, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003481.276, "dur": 6.020, + "args": { + "External id": 85560, "cbid": 211, "correlation": 161152398 + } + }, + { + "ph": "s", "id": 161152398, "pid": 5714, "tid": 5714, "ts": 6300866003481.276, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6300866003530.626, "dur": 1.160, + "args": { + "External id": 85568, "cbid": 210, "correlation": 161152424 + } + }, + { + "ph": "f", "id": 161152424, "pid": 5714, "tid": 5714, "ts": 6300866003530.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6300866018123.109, "dur": 637.576, + "args": { + "External id": 85568, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152425, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161152425, "pid": 0, "tid": 7, "ts": 6300866018123.109, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003535.285, "dur": 7.311, + "args": { + "External id": 85568, "cbid": 211, "correlation": 161152425 + } + }, + { + "ph": "s", "id": 161152425, "pid": 5714, "tid": 5714, "ts": 6300866003535.285, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6300866018761.357, "dur": 171.522, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152444, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161152444, "pid": 0, "tid": 7, "ts": 6300866018761.357, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003646.615, "dur": 9.130, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161152444 + } + }, + { + "ph": "s", "id": 161152444, "pid": 5714, "tid": 5714, "ts": 6300866003646.615, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866018933.487, "dur": 4.095, + "args": { + "External id": 85578, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152461, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152461, "pid": 0, "tid": 7, "ts": 6300866018933.487, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003689.145, "dur": 7.330, + "args": { + "External id": 85578, "cbid": 211, "correlation": 161152461 + } + }, + { + "ph": "s", "id": 161152461, "pid": 5714, "tid": 5714, "ts": 6300866003689.145, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866018938.318, "dur": 1.216, + "args": { + "External id": 85583, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152478, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152478, "pid": 0, "tid": 7, "ts": 6300866018938.318, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003721.825, "dur": 5.440, + "args": { + "External id": 85583, "cbid": 211, "correlation": 161152478 + } + }, + { + "ph": "s", "id": 161152478, "pid": 5714, "tid": 5714, "ts": 6300866003721.825, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866018940.206, "dur": 1.024, + "args": { + "External id": 85585, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152488, "pid": 0, "tid": 7, "ts": 6300866018940.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003740.985, "dur": 4.720, + "args": { + "External id": 85585, "cbid": 211, "correlation": 161152488 + } + }, + { + "ph": "s", "id": 161152488, "pid": 5714, "tid": 5714, "ts": 6300866003740.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866018941.934, "dur": 1.056, + "args": { + "External id": 85586, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152494, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152494, "pid": 0, "tid": 7, "ts": 6300866018941.934, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003752.625, "dur": 4.370, + "args": { + "External id": 85586, "cbid": 211, "correlation": 161152494 + } + }, + { + "ph": "s", "id": 161152494, "pid": 5714, "tid": 5714, "ts": 6300866003752.625, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866018943.694, "dur": 1.025, + "args": { + "External id": 85587, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152504, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152504, "pid": 0, "tid": 7, "ts": 6300866018943.694, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003767.625, "dur": 4.430, + "args": { + "External id": 85587, "cbid": 211, "correlation": 161152504 + } + }, + { + "ph": "s", "id": 161152504, "pid": 5714, "tid": 5714, "ts": 6300866003767.625, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866018945.423, "dur": 1.056, + "args": { + "External id": 85588, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152510, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152510, "pid": 0, "tid": 7, "ts": 6300866018945.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003777.695, "dur": 3.960, + "args": { + "External id": 85588, "cbid": 211, "correlation": 161152510 + } + }, + { + "ph": "s", "id": 161152510, "pid": 5714, "tid": 5714, "ts": 6300866003777.695, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866018947.183, "dur": 3.424, + "args": { + "External id": 85589, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152523, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152523, "pid": 0, "tid": 7, "ts": 6300866018947.183, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003808.335, "dur": 5.270, + "args": { + "External id": 85589, "cbid": 211, "correlation": 161152523 + } + }, + { + "ph": "s", "id": 161152523, "pid": 5714, "tid": 5714, "ts": 6300866003808.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866018951.215, "dur": 1.088, + "args": { + "External id": 85592, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152529, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152529, "pid": 0, "tid": 7, "ts": 6300866018951.215, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003821.295, "dur": 4.260, + "args": { + "External id": 85592, "cbid": 211, "correlation": 161152529 + } + }, + { + "ph": "s", "id": 161152529, "pid": 5714, "tid": 5714, "ts": 6300866003821.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866018952.943, "dur": 0.992, + "args": { + "External id": 85593, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152535, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152535, "pid": 0, "tid": 7, "ts": 6300866018952.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003830.505, "dur": 4.090, + "args": { + "External id": 85593, "cbid": 211, "correlation": 161152535 + } + }, + { + "ph": "s", "id": 161152535, "pid": 5714, "tid": 5714, "ts": 6300866003830.505, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6300866018954.671, "dur": 233.762, + "args": { + "External id": 85209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152549, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 161152549, "pid": 0, "tid": 7, "ts": 6300866018954.671, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866003938.395, "dur": 22.580, + "args": { + "External id": 85209, "cbid": 307, "correlation": 161152549 + } + }, + { + "ph": "s", "id": 161152549, "pid": 5714, "tid": 5714, "ts": 6300866003938.395, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866003997.904, "dur": 0.540, + "args": { + "External id": 85597, "cbid": 200, "correlation": 161152572 + } + }, + { + "ph": "f", "id": 161152572, "pid": 5714, "tid": 5714, "ts": 6300866003997.904, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866019189.297, "dur": 0.800, + "args": { + "External id": 85597, "device": 0, "context": 1, "stream": 7, "correlation": 161152575, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161152575, "pid": 0, "tid": 7, "ts": 6300866019189.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6300866004000.244, "dur": 6.800, + "args": { + "External id": 85597, "cbid": 51, "correlation": 161152575 + } + }, + { + "ph": "s", "id": 161152575, "pid": 5714, "tid": 5714, "ts": 6300866004000.244, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866019191.281, "dur": 691.945, + "args": { + "External id": 85597, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152576, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161152576, "pid": 0, "tid": 7, "ts": 6300866019191.281, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866004007.275, "dur": 5.800, + "args": { + "External id": 85597, "cbid": 307, "correlation": 161152576 + } + }, + { + "ph": "s", "id": 161152576, "pid": 5714, "tid": 5714, "ts": 6300866004007.275, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866019883.898, "dur": 2.944, + "args": { + "External id": 85600, "device": 0, "context": 1, "stream": 7, "correlation": 161152581, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 161152581, "pid": 0, "tid": 7, "ts": 6300866019883.898, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300866004039.795, "dur": 13.220, + "args": { + "External id": 85600, "cbid": 41, "correlation": 161152581 + } + }, + { + "ph": "s", "id": 161152581, "pid": 5714, "tid": 5714, "ts": 6300866004039.795, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6300866004094.564, "dur": 0.490, + "args": { + "External id": 85605, "cbid": 200, "correlation": 161152609 + } + }, + { + "ph": "f", "id": 161152609, "pid": 5714, "tid": 5714, "ts": 6300866004094.564, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866019887.514, "dur": 689.576, + "args": { + "External id": 85605, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152612, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161152612, "pid": 0, "tid": 7, "ts": 6300866019887.514, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866004096.534, "dur": 7.320, + "args": { + "External id": 85605, "cbid": 307, "correlation": 161152612 + } + }, + { + "ph": "s", "id": 161152612, "pid": 5714, "tid": 5714, "ts": 6300866004096.534, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866020577.730, "dur": 220.610, + "args": { + "External id": 85606, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152617, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152617, "pid": 0, "tid": 7, "ts": 6300866020577.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866004119.264, "dur": 6.080, + "args": { + "External id": 85606, "cbid": 211, "correlation": 161152617 + } + }, + { + "ph": "s", "id": 161152617, "pid": 5714, "tid": 5714, "ts": 6300866004119.264, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866020799.012, "dur": 5.312, + "args": { + "External id": 85608, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152630, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152630, "pid": 0, "tid": 7, "ts": 6300866020799.012, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866004148.524, "dur": 6.220, + "args": { + "External id": 85608, "cbid": 211, "correlation": 161152630 + } + }, + { + "ph": "s", "id": 161152630, "pid": 5714, "tid": 5714, "ts": 6300866004148.524, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866020805.028, "dur": 159.682, + "args": { + "External id": 85613, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152643, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152643, "pid": 0, "tid": 7, "ts": 6300866020805.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866004177.904, "dur": 6.000, + "args": { + "External id": 85613, "cbid": 211, "correlation": 161152643 + } + }, + { + "ph": "s", "id": 161152643, "pid": 5714, "tid": 5714, "ts": 6300866004177.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866020965.318, "dur": 1.504, + "args": { + "External id": 85618, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152651, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152651, "pid": 0, "tid": 7, "ts": 6300866020965.318, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866004242.064, "dur": 7.730, + "args": { + "External id": 85618, "cbid": 211, "correlation": 161152651 + } + }, + { + "ph": "s", "id": 161152651, "pid": 5714, "tid": 5714, "ts": 6300866004242.064, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866020967.462, "dur": 1.312, + "args": { + "External id": 85619, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152657, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152657, "pid": 0, "tid": 7, "ts": 6300866020967.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866004261.224, "dur": 4.680, + "args": { + "External id": 85619, "cbid": 211, "correlation": 161152657 + } + }, + { + "ph": "s", "id": 161152657, "pid": 5714, "tid": 5714, "ts": 6300866004261.224, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300866004523.603, "dur": 3.280, + "args": { + "cbid": 147, "correlation": 161152662 + } + }, + { + "ph": "s", "id": 161152662, "pid": 5714, "tid": 5714, "ts": 6300866004523.603, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300866004530.283, "dur": 1.280, + "args": { + "cbid": 147, "correlation": 161152666 + } + }, + { + "ph": "s", "id": 161152666, "pid": 5714, "tid": 5714, "ts": 6300866004530.283, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866020969.478, "dur": 1.024, + "args": { + "External id": 85621, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152683, "pid": 0, "tid": 7, "ts": 6300866020969.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866004605.423, "dur": 14.470, + "args": { + "External id": 85621, "cbid": 211, "correlation": 161152683 + } + }, + { + "ph": "s", "id": 161152683, "pid": 5714, "tid": 5714, "ts": 6300866004605.423, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866020971.238, "dur": 0.896, + "args": { + "External id": 85625, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152696, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152696, "pid": 0, "tid": 7, "ts": 6300866020971.238, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866004698.783, "dur": 8.670, + "args": { + "External id": 85625, "cbid": 211, "correlation": 161152696 + } + }, + { + "ph": "s", "id": 161152696, "pid": 5714, "tid": 5714, "ts": 6300866004698.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866020972.838, "dur": 1.089, + "args": { + "External id": 86019, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152712, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152712, "pid": 0, "tid": 7, "ts": 6300866020972.838, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866005268.252, "dur": 41.660, + "args": { + "External id": 86019, "cbid": 211, "correlation": 161152712 + } + }, + { + "ph": "s", "id": 161152712, "pid": 5714, "tid": 6744, "ts": 6300866005268.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866020974.599, "dur": 1.952, + "args": { + "External id": 86025, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152730, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152730, "pid": 0, "tid": 7, "ts": 6300866020974.599, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866005587.881, "dur": 17.960, + "args": { + "External id": 86025, "cbid": 211, "correlation": 161152730 + } + }, + { + "ph": "s", "id": 161152730, "pid": 5714, "tid": 6744, "ts": 6300866005587.881, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6300866020977.383, "dur": 0.352, + "args": { + "External id": 86033, "device": 0, "context": 1, "stream": 7, "correlation": 161152748, "bytes": 4, "memory bandwidth (GB/s)": 0.011363636363636364 + } + }, + { + "ph": "f", "id": 161152748, "pid": 0, "tid": 7, "ts": 6300866020977.383, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866005744.120, "dur": 25.191, + "args": { + "External id": 86033, "cbid": 41, "correlation": 161152748 + } + }, + { + "ph": "s", "id": 161152748, "pid": 5714, "tid": 6744, "ts": 6300866005744.120, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6300866005770.180, "dur": 15210.726, + "args": { + "External id": 86033, "cbid": 131, "correlation": 161152749 + } + }, + { + "ph": "s", "id": 161152749, "pid": 5714, "tid": 6744, "ts": 6300866005770.180, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866021029.799, "dur": 1.152, + "args": { + "External id": 86037, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152758, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152758, "pid": 0, "tid": 7, "ts": 6300866021029.799, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866021017.386, "dur": 14.110, + "args": { + "External id": 86037, "cbid": 211, "correlation": 161152758 + } + }, + { + "ph": "s", "id": 161152758, "pid": 5714, "tid": 6744, "ts": 6300866021017.386, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6300866021066.983, "dur": 1.024, + "args": { + "External id": 86040, "device": 0, "context": 1, "stream": 7, "correlation": 161152764, "bytes": 1, "memory bandwidth (GB/s)": 0.0009765625 + } + }, + { + "ph": "f", "id": 161152764, "pid": 0, "tid": 7, "ts": 6300866021066.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866021055.036, "dur": 11.670, + "args": { + "External id": 86040, "cbid": 41, "correlation": 161152764 + } + }, + { + "ph": "s", "id": 161152764, "pid": 5714, "tid": 6744, "ts": 6300866021055.036, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6300866021066.956, "dur": 3.580, + "args": { + "External id": 86040, "cbid": 131, "correlation": 161152765 + } + }, + { + "ph": "s", "id": 161152765, "pid": 5714, "tid": 6744, "ts": 6300866021066.956, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866021179.785, "dur": 16.224, + "args": { + "External id": 86051, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152787, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152787, "pid": 0, "tid": 7, "ts": 6300866021179.785, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866021167.516, "dur": 12.769, + "args": { + "External id": 86051, "cbid": 211, "correlation": 161152787 + } + }, + { + "ph": "s", "id": 161152787, "pid": 5714, "tid": 6744, "ts": 6300866021167.516, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866021219.177, "dur": 44.385, + "args": { + "External id": 86054, "device": 0, "context": 1, "stream": 7, "correlation": 161152794, "bytes": 25165824, "memory bandwidth (GB/s)": 566.9893883068604 + } + }, + { + "ph": "f", "id": 161152794, "pid": 0, "tid": 7, "ts": 6300866021219.177, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866021195.645, "dur": 22.891, + "args": { + "External id": 86054, "cbid": 41, "correlation": 161152794 + } + }, + { + "ph": "s", "id": 161152794, "pid": 5714, "tid": 6744, "ts": 6300866021195.645, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866021271.690, "dur": 85.345, + "args": { + "External id": 86061, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152812, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152812, "pid": 0, "tid": 7, "ts": 6300866021271.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866021263.265, "dur": 8.050, + "args": { + "External id": 86061, "cbid": 211, "correlation": 161152812 + } + }, + { + "ph": "s", "id": 161152812, "pid": 5714, "tid": 6744, "ts": 6300866021263.265, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866021357.739, "dur": 69.537, + "args": { + "External id": 86064, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152820, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152820, "pid": 0, "tid": 7, "ts": 6300866021357.739, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866021293.625, "dur": 16.910, + "args": { + "External id": 86064, "cbid": 211, "correlation": 161152820 + } + }, + { + "ph": "s", "id": 161152820, "pid": 5714, "tid": 6744, "ts": 6300866021293.625, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866021427.916, "dur": 87.137, + "args": { + "External id": 86071, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152839, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152839, "pid": 0, "tid": 7, "ts": 6300866021427.916, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866021350.955, "dur": 7.160, + "args": { + "External id": 86071, "cbid": 211, "correlation": 161152839 + } + }, + { + "ph": "s", "id": 161152839, "pid": 5714, "tid": 6744, "ts": 6300866021350.955, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866021515.789, "dur": 227.011, + "args": { + "External id": 86074, "device": 0, "context": 1, "stream": 7, "correlation": 161152846, "bytes": 100663296, "memory bandwidth (GB/s)": 443.42915541537633 + } + }, + { + "ph": "f", "id": 161152846, "pid": 0, "tid": 7, "ts": 6300866021515.789, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866021368.735, "dur": 22.580, + "args": { + "External id": 86074, "cbid": 41, "correlation": 161152846 + } + }, + { + "ph": "s", "id": 161152846, "pid": 5714, "tid": 6744, "ts": 6300866021368.735, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866021743.472, "dur": 98.913, + "args": { + "External id": 86081, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152864, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152864, "pid": 0, "tid": 7, "ts": 6300866021743.472, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866021425.445, "dur": 7.090, + "args": { + "External id": 86081, "cbid": 211, "correlation": 161152864 + } + }, + { + "ph": "s", "id": 161152864, "pid": 5714, "tid": 6744, "ts": 6300866021425.445, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866021842.993, "dur": 226.434, + "args": { + "External id": 86084, "device": 0, "context": 1, "stream": 7, "correlation": 161152871, "bytes": 100663296, "memory bandwidth (GB/s)": 444.5591033148732 + } + }, + { + "ph": "f", "id": 161152871, "pid": 0, "tid": 7, "ts": 6300866021842.993, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866021444.725, "dur": 10.170, + "args": { + "External id": 86084, "cbid": 41, "correlation": 161152871 + } + }, + { + "ph": "s", "id": 161152871, "pid": 5714, "tid": 6744, "ts": 6300866021444.725, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866022070.163, "dur": 2.496, + "args": { + "External id": 86088, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152889, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152889, "pid": 0, "tid": 7, "ts": 6300866022070.163, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866021492.315, "dur": 8.350, + "args": { + "External id": 86088, "cbid": 211, "correlation": 161152889 + } + }, + { + "ph": "s", "id": 161152889, "pid": 5714, "tid": 6744, "ts": 6300866021492.315, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6300866022073.491, "dur": 0.352, + "args": { + "External id": 86096, "device": 0, "context": 1, "stream": 7, "correlation": 161152907, "bytes": 4, "memory bandwidth (GB/s)": 0.011363636363636364 + } + }, + { + "ph": "f", "id": 161152907, "pid": 0, "tid": 7, "ts": 6300866022073.491, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866021566.285, "dur": 10.330, + "args": { + "External id": 86096, "cbid": 41, "correlation": 161152907 + } + }, + { + "ph": "s", "id": 161152907, "pid": 5714, "tid": 6744, "ts": 6300866021566.285, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6300866021576.975, "dur": 498.988, + "args": { + "External id": 86096, "cbid": 131, "correlation": 161152908 + } + }, + { + "ph": "s", "id": 161152908, "pid": 5714, "tid": 6744, "ts": 6300866021576.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866022105.460, "dur": 1.376, + "args": { + "External id": 86100, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152917, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161152917, "pid": 0, "tid": 7, "ts": 6300866022105.460, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866022096.043, "dur": 9.471, + "args": { + "External id": 86100, "cbid": 211, "correlation": 161152917 + } + }, + { + "ph": "s", "id": 161152917, "pid": 5714, "tid": 6744, "ts": 6300866022096.043, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6300866022126.100, "dur": 1.088, + "args": { + "External id": 86103, "device": 0, "context": 1, "stream": 7, "correlation": 161152923, "bytes": 1, "memory bandwidth (GB/s)": 0.0009191176470588235 + } + }, + { + "ph": "f", "id": 161152923, "pid": 0, "tid": 7, "ts": 6300866022126.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866022117.453, "dur": 8.030, + "args": { + "External id": 86103, "cbid": 41, "correlation": 161152923 + } + }, + { + "ph": "s", "id": 161152923, "pid": 5714, "tid": 6744, "ts": 6300866022117.453, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6300866022125.783, "dur": 3.920, + "args": { + "External id": 86103, "cbid": 131, "correlation": 161152924 + } + }, + { + "ph": "s", "id": 161152924, "pid": 5714, "tid": 6744, "ts": 6300866022125.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866022170.132, "dur": 160.450, + "args": { + "External id": 86104, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152931, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152931, "pid": 0, "tid": 7, "ts": 6300866022170.132, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866022160.853, "dur": 9.000, + "args": { + "External id": 86104, "cbid": 211, "correlation": 161152931 + } + }, + { + "ph": "s", "id": 161152931, "pid": 5714, "tid": 6744, "ts": 6300866022160.853, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866022331.222, "dur": 17.601, + "args": { + "External id": 86115, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152955, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152955, "pid": 0, "tid": 7, "ts": 6300866022331.222, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866022226.113, "dur": 7.950, + "args": { + "External id": 86115, "cbid": 211, "correlation": 161152955 + } + }, + { + "ph": "s", "id": 161152955, "pid": 5714, "tid": 6744, "ts": 6300866022226.113, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866022349.559, "dur": 44.032, + "args": { + "External id": 86118, "device": 0, "context": 1, "stream": 7, "correlation": 161152962, "bytes": 25165824, "memory bandwidth (GB/s)": 571.5348837209302 + } + }, + { + "ph": "f", "id": 161152962, "pid": 0, "tid": 7, "ts": 6300866022349.559, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866022247.163, "dur": 13.140, + "args": { + "External id": 86118, "cbid": 41, "correlation": 161152962 + } + }, + { + "ph": "s", "id": 161152962, "pid": 5714, "tid": 6744, "ts": 6300866022247.163, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866022394.263, "dur": 86.817, + "args": { + "External id": 86125, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152980, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152980, "pid": 0, "tid": 7, "ts": 6300866022394.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866022305.253, "dur": 8.470, + "args": { + "External id": 86125, "cbid": 211, "correlation": 161152980 + } + }, + { + "ph": "s", "id": 161152980, "pid": 5714, "tid": 6744, "ts": 6300866022305.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866022481.784, "dur": 69.633, + "args": { + "External id": 86128, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161152988, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161152988, "pid": 0, "tid": 7, "ts": 6300866022481.784, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866022330.353, "dur": 5.290, + "args": { + "External id": 86128, "cbid": 211, "correlation": 161152988 + } + }, + { + "ph": "s", "id": 161152988, "pid": 5714, "tid": 6744, "ts": 6300866022330.353, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866022552.057, "dur": 87.041, + "args": { + "External id": 86135, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153007, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153007, "pid": 0, "tid": 7, "ts": 6300866022552.057, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866022370.473, "dur": 6.450, + "args": { + "External id": 86135, "cbid": 211, "correlation": 161153007 + } + }, + { + "ph": "s", "id": 161153007, "pid": 5714, "tid": 6744, "ts": 6300866022370.473, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866022641.082, "dur": 225.059, + "args": { + "External id": 86138, "device": 0, "context": 1, "stream": 7, "correlation": 161153014, "bytes": 100663296, "memory bandwidth (GB/s)": 447.2751411852003 + } + }, + { + "ph": "f", "id": 161153014, "pid": 0, "tid": 7, "ts": 6300866022641.082, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866022388.313, "dur": 15.750, + "args": { + "External id": 86138, "cbid": 41, "correlation": 161153014 + } + }, + { + "ph": "s", "id": 161153014, "pid": 5714, "tid": 6744, "ts": 6300866022388.313, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866022866.813, "dur": 98.113, + "args": { + "External id": 86145, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153032, "pid": 0, "tid": 7, "ts": 6300866022866.813, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866022436.783, "dur": 7.190, + "args": { + "External id": 86145, "cbid": 211, "correlation": 161153032 + } + }, + { + "ph": "s", "id": 161153032, "pid": 5714, "tid": 6744, "ts": 6300866022436.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866022965.598, "dur": 224.514, + "args": { + "External id": 86148, "device": 0, "context": 1, "stream": 7, "correlation": 161153039, "bytes": 100663296, "memory bandwidth (GB/s)": 448.3608861808172 + } + }, + { + "ph": "f", "id": 161153039, "pid": 0, "tid": 7, "ts": 6300866022965.598, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866022454.313, "dur": 9.509, + "args": { + "External id": 86148, "cbid": 41, "correlation": 161153039 + } + }, + { + "ph": "s", "id": 161153039, "pid": 5714, "tid": 6744, "ts": 6300866022454.313, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866023190.720, "dur": 323.717, + "args": { + "External id": 86149, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153046, "registers per thread": 23, "shared memory": 0, "blocks per SM": 768.000000, "warps per SM": 3072.000000, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153046, "pid": 0, "tid": 7, "ts": 6300866023190.720, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866022476.493, "dur": 5.289, + "args": { + "External id": 86149, "cbid": 211, "correlation": 161153046 + } + }, + { + "ph": "s", "id": 161153046, "pid": 5714, "tid": 6744, "ts": 6300866022476.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866023515.077, "dur": 2.816, + "args": { + "External id": 86153, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153066, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161153066, "pid": 0, "tid": 7, "ts": 6300866023515.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866022516.613, "dur": 7.320, + "args": { + "External id": 86153, "cbid": 211, "correlation": 161153066 + } + }, + { + "ph": "s", "id": 161153066, "pid": 5714, "tid": 6744, "ts": 6300866022516.613, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6300866023518.821, "dur": 0.384, + "args": { + "External id": 86161, "device": 0, "context": 1, "stream": 7, "correlation": 161153084, "bytes": 4, "memory bandwidth (GB/s)": 0.010416666666666666 + } + }, + { + "ph": "f", "id": 161153084, "pid": 0, "tid": 7, "ts": 6300866023518.821, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866022587.422, "dur": 9.850, + "args": { + "External id": 86161, "cbid": 41, "correlation": 161153084 + } + }, + { + "ph": "s", "id": 161153084, "pid": 5714, "tid": 6744, "ts": 6300866022587.422, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6300866022597.632, "dur": 923.948, + "args": { + "External id": 86161, "cbid": 131, "correlation": 161153085 + } + }, + { + "ph": "s", "id": 161153085, "pid": 5714, "tid": 6744, "ts": 6300866022597.632, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866023553.509, "dur": 1.376, + "args": { + "External id": 86165, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153094, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161153094, "pid": 0, "tid": 7, "ts": 6300866023553.509, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866023543.460, "dur": 10.090, + "args": { + "External id": 86165, "cbid": 211, "correlation": 161153094 + } + }, + { + "ph": "s", "id": 161153094, "pid": 5714, "tid": 6744, "ts": 6300866023543.460, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6300866023575.429, "dur": 1.024, + "args": { + "External id": 86168, "device": 0, "context": 1, "stream": 7, "correlation": 161153100, "bytes": 1, "memory bandwidth (GB/s)": 0.0009765625 + } + }, + { + "ph": "f", "id": 161153100, "pid": 0, "tid": 7, "ts": 6300866023575.429, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866023565.780, "dur": 9.020, + "args": { + "External id": 86168, "cbid": 41, "correlation": 161153100 + } + }, + { + "ph": "s", "id": 161153100, "pid": 5714, "tid": 6744, "ts": 6300866023565.780, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6300866023575.040, "dur": 3.800, + "args": { + "External id": 86168, "cbid": 131, "correlation": 161153101 + } + }, + { + "ph": "s", "id": 161153101, "pid": 5714, "tid": 6744, "ts": 6300866023575.040, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866023610.726, "dur": 159.202, + "args": { + "External id": 86169, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153108, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153108, "pid": 0, "tid": 7, "ts": 6300866023610.726, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866023602.830, "dur": 7.270, + "args": { + "External id": 86169, "cbid": 211, "correlation": 161153108 + } + }, + { + "ph": "s", "id": 161153108, "pid": 5714, "tid": 6744, "ts": 6300866023602.830, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866023770.568, "dur": 17.343, + "args": { + "External id": 86180, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153132, "pid": 0, "tid": 7, "ts": 6300866023770.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866023666.740, "dur": 8.220, + "args": { + "External id": 86180, "cbid": 211, "correlation": 161153132 + } + }, + { + "ph": "s", "id": 161153132, "pid": 5714, "tid": 6744, "ts": 6300866023666.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866023788.551, "dur": 45.857, + "args": { + "External id": 86183, "device": 0, "context": 1, "stream": 7, "correlation": 161153139, "bytes": 25165824, "memory bandwidth (GB/s)": 548.7891488758532 + } + }, + { + "ph": "f", "id": 161153139, "pid": 0, "tid": 7, "ts": 6300866023788.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866023686.600, "dur": 12.740, + "args": { + "External id": 86183, "cbid": 41, "correlation": 161153139 + } + }, + { + "ph": "s", "id": 161153139, "pid": 5714, "tid": 6744, "ts": 6300866023686.600, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866023835.112, "dur": 86.881, + "args": { + "External id": 86190, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153157, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153157, "pid": 0, "tid": 7, "ts": 6300866023835.112, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866023738.620, "dur": 7.170, + "args": { + "External id": 86190, "cbid": 211, "correlation": 161153157 + } + }, + { + "ph": "s", "id": 161153157, "pid": 5714, "tid": 6744, "ts": 6300866023738.620, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866023922.697, "dur": 69.793, + "args": { + "External id": 86193, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153165, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153165, "pid": 0, "tid": 7, "ts": 6300866023922.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866023760.630, "dur": 5.190, + "args": { + "External id": 86193, "cbid": 211, "correlation": 161153165 + } + }, + { + "ph": "s", "id": 161153165, "pid": 5714, "tid": 6744, "ts": 6300866023760.630, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866023993.194, "dur": 87.137, + "args": { + "External id": 86200, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153184, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153184, "pid": 0, "tid": 7, "ts": 6300866023993.194, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866023798.030, "dur": 6.569, + "args": { + "External id": 86200, "cbid": 211, "correlation": 161153184 + } + }, + { + "ph": "s", "id": 161153184, "pid": 5714, "tid": 6744, "ts": 6300866023798.030, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866024080.939, "dur": 227.619, + "args": { + "External id": 86203, "device": 0, "context": 1, "stream": 7, "correlation": 161153191, "bytes": 100663296, "memory bandwidth (GB/s)": 442.24469837755197 + } + }, + { + "ph": "f", "id": 161153191, "pid": 0, "tid": 7, "ts": 6300866024080.939, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866023814.070, "dur": 10.820, + "args": { + "External id": 86203, "cbid": 41, "correlation": 161153191 + } + }, + { + "ph": "s", "id": 161153191, "pid": 5714, "tid": 6744, "ts": 6300866023814.070, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866024309.166, "dur": 96.577, + "args": { + "External id": 86210, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153209, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153209, "pid": 0, "tid": 7, "ts": 6300866024309.166, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866023860.290, "dur": 6.740, + "args": { + "External id": 86210, "cbid": 211, "correlation": 161153209 + } + }, + { + "ph": "s", "id": 161153209, "pid": 5714, "tid": 6744, "ts": 6300866023860.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866024406.383, "dur": 226.339, + "args": { + "External id": 86213, "device": 0, "context": 1, "stream": 7, "correlation": 161153216, "bytes": 100663296, "memory bandwidth (GB/s)": 444.7456956158682 + } + }, + { + "ph": "f", "id": 161153216, "pid": 0, "tid": 7, "ts": 6300866024406.383, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866023876.219, "dur": 9.660, + "args": { + "External id": 86213, "cbid": 41, "correlation": 161153216 + } + }, + { + "ph": "s", "id": 161153216, "pid": 5714, "tid": 6744, "ts": 6300866023876.219, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866024633.394, "dur": 324.035, + "args": { + "External id": 86214, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153223, "registers per thread": 23, "shared memory": 0, "blocks per SM": 768.000000, "warps per SM": 3072.000000, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153223, "pid": 0, "tid": 7, "ts": 6300866024633.394, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866023898.279, "dur": 4.950, + "args": { + "External id": 86214, "cbid": 211, "correlation": 161153223 + } + }, + { + "ph": "s", "id": 161153223, "pid": 5714, "tid": 6744, "ts": 6300866023898.279, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866024958.037, "dur": 2.592, + "args": { + "External id": 86218, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153243, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161153243, "pid": 0, "tid": 7, "ts": 6300866024958.037, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866023936.459, "dur": 7.440, + "args": { + "External id": 86218, "cbid": 211, "correlation": 161153243 + } + }, + { + "ph": "s", "id": 161153243, "pid": 5714, "tid": 6744, "ts": 6300866023936.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6300866024961.525, "dur": 0.384, + "args": { + "External id": 86226, "device": 0, "context": 1, "stream": 7, "correlation": 161153261, "bytes": 4, "memory bandwidth (GB/s)": 0.010416666666666666 + } + }, + { + "ph": "f", "id": 161153261, "pid": 0, "tid": 7, "ts": 6300866024961.525, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866024013.249, "dur": 11.120, + "args": { + "External id": 86226, "cbid": 41, "correlation": 161153261 + } + }, + { + "ph": "s", "id": 161153261, "pid": 5714, "tid": 6744, "ts": 6300866024013.249, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6300866024024.719, "dur": 939.458, + "args": { + "External id": 86226, "cbid": 131, "correlation": 161153262 + } + }, + { + "ph": "s", "id": 161153262, "pid": 5714, "tid": 6744, "ts": 6300866024024.719, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866024995.286, "dur": 1.440, + "args": { + "External id": 86230, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153271, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161153271, "pid": 0, "tid": 7, "ts": 6300866024995.286, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866024985.287, "dur": 10.170, + "args": { + "External id": 86230, "cbid": 211, "correlation": 161153271 + } + }, + { + "ph": "s", "id": 161153271, "pid": 5714, "tid": 6744, "ts": 6300866024985.287, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6300866025017.942, "dur": 0.992, + "args": { + "External id": 86233, "device": 0, "context": 1, "stream": 7, "correlation": 161153277, "bytes": 1, "memory bandwidth (GB/s)": 0.0010080645161290322 + } + }, + { + "ph": "f", "id": 161153277, "pid": 0, "tid": 7, "ts": 6300866025017.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866025008.637, "dur": 8.610, + "args": { + "External id": 86233, "cbid": 41, "correlation": 161153277 + } + }, + { + "ph": "s", "id": 161153277, "pid": 5714, "tid": 6744, "ts": 6300866025008.637, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6300866025017.537, "dur": 3.790, + "args": { + "External id": 86233, "cbid": 131, "correlation": 161153278 + } + }, + { + "ph": "s", "id": 161153278, "pid": 5714, "tid": 6744, "ts": 6300866025017.537, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866025053.878, "dur": 159.074, + "args": { + "External id": 86234, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153285, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153285, "pid": 0, "tid": 7, "ts": 6300866025053.878, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025045.937, "dur": 7.340, + "args": { + "External id": 86234, "cbid": 211, "correlation": 161153285 + } + }, + { + "ph": "s", "id": 161153285, "pid": 5714, "tid": 6744, "ts": 6300866025045.937, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866025213.624, "dur": 17.761, + "args": { + "External id": 86245, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153309, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153309, "pid": 0, "tid": 7, "ts": 6300866025213.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025109.267, "dur": 7.909, + "args": { + "External id": 86245, "cbid": 211, "correlation": 161153309 + } + }, + { + "ph": "s", "id": 161153309, "pid": 5714, "tid": 6744, "ts": 6300866025109.267, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866025232.057, "dur": 44.704, + "args": { + "External id": 86248, "device": 0, "context": 1, "stream": 7, "correlation": 161153316, "bytes": 25165824, "memory bandwidth (GB/s)": 562.9434502505369 + } + }, + { + "ph": "f", "id": 161153316, "pid": 0, "tid": 7, "ts": 6300866025232.057, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866025128.856, "dur": 13.631, + "args": { + "External id": 86248, "cbid": 41, "correlation": 161153316 + } + }, + { + "ph": "s", "id": 161153316, "pid": 5714, "tid": 6744, "ts": 6300866025128.856, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866025277.497, "dur": 87.137, + "args": { + "External id": 86255, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153334, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153334, "pid": 0, "tid": 7, "ts": 6300866025277.497, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025182.747, "dur": 7.369, + "args": { + "External id": 86255, "cbid": 211, "correlation": 161153334 + } + }, + { + "ph": "s", "id": 161153334, "pid": 5714, "tid": 6744, "ts": 6300866025182.747, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866025365.242, "dur": 68.641, + "args": { + "External id": 86258, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153342, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153342, "pid": 0, "tid": 7, "ts": 6300866025365.242, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025204.806, "dur": 5.340, + "args": { + "External id": 86258, "cbid": 211, "correlation": 161153342 + } + }, + { + "ph": "s", "id": 161153342, "pid": 5714, "tid": 6744, "ts": 6300866025204.806, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866025434.587, "dur": 88.033, + "args": { + "External id": 86265, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153361, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153361, "pid": 0, "tid": 7, "ts": 6300866025434.587, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025243.566, "dur": 6.340, + "args": { + "External id": 86265, "cbid": 211, "correlation": 161153361 + } + }, + { + "ph": "s", "id": 161153361, "pid": 5714, "tid": 6744, "ts": 6300866025243.566, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866025523.356, "dur": 225.059, + "args": { + "External id": 86268, "device": 0, "context": 1, "stream": 7, "correlation": 161153368, "bytes": 100663296, "memory bandwidth (GB/s)": 447.2751411852003 + } + }, + { + "ph": "f", "id": 161153368, "pid": 0, "tid": 7, "ts": 6300866025523.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866025259.696, "dur": 19.270, + "args": { + "External id": 86268, "cbid": 41, "correlation": 161153368 + } + }, + { + "ph": "s", "id": 161153368, "pid": 5714, "tid": 6744, "ts": 6300866025259.696, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866025749.087, "dur": 98.529, + "args": { + "External id": 86275, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153386, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153386, "pid": 0, "tid": 7, "ts": 6300866025749.087, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025329.556, "dur": 8.470, + "args": { + "External id": 86275, "cbid": 211, "correlation": 161153386 + } + }, + { + "ph": "s", "id": 161153386, "pid": 5714, "tid": 6744, "ts": 6300866025329.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866025848.352, "dur": 225.986, + "args": { + "External id": 86278, "device": 0, "context": 1, "stream": 7, "correlation": 161153393, "bytes": 100663296, "memory bandwidth (GB/s)": 445.4404078128734 + } + }, + { + "ph": "f", "id": 161153393, "pid": 0, "tid": 7, "ts": 6300866025848.352, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866025349.286, "dur": 10.580, + "args": { + "External id": 86278, "cbid": 41, "correlation": 161153393 + } + }, + { + "ph": "s", "id": 161153393, "pid": 5714, "tid": 6744, "ts": 6300866025349.286, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866026075.010, "dur": 323.268, + "args": { + "External id": 86279, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153400, "registers per thread": 23, "shared memory": 0, "blocks per SM": 768.000000, "warps per SM": 3072.000000, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153400, "pid": 0, "tid": 7, "ts": 6300866026075.010, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025373.446, "dur": 5.410, + "args": { + "External id": 86279, "cbid": 211, "correlation": 161153400 + } + }, + { + "ph": "s", "id": 161153400, "pid": 5714, "tid": 6744, "ts": 6300866025373.446, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866026398.918, "dur": 198.755, + "args": { + "External id": 86282, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153421, "registers per thread": 35, "shared memory": 1024, "blocks per SM": 16.031250, "warps per SM": 64.125000, "grid": [2052, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153421, "pid": 0, "tid": 7, "ts": 6300866026398.918, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025579.106, "dur": 10.849, + "args": { + "External id": 86282, "cbid": 307, "correlation": 161153421 + } + }, + { + "ph": "s", "id": 161153421, "pid": 5714, "tid": 6744, "ts": 6300866025579.106, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866026598.281, "dur": 4.704, + "args": { + "External id": 86283, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153429, "registers per thread": 21, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161153429, "pid": 0, "tid": 7, "ts": 6300866026598.281, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025629.186, "dur": 7.259, + "args": { + "External id": 86283, "cbid": 307, "correlation": 161153429 + } + }, + { + "ph": "s", "id": 161153429, "pid": 5714, "tid": 6744, "ts": 6300866025629.186, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866026603.689, "dur": 311.267, + "args": { + "External id": 86284, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153440, "registers per thread": 24, "shared memory": 32, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153440, "pid": 0, "tid": 7, "ts": 6300866026603.689, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025675.615, "dur": 7.350, + "args": { + "External id": 86284, "cbid": 307, "correlation": 161153440 + } + }, + { + "ph": "s", "id": 161153440, "pid": 5714, "tid": 6744, "ts": 6300866025675.615, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866026915.660, "dur": 327.876, + "args": { + "External id": 86313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153477, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161153477, "pid": 0, "tid": 7, "ts": 6300866026915.660, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866025950.245, "dur": 13.960, + "args": { + "External id": 86313, "cbid": 211, "correlation": 161153477 + } + }, + { + "ph": "s", "id": 161153477, "pid": 5714, "tid": 6744, "ts": 6300866025950.245, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6300866027244.208, "dur": 433.957, + "args": { + "External id": 86302, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153505, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153505, "pid": 0, "tid": 7, "ts": 6300866027244.208, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026048.425, "dur": 8.920, + "args": { + "External id": 86302, "cbid": 307, "correlation": 161153505 + } + }, + { + "ph": "s", "id": 161153505, "pid": 5714, "tid": 6744, "ts": 6300866026048.425, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866026189.534, "dur": 1.120, + "args": { + "External id": 86338, "cbid": 200, "correlation": 161153530 + } + }, + { + "ph": "f", "id": 161153530, "pid": 5714, "tid": 6744, "ts": 6300866026189.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866027679.093, "dur": 0.832, + "args": { + "External id": 86338, "device": 0, "context": 1, "stream": 7, "correlation": 161153533, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161153533, "pid": 0, "tid": 7, "ts": 6300866027679.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866026194.234, "dur": 13.380, + "args": { + "External id": 86338, "cbid": 51, "correlation": 161153533 + } + }, + { + "ph": "s", "id": 161153533, "pid": 5714, "tid": 6744, "ts": 6300866026194.234, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866027681.205, "dur": 368.037, + "args": { + "External id": 86338, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153534, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161153534, "pid": 0, "tid": 7, "ts": 6300866027681.205, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026207.904, "dur": 9.340, + "args": { + "External id": 86338, "cbid": 307, "correlation": 161153534 + } + }, + { + "ph": "s", "id": 161153534, "pid": 5714, "tid": 6744, "ts": 6300866026207.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866026360.994, "dur": 0.600, + "args": { + "External id": 86356, "cbid": 200, "correlation": 161153571 + } + }, + { + "ph": "f", "id": 161153571, "pid": 5714, "tid": 6744, "ts": 6300866026360.994, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866028050.042, "dur": 0.832, + "args": { + "External id": 86356, "device": 0, "context": 1, "stream": 7, "correlation": 161153574, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161153574, "pid": 0, "tid": 7, "ts": 6300866028050.042, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866026363.394, "dur": 7.060, + "args": { + "External id": 86356, "cbid": 51, "correlation": 161153574 + } + }, + { + "ph": "s", "id": 161153574, "pid": 5714, "tid": 6744, "ts": 6300866026363.394, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866028052.058, "dur": 353.700, + "args": { + "External id": 86356, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153575, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161153575, "pid": 0, "tid": 7, "ts": 6300866028052.058, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026370.694, "dur": 8.890, + "args": { + "External id": 86356, "cbid": 307, "correlation": 161153575 + } + }, + { + "ph": "s", "id": 161153575, "pid": 5714, "tid": 6744, "ts": 6300866026370.694, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866026415.704, "dur": 0.320, + "args": { + "External id": 86363, "cbid": 200, "correlation": 161153600 + } + }, + { + "ph": "f", "id": 161153600, "pid": 5714, "tid": 6744, "ts": 6300866026415.704, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866028406.366, "dur": 353.988, + "args": { + "External id": 86363, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153603, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161153603, "pid": 0, "tid": 7, "ts": 6300866028406.366, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026417.513, "dur": 6.800, + "args": { + "External id": 86363, "cbid": 307, "correlation": 161153603 + } + }, + { + "ph": "s", "id": 161153603, "pid": 5714, "tid": 6744, "ts": 6300866026417.513, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866026535.283, "dur": 0.400, + "args": { + "External id": 86386, "cbid": 200, "correlation": 161153648 + } + }, + { + "ph": "f", "id": 161153648, "pid": 5714, "tid": 6744, "ts": 6300866026535.283, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866028761.282, "dur": 0.768, + "args": { + "External id": 86386, "device": 0, "context": 1, "stream": 7, "correlation": 161153651, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 161153651, "pid": 0, "tid": 7, "ts": 6300866028761.282, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866026537.153, "dur": 6.400, + "args": { + "External id": 86386, "cbid": 51, "correlation": 161153651 + } + }, + { + "ph": "s", "id": 161153651, "pid": 5714, "tid": 6744, "ts": 6300866026537.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866028762.850, "dur": 357.956, + "args": { + "External id": 86386, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153652, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161153652, "pid": 0, "tid": 7, "ts": 6300866028762.850, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026543.743, "dur": 7.470, + "args": { + "External id": 86386, "cbid": 307, "correlation": 161153652 + } + }, + { + "ph": "s", "id": 161153652, "pid": 5714, "tid": 6744, "ts": 6300866026543.743, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866026582.563, "dur": 0.260, + "args": { + "External id": 86393, "cbid": 200, "correlation": 161153677 + } + }, + { + "ph": "f", "id": 161153677, "pid": 5714, "tid": 6744, "ts": 6300866026582.563, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866029121.478, "dur": 356.549, + "args": { + "External id": 86393, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153680, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161153680, "pid": 0, "tid": 7, "ts": 6300866029121.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026583.893, "dur": 5.500, + "args": { + "External id": 86393, "cbid": 307, "correlation": 161153680 + } + }, + { + "ph": "s", "id": 161153680, "pid": 5714, "tid": 6744, "ts": 6300866026583.893, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866029478.763, "dur": 51.424, + "args": { + "External id": 86398, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153694, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153694, "pid": 0, "tid": 7, "ts": 6300866029478.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026636.443, "dur": 8.110, + "args": { + "External id": 86398, "cbid": 211, "correlation": 161153694 + } + }, + { + "ph": "s", "id": 161153694, "pid": 5714, "tid": 6744, "ts": 6300866026636.443, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866029530.859, "dur": 45.345, + "args": { + "External id": 86410, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153718, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153718, "pid": 0, "tid": 7, "ts": 6300866029530.859, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026707.993, "dur": 10.340, + "args": { + "External id": 86410, "cbid": 211, "correlation": 161153718 + } + }, + { + "ph": "s", "id": 161153718, "pid": 5714, "tid": 6744, "ts": 6300866026707.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866029576.940, "dur": 24.832, + "args": { + "External id": 86411, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153728, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153728, "pid": 0, "tid": 7, "ts": 6300866029576.940, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026729.013, "dur": 6.020, + "args": { + "External id": 86411, "cbid": 211, "correlation": 161153728 + } + }, + { + "ph": "s", "id": 161153728, "pid": 5714, "tid": 6744, "ts": 6300866026729.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866029602.604, "dur": 0.768, + "args": { + "External id": 86412, "device": 0, "context": 1, "stream": 7, "correlation": 161153743, "bytes": 24, "memory bandwidth (GB/s)": 0.03125 + } + }, + { + "ph": "f", "id": 161153743, "pid": 0, "tid": 7, "ts": 6300866029602.604, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866026755.613, "dur": 7.580, + "args": { + "External id": 86412, "cbid": 51, "correlation": 161153743 + } + }, + { + "ph": "s", "id": 161153743, "pid": 5714, "tid": 6744, "ts": 6300866026755.613, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6300866029604.652, "dur": 42.464, + "args": { + "External id": 86412, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153745, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161153745, "pid": 0, "tid": 7, "ts": 6300866029604.652, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026764.903, "dur": 6.940, + "args": { + "External id": 86412, "cbid": 211, "correlation": 161153745 + } + }, + { + "ph": "s", "id": 161153745, "pid": 5714, "tid": 6744, "ts": 6300866026764.903, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866029647.852, "dur": 52.129, + "args": { + "External id": 86423, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153766, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153766, "pid": 0, "tid": 7, "ts": 6300866029647.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026837.033, "dur": 10.080, + "args": { + "External id": 86423, "cbid": 211, "correlation": 161153766 + } + }, + { + "ph": "s", "id": 161153766, "pid": 5714, "tid": 6744, "ts": 6300866026837.033, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866029700.653, "dur": 140.898, + "args": { + "External id": 86426, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153781, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153781, "pid": 0, "tid": 7, "ts": 6300866029700.653, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026874.503, "dur": 7.609, + "args": { + "External id": 86426, "cbid": 211, "correlation": 161153781 + } + }, + { + "ph": "s", "id": 161153781, "pid": 5714, "tid": 6744, "ts": 6300866026874.503, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866029842.223, "dur": 109.665, + "args": { + "External id": 86427, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153791, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153791, "pid": 0, "tid": 7, "ts": 6300866029842.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026893.712, "dur": 6.271, + "args": { + "External id": 86427, "cbid": 211, "correlation": 161153791 + } + }, + { + "ph": "s", "id": 161153791, "pid": 5714, "tid": 6744, "ts": 6300866026893.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866029952.624, "dur": 77.729, + "args": { + "External id": 86428, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153805, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153805, "pid": 0, "tid": 7, "ts": 6300866029952.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026914.843, "dur": 5.449, + "args": { + "External id": 86428, "cbid": 211, "correlation": 161153805 + } + }, + { + "ph": "s", "id": 161153805, "pid": 5714, "tid": 6744, "ts": 6300866026914.843, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866030031.089, "dur": 1.472, + "args": { + "External id": 86431, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161153819, "pid": 0, "tid": 7, "ts": 6300866030031.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026961.172, "dur": 8.960, + "args": { + "External id": 86431, "cbid": 211, "correlation": 161153819 + } + }, + { + "ph": "s", "id": 161153819, "pid": 5714, "tid": 6744, "ts": 6300866026961.172, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866030033.233, "dur": 1.248, + "args": { + "External id": 86435, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153829, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161153829, "pid": 0, "tid": 7, "ts": 6300866030033.233, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866026988.972, "dur": 6.620, + "args": { + "External id": 86435, "cbid": 211, "correlation": 161153829 + } + }, + { + "ph": "s", "id": 161153829, "pid": 5714, "tid": 6744, "ts": 6300866026988.972, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866030035.121, "dur": 1.024, + "args": { + "External id": 86436, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153839, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161153839, "pid": 0, "tid": 7, "ts": 6300866030035.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027004.712, "dur": 4.260, + "args": { + "External id": 86436, "cbid": 211, "correlation": 161153839 + } + }, + { + "ph": "s", "id": 161153839, "pid": 5714, "tid": 6744, "ts": 6300866027004.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866030036.881, "dur": 26.784, + "args": { + "External id": 86444, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153857, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153857, "pid": 0, "tid": 7, "ts": 6300866030036.881, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027064.512, "dur": 9.310, + "args": { + "External id": 86444, "cbid": 211, "correlation": 161153857 + } + }, + { + "ph": "s", "id": 161153857, "pid": 5714, "tid": 6744, "ts": 6300866027064.512, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866030064.305, "dur": 113.218, + "args": { + "External id": 86450, "device": 0, "context": 1, "stream": 7, "correlation": 161153871, "bytes": 50331648, "memory bandwidth (GB/s)": 444.55517673868115 + } + }, + { + "ph": "f", "id": 161153871, "pid": 0, "tid": 7, "ts": 6300866030064.305, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866027105.292, "dur": 17.260, + "args": { + "External id": 86450, "cbid": 41, "correlation": 161153871 + } + }, + { + "ph": "s", "id": 161153871, "pid": 5714, "tid": 6744, "ts": 6300866027105.292, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866030178.227, "dur": 71.072, + "args": { + "External id": 86452, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153883, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153883, "pid": 0, "tid": 7, "ts": 6300866030178.227, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027138.692, "dur": 5.850, + "args": { + "External id": 86452, "cbid": 211, "correlation": 161153883 + } + }, + { + "ph": "s", "id": 161153883, "pid": 5714, "tid": 6744, "ts": 6300866027138.692, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866030250.035, "dur": 147.522, + "args": { + "External id": 86453, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153893, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153893, "pid": 0, "tid": 7, "ts": 6300866030250.035, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027153.112, "dur": 4.290, + "args": { + "External id": 86453, "cbid": 211, "correlation": 161153893 + } + }, + { + "ph": "s", "id": 161153893, "pid": 5714, "tid": 6744, "ts": 6300866027153.112, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866030398.261, "dur": 143.778, + "args": { + "External id": 86454, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153900, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153900, "pid": 0, "tid": 7, "ts": 6300866030398.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027173.562, "dur": 6.140, + "args": { + "External id": 86454, "cbid": 211, "correlation": 161153900 + } + }, + { + "ph": "s", "id": 161153900, "pid": 5714, "tid": 6744, "ts": 6300866027173.562, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866030542.679, "dur": 47.169, + "args": { + "External id": 86460, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153919, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153919, "pid": 0, "tid": 7, "ts": 6300866030542.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027215.742, "dur": 8.100, + "args": { + "External id": 86460, "cbid": 211, "correlation": 161153919 + } + }, + { + "ph": "s", "id": 161153919, "pid": 5714, "tid": 6744, "ts": 6300866027215.742, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866030590.456, "dur": 57.408, + "args": { + "External id": 86461, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153931, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161153931, "pid": 0, "tid": 7, "ts": 6300866030590.456, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027240.342, "dur": 6.940, + "args": { + "External id": 86461, "cbid": 211, "correlation": 161153931 + } + }, + { + "ph": "s", "id": 161153931, "pid": 5714, "tid": 6744, "ts": 6300866027240.342, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866027346.722, "dur": 0.549, + "args": { + "External id": 86473, "cbid": 200, "correlation": 161153971 + } + }, + { + "ph": "f", "id": 161153971, "pid": 5714, "tid": 6744, "ts": 6300866027346.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866030648.632, "dur": 0.768, + "args": { + "External id": 86473, "device": 0, "context": 1, "stream": 7, "correlation": 161153974, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161153974, "pid": 0, "tid": 7, "ts": 6300866030648.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866027349.062, "dur": 7.469, + "args": { + "External id": 86473, "cbid": 51, "correlation": 161153974 + } + }, + { + "ph": "s", "id": 161153974, "pid": 5714, "tid": 6744, "ts": 6300866027349.062, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866030650.200, "dur": 137.826, + "args": { + "External id": 86473, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153975, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161153975, "pid": 0, "tid": 7, "ts": 6300866030650.200, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027356.791, "dur": 7.991, + "args": { + "External id": 86473, "cbid": 307, "correlation": 161153975 + } + }, + { + "ph": "s", "id": 161153975, "pid": 5714, "tid": 6744, "ts": 6300866027356.791, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866030788.634, "dur": 122.145, + "args": { + "External id": 86480, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161153997, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161153997, "pid": 0, "tid": 7, "ts": 6300866030788.634, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027399.481, "dur": 7.920, + "args": { + "External id": 86480, "cbid": 211, "correlation": 161153997 + } + }, + { + "ph": "s", "id": 161153997, "pid": 5714, "tid": 6744, "ts": 6300866027399.481, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866027636.941, "dur": 0.760, + "args": { + "External id": 86506, "cbid": 200, "correlation": 161154044 + } + }, + { + "ph": "f", "id": 161154044, "pid": 5714, "tid": 6744, "ts": 6300866027636.941, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866027637.841, "dur": 0.220, + "args": { + "External id": 86506, "cbid": 200, "correlation": 161154045 + } + }, + { + "ph": "f", "id": 161154045, "pid": 5714, "tid": 6744, "ts": 6300866027637.841, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866027661.771, "dur": 0.340, + "args": { + "External id": 86506, "cbid": 200, "correlation": 161154063 + } + }, + { + "ph": "f", "id": 161154063, "pid": 5714, "tid": 6744, "ts": 6300866027661.771, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866030911.387, "dur": 92.225, + "args": { + "External id": 86506, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154064, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154064, "pid": 0, "tid": 7, "ts": 6300866030911.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027664.081, "dur": 13.500, + "args": { + "External id": 86506, "cbid": 211, "correlation": 161154064 + } + }, + { + "ph": "s", "id": 161154064, "pid": 5714, "tid": 6744, "ts": 6300866027664.081, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866027679.201, "dur": 1.430, + "args": { + "External id": 86506, "cbid": 273, "correlation": 161154066 + } + }, + { + "ph": "f", "id": 161154066, "pid": 5714, "tid": 6744, "ts": 6300866027679.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866031004.252, "dur": 991.244, + "args": { + "External id": 86506, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154067, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161154067, "pid": 0, "tid": 7, "ts": 6300866031004.252, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027680.941, "dur": 5.800, + "args": { + "External id": 86506, "cbid": 211, "correlation": 161154067 + } + }, + { + "ph": "s", "id": 161154067, "pid": 5714, "tid": 6744, "ts": 6300866027680.941, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866031996.136, "dur": 72.705, + "args": { + "External id": 86506, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154069, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161154069, "pid": 0, "tid": 7, "ts": 6300866031996.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866027687.581, "dur": 5.190, + "args": { + "External id": 86506, "cbid": 211, "correlation": 161154069 + } + }, + { + "ph": "s", "id": 161154069, "pid": 5714, "tid": 6744, "ts": 6300866027687.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866032069.577, "dur": 46.976, + "args": { + "External id": 86516, "device": 0, "context": 1, "stream": 7, "correlation": 161154095, "bytes": 25165824, "memory bandwidth (GB/s)": 535.716621253406 + } + }, + { + "ph": "f", "id": 161154095, "pid": 0, "tid": 7, "ts": 6300866032069.577, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866027852.150, "dur": 18.730, + "args": { + "External id": 86516, "cbid": 41, "correlation": 161154095 + } + }, + { + "ph": "s", "id": 161154095, "pid": 5714, "tid": 6744, "ts": 6300866027852.150, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866032117.257, "dur": 32.417, + "args": { + "External id": 86513, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154113, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154113, "pid": 0, "tid": 7, "ts": 6300866032117.257, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028033.990, "dur": 10.260, + "args": { + "External id": 86513, "cbid": 307, "correlation": 161154113 + } + }, + { + "ph": "s", "id": 161154113, "pid": 5714, "tid": 6744, "ts": 6300866028033.990, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866032150.314, "dur": 37.664, + "args": { + "External id": 86523, "device": 0, "context": 1, "stream": 7, "correlation": 161154128, "bytes": 25165824, "memory bandwidth (GB/s)": 668.1665250637213 + } + }, + { + "ph": "f", "id": 161154128, "pid": 0, "tid": 7, "ts": 6300866032150.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866028113.230, "dur": 15.000, + "args": { + "External id": 86523, "cbid": 41, "correlation": 161154128 + } + }, + { + "ph": "s", "id": 161154128, "pid": 5714, "tid": 6744, "ts": 6300866028113.230, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866032188.650, "dur": 28.225, + "args": { + "External id": 86520, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154146, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154146, "pid": 0, "tid": 7, "ts": 6300866032188.650, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028226.580, "dur": 8.049, + "args": { + "External id": 86520, "cbid": 307, "correlation": 161154146 + } + }, + { + "ph": "s", "id": 161154146, "pid": 5714, "tid": 6744, "ts": 6300866028226.580, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866028484.069, "dur": 0.560, + "args": { + "External id": 86547, "cbid": 200, "correlation": 161154190 + } + }, + { + "ph": "f", "id": 161154190, "pid": 5714, "tid": 6744, "ts": 6300866028484.069, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866032217.771, "dur": 0.768, + "args": { + "External id": 86547, "device": 0, "context": 1, "stream": 7, "correlation": 161154193, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161154193, "pid": 0, "tid": 7, "ts": 6300866032217.771, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866028486.489, "dur": 7.660, + "args": { + "External id": 86547, "cbid": 51, "correlation": 161154193 + } + }, + { + "ph": "s", "id": 161154193, "pid": 5714, "tid": 6744, "ts": 6300866028486.489, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866032220.203, "dur": 140.961, + "args": { + "External id": 86547, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154194, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154194, "pid": 0, "tid": 7, "ts": 6300866032220.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028494.409, "dur": 8.610, + "args": { + "External id": 86547, "cbid": 307, "correlation": 161154194 + } + }, + { + "ph": "s", "id": 161154194, "pid": 5714, "tid": 6744, "ts": 6300866028494.409, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866032361.772, "dur": 122.562, + "args": { + "External id": 86554, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154216, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154216, "pid": 0, "tid": 7, "ts": 6300866032361.772, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028534.149, "dur": 6.570, + "args": { + "External id": 86554, "cbid": 211, "correlation": 161154216 + } + }, + { + "ph": "s", "id": 161154216, "pid": 5714, "tid": 6744, "ts": 6300866028534.149, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866028651.088, "dur": 0.511, + "args": { + "External id": 86577, "cbid": 200, "correlation": 161154262 + } + }, + { + "ph": "f", "id": 161154262, "pid": 5714, "tid": 6744, "ts": 6300866028651.088, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866032485.134, "dur": 0.768, + "args": { + "External id": 86577, "device": 0, "context": 1, "stream": 7, "correlation": 161154265, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161154265, "pid": 0, "tid": 7, "ts": 6300866032485.134, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866028653.208, "dur": 6.100, + "args": { + "External id": 86577, "cbid": 51, "correlation": 161154265 + } + }, + { + "ph": "s", "id": 161154265, "pid": 5714, "tid": 6744, "ts": 6300866028653.208, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866032486.702, "dur": 140.929, + "args": { + "External id": 86577, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154266, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154266, "pid": 0, "tid": 7, "ts": 6300866032486.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028659.499, "dur": 7.360, + "args": { + "External id": 86577, "cbid": 307, "correlation": 161154266 + } + }, + { + "ph": "s", "id": 161154266, "pid": 5714, "tid": 6744, "ts": 6300866028659.499, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866032628.271, "dur": 122.242, + "args": { + "External id": 86584, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154288, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154288, "pid": 0, "tid": 7, "ts": 6300866032628.271, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028696.079, "dur": 5.909, + "args": { + "External id": 86584, "cbid": 211, "correlation": 161154288 + } + }, + { + "ph": "s", "id": 161154288, "pid": 5714, "tid": 6744, "ts": 6300866028696.079, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866032751.761, "dur": 39.840, + "args": { + "External id": 86589, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154303, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154303, "pid": 0, "tid": 7, "ts": 6300866032751.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028745.158, "dur": 8.030, + "args": { + "External id": 86589, "cbid": 211, "correlation": 161154303 + } + }, + { + "ph": "s", "id": 161154303, "pid": 5714, "tid": 6744, "ts": 6300866028745.158, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866028842.258, "dur": 0.520, + "args": { + "External id": 86608, "cbid": 200, "correlation": 161154347 + } + }, + { + "ph": "f", "id": 161154347, "pid": 5714, "tid": 6744, "ts": 6300866028842.258, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866032792.497, "dur": 0.800, + "args": { + "External id": 86608, "device": 0, "context": 1, "stream": 7, "correlation": 161154350, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 161154350, "pid": 0, "tid": 7, "ts": 6300866032792.497, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866028844.428, "dur": 6.230, + "args": { + "External id": 86608, "cbid": 51, "correlation": 161154350 + } + }, + { + "ph": "s", "id": 161154350, "pid": 5714, "tid": 6744, "ts": 6300866028844.428, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866032794.481, "dur": 142.338, + "args": { + "External id": 86608, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154351, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154351, "pid": 0, "tid": 7, "ts": 6300866032794.481, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028850.848, "dur": 7.260, + "args": { + "External id": 86608, "cbid": 307, "correlation": 161154351 + } + }, + { + "ph": "s", "id": 161154351, "pid": 5714, "tid": 6744, "ts": 6300866028850.848, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866032937.459, "dur": 122.370, + "args": { + "External id": 86615, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154373, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154373, "pid": 0, "tid": 7, "ts": 6300866032937.459, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028887.778, "dur": 5.700, + "args": { + "External id": 86615, "cbid": 211, "correlation": 161154373 + } + }, + { + "ph": "s", "id": 161154373, "pid": 5714, "tid": 6744, "ts": 6300866028887.778, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866033060.501, "dur": 38.720, + "args": { + "External id": 86620, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154384, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154384, "pid": 0, "tid": 7, "ts": 6300866033060.501, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028935.018, "dur": 7.510, + "args": { + "External id": 86620, "cbid": 211, "correlation": 161154384 + } + }, + { + "ph": "s", "id": 161154384, "pid": 5714, "tid": 6744, "ts": 6300866028935.018, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866033099.829, "dur": 42.240, + "args": { + "External id": 86632, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154408, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154408, "pid": 0, "tid": 7, "ts": 6300866033099.829, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866028998.918, "dur": 8.960, + "args": { + "External id": 86632, "cbid": 211, "correlation": 161154408 + } + }, + { + "ph": "s", "id": 161154408, "pid": 5714, "tid": 6744, "ts": 6300866028998.918, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866033142.709, "dur": 25.089, + "args": { + "External id": 86633, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154418, "pid": 0, "tid": 7, "ts": 6300866033142.709, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029017.788, "dur": 4.600, + "args": { + "External id": 86633, "cbid": 211, "correlation": 161154418 + } + }, + { + "ph": "s", "id": 161154418, "pid": 5714, "tid": 6744, "ts": 6300866029017.788, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866033168.566, "dur": 0.768, + "args": { + "External id": 86634, "device": 0, "context": 1, "stream": 7, "correlation": 161154433, "bytes": 24, "memory bandwidth (GB/s)": 0.03125 + } + }, + { + "ph": "f", "id": 161154433, "pid": 0, "tid": 7, "ts": 6300866033168.566, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866029039.378, "dur": 6.000, + "args": { + "External id": 86634, "cbid": 51, "correlation": 161154433 + } + }, + { + "ph": "s", "id": 161154433, "pid": 5714, "tid": 6744, "ts": 6300866029039.378, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6300866033170.518, "dur": 41.664, + "args": { + "External id": 86634, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154435, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161154435, "pid": 0, "tid": 7, "ts": 6300866033170.518, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029046.628, "dur": 5.850, + "args": { + "External id": 86634, "cbid": 211, "correlation": 161154435 + } + }, + { + "ph": "s", "id": 161154435, "pid": 5714, "tid": 6744, "ts": 6300866029046.628, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866033212.822, "dur": 49.953, + "args": { + "External id": 86645, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154456, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154456, "pid": 0, "tid": 7, "ts": 6300866033212.822, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029111.558, "dur": 8.600, + "args": { + "External id": 86645, "cbid": 211, "correlation": 161154456 + } + }, + { + "ph": "s", "id": 161154456, "pid": 5714, "tid": 6744, "ts": 6300866029111.558, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866033263.447, "dur": 141.794, + "args": { + "External id": 86648, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154471, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154471, "pid": 0, "tid": 7, "ts": 6300866033263.447, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029147.238, "dur": 6.089, + "args": { + "External id": 86648, "cbid": 211, "correlation": 161154471 + } + }, + { + "ph": "s", "id": 161154471, "pid": 5714, "tid": 6744, "ts": 6300866029147.238, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866033405.881, "dur": 108.801, + "args": { + "External id": 86649, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154481, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154481, "pid": 0, "tid": 7, "ts": 6300866033405.881, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029164.547, "dur": 4.550, + "args": { + "External id": 86649, "cbid": 211, "correlation": 161154481 + } + }, + { + "ph": "s", "id": 161154481, "pid": 5714, "tid": 6744, "ts": 6300866029164.547, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866033515.322, "dur": 77.409, + "args": { + "External id": 86650, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154495, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154495, "pid": 0, "tid": 7, "ts": 6300866033515.322, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029184.267, "dur": 5.230, + "args": { + "External id": 86650, "cbid": 211, "correlation": 161154495 + } + }, + { + "ph": "s", "id": 161154495, "pid": 5714, "tid": 6744, "ts": 6300866029184.267, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866033593.435, "dur": 1.472, + "args": { + "External id": 86653, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154509, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161154509, "pid": 0, "tid": 7, "ts": 6300866033593.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029219.987, "dur": 7.180, + "args": { + "External id": 86653, "cbid": 211, "correlation": 161154509 + } + }, + { + "ph": "s", "id": 161154509, "pid": 5714, "tid": 6744, "ts": 6300866029219.987, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866033595.579, "dur": 1.248, + "args": { + "External id": 86657, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154519, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161154519, "pid": 0, "tid": 7, "ts": 6300866033595.579, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029240.357, "dur": 4.670, + "args": { + "External id": 86657, "cbid": 211, "correlation": 161154519 + } + }, + { + "ph": "s", "id": 161154519, "pid": 5714, "tid": 6744, "ts": 6300866029240.357, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866033597.531, "dur": 1.056, + "args": { + "External id": 86658, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154529, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161154529, "pid": 0, "tid": 7, "ts": 6300866033597.531, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029254.197, "dur": 4.260, + "args": { + "External id": 86658, "cbid": 211, "correlation": 161154529 + } + }, + { + "ph": "s", "id": 161154529, "pid": 5714, "tid": 6744, "ts": 6300866029254.197, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866033599.291, "dur": 27.232, + "args": { + "External id": 86666, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154547, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154547, "pid": 0, "tid": 7, "ts": 6300866033599.291, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029341.697, "dur": 9.030, + "args": { + "External id": 86666, "cbid": 211, "correlation": 161154547 + } + }, + { + "ph": "s", "id": 161154547, "pid": 5714, "tid": 6744, "ts": 6300866029341.697, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866033627.227, "dur": 112.961, + "args": { + "External id": 86672, "device": 0, "context": 1, "stream": 7, "correlation": 161154561, "bytes": 50331648, "memory bandwidth (GB/s)": 445.5665937801542 + } + }, + { + "ph": "f", "id": 161154561, "pid": 0, "tid": 7, "ts": 6300866033627.227, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866029383.827, "dur": 15.720, + "args": { + "External id": 86672, "cbid": 41, "correlation": 161154561 + } + }, + { + "ph": "s", "id": 161154561, "pid": 5714, "tid": 6744, "ts": 6300866029383.827, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866033740.860, "dur": 71.777, + "args": { + "External id": 86674, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154573, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154573, "pid": 0, "tid": 7, "ts": 6300866033740.860, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029416.187, "dur": 5.790, + "args": { + "External id": 86674, "cbid": 211, "correlation": 161154573 + } + }, + { + "ph": "s", "id": 161154573, "pid": 5714, "tid": 6744, "ts": 6300866029416.187, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866033813.277, "dur": 152.770, + "args": { + "External id": 86675, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154583, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154583, "pid": 0, "tid": 7, "ts": 6300866033813.277, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029431.907, "dur": 4.190, + "args": { + "External id": 86675, "cbid": 211, "correlation": 161154583 + } + }, + { + "ph": "s", "id": 161154583, "pid": 5714, "tid": 6744, "ts": 6300866029431.907, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866033966.687, "dur": 143.746, + "args": { + "External id": 86676, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154590, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154590, "pid": 0, "tid": 7, "ts": 6300866033966.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029449.707, "dur": 4.740, + "args": { + "External id": 86676, "cbid": 211, "correlation": 161154590 + } + }, + { + "ph": "s", "id": 161154590, "pid": 5714, "tid": 6744, "ts": 6300866029449.707, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866034111.137, "dur": 46.880, + "args": { + "External id": 86682, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154609, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154609, "pid": 0, "tid": 7, "ts": 6300866034111.137, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029488.227, "dur": 6.850, + "args": { + "External id": 86682, "cbid": 211, "correlation": 161154609 + } + }, + { + "ph": "s", "id": 161154609, "pid": 5714, "tid": 6744, "ts": 6300866029488.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866034158.689, "dur": 40.417, + "args": { + "External id": 86683, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154617, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154617, "pid": 0, "tid": 7, "ts": 6300866034158.689, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029506.146, "dur": 4.300, + "args": { + "External id": 86683, "cbid": 211, "correlation": 161154617 + } + }, + { + "ph": "s", "id": 161154617, "pid": 5714, "tid": 6744, "ts": 6300866029506.146, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866034199.746, "dur": 328.964, + "args": { + "External id": 86698, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154650, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154650, "pid": 0, "tid": 7, "ts": 6300866034199.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029613.576, "dur": 10.220, + "args": { + "External id": 86698, "cbid": 211, "correlation": 161154650 + } + }, + { + "ph": "s", "id": 161154650, "pid": 5714, "tid": 6744, "ts": 6300866029613.576, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6300866034529.382, "dur": 427.941, + "args": { + "External id": 86687, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154678, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154678, "pid": 0, "tid": 7, "ts": 6300866034529.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029685.026, "dur": 7.340, + "args": { + "External id": 86687, "cbid": 307, "correlation": 161154678 + } + }, + { + "ph": "s", "id": 161154678, "pid": 5714, "tid": 6744, "ts": 6300866029685.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866029791.836, "dur": 0.590, + "args": { + "External id": 86723, "cbid": 200, "correlation": 161154703 + } + }, + { + "ph": "f", "id": 161154703, "pid": 5714, "tid": 6744, "ts": 6300866029791.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866034958.091, "dur": 0.800, + "args": { + "External id": 86723, "device": 0, "context": 1, "stream": 7, "correlation": 161154706, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161154706, "pid": 0, "tid": 7, "ts": 6300866034958.091, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866029794.236, "dur": 7.630, + "args": { + "External id": 86723, "cbid": 51, "correlation": 161154706 + } + }, + { + "ph": "s", "id": 161154706, "pid": 5714, "tid": 6744, "ts": 6300866029794.236, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866034959.691, "dur": 367.172, + "args": { + "External id": 86723, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154707, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154707, "pid": 0, "tid": 7, "ts": 6300866034959.691, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029802.106, "dur": 7.970, + "args": { + "External id": 86723, "cbid": 307, "correlation": 161154707 + } + }, + { + "ph": "s", "id": 161154707, "pid": 5714, "tid": 6744, "ts": 6300866029802.106, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866029932.106, "dur": 0.470, + "args": { + "External id": 86741, "cbid": 200, "correlation": 161154744 + } + }, + { + "ph": "f", "id": 161154744, "pid": 5714, "tid": 6744, "ts": 6300866029932.106, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866035327.791, "dur": 0.832, + "args": { + "External id": 86741, "device": 0, "context": 1, "stream": 7, "correlation": 161154747, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161154747, "pid": 0, "tid": 7, "ts": 6300866035327.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866029934.196, "dur": 6.860, + "args": { + "External id": 86741, "cbid": 51, "correlation": 161154747 + } + }, + { + "ph": "s", "id": 161154747, "pid": 5714, "tid": 6744, "ts": 6300866029934.196, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866035329.775, "dur": 353.124, + "args": { + "External id": 86741, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154748, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154748, "pid": 0, "tid": 7, "ts": 6300866035329.775, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029941.256, "dur": 8.320, + "args": { + "External id": 86741, "cbid": 307, "correlation": 161154748 + } + }, + { + "ph": "s", "id": 161154748, "pid": 5714, "tid": 6744, "ts": 6300866029941.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866029981.976, "dur": 0.260, + "args": { + "External id": 86748, "cbid": 200, "correlation": 161154773 + } + }, + { + "ph": "f", "id": 161154773, "pid": 5714, "tid": 6744, "ts": 6300866029981.976, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866035683.539, "dur": 353.765, + "args": { + "External id": 86748, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154776, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154776, "pid": 0, "tid": 7, "ts": 6300866035683.539, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866029983.436, "dur": 5.869, + "args": { + "External id": 86748, "cbid": 307, "correlation": 161154776 + } + }, + { + "ph": "s", "id": 161154776, "pid": 5714, "tid": 6744, "ts": 6300866029983.436, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866030095.835, "dur": 0.460, + "args": { + "External id": 86771, "cbid": 200, "correlation": 161154821 + } + }, + { + "ph": "f", "id": 161154821, "pid": 5714, "tid": 6744, "ts": 6300866030095.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866036038.168, "dur": 0.800, + "args": { + "External id": 86771, "device": 0, "context": 1, "stream": 7, "correlation": 161154824, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161154824, "pid": 0, "tid": 7, "ts": 6300866036038.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866030097.785, "dur": 6.350, + "args": { + "External id": 86771, "cbid": 51, "correlation": 161154824 + } + }, + { + "ph": "s", "id": 161154824, "pid": 5714, "tid": 6744, "ts": 6300866030097.785, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866036040.120, "dur": 356.228, + "args": { + "External id": 86771, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154825, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154825, "pid": 0, "tid": 7, "ts": 6300866036040.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030104.345, "dur": 7.390, + "args": { + "External id": 86771, "cbid": 307, "correlation": 161154825 + } + }, + { + "ph": "s", "id": 161154825, "pid": 5714, "tid": 6744, "ts": 6300866030104.345, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866030142.815, "dur": 0.320, + "args": { + "External id": 86778, "cbid": 200, "correlation": 161154850 + } + }, + { + "ph": "f", "id": 161154850, "pid": 5714, "tid": 6744, "ts": 6300866030142.815, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866036397.052, "dur": 356.100, + "args": { + "External id": 86778, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154853, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161154853, "pid": 0, "tid": 7, "ts": 6300866036397.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030144.215, "dur": 5.870, + "args": { + "External id": 86778, "cbid": 307, "correlation": 161154853 + } + }, + { + "ph": "s", "id": 161154853, "pid": 5714, "tid": 6744, "ts": 6300866030144.215, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866036753.760, "dur": 52.609, + "args": { + "External id": 86783, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154867, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154867, "pid": 0, "tid": 7, "ts": 6300866036753.760, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030193.985, "dur": 7.760, + "args": { + "External id": 86783, "cbid": 211, "correlation": 161154867 + } + }, + { + "ph": "s", "id": 161154867, "pid": 5714, "tid": 6744, "ts": 6300866030193.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866036806.977, "dur": 45.664, + "args": { + "External id": 86795, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154891, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154891, "pid": 0, "tid": 7, "ts": 6300866036806.977, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030257.975, "dur": 8.100, + "args": { + "External id": 86795, "cbid": 211, "correlation": 161154891 + } + }, + { + "ph": "s", "id": 161154891, "pid": 5714, "tid": 6744, "ts": 6300866030257.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866036853.249, "dur": 27.424, + "args": { + "External id": 86796, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154901, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154901, "pid": 0, "tid": 7, "ts": 6300866036853.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030275.675, "dur": 4.390, + "args": { + "External id": 86796, "cbid": 211, "correlation": 161154901 + } + }, + { + "ph": "s", "id": 161154901, "pid": 5714, "tid": 6744, "ts": 6300866030275.675, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866036881.537, "dur": 0.736, + "args": { + "External id": 86797, "device": 0, "context": 1, "stream": 7, "correlation": 161154916, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 161154916, "pid": 0, "tid": 7, "ts": 6300866036881.537, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866030305.515, "dur": 6.250, + "args": { + "External id": 86797, "cbid": 51, "correlation": 161154916 + } + }, + { + "ph": "s", "id": 161154916, "pid": 5714, "tid": 6744, "ts": 6300866030305.515, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6300866036883.105, "dur": 43.393, + "args": { + "External id": 86797, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154918, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161154918, "pid": 0, "tid": 7, "ts": 6300866036883.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030313.045, "dur": 6.070, + "args": { + "External id": 86797, "cbid": 211, "correlation": 161154918 + } + }, + { + "ph": "s", "id": 161154918, "pid": 5714, "tid": 6744, "ts": 6300866030313.045, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866036927.202, "dur": 50.656, + "args": { + "External id": 86808, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154939, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154939, "pid": 0, "tid": 7, "ts": 6300866036927.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030381.335, "dur": 8.600, + "args": { + "External id": 86808, "cbid": 211, "correlation": 161154939 + } + }, + { + "ph": "s", "id": 161154939, "pid": 5714, "tid": 6744, "ts": 6300866030381.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866036978.562, "dur": 144.194, + "args": { + "External id": 86811, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154954, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154954, "pid": 0, "tid": 7, "ts": 6300866036978.562, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030416.204, "dur": 6.031, + "args": { + "External id": 86811, "cbid": 211, "correlation": 161154954 + } + }, + { + "ph": "s", "id": 161154954, "pid": 5714, "tid": 6744, "ts": 6300866030416.204, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866037123.428, "dur": 108.993, + "args": { + "External id": 86812, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154964, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154964, "pid": 0, "tid": 7, "ts": 6300866037123.428, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030432.664, "dur": 4.711, + "args": { + "External id": 86812, "cbid": 211, "correlation": 161154964 + } + }, + { + "ph": "s", "id": 161154964, "pid": 5714, "tid": 6744, "ts": 6300866030432.664, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866037232.997, "dur": 78.562, + "args": { + "External id": 86813, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154978, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161154978, "pid": 0, "tid": 7, "ts": 6300866037232.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030452.095, "dur": 5.049, + "args": { + "External id": 86813, "cbid": 211, "correlation": 161154978 + } + }, + { + "ph": "s", "id": 161154978, "pid": 5714, "tid": 6744, "ts": 6300866030452.095, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866037312.295, "dur": 1.568, + "args": { + "External id": 86816, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161154992, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161154992, "pid": 0, "tid": 7, "ts": 6300866037312.295, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030488.094, "dur": 6.850, + "args": { + "External id": 86816, "cbid": 211, "correlation": 161154992 + } + }, + { + "ph": "s", "id": 161154992, "pid": 5714, "tid": 6744, "ts": 6300866030488.094, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866037314.503, "dur": 1.120, + "args": { + "External id": 86820, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155002, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161155002, "pid": 0, "tid": 7, "ts": 6300866037314.503, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030509.154, "dur": 4.650, + "args": { + "External id": 86820, "cbid": 211, "correlation": 161155002 + } + }, + { + "ph": "s", "id": 161155002, "pid": 5714, "tid": 6744, "ts": 6300866030509.154, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866037316.263, "dur": 1.024, + "args": { + "External id": 86821, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155012, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161155012, "pid": 0, "tid": 7, "ts": 6300866037316.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030521.834, "dur": 4.020, + "args": { + "External id": 86821, "cbid": 211, "correlation": 161155012 + } + }, + { + "ph": "s", "id": 161155012, "pid": 5714, "tid": 6744, "ts": 6300866030521.834, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866037318.022, "dur": 26.433, + "args": { + "External id": 86829, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155030, "pid": 0, "tid": 7, "ts": 6300866037318.022, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030573.104, "dur": 7.690, + "args": { + "External id": 86829, "cbid": 211, "correlation": 161155030 + } + }, + { + "ph": "s", "id": 161155030, "pid": 5714, "tid": 6744, "ts": 6300866030573.104, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866037345.191, "dur": 112.353, + "args": { + "External id": 86835, "device": 0, "context": 1, "stream": 7, "correlation": 161155044, "bytes": 50331648, "memory bandwidth (GB/s)": 447.977784304825 + } + }, + { + "ph": "f", "id": 161155044, "pid": 0, "tid": 7, "ts": 6300866037345.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866030610.014, "dur": 17.000, + "args": { + "External id": 86835, "cbid": 41, "correlation": 161155044 + } + }, + { + "ph": "s", "id": 161155044, "pid": 5714, "tid": 6744, "ts": 6300866030610.014, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866037458.152, "dur": 71.681, + "args": { + "External id": 86837, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155056, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155056, "pid": 0, "tid": 7, "ts": 6300866037458.152, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030640.934, "dur": 5.930, + "args": { + "External id": 86837, "cbid": 211, "correlation": 161155056 + } + }, + { + "ph": "s", "id": 161155056, "pid": 5714, "tid": 6744, "ts": 6300866030640.934, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866037530.537, "dur": 148.770, + "args": { + "External id": 86838, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155066, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155066, "pid": 0, "tid": 7, "ts": 6300866037530.537, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030656.364, "dur": 4.270, + "args": { + "External id": 86838, "cbid": 211, "correlation": 161155066 + } + }, + { + "ph": "s", "id": 161155066, "pid": 5714, "tid": 6744, "ts": 6300866030656.364, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866037680.011, "dur": 146.849, + "args": { + "External id": 86839, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155073, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155073, "pid": 0, "tid": 7, "ts": 6300866037680.011, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030674.364, "dur": 4.660, + "args": { + "External id": 86839, "cbid": 211, "correlation": 161155073 + } + }, + { + "ph": "s", "id": 161155073, "pid": 5714, "tid": 6744, "ts": 6300866030674.364, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866037827.468, "dur": 47.329, + "args": { + "External id": 86845, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155092, "pid": 0, "tid": 7, "ts": 6300866037827.468, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030712.364, "dur": 6.890, + "args": { + "External id": 86845, "cbid": 211, "correlation": 161155092 + } + }, + { + "ph": "s", "id": 161155092, "pid": 5714, "tid": 6744, "ts": 6300866030712.364, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866037875.437, "dur": 58.785, + "args": { + "External id": 86846, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155104, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155104, "pid": 0, "tid": 7, "ts": 6300866037875.437, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030735.144, "dur": 5.120, + "args": { + "External id": 86846, "cbid": 211, "correlation": 161155104 + } + }, + { + "ph": "s", "id": 161155104, "pid": 5714, "tid": 6744, "ts": 6300866030735.144, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866037934.830, "dur": 40.928, + "args": { + "External id": 86849, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155117, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155117, "pid": 0, "tid": 7, "ts": 6300866037934.830, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030767.994, "dur": 5.750, + "args": { + "External id": 86849, "cbid": 211, "correlation": 161155117 + } + }, + { + "ph": "s", "id": 161155117, "pid": 5714, "tid": 6744, "ts": 6300866030767.994, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866030833.674, "dur": 0.560, + "args": { + "External id": 86859, "cbid": 200, "correlation": 161155153 + } + }, + { + "ph": "f", "id": 161155153, "pid": 5714, "tid": 6744, "ts": 6300866030833.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866037976.622, "dur": 0.768, + "args": { + "External id": 86859, "device": 0, "context": 1, "stream": 7, "correlation": 161155156, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161155156, "pid": 0, "tid": 7, "ts": 6300866037976.622, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866030836.054, "dur": 7.209, + "args": { + "External id": 86859, "cbid": 51, "correlation": 161155156 + } + }, + { + "ph": "s", "id": 161155156, "pid": 5714, "tid": 6744, "ts": 6300866030836.054, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866037978.190, "dur": 138.530, + "args": { + "External id": 86859, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155157, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155157, "pid": 0, "tid": 7, "ts": 6300866037978.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030843.514, "dur": 6.960, + "args": { + "External id": 86859, "cbid": 307, "correlation": 161155157 + } + }, + { + "ph": "s", "id": 161155157, "pid": 5714, "tid": 6744, "ts": 6300866030843.514, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866038117.360, "dur": 122.305, + "args": { + "External id": 86866, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155179, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155179, "pid": 0, "tid": 7, "ts": 6300866038117.360, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866030880.543, "dur": 5.940, + "args": { + "External id": 86866, "cbid": 211, "correlation": 161155179 + } + }, + { + "ph": "s", "id": 161155179, "pid": 5714, "tid": 6744, "ts": 6300866030880.543, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866031065.603, "dur": 0.500, + "args": { + "External id": 86892, "cbid": 200, "correlation": 161155226 + } + }, + { + "ph": "f", "id": 161155226, "pid": 5714, "tid": 6744, "ts": 6300866031065.603, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866031066.243, "dur": 0.230, + "args": { + "External id": 86892, "cbid": 200, "correlation": 161155227 + } + }, + { + "ph": "f", "id": 161155227, "pid": 5714, "tid": 6744, "ts": 6300866031066.243, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866031085.163, "dur": 0.240, + "args": { + "External id": 86892, "cbid": 200, "correlation": 161155245 + } + }, + { + "ph": "f", "id": 161155245, "pid": 5714, "tid": 6744, "ts": 6300866031085.163, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866038240.273, "dur": 96.865, + "args": { + "External id": 86892, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155246, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155246, "pid": 0, "tid": 7, "ts": 6300866038240.273, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866031086.633, "dur": 12.040, + "args": { + "External id": 86892, "cbid": 211, "correlation": 161155246 + } + }, + { + "ph": "s", "id": 161155246, "pid": 5714, "tid": 6744, "ts": 6300866031086.633, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866031099.613, "dur": 1.080, + "args": { + "External id": 86892, "cbid": 273, "correlation": 161155248 + } + }, + { + "ph": "f", "id": 161155248, "pid": 5714, "tid": 6744, "ts": 6300866031099.613, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866038337.746, "dur": 979.724, + "args": { + "External id": 86892, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155249, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161155249, "pid": 0, "tid": 7, "ts": 6300866038337.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866031100.973, "dur": 4.420, + "args": { + "External id": 86892, "cbid": 211, "correlation": 161155249 + } + }, + { + "ph": "s", "id": 161155249, "pid": 5714, "tid": 6744, "ts": 6300866031100.973, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866039318.174, "dur": 75.745, + "args": { + "External id": 86892, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155251, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161155251, "pid": 0, "tid": 7, "ts": 6300866039318.174, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866031105.973, "dur": 3.980, + "args": { + "External id": 86892, "cbid": 211, "correlation": 161155251 + } + }, + { + "ph": "s", "id": 161155251, "pid": 5714, "tid": 6744, "ts": 6300866031105.973, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866039394.591, "dur": 46.464, + "args": { + "External id": 86902, "device": 0, "context": 1, "stream": 7, "correlation": 161155277, "bytes": 25165824, "memory bandwidth (GB/s)": 541.6198347107438 + } + }, + { + "ph": "f", "id": 161155277, "pid": 0, "tid": 7, "ts": 6300866039394.591, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866031242.873, "dur": 18.560, + "args": { + "External id": 86902, "cbid": 41, "correlation": 161155277 + } + }, + { + "ph": "s", "id": 161155277, "pid": 5714, "tid": 6744, "ts": 6300866031242.873, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866039441.759, "dur": 41.409, + "args": { + "External id": 86899, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155295, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155295, "pid": 0, "tid": 7, "ts": 6300866039441.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866031375.782, "dur": 9.420, + "args": { + "External id": 86899, "cbid": 307, "correlation": 161155295 + } + }, + { + "ph": "s", "id": 161155295, "pid": 5714, "tid": 6744, "ts": 6300866031375.782, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866039483.840, "dur": 40.481, + "args": { + "External id": 86909, "device": 0, "context": 1, "stream": 7, "correlation": 161155310, "bytes": 25165824, "memory bandwidth (GB/s)": 621.6700180331513 + } + }, + { + "ph": "f", "id": 161155310, "pid": 0, "tid": 7, "ts": 6300866039483.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866031454.272, "dur": 16.530, + "args": { + "External id": 86909, "cbid": 41, "correlation": 161155310 + } + }, + { + "ph": "s", "id": 161155310, "pid": 5714, "tid": 6744, "ts": 6300866031454.272, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866039524.929, "dur": 27.296, + "args": { + "External id": 86906, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155328, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155328, "pid": 0, "tid": 7, "ts": 6300866039524.929, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866031566.222, "dur": 8.230, + "args": { + "External id": 86906, "cbid": 307, "correlation": 161155328 + } + }, + { + "ph": "s", "id": 161155328, "pid": 5714, "tid": 6744, "ts": 6300866031566.222, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866031699.352, "dur": 0.549, + "args": { + "External id": 86933, "cbid": 200, "correlation": 161155372 + } + }, + { + "ph": "f", "id": 161155372, "pid": 5714, "tid": 6744, "ts": 6300866031699.352, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866039553.025, "dur": 0.768, + "args": { + "External id": 86933, "device": 0, "context": 1, "stream": 7, "correlation": 161155375, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161155375, "pid": 0, "tid": 7, "ts": 6300866039553.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866031701.812, "dur": 7.509, + "args": { + "External id": 86933, "cbid": 51, "correlation": 161155375 + } + }, + { + "ph": "s", "id": 161155375, "pid": 5714, "tid": 6744, "ts": 6300866031701.812, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866039555.073, "dur": 138.945, + "args": { + "External id": 86933, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155376, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155376, "pid": 0, "tid": 7, "ts": 6300866039555.073, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866031709.561, "dur": 7.800, + "args": { + "External id": 86933, "cbid": 307, "correlation": 161155376 + } + }, + { + "ph": "s", "id": 161155376, "pid": 5714, "tid": 6744, "ts": 6300866031709.561, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866039694.626, "dur": 947.052, + "args": { + "External id": 86940, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155398, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155398, "pid": 0, "tid": 7, "ts": 6300866039694.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866031749.212, "dur": 6.029, + "args": { + "External id": 86940, "cbid": 211, "correlation": 161155398 + } + }, + { + "ph": "s", "id": 161155398, "pid": 5714, "tid": 6744, "ts": 6300866031749.212, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866031863.061, "dur": 0.490, + "args": { + "External id": 86963, "cbid": 200, "correlation": 161155444 + } + }, + { + "ph": "f", "id": 161155444, "pid": 5714, "tid": 6744, "ts": 6300866031863.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866040704.654, "dur": 49.377, + "args": { + "External id": 86963, "device": 0, "context": 1, "stream": 7, "correlation": 161155447, "bytes": 576, "memory bandwidth (GB/s)": 0.011665350264293092 + } + }, + { + "ph": "f", "id": 161155447, "pid": 0, "tid": 7, "ts": 6300866040704.654, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866031865.071, "dur": 6.430, + "args": { + "External id": 86963, "cbid": 51, "correlation": 161155447 + } + }, + { + "ph": "s", "id": 161155447, "pid": 5714, "tid": 6744, "ts": 6300866031865.071, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866040810.223, "dur": 291.876, + "args": { + "External id": 86963, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155448, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155448, "pid": 0, "tid": 7, "ts": 6300866040810.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866031871.711, "dur": 7.340, + "args": { + "External id": 86963, "cbid": 307, "correlation": 161155448 + } + }, + { + "ph": "s", "id": 161155448, "pid": 5714, "tid": 6744, "ts": 6300866031871.711, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866041102.835, "dur": 196.898, + "args": { + "External id": 86970, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155470, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155470, "pid": 0, "tid": 7, "ts": 6300866041102.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866031908.191, "dur": 5.560, + "args": { + "External id": 86970, "cbid": 211, "correlation": 161155470 + } + }, + { + "ph": "s", "id": 161155470, "pid": 5714, "tid": 6744, "ts": 6300866031908.191, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866041300.949, "dur": 43.009, + "args": { + "External id": 86975, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155485, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155485, "pid": 0, "tid": 7, "ts": 6300866041300.949, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866031955.791, "dur": 8.020, + "args": { + "External id": 86975, "cbid": 211, "correlation": 161155485 + } + }, + { + "ph": "s", "id": 161155485, "pid": 5714, "tid": 6744, "ts": 6300866031955.791, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866032054.621, "dur": 0.510, + "args": { + "External id": 86994, "cbid": 200, "correlation": 161155529 + } + }, + { + "ph": "f", "id": 161155529, "pid": 5714, "tid": 6744, "ts": 6300866032054.621, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866041345.206, "dur": 1.056, + "args": { + "External id": 86994, "device": 0, "context": 1, "stream": 7, "correlation": 161155532, "bytes": 576, "memory bandwidth (GB/s)": 0.5454545454545454 + } + }, + { + "ph": "f", "id": 161155532, "pid": 0, "tid": 7, "ts": 6300866041345.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866032056.661, "dur": 6.520, + "args": { + "External id": 86994, "cbid": 51, "correlation": 161155532 + } + }, + { + "ph": "s", "id": 161155532, "pid": 5714, "tid": 6744, "ts": 6300866032056.661, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866041347.766, "dur": 147.074, + "args": { + "External id": 86994, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155533, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155533, "pid": 0, "tid": 7, "ts": 6300866041347.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032063.391, "dur": 7.350, + "args": { + "External id": 86994, "cbid": 307, "correlation": 161155533 + } + }, + { + "ph": "s", "id": 161155533, "pid": 5714, "tid": 6744, "ts": 6300866032063.391, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866041495.544, "dur": 141.601, + "args": { + "External id": 87001, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155555, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155555, "pid": 0, "tid": 7, "ts": 6300866041495.544, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032099.501, "dur": 5.700, + "args": { + "External id": 87001, "cbid": 211, "correlation": 161155555 + } + }, + { + "ph": "s", "id": 161155555, "pid": 5714, "tid": 6744, "ts": 6300866032099.501, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866041637.817, "dur": 38.336, + "args": { + "External id": 87006, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155566, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155566, "pid": 0, "tid": 7, "ts": 6300866041637.817, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032145.220, "dur": 7.420, + "args": { + "External id": 87006, "cbid": 211, "correlation": 161155566 + } + }, + { + "ph": "s", "id": 161155566, "pid": 5714, "tid": 6744, "ts": 6300866032145.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866041676.858, "dur": 43.712, + "args": { + "External id": 87018, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155590, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155590, "pid": 0, "tid": 7, "ts": 6300866041676.858, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032208.880, "dur": 8.120, + "args": { + "External id": 87018, "cbid": 211, "correlation": 161155590 + } + }, + { + "ph": "s", "id": 161155590, "pid": 5714, "tid": 6744, "ts": 6300866032208.880, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866041721.274, "dur": 24.865, + "args": { + "External id": 87019, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155600, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155600, "pid": 0, "tid": 7, "ts": 6300866041721.274, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032227.360, "dur": 4.700, + "args": { + "External id": 87019, "cbid": 211, "correlation": 161155600 + } + }, + { + "ph": "s", "id": 161155600, "pid": 5714, "tid": 6744, "ts": 6300866032227.360, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866041747.515, "dur": 1.088, + "args": { + "External id": 87020, "device": 0, "context": 1, "stream": 7, "correlation": 161155615, "bytes": 24, "memory bandwidth (GB/s)": 0.022058823529411766 + } + }, + { + "ph": "f", "id": 161155615, "pid": 0, "tid": 7, "ts": 6300866041747.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866032249.680, "dur": 5.800, + "args": { + "External id": 87020, "cbid": 51, "correlation": 161155615 + } + }, + { + "ph": "s", "id": 161155615, "pid": 5714, "tid": 6744, "ts": 6300866032249.680, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6300866041749.787, "dur": 43.488, + "args": { + "External id": 87020, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155617, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161155617, "pid": 0, "tid": 7, "ts": 6300866041749.787, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032256.680, "dur": 5.630, + "args": { + "External id": 87020, "cbid": 211, "correlation": 161155617 + } + }, + { + "ph": "s", "id": 161155617, "pid": 5714, "tid": 6744, "ts": 6300866032256.680, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866041793.979, "dur": 47.969, + "args": { + "External id": 87031, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155638, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155638, "pid": 0, "tid": 7, "ts": 6300866041793.979, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032329.400, "dur": 9.760, + "args": { + "External id": 87031, "cbid": 211, "correlation": 161155638 + } + }, + { + "ph": "s", "id": 161155638, "pid": 5714, "tid": 6744, "ts": 6300866032329.400, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866041842.588, "dur": 268.451, + "args": { + "External id": 87034, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155653, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155653, "pid": 0, "tid": 7, "ts": 6300866041842.588, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032367.020, "dur": 6.580, + "args": { + "External id": 87034, "cbid": 211, "correlation": 161155653 + } + }, + { + "ph": "s", "id": 161155653, "pid": 5714, "tid": 6744, "ts": 6300866032367.020, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866042111.679, "dur": 204.674, + "args": { + "External id": 87035, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155663, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155663, "pid": 0, "tid": 7, "ts": 6300866042111.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032382.830, "dur": 4.660, + "args": { + "External id": 87035, "cbid": 211, "correlation": 161155663 + } + }, + { + "ph": "s", "id": 161155663, "pid": 5714, "tid": 6744, "ts": 6300866032382.830, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866042317.025, "dur": 113.409, + "args": { + "External id": 87036, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155677, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155677, "pid": 0, "tid": 7, "ts": 6300866042317.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032402.830, "dur": 5.300, + "args": { + "External id": 87036, "cbid": 211, "correlation": 161155677 + } + }, + { + "ph": "s", "id": 161155677, "pid": 5714, "tid": 6744, "ts": 6300866032402.830, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866042431.106, "dur": 94.178, + "args": { + "External id": 87039, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155691, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161155691, "pid": 0, "tid": 7, "ts": 6300866042431.106, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032448.000, "dur": 7.480, + "args": { + "External id": 87039, "cbid": 211, "correlation": 161155691 + } + }, + { + "ph": "s", "id": 161155691, "pid": 5714, "tid": 6744, "ts": 6300866032448.000, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866042525.924, "dur": 105.889, + "args": { + "External id": 87043, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155701, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161155701, "pid": 0, "tid": 7, "ts": 6300866042525.924, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032469.750, "dur": 5.180, + "args": { + "External id": 87043, "cbid": 211, "correlation": 161155701 + } + }, + { + "ph": "s", "id": 161155701, "pid": 5714, "tid": 6744, "ts": 6300866032469.750, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866042632.453, "dur": 68.833, + "args": { + "External id": 87044, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155711, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161155711, "pid": 0, "tid": 7, "ts": 6300866042632.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032483.120, "dur": 4.040, + "args": { + "External id": 87044, "cbid": 211, "correlation": 161155711 + } + }, + { + "ph": "s", "id": 161155711, "pid": 5714, "tid": 6744, "ts": 6300866032483.120, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866042701.990, "dur": 32.800, + "args": { + "External id": 87052, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155729, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155729, "pid": 0, "tid": 7, "ts": 6300866042701.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032536.620, "dur": 7.640, + "args": { + "External id": 87052, "cbid": 211, "correlation": 161155729 + } + }, + { + "ph": "s", "id": 161155729, "pid": 5714, "tid": 6744, "ts": 6300866032536.620, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866042735.462, "dur": 115.522, + "args": { + "External id": 87058, "device": 0, "context": 1, "stream": 7, "correlation": 161155743, "bytes": 50331648, "memory bandwidth (GB/s)": 435.688855802358 + } + }, + { + "ph": "f", "id": 161155743, "pid": 0, "tid": 7, "ts": 6300866042735.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866032574.230, "dur": 15.780, + "args": { + "External id": 87058, "cbid": 41, "correlation": 161155743 + } + }, + { + "ph": "s", "id": 161155743, "pid": 5714, "tid": 6744, "ts": 6300866032574.230, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866042851.624, "dur": 79.072, + "args": { + "External id": 87060, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155755, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155755, "pid": 0, "tid": 7, "ts": 6300866042851.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032605.450, "dur": 5.809, + "args": { + "External id": 87060, "cbid": 211, "correlation": 161155755 + } + }, + { + "ph": "s", "id": 161155755, "pid": 5714, "tid": 6744, "ts": 6300866032605.450, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866042931.368, "dur": 150.850, + "args": { + "External id": 87061, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155765, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155765, "pid": 0, "tid": 7, "ts": 6300866042931.368, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032619.850, "dur": 4.080, + "args": { + "External id": 87061, "cbid": 211, "correlation": 161155765 + } + }, + { + "ph": "s", "id": 161155765, "pid": 5714, "tid": 6744, "ts": 6300866032619.850, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866043082.954, "dur": 143.522, + "args": { + "External id": 87062, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155772, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155772, "pid": 0, "tid": 7, "ts": 6300866043082.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032637.230, "dur": 5.049, + "args": { + "External id": 87062, "cbid": 211, "correlation": 161155772 + } + }, + { + "ph": "s", "id": 161155772, "pid": 5714, "tid": 6744, "ts": 6300866032637.230, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866043227.084, "dur": 60.960, + "args": { + "External id": 87068, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155791, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155791, "pid": 0, "tid": 7, "ts": 6300866043227.084, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032676.189, "dur": 6.580, + "args": { + "External id": 87068, "cbid": 211, "correlation": 161155791 + } + }, + { + "ph": "s", "id": 161155791, "pid": 5714, "tid": 6744, "ts": 6300866032676.189, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866043288.748, "dur": 45.537, + "args": { + "External id": 87069, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155799, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155799, "pid": 0, "tid": 7, "ts": 6300866043288.748, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032693.699, "dur": 4.390, + "args": { + "External id": 87069, "cbid": 211, "correlation": 161155799 + } + }, + { + "ph": "s", "id": 161155799, "pid": 5714, "tid": 6744, "ts": 6300866032693.699, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866043334.861, "dur": 326.244, + "args": { + "External id": 87084, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155832, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155832, "pid": 0, "tid": 7, "ts": 6300866043334.861, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032800.739, "dur": 9.770, + "args": { + "External id": 87084, "cbid": 211, "correlation": 161155832 + } + }, + { + "ph": "s", "id": 161155832, "pid": 5714, "tid": 6744, "ts": 6300866032800.739, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6300866043661.809, "dur": 427.589, + "args": { + "External id": 87073, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155860, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161155860, "pid": 0, "tid": 7, "ts": 6300866043661.809, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032869.909, "dur": 6.970, + "args": { + "External id": 87073, "cbid": 307, "correlation": 161155860 + } + }, + { + "ph": "s", "id": 161155860, "pid": 5714, "tid": 6744, "ts": 6300866032869.909, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866032977.409, "dur": 0.550, + "args": { + "External id": 87109, "cbid": 200, "correlation": 161155885 + } + }, + { + "ph": "f", "id": 161155885, "pid": 5714, "tid": 6744, "ts": 6300866032977.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866044090.262, "dur": 0.800, + "args": { + "External id": 87109, "device": 0, "context": 1, "stream": 7, "correlation": 161155888, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161155888, "pid": 0, "tid": 7, "ts": 6300866044090.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866032979.849, "dur": 7.690, + "args": { + "External id": 87109, "cbid": 51, "correlation": 161155888 + } + }, + { + "ph": "s", "id": 161155888, "pid": 5714, "tid": 6744, "ts": 6300866032979.849, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866044092.214, "dur": 364.900, + "args": { + "External id": 87109, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155889, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155889, "pid": 0, "tid": 7, "ts": 6300866044092.214, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866032987.799, "dur": 7.650, + "args": { + "External id": 87109, "cbid": 307, "correlation": 161155889 + } + }, + { + "ph": "s", "id": 161155889, "pid": 5714, "tid": 6744, "ts": 6300866032987.799, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866033118.048, "dur": 0.530, + "args": { + "External id": 87127, "cbid": 200, "correlation": 161155926 + } + }, + { + "ph": "f", "id": 161155926, "pid": 5714, "tid": 6744, "ts": 6300866033118.048, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866044457.946, "dur": 0.800, + "args": { + "External id": 87127, "device": 0, "context": 1, "stream": 7, "correlation": 161155929, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161155929, "pid": 0, "tid": 7, "ts": 6300866044457.946, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866033120.208, "dur": 6.660, + "args": { + "External id": 87127, "cbid": 51, "correlation": 161155929 + } + }, + { + "ph": "s", "id": 161155929, "pid": 5714, "tid": 6744, "ts": 6300866033120.208, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866044459.898, "dur": 351.524, + "args": { + "External id": 87127, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155930, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155930, "pid": 0, "tid": 7, "ts": 6300866044459.898, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033127.078, "dur": 8.310, + "args": { + "External id": 87127, "cbid": 307, "correlation": 161155930 + } + }, + { + "ph": "s", "id": 161155930, "pid": 5714, "tid": 6744, "ts": 6300866033127.078, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866033166.528, "dur": 0.320, + "args": { + "External id": 87134, "cbid": 200, "correlation": 161155955 + } + }, + { + "ph": "f", "id": 161155955, "pid": 5714, "tid": 6744, "ts": 6300866033166.528, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866044812.126, "dur": 355.685, + "args": { + "External id": 87134, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161155958, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161155958, "pid": 0, "tid": 7, "ts": 6300866044812.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033167.988, "dur": 5.730, + "args": { + "External id": 87134, "cbid": 307, "correlation": 161155958 + } + }, + { + "ph": "s", "id": 161155958, "pid": 5714, "tid": 6744, "ts": 6300866033167.988, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866033276.988, "dur": 0.400, + "args": { + "External id": 87157, "cbid": 200, "correlation": 161156003 + } + }, + { + "ph": "f", "id": 161156003, "pid": 5714, "tid": 6744, "ts": 6300866033276.988, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866045168.707, "dur": 0.800, + "args": { + "External id": 87157, "device": 0, "context": 1, "stream": 7, "correlation": 161156006, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161156006, "pid": 0, "tid": 7, "ts": 6300866045168.707, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866033278.838, "dur": 6.270, + "args": { + "External id": 87157, "cbid": 51, "correlation": 161156006 + } + }, + { + "ph": "s", "id": 161156006, "pid": 5714, "tid": 6744, "ts": 6300866033278.838, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866045170.659, "dur": 351.940, + "args": { + "External id": 87157, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156007, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161156007, "pid": 0, "tid": 7, "ts": 6300866045170.659, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033285.308, "dur": 7.450, + "args": { + "External id": 87157, "cbid": 307, "correlation": 161156007 + } + }, + { + "ph": "s", "id": 161156007, "pid": 5714, "tid": 6744, "ts": 6300866033285.308, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866033334.298, "dur": 0.310, + "args": { + "External id": 87164, "cbid": 200, "correlation": 161156032 + } + }, + { + "ph": "f", "id": 161156032, "pid": 5714, "tid": 6744, "ts": 6300866033334.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866045523.335, "dur": 355.236, + "args": { + "External id": 87164, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156035, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161156035, "pid": 0, "tid": 7, "ts": 6300866045523.335, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033335.738, "dur": 6.280, + "args": { + "External id": 87164, "cbid": 307, "correlation": 161156035 + } + }, + { + "ph": "s", "id": 161156035, "pid": 5714, "tid": 6744, "ts": 6300866033335.738, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866045879.243, "dur": 50.913, + "args": { + "External id": 87169, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156049, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156049, "pid": 0, "tid": 7, "ts": 6300866045879.243, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033387.188, "dur": 7.690, + "args": { + "External id": 87169, "cbid": 211, "correlation": 161156049 + } + }, + { + "ph": "s", "id": 161156049, "pid": 5714, "tid": 6744, "ts": 6300866033387.188, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866045930.764, "dur": 45.920, + "args": { + "External id": 87181, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156073, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156073, "pid": 0, "tid": 7, "ts": 6300866045930.764, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033450.328, "dur": 8.469, + "args": { + "External id": 87181, "cbid": 211, "correlation": 161156073 + } + }, + { + "ph": "s", "id": 161156073, "pid": 5714, "tid": 6744, "ts": 6300866033450.328, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866045977.388, "dur": 28.417, + "args": { + "External id": 87182, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156083, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156083, "pid": 0, "tid": 7, "ts": 6300866045977.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033468.677, "dur": 4.520, + "args": { + "External id": 87182, "cbid": 211, "correlation": 161156083 + } + }, + { + "ph": "s", "id": 161156083, "pid": 5714, "tid": 6744, "ts": 6300866033468.677, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866046006.637, "dur": 0.736, + "args": { + "External id": 87183, "device": 0, "context": 1, "stream": 7, "correlation": 161156098, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 161156098, "pid": 0, "tid": 7, "ts": 6300866046006.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866033490.408, "dur": 6.069, + "args": { + "External id": 87183, "cbid": 51, "correlation": 161156098 + } + }, + { + "ph": "s", "id": 161156098, "pid": 5714, "tid": 6744, "ts": 6300866033490.408, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6300866046008.589, "dur": 41.728, + "args": { + "External id": 87183, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156100, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161156100, "pid": 0, "tid": 7, "ts": 6300866046008.589, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033497.668, "dur": 5.389, + "args": { + "External id": 87183, "cbid": 211, "correlation": 161156100 + } + }, + { + "ph": "s", "id": 161156100, "pid": 5714, "tid": 6744, "ts": 6300866033497.668, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866046050.957, "dur": 50.593, + "args": { + "External id": 87194, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156121, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156121, "pid": 0, "tid": 7, "ts": 6300866046050.957, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033563.037, "dur": 8.830, + "args": { + "External id": 87194, "cbid": 211, "correlation": 161156121 + } + }, + { + "ph": "s", "id": 161156121, "pid": 5714, "tid": 6744, "ts": 6300866033563.037, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866046102.190, "dur": 143.330, + "args": { + "External id": 87197, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156136, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156136, "pid": 0, "tid": 7, "ts": 6300866046102.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033598.327, "dur": 6.100, + "args": { + "External id": 87197, "cbid": 211, "correlation": 161156136 + } + }, + { + "ph": "s", "id": 161156136, "pid": 5714, "tid": 6744, "ts": 6300866033598.327, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866046246.192, "dur": 108.673, + "args": { + "External id": 87198, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156146, "pid": 0, "tid": 7, "ts": 6300866046246.192, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033613.897, "dur": 4.580, + "args": { + "External id": 87198, "cbid": 211, "correlation": 161156146 + } + }, + { + "ph": "s", "id": 161156146, "pid": 5714, "tid": 6744, "ts": 6300866033613.897, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866046355.537, "dur": 78.145, + "args": { + "External id": 87199, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156160, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156160, "pid": 0, "tid": 7, "ts": 6300866046355.537, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033632.827, "dur": 5.120, + "args": { + "External id": 87199, "cbid": 211, "correlation": 161156160 + } + }, + { + "ph": "s", "id": 161156160, "pid": 5714, "tid": 6744, "ts": 6300866033632.827, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866046434.418, "dur": 1.440, + "args": { + "External id": 87202, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156174, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161156174, "pid": 0, "tid": 7, "ts": 6300866046434.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033668.717, "dur": 6.930, + "args": { + "External id": 87202, "cbid": 211, "correlation": 161156174 + } + }, + { + "ph": "s", "id": 161156174, "pid": 5714, "tid": 6744, "ts": 6300866033668.717, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866046436.530, "dur": 1.216, + "args": { + "External id": 87206, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156184, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161156184, "pid": 0, "tid": 7, "ts": 6300866046436.530, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033690.577, "dur": 4.650, + "args": { + "External id": 87206, "cbid": 211, "correlation": 161156184 + } + }, + { + "ph": "s", "id": 161156184, "pid": 5714, "tid": 6744, "ts": 6300866033690.577, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866046438.386, "dur": 0.992, + "args": { + "External id": 87207, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156194, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161156194, "pid": 0, "tid": 7, "ts": 6300866046438.386, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033703.287, "dur": 3.950, + "args": { + "External id": 87207, "cbid": 211, "correlation": 161156194 + } + }, + { + "ph": "s", "id": 161156194, "pid": 5714, "tid": 6744, "ts": 6300866033703.287, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866046439.986, "dur": 27.616, + "args": { + "External id": 87215, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156212, "pid": 0, "tid": 7, "ts": 6300866046439.986, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033755.357, "dur": 7.690, + "args": { + "External id": 87215, "cbid": 211, "correlation": 161156212 + } + }, + { + "ph": "s", "id": 161156212, "pid": 5714, "tid": 6744, "ts": 6300866033755.357, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866046468.338, "dur": 115.681, + "args": { + "External id": 87221, "device": 0, "context": 1, "stream": 7, "correlation": 161156226, "bytes": 50331648, "memory bandwidth (GB/s)": 435.09001478202987 + } + }, + { + "ph": "f", "id": 161156226, "pid": 0, "tid": 7, "ts": 6300866046468.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866033793.097, "dur": 19.420, + "args": { + "External id": 87221, "cbid": 41, "correlation": 161156226 + } + }, + { + "ph": "s", "id": 161156226, "pid": 5714, "tid": 6744, "ts": 6300866033793.097, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866046584.723, "dur": 74.593, + "args": { + "External id": 87223, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156238, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156238, "pid": 0, "tid": 7, "ts": 6300866046584.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033827.407, "dur": 6.010, + "args": { + "External id": 87223, "cbid": 211, "correlation": 161156238 + } + }, + { + "ph": "s", "id": 161156238, "pid": 5714, "tid": 6744, "ts": 6300866033827.407, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866046660.020, "dur": 147.586, + "args": { + "External id": 87224, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156248, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156248, "pid": 0, "tid": 7, "ts": 6300866046660.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033841.307, "dur": 3.970, + "args": { + "External id": 87224, "cbid": 211, "correlation": 161156248 + } + }, + { + "ph": "s", "id": 161156248, "pid": 5714, "tid": 6744, "ts": 6300866033841.307, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866046808.342, "dur": 143.938, + "args": { + "External id": 87225, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156255, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156255, "pid": 0, "tid": 7, "ts": 6300866046808.342, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033859.117, "dur": 4.760, + "args": { + "External id": 87225, "cbid": 211, "correlation": 161156255 + } + }, + { + "ph": "s", "id": 161156255, "pid": 5714, "tid": 6744, "ts": 6300866033859.117, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866046952.952, "dur": 46.112, + "args": { + "External id": 87231, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156274, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156274, "pid": 0, "tid": 7, "ts": 6300866046952.952, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033896.227, "dur": 6.760, + "args": { + "External id": 87231, "cbid": 211, "correlation": 161156274 + } + }, + { + "ph": "s", "id": 161156274, "pid": 5714, "tid": 6744, "ts": 6300866033896.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866046999.672, "dur": 57.953, + "args": { + "External id": 87232, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156286, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156286, "pid": 0, "tid": 7, "ts": 6300866046999.672, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033918.556, "dur": 5.211, + "args": { + "External id": 87232, "cbid": 211, "correlation": 161156286 + } + }, + { + "ph": "s", "id": 161156286, "pid": 5714, "tid": 6744, "ts": 6300866033918.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866047058.297, "dur": 41.153, + "args": { + "External id": 87235, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156299, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156299, "pid": 0, "tid": 7, "ts": 6300866047058.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866033949.376, "dur": 5.931, + "args": { + "External id": 87235, "cbid": 211, "correlation": 161156299 + } + }, + { + "ph": "s", "id": 161156299, "pid": 5714, "tid": 6744, "ts": 6300866033949.376, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866034017.976, "dur": 0.530, + "args": { + "External id": 87245, "cbid": 200, "correlation": 161156335 + } + }, + { + "ph": "f", "id": 161156335, "pid": 5714, "tid": 6744, "ts": 6300866034017.976, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866047100.250, "dur": 0.768, + "args": { + "External id": 87245, "device": 0, "context": 1, "stream": 7, "correlation": 161156338, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161156338, "pid": 0, "tid": 7, "ts": 6300866047100.250, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866034020.216, "dur": 6.700, + "args": { + "External id": 87245, "cbid": 51, "correlation": 161156338 + } + }, + { + "ph": "s", "id": 161156338, "pid": 5714, "tid": 6744, "ts": 6300866034020.216, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866047102.170, "dur": 136.417, + "args": { + "External id": 87245, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156339, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161156339, "pid": 0, "tid": 7, "ts": 6300866047102.170, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866034027.156, "dur": 7.140, + "args": { + "External id": 87245, "cbid": 307, "correlation": 161156339 + } + }, + { + "ph": "s", "id": 161156339, "pid": 5714, "tid": 6744, "ts": 6300866034027.156, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866047239.291, "dur": 120.578, + "args": { + "External id": 87252, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156361, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161156361, "pid": 0, "tid": 7, "ts": 6300866047239.291, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866034064.216, "dur": 5.990, + "args": { + "External id": 87252, "cbid": 211, "correlation": 161156361 + } + }, + { + "ph": "s", "id": 161156361, "pid": 5714, "tid": 6744, "ts": 6300866034064.216, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866034248.506, "dur": 0.510, + "args": { + "External id": 87278, "cbid": 200, "correlation": 161156408 + } + }, + { + "ph": "f", "id": 161156408, "pid": 5714, "tid": 6744, "ts": 6300866034248.506, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866034249.136, "dur": 0.210, + "args": { + "External id": 87278, "cbid": 200, "correlation": 161156409 + } + }, + { + "ph": "f", "id": 161156409, "pid": 5714, "tid": 6744, "ts": 6300866034249.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866034267.156, "dur": 0.250, + "args": { + "External id": 87278, "cbid": 200, "correlation": 161156427 + } + }, + { + "ph": "f", "id": 161156427, "pid": 5714, "tid": 6744, "ts": 6300866034267.156, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866047360.477, "dur": 96.257, + "args": { + "External id": 87278, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156428, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156428, "pid": 0, "tid": 7, "ts": 6300866047360.477, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866034268.706, "dur": 12.050, + "args": { + "External id": 87278, "cbid": 211, "correlation": 161156428 + } + }, + { + "ph": "s", "id": 161156428, "pid": 5714, "tid": 6744, "ts": 6300866034268.706, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866034281.616, "dur": 1.060, + "args": { + "External id": 87278, "cbid": 273, "correlation": 161156430 + } + }, + { + "ph": "f", "id": 161156430, "pid": 5714, "tid": 6744, "ts": 6300866034281.616, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866047457.438, "dur": 981.291, + "args": { + "External id": 87278, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156431, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161156431, "pid": 0, "tid": 7, "ts": 6300866047457.438, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866034282.976, "dur": 4.350, + "args": { + "External id": 87278, "cbid": 211, "correlation": 161156431 + } + }, + { + "ph": "s", "id": 161156431, "pid": 5714, "tid": 6744, "ts": 6300866034282.976, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866048439.433, "dur": 70.401, + "args": { + "External id": 87278, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156433, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161156433, "pid": 0, "tid": 7, "ts": 6300866048439.433, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866034287.916, "dur": 3.840, + "args": { + "External id": 87278, "cbid": 211, "correlation": 161156433 + } + }, + { + "ph": "s", "id": 161156433, "pid": 5714, "tid": 6744, "ts": 6300866034287.916, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866048510.538, "dur": 47.009, + "args": { + "External id": 87288, "device": 0, "context": 1, "stream": 7, "correlation": 161156459, "bytes": 25165824, "memory bandwidth (GB/s)": 535.340551809228 + } + }, + { + "ph": "f", "id": 161156459, "pid": 0, "tid": 7, "ts": 6300866048510.538, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866034435.665, "dur": 17.440, + "args": { + "External id": 87288, "cbid": 41, "correlation": 161156459 + } + }, + { + "ph": "s", "id": 161156459, "pid": 5714, "tid": 6744, "ts": 6300866034435.665, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866048558.219, "dur": 31.872, + "args": { + "External id": 87285, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156477, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156477, "pid": 0, "tid": 7, "ts": 6300866048558.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866034557.895, "dur": 9.100, + "args": { + "External id": 87285, "cbid": 307, "correlation": 161156477 + } + }, + { + "ph": "s", "id": 161156477, "pid": 5714, "tid": 6744, "ts": 6300866034557.895, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866048590.699, "dur": 39.136, + "args": { + "External id": 87295, "device": 0, "context": 1, "stream": 7, "correlation": 161156492, "bytes": 25165824, "memory bandwidth (GB/s)": 643.0351594439902 + } + }, + { + "ph": "f", "id": 161156492, "pid": 0, "tid": 7, "ts": 6300866048590.699, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866034641.335, "dur": 17.690, + "args": { + "External id": 87295, "cbid": 41, "correlation": 161156492 + } + }, + { + "ph": "s", "id": 161156492, "pid": 5714, "tid": 6744, "ts": 6300866034641.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866048630.507, "dur": 25.537, + "args": { + "External id": 87292, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156510, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156510, "pid": 0, "tid": 7, "ts": 6300866048630.507, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866034756.115, "dur": 8.610, + "args": { + "External id": 87292, "cbid": 307, "correlation": 161156510 + } + }, + { + "ph": "s", "id": 161156510, "pid": 5714, "tid": 6744, "ts": 6300866034756.115, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866034886.184, "dur": 0.600, + "args": { + "External id": 87319, "cbid": 200, "correlation": 161156554 + } + }, + { + "ph": "f", "id": 161156554, "pid": 5714, "tid": 6744, "ts": 6300866034886.184, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866048656.972, "dur": 0.768, + "args": { + "External id": 87319, "device": 0, "context": 1, "stream": 7, "correlation": 161156557, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161156557, "pid": 0, "tid": 7, "ts": 6300866048656.972, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866034888.624, "dur": 7.510, + "args": { + "External id": 87319, "cbid": 51, "correlation": 161156557 + } + }, + { + "ph": "s", "id": 161156557, "pid": 5714, "tid": 6744, "ts": 6300866034888.624, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866048658.892, "dur": 140.129, + "args": { + "External id": 87319, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156558, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161156558, "pid": 0, "tid": 7, "ts": 6300866048658.892, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866034896.374, "dur": 7.960, + "args": { + "External id": 87319, "cbid": 307, "correlation": 161156558 + } + }, + { + "ph": "s", "id": 161156558, "pid": 5714, "tid": 6744, "ts": 6300866034896.374, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866048799.693, "dur": 121.026, + "args": { + "External id": 87326, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156580, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161156580, "pid": 0, "tid": 7, "ts": 6300866048799.693, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866034934.644, "dur": 6.040, + "args": { + "External id": 87326, "cbid": 211, "correlation": 161156580 + } + }, + { + "ph": "s", "id": 161156580, "pid": 5714, "tid": 6744, "ts": 6300866034934.644, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866035046.684, "dur": 0.490, + "args": { + "External id": 87349, "cbid": 200, "correlation": 161156626 + } + }, + { + "ph": "f", "id": 161156626, "pid": 5714, "tid": 6744, "ts": 6300866035046.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866048921.519, "dur": 0.768, + "args": { + "External id": 87349, "device": 0, "context": 1, "stream": 7, "correlation": 161156629, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161156629, "pid": 0, "tid": 7, "ts": 6300866048921.519, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866035048.704, "dur": 6.260, + "args": { + "External id": 87349, "cbid": 51, "correlation": 161156629 + } + }, + { + "ph": "s", "id": 161156629, "pid": 5714, "tid": 6744, "ts": 6300866035048.704, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866048923.439, "dur": 138.849, + "args": { + "External id": 87349, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156630, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161156630, "pid": 0, "tid": 7, "ts": 6300866048923.439, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035055.164, "dur": 7.460, + "args": { + "External id": 87349, "cbid": 307, "correlation": 161156630 + } + }, + { + "ph": "s", "id": 161156630, "pid": 5714, "tid": 6744, "ts": 6300866035055.164, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866049062.960, "dur": 120.674, + "args": { + "External id": 87356, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156652, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161156652, "pid": 0, "tid": 7, "ts": 6300866049062.960, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035090.964, "dur": 5.380, + "args": { + "External id": 87356, "cbid": 211, "correlation": 161156652 + } + }, + { + "ph": "s", "id": 161156652, "pid": 5714, "tid": 6744, "ts": 6300866035090.964, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866049184.370, "dur": 42.337, + "args": { + "External id": 87361, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156667, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156667, "pid": 0, "tid": 7, "ts": 6300866049184.370, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035138.154, "dur": 7.970, + "args": { + "External id": 87361, "cbid": 211, "correlation": 161156667 + } + }, + { + "ph": "s", "id": 161156667, "pid": 5714, "tid": 6744, "ts": 6300866035138.154, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866035230.384, "dur": 0.549, + "args": { + "External id": 87380, "cbid": 200, "correlation": 161156711 + } + }, + { + "ph": "f", "id": 161156711, "pid": 5714, "tid": 6744, "ts": 6300866035230.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866049227.507, "dur": 0.768, + "args": { + "External id": 87380, "device": 0, "context": 1, "stream": 7, "correlation": 161156714, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161156714, "pid": 0, "tid": 7, "ts": 6300866049227.507, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866035232.513, "dur": 6.520, + "args": { + "External id": 87380, "cbid": 51, "correlation": 161156714 + } + }, + { + "ph": "s", "id": 161156714, "pid": 5714, "tid": 6744, "ts": 6300866035232.513, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866049229.555, "dur": 140.609, + "args": { + "External id": 87380, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156715, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161156715, "pid": 0, "tid": 7, "ts": 6300866049229.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035239.233, "dur": 7.251, + "args": { + "External id": 87380, "cbid": 307, "correlation": 161156715 + } + }, + { + "ph": "s", "id": 161156715, "pid": 5714, "tid": 6744, "ts": 6300866035239.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866049370.804, "dur": 120.610, + "args": { + "External id": 87387, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156737, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161156737, "pid": 0, "tid": 7, "ts": 6300866049370.804, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035275.004, "dur": 5.629, + "args": { + "External id": 87387, "cbid": 211, "correlation": 161156737 + } + }, + { + "ph": "s", "id": 161156737, "pid": 5714, "tid": 6744, "ts": 6300866035275.004, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866049492.022, "dur": 37.792, + "args": { + "External id": 87392, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156748, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156748, "pid": 0, "tid": 7, "ts": 6300866049492.022, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035330.073, "dur": 8.250, + "args": { + "External id": 87392, "cbid": 211, "correlation": 161156748 + } + }, + { + "ph": "s", "id": 161156748, "pid": 5714, "tid": 6744, "ts": 6300866035330.073, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866049530.518, "dur": 42.305, + "args": { + "External id": 87404, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156772, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156772, "pid": 0, "tid": 7, "ts": 6300866049530.518, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035395.223, "dur": 8.280, + "args": { + "External id": 87404, "cbid": 211, "correlation": 161156772 + } + }, + { + "ph": "s", "id": 161156772, "pid": 5714, "tid": 6744, "ts": 6300866035395.223, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866049573.463, "dur": 24.160, + "args": { + "External id": 87405, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156782, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156782, "pid": 0, "tid": 7, "ts": 6300866049573.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035413.403, "dur": 4.510, + "args": { + "External id": 87405, "cbid": 211, "correlation": 161156782 + } + }, + { + "ph": "s", "id": 161156782, "pid": 5714, "tid": 6744, "ts": 6300866035413.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866049598.519, "dur": 0.768, + "args": { + "External id": 87406, "device": 0, "context": 1, "stream": 7, "correlation": 161156797, "bytes": 24, "memory bandwidth (GB/s)": 0.03125 + } + }, + { + "ph": "f", "id": 161156797, "pid": 0, "tid": 7, "ts": 6300866049598.519, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866035434.263, "dur": 6.260, + "args": { + "External id": 87406, "cbid": 51, "correlation": 161156797 + } + }, + { + "ph": "s", "id": 161156797, "pid": 5714, "tid": 6744, "ts": 6300866035434.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6300866049600.471, "dur": 41.504, + "args": { + "External id": 87406, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156799, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161156799, "pid": 0, "tid": 7, "ts": 6300866049600.471, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035441.743, "dur": 5.550, + "args": { + "External id": 87406, "cbid": 211, "correlation": 161156799 + } + }, + { + "ph": "s", "id": 161156799, "pid": 5714, "tid": 6744, "ts": 6300866035441.743, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866049642.615, "dur": 44.449, + "args": { + "External id": 87417, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156820, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156820, "pid": 0, "tid": 7, "ts": 6300866049642.615, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035503.763, "dur": 8.630, + "args": { + "External id": 87417, "cbid": 211, "correlation": 161156820 + } + }, + { + "ph": "s", "id": 161156820, "pid": 5714, "tid": 6744, "ts": 6300866035503.763, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866049687.736, "dur": 145.474, + "args": { + "External id": 87420, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156835, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156835, "pid": 0, "tid": 7, "ts": 6300866049687.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035539.393, "dur": 5.940, + "args": { + "External id": 87420, "cbid": 211, "correlation": 161156835 + } + }, + { + "ph": "s", "id": 161156835, "pid": 5714, "tid": 6744, "ts": 6300866035539.393, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866049833.850, "dur": 109.377, + "args": { + "External id": 87421, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156845, "pid": 0, "tid": 7, "ts": 6300866049833.850, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035554.923, "dur": 4.700, + "args": { + "External id": 87421, "cbid": 211, "correlation": 161156845 + } + }, + { + "ph": "s", "id": 161156845, "pid": 5714, "tid": 6744, "ts": 6300866035554.923, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866049943.835, "dur": 77.825, + "args": { + "External id": 87422, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156859, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156859, "pid": 0, "tid": 7, "ts": 6300866049943.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035573.783, "dur": 5.090, + "args": { + "External id": 87422, "cbid": 211, "correlation": 161156859 + } + }, + { + "ph": "s", "id": 161156859, "pid": 5714, "tid": 6744, "ts": 6300866035573.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866050022.268, "dur": 1.472, + "args": { + "External id": 87425, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156873, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161156873, "pid": 0, "tid": 7, "ts": 6300866050022.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035609.333, "dur": 7.050, + "args": { + "External id": 87425, "cbid": 211, "correlation": 161156873 + } + }, + { + "ph": "s", "id": 161156873, "pid": 5714, "tid": 6744, "ts": 6300866035609.333, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866050024.412, "dur": 0.992, + "args": { + "External id": 87429, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156883, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161156883, "pid": 0, "tid": 7, "ts": 6300866050024.412, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035629.703, "dur": 4.680, + "args": { + "External id": 87429, "cbid": 211, "correlation": 161156883 + } + }, + { + "ph": "s", "id": 161156883, "pid": 5714, "tid": 6744, "ts": 6300866035629.703, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866050026.108, "dur": 1.024, + "args": { + "External id": 87430, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156893, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161156893, "pid": 0, "tid": 7, "ts": 6300866050026.108, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035643.483, "dur": 4.150, + "args": { + "External id": 87430, "cbid": 211, "correlation": 161156893 + } + }, + { + "ph": "s", "id": 161156893, "pid": 5714, "tid": 6744, "ts": 6300866035643.483, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866050027.740, "dur": 26.240, + "args": { + "External id": 87438, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156911, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156911, "pid": 0, "tid": 7, "ts": 6300866050027.740, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035694.852, "dur": 7.611, + "args": { + "External id": 87438, "cbid": 211, "correlation": 161156911 + } + }, + { + "ph": "s", "id": 161156911, "pid": 5714, "tid": 6744, "ts": 6300866035694.852, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866050054.652, "dur": 112.385, + "args": { + "External id": 87444, "device": 0, "context": 1, "stream": 7, "correlation": 161156925, "bytes": 50331648, "memory bandwidth (GB/s)": 447.85022912310365 + } + }, + { + "ph": "f", "id": 161156925, "pid": 0, "tid": 7, "ts": 6300866050054.652, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866035731.732, "dur": 15.151, + "args": { + "External id": 87444, "cbid": 41, "correlation": 161156925 + } + }, + { + "ph": "s", "id": 161156925, "pid": 5714, "tid": 6744, "ts": 6300866035731.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866050167.709, "dur": 71.265, + "args": { + "External id": 87446, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156937, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156937, "pid": 0, "tid": 7, "ts": 6300866050167.709, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035760.972, "dur": 6.050, + "args": { + "External id": 87446, "cbid": 211, "correlation": 161156937 + } + }, + { + "ph": "s", "id": 161156937, "pid": 5714, "tid": 6744, "ts": 6300866035760.972, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866050239.678, "dur": 146.594, + "args": { + "External id": 87447, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156947, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156947, "pid": 0, "tid": 7, "ts": 6300866050239.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035777.322, "dur": 4.360, + "args": { + "External id": 87447, "cbid": 211, "correlation": 161156947 + } + }, + { + "ph": "s", "id": 161156947, "pid": 5714, "tid": 6744, "ts": 6300866035777.322, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866050386.944, "dur": 152.066, + "args": { + "External id": 87448, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156954, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156954, "pid": 0, "tid": 7, "ts": 6300866050386.944, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035795.252, "dur": 4.890, + "args": { + "External id": 87448, "cbid": 211, "correlation": 161156954 + } + }, + { + "ph": "s", "id": 161156954, "pid": 5714, "tid": 6744, "ts": 6300866035795.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866050539.714, "dur": 48.513, + "args": { + "External id": 87454, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156973, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156973, "pid": 0, "tid": 7, "ts": 6300866050539.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035833.792, "dur": 6.810, + "args": { + "External id": 87454, "cbid": 211, "correlation": 161156973 + } + }, + { + "ph": "s", "id": 161156973, "pid": 5714, "tid": 6744, "ts": 6300866035833.792, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866050588.867, "dur": 40.512, + "args": { + "External id": 87455, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161156981, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161156981, "pid": 0, "tid": 7, "ts": 6300866050588.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035852.332, "dur": 4.330, + "args": { + "External id": 87455, "cbid": 211, "correlation": 161156981 + } + }, + { + "ph": "s", "id": 161156981, "pid": 5714, "tid": 6744, "ts": 6300866035852.332, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866050629.987, "dur": 325.444, + "args": { + "External id": 87470, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157014, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157014, "pid": 0, "tid": 7, "ts": 6300866050629.987, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866035956.512, "dur": 9.840, + "args": { + "External id": 87470, "cbid": 211, "correlation": 161157014 + } + }, + { + "ph": "s", "id": 161157014, "pid": 5714, "tid": 6744, "ts": 6300866035956.512, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6300866050956.135, "dur": 429.669, + "args": { + "External id": 87459, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157042, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157042, "pid": 0, "tid": 7, "ts": 6300866050956.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036026.742, "dur": 7.330, + "args": { + "External id": 87459, "cbid": 307, "correlation": 161157042 + } + }, + { + "ph": "s", "id": 161157042, "pid": 5714, "tid": 6744, "ts": 6300866036026.742, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866036128.122, "dur": 0.600, + "args": { + "External id": 87495, "cbid": 200, "correlation": 161157067 + } + }, + { + "ph": "f", "id": 161157067, "pid": 5714, "tid": 6744, "ts": 6300866036128.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866051386.636, "dur": 0.768, + "args": { + "External id": 87495, "device": 0, "context": 1, "stream": 7, "correlation": 161157070, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 161157070, "pid": 0, "tid": 7, "ts": 6300866051386.636, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866036130.582, "dur": 7.669, + "args": { + "External id": 87495, "cbid": 51, "correlation": 161157070 + } + }, + { + "ph": "s", "id": 161157070, "pid": 5714, "tid": 6744, "ts": 6300866036130.582, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866051388.556, "dur": 368.068, + "args": { + "External id": 87495, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157071, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157071, "pid": 0, "tid": 7, "ts": 6300866051388.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036138.511, "dur": 7.751, + "args": { + "External id": 87495, "cbid": 307, "correlation": 161157071 + } + }, + { + "ph": "s", "id": 161157071, "pid": 5714, "tid": 6744, "ts": 6300866036138.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866036267.521, "dur": 0.550, + "args": { + "External id": 87513, "cbid": 200, "correlation": 161157108 + } + }, + { + "ph": "f", "id": 161157108, "pid": 5714, "tid": 6744, "ts": 6300866036267.521, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866051757.424, "dur": 0.800, + "args": { + "External id": 87513, "device": 0, "context": 1, "stream": 7, "correlation": 161157111, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161157111, "pid": 0, "tid": 7, "ts": 6300866051757.424, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866036269.671, "dur": 6.620, + "args": { + "External id": 87513, "cbid": 51, "correlation": 161157111 + } + }, + { + "ph": "s", "id": 161157111, "pid": 5714, "tid": 6744, "ts": 6300866036269.671, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866051759.376, "dur": 351.204, + "args": { + "External id": 87513, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157112, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157112, "pid": 0, "tid": 7, "ts": 6300866051759.376, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036276.511, "dur": 8.310, + "args": { + "External id": 87513, "cbid": 307, "correlation": 161157112 + } + }, + { + "ph": "s", "id": 161157112, "pid": 5714, "tid": 6744, "ts": 6300866036276.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866036328.821, "dur": 0.330, + "args": { + "External id": 87520, "cbid": 200, "correlation": 161157137 + } + }, + { + "ph": "f", "id": 161157137, "pid": 5714, "tid": 6744, "ts": 6300866036328.821, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866052111.284, "dur": 353.477, + "args": { + "External id": 87520, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157140, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157140, "pid": 0, "tid": 7, "ts": 6300866052111.284, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036330.311, "dur": 6.470, + "args": { + "External id": 87520, "cbid": 307, "correlation": 161157140 + } + }, + { + "ph": "s", "id": 161157140, "pid": 5714, "tid": 6744, "ts": 6300866036330.311, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866036440.741, "dur": 0.400, + "args": { + "External id": 87543, "cbid": 200, "correlation": 161157185 + } + }, + { + "ph": "f", "id": 161157185, "pid": 5714, "tid": 6744, "ts": 6300866036440.741, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866052465.561, "dur": 0.800, + "args": { + "External id": 87543, "device": 0, "context": 1, "stream": 7, "correlation": 161157188, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 161157188, "pid": 0, "tid": 7, "ts": 6300866052465.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866036442.621, "dur": 6.400, + "args": { + "External id": 87543, "cbid": 51, "correlation": 161157188 + } + }, + { + "ph": "s", "id": 161157188, "pid": 5714, "tid": 6744, "ts": 6300866036442.621, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866052467.513, "dur": 352.452, + "args": { + "External id": 87543, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157189, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157189, "pid": 0, "tid": 7, "ts": 6300866052467.513, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036449.221, "dur": 7.200, + "args": { + "External id": 87543, "cbid": 307, "correlation": 161157189 + } + }, + { + "ph": "s", "id": 161157189, "pid": 5714, "tid": 6744, "ts": 6300866036449.221, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866036487.441, "dur": 0.270, + "args": { + "External id": 87550, "cbid": 200, "correlation": 161157214 + } + }, + { + "ph": "f", "id": 161157214, "pid": 5714, "tid": 6744, "ts": 6300866036487.441, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866052820.669, "dur": 356.292, + "args": { + "External id": 87550, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157217, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157217, "pid": 0, "tid": 7, "ts": 6300866052820.669, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036488.701, "dur": 5.300, + "args": { + "External id": 87550, "cbid": 307, "correlation": 161157217 + } + }, + { + "ph": "s", "id": 161157217, "pid": 5714, "tid": 6744, "ts": 6300866036488.701, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866053177.601, "dur": 51.488, + "args": { + "External id": 87555, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157231, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157231, "pid": 0, "tid": 7, "ts": 6300866053177.601, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036554.181, "dur": 8.440, + "args": { + "External id": 87555, "cbid": 211, "correlation": 161157231 + } + }, + { + "ph": "s", "id": 161157231, "pid": 5714, "tid": 6744, "ts": 6300866036554.181, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866053229.793, "dur": 43.361, + "args": { + "External id": 87567, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157255, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157255, "pid": 0, "tid": 7, "ts": 6300866053229.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036621.290, "dur": 8.360, + "args": { + "External id": 87567, "cbid": 211, "correlation": 161157255 + } + }, + { + "ph": "s", "id": 161157255, "pid": 5714, "tid": 6744, "ts": 6300866036621.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866053273.762, "dur": 26.048, + "args": { + "External id": 87568, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157265, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157265, "pid": 0, "tid": 7, "ts": 6300866053273.762, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036639.530, "dur": 4.520, + "args": { + "External id": 87568, "cbid": 211, "correlation": 161157265 + } + }, + { + "ph": "s", "id": 161157265, "pid": 5714, "tid": 6744, "ts": 6300866036639.530, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866053300.706, "dur": 0.736, + "args": { + "External id": 87569, "device": 0, "context": 1, "stream": 7, "correlation": 161157280, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 161157280, "pid": 0, "tid": 7, "ts": 6300866053300.706, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866036661.670, "dur": 6.260, + "args": { + "External id": 87569, "cbid": 51, "correlation": 161157280 + } + }, + { + "ph": "s", "id": 161157280, "pid": 5714, "tid": 6744, "ts": 6300866036661.670, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6300866053302.626, "dur": 41.953, + "args": { + "External id": 87569, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157282, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161157282, "pid": 0, "tid": 7, "ts": 6300866053302.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036669.160, "dur": 5.330, + "args": { + "External id": 87569, "cbid": 211, "correlation": 161157282 + } + }, + { + "ph": "s", "id": 161157282, "pid": 5714, "tid": 6744, "ts": 6300866036669.160, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866053345.219, "dur": 49.728, + "args": { + "External id": 87580, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157303, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157303, "pid": 0, "tid": 7, "ts": 6300866053345.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036732.590, "dur": 8.510, + "args": { + "External id": 87580, "cbid": 211, "correlation": 161157303 + } + }, + { + "ph": "s", "id": 161157303, "pid": 5714, "tid": 6744, "ts": 6300866036732.590, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866053395.523, "dur": 145.314, + "args": { + "External id": 87583, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157318, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157318, "pid": 0, "tid": 7, "ts": 6300866053395.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036768.820, "dur": 6.210, + "args": { + "External id": 87583, "cbid": 211, "correlation": 161157318 + } + }, + { + "ph": "s", "id": 161157318, "pid": 5714, "tid": 6744, "ts": 6300866036768.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866053541.445, "dur": 108.289, + "args": { + "External id": 87584, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157328, "pid": 0, "tid": 7, "ts": 6300866053541.445, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036784.220, "dur": 4.440, + "args": { + "External id": 87584, "cbid": 211, "correlation": 161157328 + } + }, + { + "ph": "s", "id": 161157328, "pid": 5714, "tid": 6744, "ts": 6300866036784.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866053650.438, "dur": 77.729, + "args": { + "External id": 87585, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157342, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157342, "pid": 0, "tid": 7, "ts": 6300866053650.438, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036803.180, "dur": 5.000, + "args": { + "External id": 87585, "cbid": 211, "correlation": 161157342 + } + }, + { + "ph": "s", "id": 161157342, "pid": 5714, "tid": 6744, "ts": 6300866036803.180, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866053728.871, "dur": 1.472, + "args": { + "External id": 87588, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157356, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161157356, "pid": 0, "tid": 7, "ts": 6300866053728.871, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036837.580, "dur": 6.740, + "args": { + "External id": 87588, "cbid": 211, "correlation": 161157356 + } + }, + { + "ph": "s", "id": 161157356, "pid": 5714, "tid": 6744, "ts": 6300866036837.580, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866053730.983, "dur": 1.120, + "args": { + "External id": 87592, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157366, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161157366, "pid": 0, "tid": 7, "ts": 6300866053730.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036859.220, "dur": 5.050, + "args": { + "External id": 87592, "cbid": 211, "correlation": 161157366 + } + }, + { + "ph": "s", "id": 161157366, "pid": 5714, "tid": 6744, "ts": 6300866036859.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866053732.743, "dur": 0.992, + "args": { + "External id": 87593, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157376, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161157376, "pid": 0, "tid": 7, "ts": 6300866053732.743, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036872.750, "dur": 3.970, + "args": { + "External id": 87593, "cbid": 211, "correlation": 161157376 + } + }, + { + "ph": "s", "id": 161157376, "pid": 5714, "tid": 6744, "ts": 6300866036872.750, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866053734.471, "dur": 26.945, + "args": { + "External id": 87601, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157394, "pid": 0, "tid": 7, "ts": 6300866053734.471, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036924.310, "dur": 7.680, + "args": { + "External id": 87601, "cbid": 211, "correlation": 161157394 + } + }, + { + "ph": "s", "id": 161157394, "pid": 5714, "tid": 6744, "ts": 6300866036924.310, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866053762.120, "dur": 111.649, + "args": { + "External id": 87607, "device": 0, "context": 1, "stream": 7, "correlation": 161157408, "bytes": 50331648, "memory bandwidth (GB/s)": 450.8024971114833 + } + }, + { + "ph": "f", "id": 161157408, "pid": 0, "tid": 7, "ts": 6300866053762.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866036962.260, "dur": 17.549, + "args": { + "External id": 87607, "cbid": 41, "correlation": 161157408 + } + }, + { + "ph": "s", "id": 161157408, "pid": 5714, "tid": 6744, "ts": 6300866036962.260, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866053874.409, "dur": 68.385, + "args": { + "External id": 87609, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157420, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157420, "pid": 0, "tid": 7, "ts": 6300866053874.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866036994.100, "dur": 6.209, + "args": { + "External id": 87609, "cbid": 211, "correlation": 161157420 + } + }, + { + "ph": "s", "id": 161157420, "pid": 5714, "tid": 6744, "ts": 6300866036994.100, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866053943.498, "dur": 150.145, + "args": { + "External id": 87610, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157430, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157430, "pid": 0, "tid": 7, "ts": 6300866053943.498, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037009.780, "dur": 3.860, + "args": { + "External id": 87610, "cbid": 211, "correlation": 161157430 + } + }, + { + "ph": "s", "id": 161157430, "pid": 5714, "tid": 6744, "ts": 6300866037009.780, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866054094.315, "dur": 137.058, + "args": { + "External id": 87611, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157437, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157437, "pid": 0, "tid": 7, "ts": 6300866054094.315, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037027.100, "dur": 4.869, + "args": { + "External id": 87611, "cbid": 211, "correlation": 161157437 + } + }, + { + "ph": "s", "id": 161157437, "pid": 5714, "tid": 6744, "ts": 6300866037027.100, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866054232.077, "dur": 43.937, + "args": { + "External id": 87617, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157456, "pid": 0, "tid": 7, "ts": 6300866054232.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037065.180, "dur": 6.509, + "args": { + "External id": 87617, "cbid": 211, "correlation": 161157456 + } + }, + { + "ph": "s", "id": 161157456, "pid": 5714, "tid": 6744, "ts": 6300866037065.180, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866054276.622, "dur": 57.952, + "args": { + "External id": 87618, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157468, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157468, "pid": 0, "tid": 7, "ts": 6300866054276.622, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037087.379, "dur": 5.110, + "args": { + "External id": 87618, "cbid": 211, "correlation": 161157468 + } + }, + { + "ph": "s", "id": 161157468, "pid": 5714, "tid": 6744, "ts": 6300866037087.379, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866054335.246, "dur": 41.985, + "args": { + "External id": 87621, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157481, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157481, "pid": 0, "tid": 7, "ts": 6300866054335.246, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037119.669, "dur": 6.030, + "args": { + "External id": 87621, "cbid": 211, "correlation": 161157481 + } + }, + { + "ph": "s", "id": 161157481, "pid": 5714, "tid": 6744, "ts": 6300866037119.669, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866037186.829, "dur": 0.570, + "args": { + "External id": 87631, "cbid": 200, "correlation": 161157517 + } + }, + { + "ph": "f", "id": 161157517, "pid": 5714, "tid": 6744, "ts": 6300866037186.829, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866054378.063, "dur": 0.768, + "args": { + "External id": 87631, "device": 0, "context": 1, "stream": 7, "correlation": 161157520, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161157520, "pid": 0, "tid": 7, "ts": 6300866054378.063, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866037189.179, "dur": 6.810, + "args": { + "External id": 87631, "cbid": 51, "correlation": 161157520 + } + }, + { + "ph": "s", "id": 161157520, "pid": 5714, "tid": 6744, "ts": 6300866037189.179, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866054380.015, "dur": 136.321, + "args": { + "External id": 87631, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157521, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157521, "pid": 0, "tid": 7, "ts": 6300866054380.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037196.209, "dur": 6.740, + "args": { + "External id": 87631, "cbid": 307, "correlation": 161157521 + } + }, + { + "ph": "s", "id": 161157521, "pid": 5714, "tid": 6744, "ts": 6300866037196.209, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866054517.008, "dur": 120.642, + "args": { + "External id": 87638, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157543, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157543, "pid": 0, "tid": 7, "ts": 6300866054517.008, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037232.709, "dur": 6.150, + "args": { + "External id": 87638, "cbid": 211, "correlation": 161157543 + } + }, + { + "ph": "s", "id": 161157543, "pid": 5714, "tid": 6744, "ts": 6300866037232.709, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866037426.488, "dur": 0.531, + "args": { + "External id": 87664, "cbid": 200, "correlation": 161157590 + } + }, + { + "ph": "f", "id": 161157590, "pid": 5714, "tid": 6744, "ts": 6300866037426.488, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866037427.159, "dur": 0.220, + "args": { + "External id": 87664, "cbid": 200, "correlation": 161157591 + } + }, + { + "ph": "f", "id": 161157591, "pid": 5714, "tid": 6744, "ts": 6300866037427.159, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866037446.259, "dur": 0.249, + "args": { + "External id": 87664, "cbid": 200, "correlation": 161157609 + } + }, + { + "ph": "f", "id": 161157609, "pid": 5714, "tid": 6744, "ts": 6300866037446.259, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866054638.738, "dur": 97.889, + "args": { + "External id": 87664, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157610, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157610, "pid": 0, "tid": 7, "ts": 6300866054638.738, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037447.748, "dur": 12.340, + "args": { + "External id": 87664, "cbid": 211, "correlation": 161157610 + } + }, + { + "ph": "s", "id": 161157610, "pid": 5714, "tid": 6744, "ts": 6300866037447.748, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866037460.959, "dur": 1.089, + "args": { + "External id": 87664, "cbid": 273, "correlation": 161157612 + } + }, + { + "ph": "f", "id": 161157612, "pid": 5714, "tid": 6744, "ts": 6300866037460.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866054737.299, "dur": 980.459, + "args": { + "External id": 87664, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157613, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161157613, "pid": 0, "tid": 7, "ts": 6300866054737.299, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037462.379, "dur": 4.409, + "args": { + "External id": 87664, "cbid": 211, "correlation": 161157613 + } + }, + { + "ph": "s", "id": 161157613, "pid": 5714, "tid": 6744, "ts": 6300866037462.379, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866055718.398, "dur": 71.266, + "args": { + "External id": 87664, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157615, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161157615, "pid": 0, "tid": 7, "ts": 6300866055718.398, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037467.408, "dur": 3.871, + "args": { + "External id": 87664, "cbid": 211, "correlation": 161157615 + } + }, + { + "ph": "s", "id": 161157615, "pid": 5714, "tid": 6744, "ts": 6300866037467.408, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866055790.272, "dur": 47.680, + "args": { + "External id": 87674, "device": 0, "context": 1, "stream": 7, "correlation": 161157641, "bytes": 25165824, "memory bandwidth (GB/s)": 527.806711409396 + } + }, + { + "ph": "f", "id": 161157641, "pid": 0, "tid": 7, "ts": 6300866055790.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866037604.328, "dur": 18.970, + "args": { + "External id": 87674, "cbid": 41, "correlation": 161157641 + } + }, + { + "ph": "s", "id": 161157641, "pid": 5714, "tid": 6744, "ts": 6300866037604.328, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866055838.656, "dur": 31.744, + "args": { + "External id": 87671, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157659, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157659, "pid": 0, "tid": 7, "ts": 6300866055838.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037728.658, "dur": 8.860, + "args": { + "External id": 87671, "cbid": 307, "correlation": 161157659 + } + }, + { + "ph": "s", "id": 161157659, "pid": 5714, "tid": 6744, "ts": 6300866037728.658, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866055871.104, "dur": 37.889, + "args": { + "External id": 87681, "device": 0, "context": 1, "stream": 7, "correlation": 161157674, "bytes": 25165824, "memory bandwidth (GB/s)": 664.198685634353 + } + }, + { + "ph": "f", "id": 161157674, "pid": 0, "tid": 7, "ts": 6300866055871.104, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866037805.908, "dur": 14.200, + "args": { + "External id": 87681, "cbid": 41, "correlation": 161157674 + } + }, + { + "ph": "s", "id": 161157674, "pid": 5714, "tid": 6744, "ts": 6300866037805.908, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866055909.665, "dur": 27.008, + "args": { + "External id": 87678, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157692, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157692, "pid": 0, "tid": 7, "ts": 6300866055909.665, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866037916.098, "dur": 8.460, + "args": { + "External id": 87678, "cbid": 307, "correlation": 161157692 + } + }, + { + "ph": "s", "id": 161157692, "pid": 5714, "tid": 6744, "ts": 6300866037916.098, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866038045.727, "dur": 0.580, + "args": { + "External id": 87705, "cbid": 200, "correlation": 161157736 + } + }, + { + "ph": "f", "id": 161157736, "pid": 5714, "tid": 6744, "ts": 6300866038045.727, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866055937.729, "dur": 0.832, + "args": { + "External id": 87705, "device": 0, "context": 1, "stream": 7, "correlation": 161157739, "bytes": 576, "memory bandwidth (GB/s)": 0.6923076923076923 + } + }, + { + "ph": "f", "id": 161157739, "pid": 0, "tid": 7, "ts": 6300866055937.729, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866038048.157, "dur": 7.550, + "args": { + "External id": 87705, "cbid": 51, "correlation": 161157739 + } + }, + { + "ph": "s", "id": 161157739, "pid": 5714, "tid": 6744, "ts": 6300866038048.157, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866055939.713, "dur": 139.138, + "args": { + "External id": 87705, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157740, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157740, "pid": 0, "tid": 7, "ts": 6300866055939.713, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038055.957, "dur": 8.320, + "args": { + "External id": 87705, "cbid": 307, "correlation": 161157740 + } + }, + { + "ph": "s", "id": 161157740, "pid": 5714, "tid": 6744, "ts": 6300866038055.957, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866056079.459, "dur": 120.737, + "args": { + "External id": 87712, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157762, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157762, "pid": 0, "tid": 7, "ts": 6300866056079.459, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038106.527, "dur": 6.680, + "args": { + "External id": 87712, "cbid": 211, "correlation": 161157762 + } + }, + { + "ph": "s", "id": 161157762, "pid": 5714, "tid": 6744, "ts": 6300866038106.527, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866038220.347, "dur": 0.540, + "args": { + "External id": 87735, "cbid": 200, "correlation": 161157808 + } + }, + { + "ph": "f", "id": 161157808, "pid": 5714, "tid": 6744, "ts": 6300866038220.347, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866056200.964, "dur": 0.768, + "args": { + "External id": 87735, "device": 0, "context": 1, "stream": 7, "correlation": 161157811, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161157811, "pid": 0, "tid": 7, "ts": 6300866056200.964, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866038222.467, "dur": 6.620, + "args": { + "External id": 87735, "cbid": 51, "correlation": 161157811 + } + }, + { + "ph": "s", "id": 161157811, "pid": 5714, "tid": 6744, "ts": 6300866038222.467, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866056203.044, "dur": 138.946, + "args": { + "External id": 87735, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157812, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157812, "pid": 0, "tid": 7, "ts": 6300866056203.044, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038229.297, "dur": 7.580, + "args": { + "External id": 87735, "cbid": 307, "correlation": 161157812 + } + }, + { + "ph": "s", "id": 161157812, "pid": 5714, "tid": 6744, "ts": 6300866038229.297, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866056342.630, "dur": 120.321, + "args": { + "External id": 87742, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157834, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157834, "pid": 0, "tid": 7, "ts": 6300866056342.630, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038266.117, "dur": 5.830, + "args": { + "External id": 87742, "cbid": 211, "correlation": 161157834 + } + }, + { + "ph": "s", "id": 161157834, "pid": 5714, "tid": 6744, "ts": 6300866038266.117, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866056463.719, "dur": 41.537, + "args": { + "External id": 87747, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157849, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157849, "pid": 0, "tid": 7, "ts": 6300866056463.719, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038323.677, "dur": 8.409, + "args": { + "External id": 87747, "cbid": 211, "correlation": 161157849 + } + }, + { + "ph": "s", "id": 161157849, "pid": 5714, "tid": 6744, "ts": 6300866038323.677, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866038419.156, "dur": 0.540, + "args": { + "External id": 87766, "cbid": 200, "correlation": 161157893 + } + }, + { + "ph": "f", "id": 161157893, "pid": 5714, "tid": 6744, "ts": 6300866038419.156, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866056506.120, "dur": 0.768, + "args": { + "External id": 87766, "device": 0, "context": 1, "stream": 7, "correlation": 161157896, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161157896, "pid": 0, "tid": 7, "ts": 6300866056506.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866038421.316, "dur": 6.600, + "args": { + "External id": 87766, "cbid": 51, "correlation": 161157896 + } + }, + { + "ph": "s", "id": 161157896, "pid": 5714, "tid": 6744, "ts": 6300866038421.316, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866056508.040, "dur": 141.122, + "args": { + "External id": 87766, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157897, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157897, "pid": 0, "tid": 7, "ts": 6300866056508.040, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038428.116, "dur": 7.210, + "args": { + "External id": 87766, "cbid": 307, "correlation": 161157897 + } + }, + { + "ph": "s", "id": 161157897, "pid": 5714, "tid": 6744, "ts": 6300866038428.116, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866056649.802, "dur": 120.897, + "args": { + "External id": 87773, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157919, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161157919, "pid": 0, "tid": 7, "ts": 6300866056649.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038464.856, "dur": 5.410, + "args": { + "External id": 87773, "cbid": 211, "correlation": 161157919 + } + }, + { + "ph": "s", "id": 161157919, "pid": 5714, "tid": 6744, "ts": 6300866038464.856, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866056771.339, "dur": 38.561, + "args": { + "External id": 87778, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157930, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157930, "pid": 0, "tid": 7, "ts": 6300866056771.339, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038509.966, "dur": 7.430, + "args": { + "External id": 87778, "cbid": 211, "correlation": 161157930 + } + }, + { + "ph": "s", "id": 161157930, "pid": 5714, "tid": 6744, "ts": 6300866038509.966, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866056810.540, "dur": 48.640, + "args": { + "External id": 87790, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157954, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157954, "pid": 0, "tid": 7, "ts": 6300866056810.540, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038572.466, "dur": 8.280, + "args": { + "External id": 87790, "cbid": 211, "correlation": 161157954 + } + }, + { + "ph": "s", "id": 161157954, "pid": 5714, "tid": 6744, "ts": 6300866038572.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866056859.852, "dur": 31.872, + "args": { + "External id": 87791, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157964, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161157964, "pid": 0, "tid": 7, "ts": 6300866056859.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038591.436, "dur": 4.670, + "args": { + "External id": 87791, "cbid": 211, "correlation": 161157964 + } + }, + { + "ph": "s", "id": 161157964, "pid": 5714, "tid": 6744, "ts": 6300866038591.436, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866056892.940, "dur": 0.736, + "args": { + "External id": 87792, "device": 0, "context": 1, "stream": 7, "correlation": 161157979, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 161157979, "pid": 0, "tid": 7, "ts": 6300866056892.940, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866038614.466, "dur": 6.110, + "args": { + "External id": 87792, "cbid": 51, "correlation": 161157979 + } + }, + { + "ph": "s", "id": 161157979, "pid": 5714, "tid": 6744, "ts": 6300866038614.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6300866056894.860, "dur": 44.353, + "args": { + "External id": 87792, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161157981, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161157981, "pid": 0, "tid": 7, "ts": 6300866056894.860, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038621.816, "dur": 5.480, + "args": { + "External id": 87792, "cbid": 211, "correlation": 161157981 + } + }, + { + "ph": "s", "id": 161157981, "pid": 5714, "tid": 6744, "ts": 6300866038621.816, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6300866056939.885, "dur": 51.137, + "args": { + "External id": 87803, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158002, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158002, "pid": 0, "tid": 7, "ts": 6300866056939.885, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038684.826, "dur": 8.470, + "args": { + "External id": 87803, "cbid": 211, "correlation": 161158002 + } + }, + { + "ph": "s", "id": 161158002, "pid": 5714, "tid": 6744, "ts": 6300866038684.826, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866056991.630, "dur": 148.065, + "args": { + "External id": 87806, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158017, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158017, "pid": 0, "tid": 7, "ts": 6300866056991.630, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038719.566, "dur": 6.070, + "args": { + "External id": 87806, "cbid": 211, "correlation": 161158017 + } + }, + { + "ph": "s", "id": 161158017, "pid": 5714, "tid": 6744, "ts": 6300866038719.566, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866057140.303, "dur": 107.650, + "args": { + "External id": 87807, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158027, "pid": 0, "tid": 7, "ts": 6300866057140.303, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038734.756, "dur": 4.720, + "args": { + "External id": 87807, "cbid": 211, "correlation": 161158027 + } + }, + { + "ph": "s", "id": 161158027, "pid": 5714, "tid": 6744, "ts": 6300866038734.756, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866057248.561, "dur": 78.176, + "args": { + "External id": 87808, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158041, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158041, "pid": 0, "tid": 7, "ts": 6300866057248.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038753.876, "dur": 5.060, + "args": { + "External id": 87808, "cbid": 211, "correlation": 161158041 + } + }, + { + "ph": "s", "id": 161158041, "pid": 5714, "tid": 6744, "ts": 6300866038753.876, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866057327.473, "dur": 1.440, + "args": { + "External id": 87811, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158055, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161158055, "pid": 0, "tid": 7, "ts": 6300866057327.473, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038788.605, "dur": 6.811, + "args": { + "External id": 87811, "cbid": 211, "correlation": 161158055 + } + }, + { + "ph": "s", "id": 161158055, "pid": 5714, "tid": 6744, "ts": 6300866038788.605, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866057329.553, "dur": 0.992, + "args": { + "External id": 87815, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158065, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161158065, "pid": 0, "tid": 7, "ts": 6300866057329.553, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038811.365, "dur": 5.280, + "args": { + "External id": 87815, "cbid": 211, "correlation": 161158065 + } + }, + { + "ph": "s", "id": 161158065, "pid": 5714, "tid": 6744, "ts": 6300866038811.365, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866057331.249, "dur": 1.024, + "args": { + "External id": 87816, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158075, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161158075, "pid": 0, "tid": 7, "ts": 6300866057331.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038824.925, "dur": 4.091, + "args": { + "External id": 87816, "cbid": 211, "correlation": 161158075 + } + }, + { + "ph": "s", "id": 161158075, "pid": 5714, "tid": 6744, "ts": 6300866038824.925, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6300866057332.977, "dur": 28.065, + "args": { + "External id": 87824, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158093, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158093, "pid": 0, "tid": 7, "ts": 6300866057332.977, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038876.965, "dur": 7.940, + "args": { + "External id": 87824, "cbid": 211, "correlation": 161158093 + } + }, + { + "ph": "s", "id": 161158093, "pid": 5714, "tid": 6744, "ts": 6300866038876.965, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866057361.746, "dur": 113.665, + "args": { + "External id": 87830, "device": 0, "context": 1, "stream": 7, "correlation": 161158107, "bytes": 50331648, "memory bandwidth (GB/s)": 442.80691505740555 + } + }, + { + "ph": "f", "id": 161158107, "pid": 0, "tid": 7, "ts": 6300866057361.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866038915.075, "dur": 15.930, + "args": { + "External id": 87830, "cbid": 41, "correlation": 161158107 + } + }, + { + "ph": "s", "id": 161158107, "pid": 5714, "tid": 6744, "ts": 6300866038915.075, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866057476.019, "dur": 69.345, + "args": { + "External id": 87832, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158119, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158119, "pid": 0, "tid": 7, "ts": 6300866057476.019, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038945.415, "dur": 6.000, + "args": { + "External id": 87832, "cbid": 211, "correlation": 161158119 + } + }, + { + "ph": "s", "id": 161158119, "pid": 5714, "tid": 6744, "ts": 6300866038945.415, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866057546.004, "dur": 143.618, + "args": { + "External id": 87833, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158129, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158129, "pid": 0, "tid": 7, "ts": 6300866057546.004, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038960.135, "dur": 4.100, + "args": { + "External id": 87833, "cbid": 211, "correlation": 161158129 + } + }, + { + "ph": "s", "id": 161158129, "pid": 5714, "tid": 6744, "ts": 6300866038960.135, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866057690.262, "dur": 144.065, + "args": { + "External id": 87834, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158136, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158136, "pid": 0, "tid": 7, "ts": 6300866057690.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866038977.605, "dur": 4.840, + "args": { + "External id": 87834, "cbid": 211, "correlation": 161158136 + } + }, + { + "ph": "s", "id": 161158136, "pid": 5714, "tid": 6744, "ts": 6300866038977.605, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866057834.967, "dur": 47.937, + "args": { + "External id": 87840, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158155, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158155, "pid": 0, "tid": 7, "ts": 6300866057834.967, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866039017.065, "dur": 6.750, + "args": { + "External id": 87840, "cbid": 211, "correlation": 161158155 + } + }, + { + "ph": "s", "id": 161158155, "pid": 5714, "tid": 6744, "ts": 6300866039017.065, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866057883.576, "dur": 40.577, + "args": { + "External id": 87841, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158163, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158163, "pid": 0, "tid": 7, "ts": 6300866057883.576, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866039035.065, "dur": 4.610, + "args": { + "External id": 87841, "cbid": 211, "correlation": 161158163 + } + }, + { + "ph": "s", "id": 161158163, "pid": 5714, "tid": 6744, "ts": 6300866039035.065, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866039286.494, "dur": 18.272, + "args": { + "External id": 87857, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161158186, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161158186, "pid": 0, "tid": 17, "ts": 6300866039286.494, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866039273.464, "dur": 13.710, + "args": { + "External id": 87857, "cbid": 211, "correlation": 161158186 + } + }, + { + "ph": "s", "id": 161158186, "pid": 5714, "tid": 6744, "ts": 6300866039273.464, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866039438.943, "dur": 42.785, + "args": { + "External id": 87873, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161158199, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161158199, "pid": 0, "tid": 17, "ts": 6300866039438.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866039427.044, "dur": 11.370, + "args": { + "External id": 87873, "cbid": 211, "correlation": 161158199 + } + }, + { + "ph": "s", "id": 161158199, "pid": 5714, "tid": 6744, "ts": 6300866039427.044, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866039468.404, "dur": 2.040, + "args": { + "External id": 87842, "cbid": 135, "correlation": 161158209 + } + }, + { + "ph": "f", "id": 161158209, "pid": 5714, "tid": 6744, "ts": 6300866039468.404, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866039473.404, "dur": 2.240, + "args": { + "External id": 87842, "cbid": 147, "correlation": 161158213 + } + }, + { + "ph": "s", "id": 161158213, "pid": 5714, "tid": 6744, "ts": 6300866039473.404, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866039558.634, "dur": 1.550, + "args": { + "External id": 87875, "cbid": 317, "correlation": 161158226 + } + }, + { + "ph": "f", "id": 161158226, "pid": 5714, "tid": 6744, "ts": 6300866039558.634, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866039563.154, "dur": 1.620, + "args": { + "External id": 87875, "cbid": 135, "correlation": 161158228 + } + }, + { + "ph": "f", "id": 161158228, "pid": 5714, "tid": 6744, "ts": 6300866039563.154, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866039567.024, "dur": 1.590, + "args": { + "External id": 87875, "cbid": 147, "correlation": 161158232 + } + }, + { + "ph": "s", "id": 161158232, "pid": 5714, "tid": 6744, "ts": 6300866039567.024, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866039594.664, "dur": 1.330, + "args": { + "External id": 87875, "cbid": 409, "correlation": 161158235 + } + }, + { + "ph": "f", "id": 161158235, "pid": 5714, "tid": 6744, "ts": 6300866039594.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866039605.104, "dur": 1.250, + "args": { + "External id": 87875, "cbid": 135, "correlation": 161158238 + } + }, + { + "ph": "f", "id": 161158238, "pid": 5714, "tid": 6744, "ts": 6300866039605.104, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866039606.494, "dur": 1.090, + "args": { + "External id": 87875, "cbid": 147, "correlation": 161158239 + } + }, + { + "ph": "s", "id": 161158239, "pid": 5714, "tid": 6744, "ts": 6300866039606.494, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866039684.066, "dur": 3667.467, + "args": { + "External id": 87875, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161158241, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161158241, "pid": 0, "tid": 20, "ts": 6300866039684.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866039609.934, "dur": 13.020, + "args": { + "External id": 87875, "cbid": 430, "correlation": 161158241 + } + }, + { + "ph": "s", "id": 161158241, "pid": 5714, "tid": 6744, "ts": 6300866039609.934, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866039624.523, "dur": 0.431, + "args": { + "External id": 87875, "cbid": 135, "correlation": 161158243 + } + }, + { + "ph": "f", "id": 161158243, "pid": 5714, "tid": 6744, "ts": 6300866039624.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866039625.083, "dur": 0.611, + "args": { + "External id": 87875, "cbid": 147, "correlation": 161158244 + } + }, + { + "ph": "s", "id": 161158244, "pid": 5714, "tid": 6744, "ts": 6300866039625.083, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866039627.383, "dur": 0.891, + "args": { + "External id": 87875, "cbid": 135, "correlation": 161158247 + } + }, + { + "ph": "f", "id": 161158247, "pid": 5714, "tid": 6744, "ts": 6300866039627.383, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866039639.774, "dur": 0.469, + "args": { + "External id": 87875, "cbid": 135, "correlation": 161158254 + } + }, + { + "ph": "f", "id": 161158254, "pid": 5714, "tid": 6744, "ts": 6300866039639.774, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866039675.954, "dur": 1.240, + "args": { + "External id": 87877, "cbid": 147, "correlation": 161158259 + } + }, + { + "ph": "s", "id": 161158259, "pid": 5714, "tid": 6744, "ts": 6300866039675.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866039697.703, "dur": 1.111, + "args": { + "External id": 87842, "cbid": 135, "correlation": 161158274 + } + }, + { + "ph": "f", "id": 161158274, "pid": 5714, "tid": 6744, "ts": 6300866039697.703, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866057924.793, "dur": 320.771, + "args": { + "External id": 87879, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158299, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158299, "pid": 0, "tid": 7, "ts": 6300866057924.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866039877.273, "dur": 12.170, + "args": { + "External id": 87879, "cbid": 211, "correlation": 161158299 + } + }, + { + "ph": "s", "id": 161158299, "pid": 5714, "tid": 6744, "ts": 6300866039877.273, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6300866058246.268, "dur": 431.686, + "args": { + "External id": 87880, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158322, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161158322, "pid": 0, "tid": 7, "ts": 6300866058246.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866039947.793, "dur": 7.660, + "args": { + "External id": 87880, "cbid": 307, "correlation": 161158322 + } + }, + { + "ph": "s", "id": 161158322, "pid": 5714, "tid": 6744, "ts": 6300866039947.793, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866039996.193, "dur": 0.610, + "args": { + "External id": 87881, "cbid": 200, "correlation": 161158345 + } + }, + { + "ph": "f", "id": 161158345, "pid": 5714, "tid": 6744, "ts": 6300866039996.193, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866058678.882, "dur": 0.768, + "args": { + "External id": 87881, "device": 0, "context": 1, "stream": 7, "correlation": 161158348, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 161158348, "pid": 0, "tid": 7, "ts": 6300866058678.882, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866039998.633, "dur": 7.270, + "args": { + "External id": 87881, "cbid": 51, "correlation": 161158348 + } + }, + { + "ph": "s", "id": 161158348, "pid": 5714, "tid": 6744, "ts": 6300866039998.633, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866058680.802, "dur": 357.476, + "args": { + "External id": 87881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158349, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158349, "pid": 0, "tid": 7, "ts": 6300866058680.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040006.143, "dur": 6.420, + "args": { + "External id": 87881, "cbid": 307, "correlation": 161158349 + } + }, + { + "ph": "s", "id": 161158349, "pid": 5714, "tid": 6744, "ts": 6300866040006.143, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866040043.213, "dur": 0.350, + "args": { + "External id": 87882, "cbid": 200, "correlation": 161158374 + } + }, + { + "ph": "f", "id": 161158374, "pid": 5714, "tid": 6744, "ts": 6300866040043.213, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866059039.174, "dur": 0.768, + "args": { + "External id": 87882, "device": 0, "context": 1, "stream": 7, "correlation": 161158377, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 161158377, "pid": 0, "tid": 7, "ts": 6300866059039.174, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866040044.703, "dur": 4.640, + "args": { + "External id": 87882, "cbid": 51, "correlation": 161158377 + } + }, + { + "ph": "s", "id": 161158377, "pid": 5714, "tid": 6744, "ts": 6300866040044.703, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866059041.094, "dur": 351.652, + "args": { + "External id": 87882, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158378, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158378, "pid": 0, "tid": 7, "ts": 6300866059041.094, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040049.493, "dur": 5.430, + "args": { + "External id": 87882, "cbid": 307, "correlation": 161158378 + } + }, + { + "ph": "s", "id": 161158378, "pid": 5714, "tid": 6744, "ts": 6300866040049.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866040081.762, "dur": 0.311, + "args": { + "External id": 87883, "cbid": 200, "correlation": 161158403 + } + }, + { + "ph": "f", "id": 161158403, "pid": 5714, "tid": 6744, "ts": 6300866040081.762, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866059393.322, "dur": 354.660, + "args": { + "External id": 87883, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158406, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158406, "pid": 0, "tid": 7, "ts": 6300866059393.322, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040083.333, "dur": 5.609, + "args": { + "External id": 87883, "cbid": 307, "correlation": 161158406 + } + }, + { + "ph": "s", "id": 161158406, "pid": 5714, "tid": 6744, "ts": 6300866040083.333, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866040111.822, "dur": 0.271, + "args": { + "External id": 87884, "cbid": 200, "correlation": 161158431 + } + }, + { + "ph": "f", "id": 161158431, "pid": 5714, "tid": 6744, "ts": 6300866040111.822, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866059748.814, "dur": 0.832, + "args": { + "External id": 87884, "device": 0, "context": 1, "stream": 7, "correlation": 161158434, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 161158434, "pid": 0, "tid": 7, "ts": 6300866059748.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866040113.193, "dur": 4.849, + "args": { + "External id": 87884, "cbid": 51, "correlation": 161158434 + } + }, + { + "ph": "s", "id": 161158434, "pid": 5714, "tid": 6744, "ts": 6300866040113.193, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866059750.798, "dur": 351.524, + "args": { + "External id": 87884, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158435, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158435, "pid": 0, "tid": 7, "ts": 6300866059750.798, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040118.202, "dur": 5.240, + "args": { + "External id": 87884, "cbid": 307, "correlation": 161158435 + } + }, + { + "ph": "s", "id": 161158435, "pid": 5714, "tid": 6744, "ts": 6300866040118.202, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866040148.202, "dur": 0.291, + "args": { + "External id": 87885, "cbid": 200, "correlation": 161158460 + } + }, + { + "ph": "f", "id": 161158460, "pid": 5714, "tid": 6744, "ts": 6300866040148.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866060103.058, "dur": 356.068, + "args": { + "External id": 87885, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158463, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158463, "pid": 0, "tid": 7, "ts": 6300866060103.058, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040149.622, "dur": 5.450, + "args": { + "External id": 87885, "cbid": 307, "correlation": 161158463 + } + }, + { + "ph": "s", "id": 161158463, "pid": 5714, "tid": 6744, "ts": 6300866040149.622, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866060459.830, "dur": 78.017, + "args": { + "External id": 87886, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158476, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158476, "pid": 0, "tid": 7, "ts": 6300866060459.830, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040199.772, "dur": 7.190, + "args": { + "External id": 87886, "cbid": 307, "correlation": 161158476 + } + }, + { + "ph": "s", "id": 161158476, "pid": 5714, "tid": 6744, "ts": 6300866040199.772, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6300866060538.583, "dur": 1.952, + "args": { + "External id": 87887, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158484, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161158484, "pid": 0, "tid": 7, "ts": 6300866060538.583, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040237.992, "dur": 6.770, + "args": { + "External id": 87887, "cbid": 307, "correlation": 161158484 + } + }, + { + "ph": "s", "id": 161158484, "pid": 5714, "tid": 6744, "ts": 6300866040237.992, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6300866060541.272, "dur": 112.097, + "args": { + "External id": 87888, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158492, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158492, "pid": 0, "tid": 7, "ts": 6300866060541.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040279.592, "dur": 7.100, + "args": { + "External id": 87888, "cbid": 307, "correlation": 161158492 + } + }, + { + "ph": "s", "id": 161158492, "pid": 5714, "tid": 6744, "ts": 6300866040279.592, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866040525.832, "dur": 0.560, + "args": { + "External id": 87907, "cbid": 200, "correlation": 161158538 + } + }, + { + "ph": "f", "id": 161158538, "pid": 5714, "tid": 6744, "ts": 6300866040525.832, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866060654.137, "dur": 0.768, + "args": { + "External id": 87907, "device": 0, "context": 1, "stream": 7, "correlation": 161158541, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161158541, "pid": 0, "tid": 7, "ts": 6300866060654.137, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866040528.192, "dur": 8.440, + "args": { + "External id": 87907, "cbid": 51, "correlation": 161158541 + } + }, + { + "ph": "s", "id": 161158541, "pid": 5714, "tid": 6744, "ts": 6300866040528.192, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866060656.057, "dur": 137.921, + "args": { + "External id": 87907, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158542, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158542, "pid": 0, "tid": 7, "ts": 6300866060656.057, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040536.881, "dur": 8.971, + "args": { + "External id": 87907, "cbid": 307, "correlation": 161158542 + } + }, + { + "ph": "s", "id": 161158542, "pid": 5714, "tid": 6744, "ts": 6300866040536.881, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866060794.650, "dur": 120.834, + "args": { + "External id": 87908, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158564, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158564, "pid": 0, "tid": 7, "ts": 6300866060794.650, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040574.761, "dur": 6.180, + "args": { + "External id": 87908, "cbid": 211, "correlation": 161158564 + } + }, + { + "ph": "s", "id": 161158564, "pid": 5714, "tid": 6744, "ts": 6300866040574.761, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866040675.601, "dur": 0.490, + "args": { + "External id": 87909, "cbid": 200, "correlation": 161158582 + } + }, + { + "ph": "f", "id": 161158582, "pid": 5714, "tid": 6744, "ts": 6300866040675.601, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866040676.241, "dur": 0.210, + "args": { + "External id": 87909, "cbid": 200, "correlation": 161158583 + } + }, + { + "ph": "f", "id": 161158583, "pid": 5714, "tid": 6744, "ts": 6300866040676.241, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866040699.471, "dur": 0.250, + "args": { + "External id": 87909, "cbid": 200, "correlation": 161158601 + } + }, + { + "ph": "f", "id": 161158601, "pid": 5714, "tid": 6744, "ts": 6300866040699.471, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866060916.188, "dur": 91.809, + "args": { + "External id": 87909, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158602, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158602, "pid": 0, "tid": 7, "ts": 6300866060916.188, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040701.061, "dur": 10.390, + "args": { + "External id": 87909, "cbid": 211, "correlation": 161158602 + } + }, + { + "ph": "s", "id": 161158602, "pid": 5714, "tid": 6744, "ts": 6300866040701.061, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866040712.321, "dur": 1.000, + "args": { + "External id": 87909, "cbid": 273, "correlation": 161158604 + } + }, + { + "ph": "f", "id": 161158604, "pid": 5714, "tid": 6744, "ts": 6300866040712.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866061008.701, "dur": 981.227, + "args": { + "External id": 87909, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158605, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161158605, "pid": 0, "tid": 7, "ts": 6300866061008.701, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040713.711, "dur": 4.430, + "args": { + "External id": 87909, "cbid": 211, "correlation": 161158605 + } + }, + { + "ph": "s", "id": 161158605, "pid": 5714, "tid": 6744, "ts": 6300866040713.711, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866061990.632, "dur": 71.681, + "args": { + "External id": 87909, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158607, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161158607, "pid": 0, "tid": 7, "ts": 6300866061990.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040718.731, "dur": 3.930, + "args": { + "External id": 87909, "cbid": 211, "correlation": 161158607 + } + }, + { + "ph": "s", "id": 161158607, "pid": 5714, "tid": 6744, "ts": 6300866040718.731, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866062062.953, "dur": 47.649, + "args": { + "External id": 87920, "device": 0, "context": 1, "stream": 7, "correlation": 161158629, "bytes": 25165824, "memory bandwidth (GB/s)": 528.1500975886167 + } + }, + { + "ph": "f", "id": 161158629, "pid": 0, "tid": 7, "ts": 6300866062062.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866040867.141, "dur": 18.770, + "args": { + "External id": 87920, "cbid": 41, "correlation": 161158629 + } + }, + { + "ph": "s", "id": 161158629, "pid": 5714, "tid": 6744, "ts": 6300866040867.141, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866062111.210, "dur": 31.424, + "args": { + "External id": 87917, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158647, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158647, "pid": 0, "tid": 7, "ts": 6300866062111.210, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866040992.971, "dur": 9.189, + "args": { + "External id": 87917, "cbid": 307, "correlation": 161158647 + } + }, + { + "ph": "s", "id": 161158647, "pid": 5714, "tid": 6744, "ts": 6300866040992.971, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866062143.306, "dur": 39.008, + "args": { + "External id": 87927, "device": 0, "context": 1, "stream": 7, "correlation": 161158662, "bytes": 25165824, "memory bandwidth (GB/s)": 645.1452009844135 + } + }, + { + "ph": "f", "id": 161158662, "pid": 0, "tid": 7, "ts": 6300866062143.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866041068.640, "dur": 14.790, + "args": { + "External id": 87927, "cbid": 41, "correlation": 161158662 + } + }, + { + "ph": "s", "id": 161158662, "pid": 5714, "tid": 6744, "ts": 6300866041068.640, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866062182.954, "dur": 26.785, + "args": { + "External id": 87924, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158680, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158680, "pid": 0, "tid": 7, "ts": 6300866062182.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041179.700, "dur": 8.160, + "args": { + "External id": 87924, "cbid": 307, "correlation": 161158680 + } + }, + { + "ph": "s", "id": 161158680, "pid": 5714, "tid": 6744, "ts": 6300866041179.700, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866041333.870, "dur": 0.590, + "args": { + "External id": 87932, "cbid": 200, "correlation": 161158710 + } + }, + { + "ph": "f", "id": 161158710, "pid": 5714, "tid": 6744, "ts": 6300866041333.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866062210.603, "dur": 0.768, + "args": { + "External id": 87932, "device": 0, "context": 1, "stream": 7, "correlation": 161158713, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161158713, "pid": 0, "tid": 7, "ts": 6300866062210.603, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866041336.390, "dur": 8.110, + "args": { + "External id": 87932, "cbid": 51, "correlation": 161158713 + } + }, + { + "ph": "s", "id": 161158713, "pid": 5714, "tid": 6744, "ts": 6300866041336.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866062212.491, "dur": 139.874, + "args": { + "External id": 87932, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158714, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158714, "pid": 0, "tid": 7, "ts": 6300866062212.491, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041344.750, "dur": 8.610, + "args": { + "External id": 87932, "cbid": 307, "correlation": 161158714 + } + }, + { + "ph": "s", "id": 161158714, "pid": 5714, "tid": 6744, "ts": 6300866041344.750, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866041384.219, "dur": 0.311, + "args": { + "External id": 87933, "cbid": 200, "correlation": 161158739 + } + }, + { + "ph": "f", "id": 161158739, "pid": 5714, "tid": 6744, "ts": 6300866041384.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866062353.197, "dur": 0.768, + "args": { + "External id": 87933, "device": 0, "context": 1, "stream": 7, "correlation": 161158742, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161158742, "pid": 0, "tid": 7, "ts": 6300866062353.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866041385.690, "dur": 5.280, + "args": { + "External id": 87933, "cbid": 51, "correlation": 161158742 + } + }, + { + "ph": "s", "id": 161158742, "pid": 5714, "tid": 6744, "ts": 6300866041385.690, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866062355.085, "dur": 137.793, + "args": { + "External id": 87933, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158743, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158743, "pid": 0, "tid": 7, "ts": 6300866062355.085, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041391.130, "dur": 5.449, + "args": { + "External id": 87933, "cbid": 307, "correlation": 161158743 + } + }, + { + "ph": "s", "id": 161158743, "pid": 5714, "tid": 6744, "ts": 6300866041391.130, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866041420.959, "dur": 0.300, + "args": { + "External id": 87934, "cbid": 200, "correlation": 161158768 + } + }, + { + "ph": "f", "id": 161158768, "pid": 5714, "tid": 6744, "ts": 6300866041420.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866062493.742, "dur": 0.800, + "args": { + "External id": 87934, "device": 0, "context": 1, "stream": 7, "correlation": 161158771, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 161158771, "pid": 0, "tid": 7, "ts": 6300866062493.742, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866041422.310, "dur": 4.500, + "args": { + "External id": 87934, "cbid": 51, "correlation": 161158771 + } + }, + { + "ph": "s", "id": 161158771, "pid": 5714, "tid": 6744, "ts": 6300866041422.310, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866062495.694, "dur": 135.170, + "args": { + "External id": 87934, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158772, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158772, "pid": 0, "tid": 7, "ts": 6300866062495.694, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041426.970, "dur": 5.009, + "args": { + "External id": 87934, "cbid": 307, "correlation": 161158772 + } + }, + { + "ph": "s", "id": 161158772, "pid": 5714, "tid": 6744, "ts": 6300866041426.970, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866062631.504, "dur": 120.737, + "args": { + "External id": 87935, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158794, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158794, "pid": 0, "tid": 7, "ts": 6300866062631.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041458.610, "dur": 6.160, + "args": { + "External id": 87935, "cbid": 211, "correlation": 161158794 + } + }, + { + "ph": "s", "id": 161158794, "pid": 5714, "tid": 6744, "ts": 6300866041458.610, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866062752.945, "dur": 121.186, + "args": { + "External id": 87936, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158817, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158817, "pid": 0, "tid": 7, "ts": 6300866062752.945, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041485.009, "dur": 5.060, + "args": { + "External id": 87936, "cbid": 211, "correlation": 161158817 + } + }, + { + "ph": "s", "id": 161158817, "pid": 5714, "tid": 6744, "ts": 6300866041485.009, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866062874.803, "dur": 122.113, + "args": { + "External id": 87937, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158840, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161158840, "pid": 0, "tid": 7, "ts": 6300866062874.803, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041509.529, "dur": 5.040, + "args": { + "External id": 87937, "cbid": 211, "correlation": 161158840 + } + }, + { + "ph": "s", "id": 161158840, "pid": 5714, "tid": 6744, "ts": 6300866041509.529, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6300866062997.556, "dur": 80.673, + "args": { + "External id": 87938, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158848, "pid": 0, "tid": 7, "ts": 6300866062997.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041554.249, "dur": 6.450, + "args": { + "External id": 87938, "cbid": 307, "correlation": 161158848 + } + }, + { + "ph": "s", "id": 161158848, "pid": 5714, "tid": 6744, "ts": 6300866041554.249, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866063078.901, "dur": 42.273, + "args": { + "External id": 87953, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158877, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158877, "pid": 0, "tid": 7, "ts": 6300866063078.901, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041740.259, "dur": 10.350, + "args": { + "External id": 87953, "cbid": 307, "correlation": 161158877 + } + }, + { + "ph": "s", "id": 161158877, "pid": 5714, "tid": 6744, "ts": 6300866041740.259, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866063121.782, "dur": 1.792, + "args": { + "External id": 87954, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158885, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161158885, "pid": 0, "tid": 7, "ts": 6300866063121.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041781.169, "dur": 6.850, + "args": { + "External id": 87954, "cbid": 307, "correlation": 161158885 + } + }, + { + "ph": "s", "id": 161158885, "pid": 5714, "tid": 6744, "ts": 6300866041781.169, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866063124.214, "dur": 47.584, + "args": { + "External id": 87955, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158896, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158896, "pid": 0, "tid": 7, "ts": 6300866063124.214, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041820.709, "dur": 6.549, + "args": { + "External id": 87955, "cbid": 307, "correlation": 161158896 + } + }, + { + "ph": "s", "id": 161158896, "pid": 5714, "tid": 6744, "ts": 6300866041820.709, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866063172.406, "dur": 45.633, + "args": { + "External id": 87956, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158901, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158901, "pid": 0, "tid": 7, "ts": 6300866063172.406, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866041865.629, "dur": 7.480, + "args": { + "External id": 87956, "cbid": 211, "correlation": 161158901 + } + }, + { + "ph": "s", "id": 161158901, "pid": 5714, "tid": 6744, "ts": 6300866041865.629, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866063222.007, "dur": 2.304, + "args": { + "External id": 87974, "device": 0, "context": 1, "stream": 7, "correlation": 161158932, "bytes": 28112, "memory bandwidth (GB/s)": 12.20138888888889 + } + }, + { + "ph": "f", "id": 161158932, "pid": 0, "tid": 7, "ts": 6300866063222.007, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866042254.358, "dur": 13.579, + "args": { + "External id": 87974, "cbid": 41, "correlation": 161158932 + } + }, + { + "ph": "s", "id": 161158932, "pid": 5714, "tid": 6744, "ts": 6300866042254.358, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866042273.277, "dur": 2.160, + "args": { + "External id": 87969, "cbid": 135, "correlation": 161158936 + } + }, + { + "ph": "f", "id": 161158936, "pid": 5714, "tid": 6744, "ts": 6300866042273.277, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866063226.263, "dur": 34.272, + "args": { + "External id": 87969, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161158940, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161158940, "pid": 0, "tid": 7, "ts": 6300866063226.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866042279.268, "dur": 12.300, + "args": { + "External id": 87969, "cbid": 211, "correlation": 161158940 + } + }, + { + "ph": "s", "id": 161158940, "pid": 5714, "tid": 6744, "ts": 6300866042279.268, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866042346.297, "dur": 1.300, + "args": { + "External id": 87962, "cbid": 135, "correlation": 161158951 + } + }, + { + "ph": "f", "id": 161158951, "pid": 5714, "tid": 6744, "ts": 6300866042346.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866042350.128, "dur": 1.820, + "args": { + "External id": 87962, "cbid": 147, "correlation": 161158955 + } + }, + { + "ph": "s", "id": 161158955, "pid": 5714, "tid": 6744, "ts": 6300866042350.128, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866042441.097, "dur": 1.150, + "args": { + "External id": 87978, "cbid": 317, "correlation": 161158975 + } + }, + { + "ph": "f", "id": 161158975, "pid": 5714, "tid": 6744, "ts": 6300866042441.097, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866042444.257, "dur": 1.490, + "args": { + "External id": 87978, "cbid": 135, "correlation": 161158977 + } + }, + { + "ph": "f", "id": 161158977, "pid": 5714, "tid": 6744, "ts": 6300866042444.257, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866042447.177, "dur": 1.260, + "args": { + "External id": 87978, "cbid": 147, "correlation": 161158981 + } + }, + { + "ph": "s", "id": 161158981, "pid": 5714, "tid": 6744, "ts": 6300866042447.177, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866042464.957, "dur": 0.810, + "args": { + "External id": 87978, "cbid": 409, "correlation": 161158984 + } + }, + { + "ph": "f", "id": 161158984, "pid": 5714, "tid": 6744, "ts": 6300866042464.957, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866042470.127, "dur": 0.910, + "args": { + "External id": 87978, "cbid": 135, "correlation": 161158987 + } + }, + { + "ph": "f", "id": 161158987, "pid": 5714, "tid": 6744, "ts": 6300866042470.127, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866042471.227, "dur": 0.940, + "args": { + "External id": 87978, "cbid": 147, "correlation": 161158988 + } + }, + { + "ph": "s", "id": 161158988, "pid": 5714, "tid": 6744, "ts": 6300866042471.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866063262.263, "dur": 7716.187, + "args": { + "External id": 87978, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161158990, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161158990, "pid": 0, "tid": 20, "ts": 6300866063262.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866042473.997, "dur": 11.550, + "args": { + "External id": 87978, "cbid": 430, "correlation": 161158990 + } + }, + { + "ph": "s", "id": 161158990, "pid": 5714, "tid": 6744, "ts": 6300866042473.997, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866042486.597, "dur": 0.400, + "args": { + "External id": 87978, "cbid": 135, "correlation": 161158992 + } + }, + { + "ph": "f", "id": 161158992, "pid": 5714, "tid": 6744, "ts": 6300866042486.597, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866042487.127, "dur": 0.620, + "args": { + "External id": 87978, "cbid": 147, "correlation": 161158993 + } + }, + { + "ph": "s", "id": 161158993, "pid": 5714, "tid": 6744, "ts": 6300866042487.127, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866042489.617, "dur": 0.920, + "args": { + "External id": 87978, "cbid": 135, "correlation": 161158996 + } + }, + { + "ph": "f", "id": 161158996, "pid": 5714, "tid": 6744, "ts": 6300866042489.617, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866042498.927, "dur": 0.510, + "args": { + "External id": 87978, "cbid": 135, "correlation": 161159003 + } + }, + { + "ph": "f", "id": 161159003, "pid": 5714, "tid": 6744, "ts": 6300866042498.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866042528.447, "dur": 1.250, + "args": { + "External id": 87980, "cbid": 147, "correlation": 161159008 + } + }, + { + "ph": "s", "id": 161159008, "pid": 5714, "tid": 6744, "ts": 6300866042528.447, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866042547.087, "dur": 0.850, + "args": { + "External id": 87962, "cbid": 135, "correlation": 161159023 + } + }, + { + "ph": "f", "id": 161159023, "pid": 5714, "tid": 6744, "ts": 6300866042547.087, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866042770.927, "dur": 1.300, + "args": { + "External id": 87962, "cbid": 135, "correlation": 161159036 + } + }, + { + "ph": "f", "id": 161159036, "pid": 5714, "tid": 6744, "ts": 6300866042770.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866042888.806, "dur": 3.790, + "args": { + "External id": 87990, "cbid": 147, "correlation": 161159047 + } + }, + { + "ph": "s", "id": 161159047, "pid": 5714, "tid": 6744, "ts": 6300866042888.806, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866043017.906, "dur": 1.390, + "args": { + "External id": 88004, "cbid": 317, "correlation": 161159088 + } + }, + { + "ph": "f", "id": 161159088, "pid": 5714, "tid": 6744, "ts": 6300866043017.906, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866043028.756, "dur": 2.700, + "args": { + "External id": 88005, "cbid": 138, "correlation": 161159091 + } + }, + { + "ph": "f", "id": 161159091, "pid": 5714, "tid": 6744, "ts": 6300866043028.756, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866063264.119, "dur": 1.504, + "args": { + "External id": 88009, "device": 0, "context": 1, "stream": 7, "correlation": 161159102, "bytes": 7224, "memory bandwidth (GB/s)": 4.803191489361702 + } + }, + { + "ph": "f", "id": 161159102, "pid": 0, "tid": 7, "ts": 6300866063264.119, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866043055.426, "dur": 12.320, + "args": { + "External id": 88009, "cbid": 41, "correlation": 161159102 + } + }, + { + "ph": "s", "id": 161159102, "pid": 5714, "tid": 6744, "ts": 6300866043055.426, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866043072.586, "dur": 2.150, + "args": { + "External id": 88004, "cbid": 135, "correlation": 161159106 + } + }, + { + "ph": "f", "id": 161159106, "pid": 5714, "tid": 6744, "ts": 6300866043072.586, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300866063267.640, "dur": 324.579, + "args": { + "External id": 88004, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159110, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159110, "pid": 0, "tid": 7, "ts": 6300866063267.640, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866043077.626, "dur": 11.890, + "args": { + "External id": 88004, "cbid": 211, "correlation": 161159110 + } + }, + { + "ph": "s", "id": 161159110, "pid": 5714, "tid": 6744, "ts": 6300866043077.626, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866043188.595, "dur": 1.540, + "args": { + "External id": 87990, "cbid": 135, "correlation": 161159121 + } + }, + { + "ph": "f", "id": 161159121, "pid": 5714, "tid": 6744, "ts": 6300866043188.595, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866043194.006, "dur": 1.320, + "args": { + "External id": 87990, "cbid": 147, "correlation": 161159125 + } + }, + { + "ph": "s", "id": 161159125, "pid": 5714, "tid": 6744, "ts": 6300866043194.006, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866043197.415, "dur": 0.860, + "args": { + "External id": 87990, "cbid": 147, "correlation": 161159129 + } + }, + { + "ph": "s", "id": 161159129, "pid": 5714, "tid": 6744, "ts": 6300866043197.415, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866063645.020, "dur": 410.308, + "args": { + "External id": 88023, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161159153, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161159153, "pid": 0, "tid": 17, "ts": 6300866063645.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866043372.095, "dur": 12.660, + "args": { + "External id": 88023, "cbid": 211, "correlation": 161159153 + } + }, + { + "ph": "s", "id": 161159153, "pid": 5714, "tid": 6744, "ts": 6300866043372.095, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866064390.245, "dur": 545.030, + "args": { + "External id": 88039, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161159166, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161159166, "pid": 0, "tid": 17, "ts": 6300866064390.245, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866043504.745, "dur": 10.280, + "args": { + "External id": 88039, "cbid": 211, "correlation": 161159166 + } + }, + { + "ph": "s", "id": 161159166, "pid": 5714, "tid": 6744, "ts": 6300866043504.745, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866043540.845, "dur": 1.400, + "args": { + "External id": 87990, "cbid": 135, "correlation": 161159176 + } + }, + { + "ph": "f", "id": 161159176, "pid": 5714, "tid": 6744, "ts": 6300866043540.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866043544.195, "dur": 1.270, + "args": { + "External id": 87990, "cbid": 147, "correlation": 161159180 + } + }, + { + "ph": "s", "id": 161159180, "pid": 5714, "tid": 6744, "ts": 6300866043544.195, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866043601.465, "dur": 0.889, + "args": { + "External id": 88041, "cbid": 317, "correlation": 161159193 + } + }, + { + "ph": "f", "id": 161159193, "pid": 5714, "tid": 6744, "ts": 6300866043601.465, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866043604.365, "dur": 1.240, + "args": { + "External id": 88041, "cbid": 135, "correlation": 161159195 + } + }, + { + "ph": "f", "id": 161159195, "pid": 5714, "tid": 6744, "ts": 6300866043604.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866043607.214, "dur": 1.311, + "args": { + "External id": 88041, "cbid": 147, "correlation": 161159199 + } + }, + { + "ph": "s", "id": 161159199, "pid": 5714, "tid": 6744, "ts": 6300866043607.214, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866043624.205, "dur": 0.709, + "args": { + "External id": 88041, "cbid": 409, "correlation": 161159202 + } + }, + { + "ph": "f", "id": 161159202, "pid": 5714, "tid": 6744, "ts": 6300866043624.205, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866043629.234, "dur": 0.900, + "args": { + "External id": 88041, "cbid": 135, "correlation": 161159205 + } + }, + { + "ph": "f", "id": 161159205, "pid": 5714, "tid": 6744, "ts": 6300866043629.234, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866043630.345, "dur": 0.980, + "args": { + "External id": 88041, "cbid": 147, "correlation": 161159206 + } + }, + { + "ph": "s", "id": 161159206, "pid": 5714, "tid": 6744, "ts": 6300866043630.345, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866071101.588, "dur": 4414.515, + "args": { + "External id": 88041, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161159208, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161159208, "pid": 0, "tid": 20, "ts": 6300866071101.588, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866043632.454, "dur": 10.240, + "args": { + "External id": 88041, "cbid": 430, "correlation": 161159208 + } + }, + { + "ph": "s", "id": 161159208, "pid": 5714, "tid": 6744, "ts": 6300866043632.454, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866043643.834, "dur": 0.431, + "args": { + "External id": 88041, "cbid": 135, "correlation": 161159210 + } + }, + { + "ph": "f", "id": 161159210, "pid": 5714, "tid": 6744, "ts": 6300866043643.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866043644.385, "dur": 0.600, + "args": { + "External id": 88041, "cbid": 147, "correlation": 161159211 + } + }, + { + "ph": "s", "id": 161159211, "pid": 5714, "tid": 6744, "ts": 6300866043644.385, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866043646.614, "dur": 0.940, + "args": { + "External id": 88041, "cbid": 135, "correlation": 161159214 + } + }, + { + "ph": "f", "id": 161159214, "pid": 5714, "tid": 6744, "ts": 6300866043646.614, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866043655.714, "dur": 0.491, + "args": { + "External id": 88041, "cbid": 135, "correlation": 161159221 + } + }, + { + "ph": "f", "id": 161159221, "pid": 5714, "tid": 6744, "ts": 6300866043655.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866043684.634, "dur": 1.060, + "args": { + "External id": 88043, "cbid": 147, "correlation": 161159226 + } + }, + { + "ph": "s", "id": 161159226, "pid": 5714, "tid": 6744, "ts": 6300866043684.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866043703.224, "dur": 1.020, + "args": { + "External id": 87990, "cbid": 135, "correlation": 161159241 + } + }, + { + "ph": "f", "id": 161159241, "pid": 5714, "tid": 6744, "ts": 6300866043703.224, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866063698.556, "dur": 1918.327, + "args": { + "External id": 88045, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159266, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159266, "pid": 0, "tid": 7, "ts": 6300866063698.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866043858.474, "dur": 12.140, + "args": { + "External id": 88045, "cbid": 211, "correlation": 161159266 + } + }, + { + "ph": "s", "id": 161159266, "pid": 5714, "tid": 6744, "ts": 6300866043858.474, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6300866065617.619, "dur": 558.022, + "args": { + "External id": 88046, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159289, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161159289, "pid": 0, "tid": 7, "ts": 6300866065617.619, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866043922.894, "dur": 6.660, + "args": { + "External id": 88046, "cbid": 307, "correlation": 161159289 + } + }, + { + "ph": "s", "id": 161159289, "pid": 5714, "tid": 6744, "ts": 6300866043922.894, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866043968.954, "dur": 0.590, + "args": { + "External id": 88047, "cbid": 200, "correlation": 161159312 + } + }, + { + "ph": "f", "id": 161159312, "pid": 5714, "tid": 6744, "ts": 6300866043968.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866066218.682, "dur": 58.464, + "args": { + "External id": 88047, "device": 0, "context": 1, "stream": 7, "correlation": 161159315, "bytes": 1536, "memory bandwidth (GB/s)": 0.026272577996715927 + } + }, + { + "ph": "f", "id": 161159315, "pid": 0, "tid": 7, "ts": 6300866066218.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866043971.474, "dur": 6.990, + "args": { + "External id": 88047, "cbid": 51, "correlation": 161159315 + } + }, + { + "ph": "s", "id": 161159315, "pid": 5714, "tid": 6744, "ts": 6300866043971.474, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866066336.795, "dur": 369.637, + "args": { + "External id": 88047, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159316, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159316, "pid": 0, "tid": 7, "ts": 6300866066336.795, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866043978.684, "dur": 6.350, + "args": { + "External id": 88047, "cbid": 307, "correlation": 161159316 + } + }, + { + "ph": "s", "id": 161159316, "pid": 5714, "tid": 6744, "ts": 6300866043978.684, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866044017.364, "dur": 0.329, + "args": { + "External id": 88048, "cbid": 200, "correlation": 161159341 + } + }, + { + "ph": "f", "id": 161159341, "pid": 5714, "tid": 6744, "ts": 6300866044017.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866066707.712, "dur": 1.312, + "args": { + "External id": 88048, "device": 0, "context": 1, "stream": 7, "correlation": 161159344, "bytes": 1536, "memory bandwidth (GB/s)": 1.170731707317073 + } + }, + { + "ph": "f", "id": 161159344, "pid": 0, "tid": 7, "ts": 6300866066707.712, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866044018.884, "dur": 5.129, + "args": { + "External id": 88048, "cbid": 51, "correlation": 161159344 + } + }, + { + "ph": "s", "id": 161159344, "pid": 5714, "tid": 6744, "ts": 6300866044018.884, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866066710.528, "dur": 354.116, + "args": { + "External id": 88048, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159345, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159345, "pid": 0, "tid": 7, "ts": 6300866066710.528, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044024.164, "dur": 5.520, + "args": { + "External id": 88048, "cbid": 307, "correlation": 161159345 + } + }, + { + "ph": "s", "id": 161159345, "pid": 5714, "tid": 6744, "ts": 6300866044024.164, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866044056.313, "dur": 0.331, + "args": { + "External id": 88049, "cbid": 200, "correlation": 161159370 + } + }, + { + "ph": "f", "id": 161159370, "pid": 5714, "tid": 6744, "ts": 6300866044056.313, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866067065.252, "dur": 358.692, + "args": { + "External id": 88049, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159373, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159373, "pid": 0, "tid": 7, "ts": 6300866067065.252, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044057.933, "dur": 5.951, + "args": { + "External id": 88049, "cbid": 307, "correlation": 161159373 + } + }, + { + "ph": "s", "id": 161159373, "pid": 5714, "tid": 6744, "ts": 6300866044057.933, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866044089.453, "dur": 0.271, + "args": { + "External id": 88050, "cbid": 200, "correlation": 161159398 + } + }, + { + "ph": "f", "id": 161159398, "pid": 5714, "tid": 6744, "ts": 6300866044089.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866067425.160, "dur": 1.280, + "args": { + "External id": 88050, "device": 0, "context": 1, "stream": 7, "correlation": 161159401, "bytes": 1536, "memory bandwidth (GB/s)": 1.2 + } + }, + { + "ph": "f", "id": 161159401, "pid": 0, "tid": 7, "ts": 6300866067425.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866044090.913, "dur": 5.080, + "args": { + "External id": 88050, "cbid": 51, "correlation": 161159401 + } + }, + { + "ph": "s", "id": 161159401, "pid": 5714, "tid": 6744, "ts": 6300866044090.913, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866067427.656, "dur": 355.364, + "args": { + "External id": 88050, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159402, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159402, "pid": 0, "tid": 7, "ts": 6300866067427.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044096.153, "dur": 5.300, + "args": { + "External id": 88050, "cbid": 307, "correlation": 161159402 + } + }, + { + "ph": "s", "id": 161159402, "pid": 5714, "tid": 6744, "ts": 6300866044096.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866044127.853, "dur": 0.350, + "args": { + "External id": 88051, "cbid": 200, "correlation": 161159427 + } + }, + { + "ph": "f", "id": 161159427, "pid": 5714, "tid": 6744, "ts": 6300866044127.853, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866067783.724, "dur": 360.804, + "args": { + "External id": 88051, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159430, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159430, "pid": 0, "tid": 7, "ts": 6300866067783.724, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044129.393, "dur": 5.740, + "args": { + "External id": 88051, "cbid": 307, "correlation": 161159430 + } + }, + { + "ph": "s", "id": 161159430, "pid": 5714, "tid": 6744, "ts": 6300866044129.393, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866068145.168, "dur": 86.050, + "args": { + "External id": 88052, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159443, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159443, "pid": 0, "tid": 7, "ts": 6300866068145.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044178.013, "dur": 6.300, + "args": { + "External id": 88052, "cbid": 307, "correlation": 161159443 + } + }, + { + "ph": "s", "id": 161159443, "pid": 5714, "tid": 6744, "ts": 6300866044178.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6300866068231.922, "dur": 3.872, + "args": { + "External id": 88053, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159451, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161159451, "pid": 0, "tid": 7, "ts": 6300866068231.922, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044211.673, "dur": 5.790, + "args": { + "External id": 88053, "cbid": 307, "correlation": 161159451 + } + }, + { + "ph": "s", "id": 161159451, "pid": 5714, "tid": 6744, "ts": 6300866044211.673, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6300866068236.434, "dur": 113.857, + "args": { + "External id": 88054, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159459, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159459, "pid": 0, "tid": 7, "ts": 6300866068236.434, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044245.983, "dur": 5.690, + "args": { + "External id": 88054, "cbid": 307, "correlation": 161159459 + } + }, + { + "ph": "s", "id": 161159459, "pid": 5714, "tid": 6744, "ts": 6300866044245.983, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866044536.872, "dur": 0.631, + "args": { + "External id": 88073, "cbid": 200, "correlation": 161159505 + } + }, + { + "ph": "f", "id": 161159505, "pid": 5714, "tid": 6744, "ts": 6300866044536.872, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866068351.507, "dur": 1.248, + "args": { + "External id": 88073, "device": 0, "context": 1, "stream": 7, "correlation": 161159508, "bytes": 576, "memory bandwidth (GB/s)": 0.46153846153846156 + } + }, + { + "ph": "f", "id": 161159508, "pid": 0, "tid": 7, "ts": 6300866068351.507, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866044539.403, "dur": 8.949, + "args": { + "External id": 88073, "cbid": 51, "correlation": 161159508 + } + }, + { + "ph": "s", "id": 161159508, "pid": 5714, "tid": 6744, "ts": 6300866044539.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866068354.003, "dur": 143.458, + "args": { + "External id": 88073, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159509, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159509, "pid": 0, "tid": 7, "ts": 6300866068354.003, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044548.612, "dur": 9.500, + "args": { + "External id": 88073, "cbid": 307, "correlation": 161159509 + } + }, + { + "ph": "s", "id": 161159509, "pid": 5714, "tid": 6744, "ts": 6300866044548.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866068498.037, "dur": 141.633, + "args": { + "External id": 88074, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159531, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159531, "pid": 0, "tid": 7, "ts": 6300866068498.037, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044589.302, "dur": 6.670, + "args": { + "External id": 88074, "cbid": 211, "correlation": 161159531 + } + }, + { + "ph": "s", "id": 161159531, "pid": 5714, "tid": 6744, "ts": 6300866044589.302, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866044690.002, "dur": 0.540, + "args": { + "External id": 88075, "cbid": 200, "correlation": 161159549 + } + }, + { + "ph": "f", "id": 161159549, "pid": 5714, "tid": 6744, "ts": 6300866044690.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866044690.662, "dur": 0.240, + "args": { + "External id": 88075, "cbid": 200, "correlation": 161159550 + } + }, + { + "ph": "f", "id": 161159550, "pid": 5714, "tid": 6744, "ts": 6300866044690.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866044714.042, "dur": 0.260, + "args": { + "External id": 88075, "cbid": 200, "correlation": 161159568 + } + }, + { + "ph": "f", "id": 161159568, "pid": 5714, "tid": 6744, "ts": 6300866044714.042, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866068640.342, "dur": 177.379, + "args": { + "External id": 88075, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159569, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159569, "pid": 0, "tid": 7, "ts": 6300866068640.342, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044715.762, "dur": 10.670, + "args": { + "External id": 88075, "cbid": 211, "correlation": 161159569 + } + }, + { + "ph": "s", "id": 161159569, "pid": 5714, "tid": 6744, "ts": 6300866044715.762, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866044727.322, "dur": 1.100, + "args": { + "External id": 88075, "cbid": 273, "correlation": 161159571 + } + }, + { + "ph": "f", "id": 161159571, "pid": 5714, "tid": 6744, "ts": 6300866044727.322, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866068818.425, "dur": 1547.474, + "args": { + "External id": 88075, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159572, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161159572, "pid": 0, "tid": 7, "ts": 6300866068818.425, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044728.792, "dur": 4.720, + "args": { + "External id": 88075, "cbid": 211, "correlation": 161159572 + } + }, + { + "ph": "s", "id": 161159572, "pid": 5714, "tid": 6744, "ts": 6300866044728.792, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866070366.955, "dur": 166.530, + "args": { + "External id": 88075, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159574, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161159574, "pid": 0, "tid": 7, "ts": 6300866070366.955, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866044734.162, "dur": 4.260, + "args": { + "External id": 88075, "cbid": 211, "correlation": 161159574 + } + }, + { + "ph": "s", "id": 161159574, "pid": 5714, "tid": 6744, "ts": 6300866044734.162, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866070534.157, "dur": 130.913, + "args": { + "External id": 88086, "device": 0, "context": 1, "stream": 7, "correlation": 161159596, "bytes": 25165824, "memory bandwidth (GB/s)": 192.23319303659682 + } + }, + { + "ph": "f", "id": 161159596, "pid": 0, "tid": 7, "ts": 6300866070534.157, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866044891.002, "dur": 19.849, + "args": { + "External id": 88086, "cbid": 41, "correlation": 161159596 + } + }, + { + "ph": "s", "id": 161159596, "pid": 5714, "tid": 6744, "ts": 6300866044891.002, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866070665.646, "dur": 40.321, + "args": { + "External id": 88083, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159614, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159614, "pid": 0, "tid": 7, "ts": 6300866070665.646, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045026.561, "dur": 9.940, + "args": { + "External id": 88083, "cbid": 307, "correlation": 161159614 + } + }, + { + "ph": "s", "id": 161159614, "pid": 5714, "tid": 6744, "ts": 6300866045026.561, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866070706.639, "dur": 57.888, + "args": { + "External id": 88093, "device": 0, "context": 1, "stream": 7, "correlation": 161159629, "bytes": 25165824, "memory bandwidth (GB/s)": 434.7330016583748 + } + }, + { + "ph": "f", "id": 161159629, "pid": 0, "tid": 7, "ts": 6300866070706.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866045108.411, "dur": 16.210, + "args": { + "External id": 88093, "cbid": 41, "correlation": 161159629 + } + }, + { + "ph": "s", "id": 161159629, "pid": 5714, "tid": 6744, "ts": 6300866045108.411, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866070765.167, "dur": 44.545, + "args": { + "External id": 88090, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159647, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159647, "pid": 0, "tid": 7, "ts": 6300866070765.167, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045228.381, "dur": 9.040, + "args": { + "External id": 88090, "cbid": 307, "correlation": 161159647 + } + }, + { + "ph": "s", "id": 161159647, "pid": 5714, "tid": 6744, "ts": 6300866045228.381, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866045387.121, "dur": 0.660, + "args": { + "External id": 88098, "cbid": 200, "correlation": 161159677 + } + }, + { + "ph": "f", "id": 161159677, "pid": 5714, "tid": 6744, "ts": 6300866045387.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866070817.072, "dur": 6.560, + "args": { + "External id": 88098, "device": 0, "context": 1, "stream": 7, "correlation": 161159680, "bytes": 576, "memory bandwidth (GB/s)": 0.08780487804878048 + } + }, + { + "ph": "f", "id": 161159680, "pid": 0, "tid": 7, "ts": 6300866070817.072, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866045389.890, "dur": 8.780, + "args": { + "External id": 88098, "cbid": 51, "correlation": 161159680 + } + }, + { + "ph": "s", "id": 161159680, "pid": 5714, "tid": 6744, "ts": 6300866045389.890, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866070826.672, "dur": 147.682, + "args": { + "External id": 88098, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159681, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159681, "pid": 0, "tid": 7, "ts": 6300866070826.672, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045398.950, "dur": 9.480, + "args": { + "External id": 88098, "cbid": 307, "correlation": 161159681 + } + }, + { + "ph": "s", "id": 161159681, "pid": 5714, "tid": 6744, "ts": 6300866045398.950, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866045440.040, "dur": 0.330, + "args": { + "External id": 88099, "cbid": 200, "correlation": 161159706 + } + }, + { + "ph": "f", "id": 161159706, "pid": 5714, "tid": 6744, "ts": 6300866045440.040, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866070975.250, "dur": 0.768, + "args": { + "External id": 88099, "device": 0, "context": 1, "stream": 7, "correlation": 161159709, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161159709, "pid": 0, "tid": 7, "ts": 6300866070975.250, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866045441.600, "dur": 5.770, + "args": { + "External id": 88099, "cbid": 51, "correlation": 161159709 + } + }, + { + "ph": "s", "id": 161159709, "pid": 5714, "tid": 6744, "ts": 6300866045441.600, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866070977.426, "dur": 453.157, + "args": { + "External id": 88099, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159710, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159710, "pid": 0, "tid": 7, "ts": 6300866070977.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045447.540, "dur": 6.120, + "args": { + "External id": 88099, "cbid": 307, "correlation": 161159710 + } + }, + { + "ph": "s", "id": 161159710, "pid": 5714, "tid": 6744, "ts": 6300866045447.540, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866045482.870, "dur": 0.340, + "args": { + "External id": 88100, "cbid": 200, "correlation": 161159735 + } + }, + { + "ph": "f", "id": 161159735, "pid": 5714, "tid": 6744, "ts": 6300866045482.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866071491.736, "dur": 1.888, + "args": { + "External id": 88100, "device": 0, "context": 1, "stream": 7, "correlation": 161159738, "bytes": 576, "memory bandwidth (GB/s)": 0.3050847457627119 + } + }, + { + "ph": "f", "id": 161159738, "pid": 0, "tid": 7, "ts": 6300866071491.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866045484.350, "dur": 5.280, + "args": { + "External id": 88100, "cbid": 51, "correlation": 161159738 + } + }, + { + "ph": "s", "id": 161159738, "pid": 5714, "tid": 6744, "ts": 6300866045484.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866071495.192, "dur": 141.793, + "args": { + "External id": 88100, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159739, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159739, "pid": 0, "tid": 7, "ts": 6300866071495.192, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045489.800, "dur": 5.420, + "args": { + "External id": 88100, "cbid": 307, "correlation": 161159739 + } + }, + { + "ph": "s", "id": 161159739, "pid": 5714, "tid": 6744, "ts": 6300866045489.800, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866071637.722, "dur": 141.441, + "args": { + "External id": 88101, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159761, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159761, "pid": 0, "tid": 7, "ts": 6300866071637.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045524.860, "dur": 6.790, + "args": { + "External id": 88101, "cbid": 211, "correlation": 161159761 + } + }, + { + "ph": "s", "id": 161159761, "pid": 5714, "tid": 6744, "ts": 6300866045524.860, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866071779.899, "dur": 142.434, + "args": { + "External id": 88102, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159784, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159784, "pid": 0, "tid": 7, "ts": 6300866071779.899, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045553.460, "dur": 5.530, + "args": { + "External id": 88102, "cbid": 211, "correlation": 161159784 + } + }, + { + "ph": "s", "id": 161159784, "pid": 5714, "tid": 6744, "ts": 6300866045553.460, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866071922.973, "dur": 142.626, + "args": { + "External id": 88103, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159807, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161159807, "pid": 0, "tid": 7, "ts": 6300866071922.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045579.390, "dur": 5.480, + "args": { + "External id": 88103, "cbid": 211, "correlation": 161159807 + } + }, + { + "ph": "s", "id": 161159807, "pid": 5714, "tid": 6744, "ts": 6300866045579.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6300866072066.303, "dur": 80.833, + "args": { + "External id": 88104, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159815, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159815, "pid": 0, "tid": 7, "ts": 6300866072066.303, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045625.180, "dur": 6.170, + "args": { + "External id": 88104, "cbid": 307, "correlation": 161159815 + } + }, + { + "ph": "s", "id": 161159815, "pid": 5714, "tid": 6744, "ts": 6300866045625.180, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866072147.744, "dur": 47.712, + "args": { + "External id": 88119, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159844, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159844, "pid": 0, "tid": 7, "ts": 6300866072147.744, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045818.289, "dur": 10.791, + "args": { + "External id": 88119, "cbid": 307, "correlation": 161159844 + } + }, + { + "ph": "s", "id": 161159844, "pid": 5714, "tid": 6744, "ts": 6300866045818.289, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866072196.064, "dur": 3.488, + "args": { + "External id": 88120, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159852, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161159852, "pid": 0, "tid": 7, "ts": 6300866072196.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045861.309, "dur": 6.100, + "args": { + "External id": 88120, "cbid": 307, "correlation": 161159852 + } + }, + { + "ph": "s", "id": 161159852, "pid": 5714, "tid": 6744, "ts": 6300866045861.309, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866072200.160, "dur": 51.489, + "args": { + "External id": 88121, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159863, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159863, "pid": 0, "tid": 7, "ts": 6300866072200.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045901.119, "dur": 6.070, + "args": { + "External id": 88121, "cbid": 307, "correlation": 161159863 + } + }, + { + "ph": "s", "id": 161159863, "pid": 5714, "tid": 6744, "ts": 6300866045901.119, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866072252.321, "dur": 101.761, + "args": { + "External id": 88122, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159868, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159868, "pid": 0, "tid": 7, "ts": 6300866072252.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866045950.419, "dur": 8.320, + "args": { + "External id": 88122, "cbid": 211, "correlation": 161159868 + } + }, + { + "ph": "s", "id": 161159868, "pid": 5714, "tid": 6744, "ts": 6300866045950.419, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866046152.049, "dur": 2.970, + "args": { + "External id": 88128, "cbid": 147, "correlation": 161159885 + } + }, + { + "ph": "s", "id": 161159885, "pid": 5714, "tid": 6744, "ts": 6300866046152.049, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866046313.119, "dur": 2.819, + "args": { + "External id": 88136, "cbid": 138, "correlation": 161159900 + } + }, + { + "ph": "f", "id": 161159900, "pid": 5714, "tid": 6744, "ts": 6300866046313.119, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866072461.988, "dur": 111.073, + "args": { + "External id": 88140, "device": 0, "context": 1, "stream": 7, "correlation": 161159911, "bytes": 28112, "memory bandwidth (GB/s)": 0.25309481152035146 + } + }, + { + "ph": "f", "id": 161159911, "pid": 0, "tid": 7, "ts": 6300866072461.988, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866046340.428, "dur": 13.440, + "args": { + "External id": 88140, "cbid": 41, "correlation": 161159911 + } + }, + { + "ph": "s", "id": 161159911, "pid": 5714, "tid": 6744, "ts": 6300866046340.428, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866046358.748, "dur": 2.260, + "args": { + "External id": 88135, "cbid": 135, "correlation": 161159915 + } + }, + { + "ph": "f", "id": 161159915, "pid": 5714, "tid": 6744, "ts": 6300866046358.748, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866072681.542, "dur": 36.352, + "args": { + "External id": 88135, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161159919, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161159919, "pid": 0, "tid": 7, "ts": 6300866072681.542, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866046364.588, "dur": 11.560, + "args": { + "External id": 88135, "cbid": 211, "correlation": 161159919 + } + }, + { + "ph": "s", "id": 161159919, "pid": 5714, "tid": 6744, "ts": 6300866046364.588, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866046421.458, "dur": 1.190, + "args": { + "External id": 88128, "cbid": 135, "correlation": 161159930 + } + }, + { + "ph": "f", "id": 161159930, "pid": 5714, "tid": 6744, "ts": 6300866046421.458, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866046424.948, "dur": 1.650, + "args": { + "External id": 88128, "cbid": 147, "correlation": 161159934 + } + }, + { + "ph": "s", "id": 161159934, "pid": 5714, "tid": 6744, "ts": 6300866046424.948, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866046505.248, "dur": 1.120, + "args": { + "External id": 88144, "cbid": 317, "correlation": 161159954 + } + }, + { + "ph": "f", "id": 161159954, "pid": 5714, "tid": 6744, "ts": 6300866046505.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866046509.248, "dur": 1.480, + "args": { + "External id": 88144, "cbid": 135, "correlation": 161159956 + } + }, + { + "ph": "f", "id": 161159956, "pid": 5714, "tid": 6744, "ts": 6300866046509.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866046512.248, "dur": 0.990, + "args": { + "External id": 88144, "cbid": 147, "correlation": 161159960 + } + }, + { + "ph": "s", "id": 161159960, "pid": 5714, "tid": 6744, "ts": 6300866046512.248, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866046529.478, "dur": 0.770, + "args": { + "External id": 88144, "cbid": 409, "correlation": 161159963 + } + }, + { + "ph": "f", "id": 161159963, "pid": 5714, "tid": 6744, "ts": 6300866046529.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866046534.828, "dur": 0.900, + "args": { + "External id": 88144, "cbid": 135, "correlation": 161159966 + } + }, + { + "ph": "f", "id": 161159966, "pid": 5714, "tid": 6744, "ts": 6300866046534.828, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866046535.928, "dur": 1.080, + "args": { + "External id": 88144, "cbid": 147, "correlation": 161159967 + } + }, + { + "ph": "s", "id": 161159967, "pid": 5714, "tid": 6744, "ts": 6300866046535.928, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866075517.159, "dur": 8599.685, + "args": { + "External id": 88144, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161159969, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161159969, "pid": 0, "tid": 20, "ts": 6300866075517.159, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866046538.228, "dur": 11.100, + "args": { + "External id": 88144, "cbid": 430, "correlation": 161159969 + } + }, + { + "ph": "s", "id": 161159969, "pid": 5714, "tid": 6744, "ts": 6300866046538.228, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866046550.438, "dur": 0.470, + "args": { + "External id": 88144, "cbid": 135, "correlation": 161159971 + } + }, + { + "ph": "f", "id": 161159971, "pid": 5714, "tid": 6744, "ts": 6300866046550.438, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866046551.038, "dur": 0.600, + "args": { + "External id": 88144, "cbid": 147, "correlation": 161159972 + } + }, + { + "ph": "s", "id": 161159972, "pid": 5714, "tid": 6744, "ts": 6300866046551.038, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866046553.538, "dur": 0.820, + "args": { + "External id": 88144, "cbid": 135, "correlation": 161159975 + } + }, + { + "ph": "f", "id": 161159975, "pid": 5714, "tid": 6744, "ts": 6300866046553.538, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866046562.908, "dur": 0.490, + "args": { + "External id": 88144, "cbid": 135, "correlation": 161159982 + } + }, + { + "ph": "f", "id": 161159982, "pid": 5714, "tid": 6744, "ts": 6300866046562.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866046592.418, "dur": 0.990, + "args": { + "External id": 88146, "cbid": 147, "correlation": 161159987 + } + }, + { + "ph": "s", "id": 161159987, "pid": 5714, "tid": 6744, "ts": 6300866046592.418, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866046611.778, "dur": 0.870, + "args": { + "External id": 88128, "cbid": 135, "correlation": 161160002 + } + }, + { + "ph": "f", "id": 161160002, "pid": 5714, "tid": 6744, "ts": 6300866046611.778, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866046820.867, "dur": 1.290, + "args": { + "External id": 88128, "cbid": 135, "correlation": 161160015 + } + }, + { + "ph": "f", "id": 161160015, "pid": 5714, "tid": 6744, "ts": 6300866046820.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866046940.217, "dur": 3.430, + "args": { + "External id": 88156, "cbid": 147, "correlation": 161160026 + } + }, + { + "ph": "s", "id": 161160026, "pid": 5714, "tid": 6744, "ts": 6300866046940.217, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866047065.687, "dur": 1.240, + "args": { + "External id": 88170, "cbid": 317, "correlation": 161160067 + } + }, + { + "ph": "f", "id": 161160067, "pid": 5714, "tid": 6744, "ts": 6300866047065.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866047075.567, "dur": 2.240, + "args": { + "External id": 88171, "cbid": 138, "correlation": 161160070 + } + }, + { + "ph": "f", "id": 161160070, "pid": 5714, "tid": 6744, "ts": 6300866047075.567, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866075518.343, "dur": 2.016, + "args": { + "External id": 88175, "device": 0, "context": 1, "stream": 7, "correlation": 161160081, "bytes": 7224, "memory bandwidth (GB/s)": 3.5833333333333335 + } + }, + { + "ph": "f", "id": 161160081, "pid": 0, "tid": 7, "ts": 6300866075518.343, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866047099.977, "dur": 12.409, + "args": { + "External id": 88175, "cbid": 41, "correlation": 161160081 + } + }, + { + "ph": "s", "id": 161160081, "pid": 5714, "tid": 6744, "ts": 6300866047099.977, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866047117.126, "dur": 1.851, + "args": { + "External id": 88170, "cbid": 135, "correlation": 161160085 + } + }, + { + "ph": "f", "id": 161160085, "pid": 5714, "tid": 6744, "ts": 6300866047117.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300866075522.311, "dur": 12.352, + "args": { + "External id": 88170, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160089, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160089, "pid": 0, "tid": 7, "ts": 6300866075522.311, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866047121.686, "dur": 11.231, + "args": { + "External id": 88170, "cbid": 211, "correlation": 161160089 + } + }, + { + "ph": "s", "id": 161160089, "pid": 5714, "tid": 6744, "ts": 6300866047121.686, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866047230.766, "dur": 1.380, + "args": { + "External id": 88156, "cbid": 135, "correlation": 161160100 + } + }, + { + "ph": "f", "id": 161160100, "pid": 5714, "tid": 6744, "ts": 6300866047230.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866047235.576, "dur": 1.300, + "args": { + "External id": 88156, "cbid": 147, "correlation": 161160104 + } + }, + { + "ph": "s", "id": 161160104, "pid": 5714, "tid": 6744, "ts": 6300866047235.576, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866047238.706, "dur": 0.830, + "args": { + "External id": 88156, "cbid": 147, "correlation": 161160108 + } + }, + { + "ph": "s", "id": 161160108, "pid": 5714, "tid": 6744, "ts": 6300866047238.706, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866075568.712, "dur": 28.480, + "args": { + "External id": 88189, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161160132, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161160132, "pid": 0, "tid": 17, "ts": 6300866075568.712, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866047406.886, "dur": 12.670, + "args": { + "External id": 88189, "cbid": 211, "correlation": 161160132 + } + }, + { + "ph": "s", "id": 161160132, "pid": 5714, "tid": 6744, "ts": 6300866047406.886, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866075607.304, "dur": 11.872, + "args": { + "External id": 88205, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161160145, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161160145, "pid": 0, "tid": 17, "ts": 6300866075607.304, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866047527.106, "dur": 10.299, + "args": { + "External id": 88205, "cbid": 211, "correlation": 161160145 + } + }, + { + "ph": "s", "id": 161160145, "pid": 5714, "tid": 6744, "ts": 6300866047527.106, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866047563.036, "dur": 1.440, + "args": { + "External id": 88156, "cbid": 135, "correlation": 161160155 + } + }, + { + "ph": "f", "id": 161160155, "pid": 5714, "tid": 6744, "ts": 6300866047563.036, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866047566.445, "dur": 1.280, + "args": { + "External id": 88156, "cbid": 147, "correlation": 161160159 + } + }, + { + "ph": "s", "id": 161160159, "pid": 5714, "tid": 6744, "ts": 6300866047566.445, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866047622.496, "dur": 1.000, + "args": { + "External id": 88207, "cbid": 317, "correlation": 161160172 + } + }, + { + "ph": "f", "id": 161160172, "pid": 5714, "tid": 6744, "ts": 6300866047622.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866047625.416, "dur": 1.280, + "args": { + "External id": 88207, "cbid": 135, "correlation": 161160174 + } + }, + { + "ph": "f", "id": 161160174, "pid": 5714, "tid": 6744, "ts": 6300866047625.416, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866047628.256, "dur": 1.280, + "args": { + "External id": 88207, "cbid": 147, "correlation": 161160178 + } + }, + { + "ph": "s", "id": 161160178, "pid": 5714, "tid": 6744, "ts": 6300866047628.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866047644.395, "dur": 0.720, + "args": { + "External id": 88207, "cbid": 409, "correlation": 161160181 + } + }, + { + "ph": "f", "id": 161160181, "pid": 5714, "tid": 6744, "ts": 6300866047644.395, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866047649.495, "dur": 0.850, + "args": { + "External id": 88207, "cbid": 135, "correlation": 161160184 + } + }, + { + "ph": "f", "id": 161160184, "pid": 5714, "tid": 6744, "ts": 6300866047649.495, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866047650.545, "dur": 0.940, + "args": { + "External id": 88207, "cbid": 147, "correlation": 161160185 + } + }, + { + "ph": "s", "id": 161160185, "pid": 5714, "tid": 6744, "ts": 6300866047650.545, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866084194.125, "dur": 5286.686, + "args": { + "External id": 88207, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161160187, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161160187, "pid": 0, "tid": 20, "ts": 6300866084194.125, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866047652.615, "dur": 10.320, + "args": { + "External id": 88207, "cbid": 430, "correlation": 161160187 + } + }, + { + "ph": "s", "id": 161160187, "pid": 5714, "tid": 6744, "ts": 6300866047652.615, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866047663.995, "dur": 0.420, + "args": { + "External id": 88207, "cbid": 135, "correlation": 161160189 + } + }, + { + "ph": "f", "id": 161160189, "pid": 5714, "tid": 6744, "ts": 6300866047663.995, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866047664.535, "dur": 0.530, + "args": { + "External id": 88207, "cbid": 147, "correlation": 161160190 + } + }, + { + "ph": "s", "id": 161160190, "pid": 5714, "tid": 6744, "ts": 6300866047664.535, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866047666.635, "dur": 0.820, + "args": { + "External id": 88207, "cbid": 135, "correlation": 161160193 + } + }, + { + "ph": "f", "id": 161160193, "pid": 5714, "tid": 6744, "ts": 6300866047666.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866047675.485, "dur": 0.480, + "args": { + "External id": 88207, "cbid": 135, "correlation": 161160200 + } + }, + { + "ph": "f", "id": 161160200, "pid": 5714, "tid": 6744, "ts": 6300866047675.485, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866047702.895, "dur": 1.050, + "args": { + "External id": 88209, "cbid": 147, "correlation": 161160205 + } + }, + { + "ph": "s", "id": 161160205, "pid": 5714, "tid": 6744, "ts": 6300866047702.895, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866047721.495, "dur": 0.960, + "args": { + "External id": 88156, "cbid": 135, "correlation": 161160220 + } + }, + { + "ph": "f", "id": 161160220, "pid": 5714, "tid": 6744, "ts": 6300866047721.495, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866075535.367, "dur": 2131.386, + "args": { + "External id": 88211, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160245, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160245, "pid": 0, "tid": 7, "ts": 6300866075535.367, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866047877.225, "dur": 11.750, + "args": { + "External id": 88211, "cbid": 211, "correlation": 161160245 + } + }, + { + "ph": "s", "id": 161160245, "pid": 5714, "tid": 6744, "ts": 6300866047877.225, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6300866077667.457, "dur": 561.030, + "args": { + "External id": 88212, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160268, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161160268, "pid": 0, "tid": 7, "ts": 6300866077667.457, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866047940.135, "dur": 6.640, + "args": { + "External id": 88212, "cbid": 307, "correlation": 161160268 + } + }, + { + "ph": "s", "id": 161160268, "pid": 5714, "tid": 6744, "ts": 6300866047940.135, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866047985.295, "dur": 0.609, + "args": { + "External id": 88213, "cbid": 200, "correlation": 161160291 + } + }, + { + "ph": "f", "id": 161160291, "pid": 5714, "tid": 6744, "ts": 6300866047985.295, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866078290.504, "dur": 59.424, + "args": { + "External id": 88213, "device": 0, "context": 1, "stream": 7, "correlation": 161160294, "bytes": 1536, "memory bandwidth (GB/s)": 0.025848142164781908 + } + }, + { + "ph": "f", "id": 161160294, "pid": 0, "tid": 7, "ts": 6300866078290.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866047987.775, "dur": 7.069, + "args": { + "External id": 88213, "cbid": 51, "correlation": 161160294 + } + }, + { + "ph": "s", "id": 161160294, "pid": 5714, "tid": 6744, "ts": 6300866047987.775, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866078394.217, "dur": 512.326, + "args": { + "External id": 88213, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160295, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160295, "pid": 0, "tid": 7, "ts": 6300866078394.217, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866047995.084, "dur": 6.371, + "args": { + "External id": 88213, "cbid": 307, "correlation": 161160295 + } + }, + { + "ph": "s", "id": 161160295, "pid": 5714, "tid": 6744, "ts": 6300866047995.084, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866048031.804, "dur": 0.320, + "args": { + "External id": 88214, "cbid": 200, "correlation": 161160320 + } + }, + { + "ph": "f", "id": 161160320, "pid": 5714, "tid": 6744, "ts": 6300866048031.804, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866078907.759, "dur": 1.248, + "args": { + "External id": 88214, "device": 0, "context": 1, "stream": 7, "correlation": 161160323, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 161160323, "pid": 0, "tid": 7, "ts": 6300866078907.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866048033.295, "dur": 4.769, + "args": { + "External id": 88214, "cbid": 51, "correlation": 161160323 + } + }, + { + "ph": "s", "id": 161160323, "pid": 5714, "tid": 6744, "ts": 6300866048033.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866078910.671, "dur": 354.148, + "args": { + "External id": 88214, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160324, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160324, "pid": 0, "tid": 7, "ts": 6300866078910.671, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048038.215, "dur": 5.429, + "args": { + "External id": 88214, "cbid": 307, "correlation": 161160324 + } + }, + { + "ph": "s", "id": 161160324, "pid": 5714, "tid": 6744, "ts": 6300866048038.215, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866048069.055, "dur": 0.320, + "args": { + "External id": 88215, "cbid": 200, "correlation": 161160349 + } + }, + { + "ph": "f", "id": 161160349, "pid": 5714, "tid": 6744, "ts": 6300866048069.055, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866079265.523, "dur": 359.364, + "args": { + "External id": 88215, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160352, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160352, "pid": 0, "tid": 7, "ts": 6300866079265.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048070.684, "dur": 5.810, + "args": { + "External id": 88215, "cbid": 307, "correlation": 161160352 + } + }, + { + "ph": "s", "id": 161160352, "pid": 5714, "tid": 6744, "ts": 6300866048070.684, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866048100.114, "dur": 0.260, + "args": { + "External id": 88216, "cbid": 200, "correlation": 161160377 + } + }, + { + "ph": "f", "id": 161160377, "pid": 5714, "tid": 6744, "ts": 6300866048100.114, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866079626.135, "dur": 1.089, + "args": { + "External id": 88216, "device": 0, "context": 1, "stream": 7, "correlation": 161160380, "bytes": 1536, "memory bandwidth (GB/s)": 1.4104683195592287 + } + }, + { + "ph": "f", "id": 161160380, "pid": 0, "tid": 7, "ts": 6300866079626.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866048101.524, "dur": 4.870, + "args": { + "External id": 88216, "cbid": 51, "correlation": 161160380 + } + }, + { + "ph": "s", "id": 161160380, "pid": 5714, "tid": 6744, "ts": 6300866048101.524, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866079628.824, "dur": 357.636, + "args": { + "External id": 88216, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160381, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160381, "pid": 0, "tid": 7, "ts": 6300866079628.824, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048106.544, "dur": 5.250, + "args": { + "External id": 88216, "cbid": 307, "correlation": 161160381 + } + }, + { + "ph": "s", "id": 161160381, "pid": 5714, "tid": 6744, "ts": 6300866048106.544, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866048137.524, "dur": 0.320, + "args": { + "External id": 88217, "cbid": 200, "correlation": 161160406 + } + }, + { + "ph": "f", "id": 161160406, "pid": 5714, "tid": 6744, "ts": 6300866048137.524, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866079987.132, "dur": 359.204, + "args": { + "External id": 88217, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160409, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160409, "pid": 0, "tid": 7, "ts": 6300866079987.132, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048138.964, "dur": 5.690, + "args": { + "External id": 88217, "cbid": 307, "correlation": 161160409 + } + }, + { + "ph": "s", "id": 161160409, "pid": 5714, "tid": 6744, "ts": 6300866048138.964, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866080347.040, "dur": 86.241, + "args": { + "External id": 88218, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160422, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160422, "pid": 0, "tid": 7, "ts": 6300866080347.040, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048184.914, "dur": 6.090, + "args": { + "External id": 88218, "cbid": 307, "correlation": 161160422 + } + }, + { + "ph": "s", "id": 161160422, "pid": 5714, "tid": 6744, "ts": 6300866048184.914, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6300866080433.953, "dur": 4.096, + "args": { + "External id": 88219, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160430, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161160430, "pid": 0, "tid": 7, "ts": 6300866080433.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048216.794, "dur": 6.080, + "args": { + "External id": 88219, "cbid": 307, "correlation": 161160430 + } + }, + { + "ph": "s", "id": 161160430, "pid": 5714, "tid": 6744, "ts": 6300866048216.794, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6300866080438.721, "dur": 114.465, + "args": { + "External id": 88220, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160438, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160438, "pid": 0, "tid": 7, "ts": 6300866080438.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048250.154, "dur": 5.430, + "args": { + "External id": 88220, "cbid": 307, "correlation": 161160438 + } + }, + { + "ph": "s", "id": 161160438, "pid": 5714, "tid": 6744, "ts": 6300866048250.154, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866048454.354, "dur": 0.560, + "args": { + "External id": 88239, "cbid": 200, "correlation": 161160484 + } + }, + { + "ph": "f", "id": 161160484, "pid": 5714, "tid": 6744, "ts": 6300866048454.354, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866080554.434, "dur": 1.120, + "args": { + "External id": 88239, "device": 0, "context": 1, "stream": 7, "correlation": 161160487, "bytes": 576, "memory bandwidth (GB/s)": 0.5142857142857142 + } + }, + { + "ph": "f", "id": 161160487, "pid": 0, "tid": 7, "ts": 6300866080554.434, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866048456.743, "dur": 8.060, + "args": { + "External id": 88239, "cbid": 51, "correlation": 161160487 + } + }, + { + "ph": "s", "id": 161160487, "pid": 5714, "tid": 6744, "ts": 6300866048456.743, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866080557.122, "dur": 177.730, + "args": { + "External id": 88239, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160488, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160488, "pid": 0, "tid": 7, "ts": 6300866080557.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048465.063, "dur": 9.160, + "args": { + "External id": 88239, "cbid": 307, "correlation": 161160488 + } + }, + { + "ph": "s", "id": 161160488, "pid": 5714, "tid": 6744, "ts": 6300866048465.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866080735.556, "dur": 141.890, + "args": { + "External id": 88240, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160510, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160510, "pid": 0, "tid": 7, "ts": 6300866080735.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048505.554, "dur": 6.349, + "args": { + "External id": 88240, "cbid": 211, "correlation": 161160510 + } + }, + { + "ph": "s", "id": 161160510, "pid": 5714, "tid": 6744, "ts": 6300866048505.554, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866048591.453, "dur": 0.500, + "args": { + "External id": 88241, "cbid": 200, "correlation": 161160528 + } + }, + { + "ph": "f", "id": 161160528, "pid": 5714, "tid": 6744, "ts": 6300866048591.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866048592.073, "dur": 0.220, + "args": { + "External id": 88241, "cbid": 200, "correlation": 161160529 + } + }, + { + "ph": "f", "id": 161160529, "pid": 5714, "tid": 6744, "ts": 6300866048592.073, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866048613.693, "dur": 0.250, + "args": { + "External id": 88241, "cbid": 200, "correlation": 161160547 + } + }, + { + "ph": "f", "id": 161160547, "pid": 5714, "tid": 6744, "ts": 6300866048613.693, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866080878.150, "dur": 91.361, + "args": { + "External id": 88241, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160548, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160548, "pid": 0, "tid": 7, "ts": 6300866080878.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048615.343, "dur": 9.840, + "args": { + "External id": 88241, "cbid": 211, "correlation": 161160548 + } + }, + { + "ph": "s", "id": 161160548, "pid": 5714, "tid": 6744, "ts": 6300866048615.343, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866048626.033, "dur": 1.000, + "args": { + "External id": 88241, "cbid": 273, "correlation": 161160550 + } + }, + { + "ph": "f", "id": 161160550, "pid": 5714, "tid": 6744, "ts": 6300866048626.033, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866080970.183, "dur": 1309.072, + "args": { + "External id": 88241, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160551, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161160551, "pid": 0, "tid": 7, "ts": 6300866080970.183, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048627.383, "dur": 4.530, + "args": { + "External id": 88241, "cbid": 211, "correlation": 161160551 + } + }, + { + "ph": "s", "id": 161160551, "pid": 5714, "tid": 6744, "ts": 6300866048627.383, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866082279.991, "dur": 163.937, + "args": { + "External id": 88241, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160553, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161160553, "pid": 0, "tid": 7, "ts": 6300866082279.991, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048632.493, "dur": 4.320, + "args": { + "External id": 88241, "cbid": 211, "correlation": 161160553 + } + }, + { + "ph": "s", "id": 161160553, "pid": 5714, "tid": 6744, "ts": 6300866048632.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866082444.568, "dur": 55.425, + "args": { + "External id": 88252, "device": 0, "context": 1, "stream": 7, "correlation": 161160575, "bytes": 25165824, "memory bandwidth (GB/s)": 454.0518538565629 + } + }, + { + "ph": "f", "id": 161160575, "pid": 0, "tid": 7, "ts": 6300866082444.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866048779.773, "dur": 18.780, + "args": { + "External id": 88252, "cbid": 41, "correlation": 161160575 + } + }, + { + "ph": "s", "id": 161160575, "pid": 5714, "tid": 6744, "ts": 6300866048779.773, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866082500.697, "dur": 33.793, + "args": { + "External id": 88249, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160593, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160593, "pid": 0, "tid": 7, "ts": 6300866082500.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866048906.042, "dur": 8.951, + "args": { + "External id": 88249, "cbid": 307, "correlation": 161160593 + } + }, + { + "ph": "s", "id": 161160593, "pid": 5714, "tid": 6744, "ts": 6300866048906.042, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866082535.674, "dur": 40.544, + "args": { + "External id": 88259, "device": 0, "context": 1, "stream": 7, "correlation": 161160608, "bytes": 25165824, "memory bandwidth (GB/s)": 620.7040252565115 + } + }, + { + "ph": "f", "id": 161160608, "pid": 0, "tid": 7, "ts": 6300866082535.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866048979.232, "dur": 15.290, + "args": { + "External id": 88259, "cbid": 41, "correlation": 161160608 + } + }, + { + "ph": "s", "id": 161160608, "pid": 5714, "tid": 6744, "ts": 6300866048979.232, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866082576.890, "dur": 28.384, + "args": { + "External id": 88256, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160626, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160626, "pid": 0, "tid": 7, "ts": 6300866082576.890, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049092.132, "dur": 8.290, + "args": { + "External id": 88256, "cbid": 307, "correlation": 161160626 + } + }, + { + "ph": "s", "id": 161160626, "pid": 5714, "tid": 6744, "ts": 6300866049092.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866049225.312, "dur": 0.580, + "args": { + "External id": 88264, "cbid": 200, "correlation": 161160656 + } + }, + { + "ph": "f", "id": 161160656, "pid": 5714, "tid": 6744, "ts": 6300866049225.312, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866082606.522, "dur": 1.217, + "args": { + "External id": 88264, "device": 0, "context": 1, "stream": 7, "correlation": 161160659, "bytes": 576, "memory bandwidth (GB/s)": 0.4732949876746097 + } + }, + { + "ph": "f", "id": 161160659, "pid": 0, "tid": 7, "ts": 6300866082606.522, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866049227.722, "dur": 7.930, + "args": { + "External id": 88264, "cbid": 51, "correlation": 161160659 + } + }, + { + "ph": "s", "id": 161160659, "pid": 5714, "tid": 6744, "ts": 6300866049227.722, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866082609.659, "dur": 146.721, + "args": { + "External id": 88264, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160660, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160660, "pid": 0, "tid": 7, "ts": 6300866082609.659, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049235.922, "dur": 8.440, + "args": { + "External id": 88264, "cbid": 307, "correlation": 161160660 + } + }, + { + "ph": "s", "id": 161160660, "pid": 5714, "tid": 6744, "ts": 6300866049235.922, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866049272.732, "dur": 0.360, + "args": { + "External id": 88265, "cbid": 200, "correlation": 161160685 + } + }, + { + "ph": "f", "id": 161160685, "pid": 5714, "tid": 6744, "ts": 6300866049272.732, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866082757.436, "dur": 1.216, + "args": { + "External id": 88265, "device": 0, "context": 1, "stream": 7, "correlation": 161160688, "bytes": 576, "memory bandwidth (GB/s)": 0.47368421052631576 + } + }, + { + "ph": "f", "id": 161160688, "pid": 0, "tid": 7, "ts": 6300866082757.436, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866049274.252, "dur": 4.850, + "args": { + "External id": 88265, "cbid": 51, "correlation": 161160688 + } + }, + { + "ph": "s", "id": 161160688, "pid": 5714, "tid": 6744, "ts": 6300866049274.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866082760.028, "dur": 158.914, + "args": { + "External id": 88265, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160689, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160689, "pid": 0, "tid": 7, "ts": 6300866082760.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049279.252, "dur": 5.660, + "args": { + "External id": 88265, "cbid": 307, "correlation": 161160689 + } + }, + { + "ph": "s", "id": 161160689, "pid": 5714, "tid": 6744, "ts": 6300866049279.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866049320.252, "dur": 0.309, + "args": { + "External id": 88266, "cbid": 200, "correlation": 161160714 + } + }, + { + "ph": "f", "id": 161160714, "pid": 5714, "tid": 6744, "ts": 6300866049320.252, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866082936.190, "dur": 13.504, + "args": { + "External id": 88266, "device": 0, "context": 1, "stream": 7, "correlation": 161160717, "bytes": 576, "memory bandwidth (GB/s)": 0.04265402843601896 + } + }, + { + "ph": "f", "id": 161160717, "pid": 0, "tid": 7, "ts": 6300866082936.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866049321.781, "dur": 5.040, + "args": { + "External id": 88266, "cbid": 51, "correlation": 161160717 + } + }, + { + "ph": "s", "id": 161160717, "pid": 5714, "tid": 6744, "ts": 6300866049321.781, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866082975.295, "dur": 480.773, + "args": { + "External id": 88266, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160718, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160718, "pid": 0, "tid": 7, "ts": 6300866082975.295, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049326.972, "dur": 5.600, + "args": { + "External id": 88266, "cbid": 307, "correlation": 161160718 + } + }, + { + "ph": "s", "id": 161160718, "pid": 5714, "tid": 6744, "ts": 6300866049326.972, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866083456.772, "dur": 364.421, + "args": { + "External id": 88267, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160740, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160740, "pid": 0, "tid": 7, "ts": 6300866083456.772, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049361.601, "dur": 5.980, + "args": { + "External id": 88267, "cbid": 211, "correlation": 161160740 + } + }, + { + "ph": "s", "id": 161160740, "pid": 5714, "tid": 6744, "ts": 6300866049361.601, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866083821.865, "dur": 152.130, + "args": { + "External id": 88268, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160763, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160763, "pid": 0, "tid": 7, "ts": 6300866083821.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049387.092, "dur": 5.160, + "args": { + "External id": 88268, "cbid": 211, "correlation": 161160763 + } + }, + { + "ph": "s", "id": 161160763, "pid": 5714, "tid": 6744, "ts": 6300866049387.092, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866083974.635, "dur": 142.017, + "args": { + "External id": 88269, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160786, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161160786, "pid": 0, "tid": 7, "ts": 6300866083974.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049410.551, "dur": 4.860, + "args": { + "External id": 88269, "cbid": 211, "correlation": 161160786 + } + }, + { + "ph": "s", "id": 161160786, "pid": 5714, "tid": 6744, "ts": 6300866049410.551, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6300866084117.292, "dur": 80.737, + "args": { + "External id": 88270, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160794, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160794, "pid": 0, "tid": 7, "ts": 6300866084117.292, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049459.581, "dur": 5.710, + "args": { + "External id": 88270, "cbid": 307, "correlation": 161160794 + } + }, + { + "ph": "s", "id": 161160794, "pid": 5714, "tid": 6744, "ts": 6300866049459.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866084198.637, "dur": 46.753, + "args": { + "External id": 88285, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160823, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160823, "pid": 0, "tid": 7, "ts": 6300866084198.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049633.771, "dur": 9.820, + "args": { + "External id": 88285, "cbid": 307, "correlation": 161160823 + } + }, + { + "ph": "s", "id": 161160823, "pid": 5714, "tid": 6744, "ts": 6300866049633.771, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866084246.126, "dur": 3.520, + "args": { + "External id": 88286, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160831, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161160831, "pid": 0, "tid": 7, "ts": 6300866084246.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049670.281, "dur": 5.560, + "args": { + "External id": 88286, "cbid": 307, "correlation": 161160831 + } + }, + { + "ph": "s", "id": 161160831, "pid": 5714, "tid": 6744, "ts": 6300866049670.281, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866084250.222, "dur": 50.784, + "args": { + "External id": 88287, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160842, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160842, "pid": 0, "tid": 7, "ts": 6300866084250.222, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049705.391, "dur": 5.640, + "args": { + "External id": 88287, "cbid": 307, "correlation": 161160842 + } + }, + { + "ph": "s", "id": 161160842, "pid": 5714, "tid": 6744, "ts": 6300866049705.391, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866084301.614, "dur": 399.781, + "args": { + "External id": 88288, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160847, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160847, "pid": 0, "tid": 7, "ts": 6300866084301.614, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866049749.300, "dur": 7.371, + "args": { + "External id": 88288, "cbid": 211, "correlation": 161160847 + } + }, + { + "ph": "s", "id": 161160847, "pid": 5714, "tid": 6744, "ts": 6300866049749.300, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866049935.550, "dur": 2.980, + "args": { + "External id": 88294, "cbid": 147, "correlation": 161160864 + } + }, + { + "ph": "s", "id": 161160864, "pid": 5714, "tid": 6744, "ts": 6300866049935.550, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866050056.570, "dur": 2.660, + "args": { + "External id": 88302, "cbid": 138, "correlation": 161160879 + } + }, + { + "ph": "f", "id": 161160879, "pid": 5714, "tid": 6744, "ts": 6300866050056.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866084757.844, "dur": 5.696, + "args": { + "External id": 88306, "device": 0, "context": 1, "stream": 7, "correlation": 161160890, "bytes": 28112, "memory bandwidth (GB/s)": 4.935393258426966 + } + }, + { + "ph": "f", "id": 161160890, "pid": 0, "tid": 7, "ts": 6300866084757.844, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866050082.510, "dur": 12.840, + "args": { + "External id": 88306, "cbid": 41, "correlation": 161160890 + } + }, + { + "ph": "s", "id": 161160890, "pid": 5714, "tid": 6744, "ts": 6300866050082.510, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050099.710, "dur": 2.070, + "args": { + "External id": 88301, "cbid": 135, "correlation": 161160894 + } + }, + { + "ph": "f", "id": 161160894, "pid": 5714, "tid": 6744, "ts": 6300866050099.710, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866084766.292, "dur": 36.064, + "args": { + "External id": 88301, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161160898, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161160898, "pid": 0, "tid": 7, "ts": 6300866084766.292, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866050105.240, "dur": 10.960, + "args": { + "External id": 88301, "cbid": 211, "correlation": 161160898 + } + }, + { + "ph": "s", "id": 161160898, "pid": 5714, "tid": 6744, "ts": 6300866050105.240, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050158.970, "dur": 1.010, + "args": { + "External id": 88294, "cbid": 135, "correlation": 161160909 + } + }, + { + "ph": "f", "id": 161160909, "pid": 5714, "tid": 6744, "ts": 6300866050158.970, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866050162.180, "dur": 1.410, + "args": { + "External id": 88294, "cbid": 147, "correlation": 161160913 + } + }, + { + "ph": "s", "id": 161160913, "pid": 5714, "tid": 6744, "ts": 6300866050162.180, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866050236.550, "dur": 1.069, + "args": { + "External id": 88310, "cbid": 317, "correlation": 161160933 + } + }, + { + "ph": "f", "id": 161160933, "pid": 5714, "tid": 6744, "ts": 6300866050236.550, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050239.770, "dur": 1.409, + "args": { + "External id": 88310, "cbid": 135, "correlation": 161160935 + } + }, + { + "ph": "f", "id": 161160935, "pid": 5714, "tid": 6744, "ts": 6300866050239.770, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866050242.719, "dur": 0.880, + "args": { + "External id": 88310, "cbid": 147, "correlation": 161160939 + } + }, + { + "ph": "s", "id": 161160939, "pid": 5714, "tid": 6744, "ts": 6300866050242.719, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866050258.770, "dur": 0.769, + "args": { + "External id": 88310, "cbid": 409, "correlation": 161160942 + } + }, + { + "ph": "f", "id": 161160942, "pid": 5714, "tid": 6744, "ts": 6300866050258.770, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050263.930, "dur": 0.829, + "args": { + "External id": 88310, "cbid": 135, "correlation": 161160945 + } + }, + { + "ph": "f", "id": 161160945, "pid": 5714, "tid": 6744, "ts": 6300866050263.930, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866050264.950, "dur": 1.120, + "args": { + "External id": 88310, "cbid": 147, "correlation": 161160946 + } + }, + { + "ph": "s", "id": 161160946, "pid": 5714, "tid": 6744, "ts": 6300866050264.950, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866089482.539, "dur": 8511.780, + "args": { + "External id": 88310, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161160948, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161160948, "pid": 0, "tid": 20, "ts": 6300866089482.539, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866050267.270, "dur": 10.499, + "args": { + "External id": 88310, "cbid": 430, "correlation": 161160948 + } + }, + { + "ph": "s", "id": 161160948, "pid": 5714, "tid": 6744, "ts": 6300866050267.270, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050278.829, "dur": 0.430, + "args": { + "External id": 88310, "cbid": 135, "correlation": 161160950 + } + }, + { + "ph": "f", "id": 161160950, "pid": 5714, "tid": 6744, "ts": 6300866050278.829, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866050279.379, "dur": 0.590, + "args": { + "External id": 88310, "cbid": 147, "correlation": 161160951 + } + }, + { + "ph": "s", "id": 161160951, "pid": 5714, "tid": 6744, "ts": 6300866050279.379, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050281.529, "dur": 0.830, + "args": { + "External id": 88310, "cbid": 135, "correlation": 161160954 + } + }, + { + "ph": "f", "id": 161160954, "pid": 5714, "tid": 6744, "ts": 6300866050281.529, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050290.389, "dur": 0.500, + "args": { + "External id": 88310, "cbid": 135, "correlation": 161160961 + } + }, + { + "ph": "f", "id": 161160961, "pid": 5714, "tid": 6744, "ts": 6300866050290.389, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866050328.199, "dur": 1.230, + "args": { + "External id": 88312, "cbid": 147, "correlation": 161160966 + } + }, + { + "ph": "s", "id": 161160966, "pid": 5714, "tid": 6744, "ts": 6300866050328.199, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050347.639, "dur": 0.950, + "args": { + "External id": 88294, "cbid": 135, "correlation": 161160981 + } + }, + { + "ph": "f", "id": 161160981, "pid": 5714, "tid": 6744, "ts": 6300866050347.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050549.219, "dur": 1.170, + "args": { + "External id": 88294, "cbid": 135, "correlation": 161160994 + } + }, + { + "ph": "f", "id": 161160994, "pid": 5714, "tid": 6744, "ts": 6300866050549.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866050664.049, "dur": 3.440, + "args": { + "External id": 88322, "cbid": 147, "correlation": 161161005 + } + }, + { + "ph": "s", "id": 161161005, "pid": 5714, "tid": 6744, "ts": 6300866050664.049, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866050782.008, "dur": 1.360, + "args": { + "External id": 88336, "cbid": 317, "correlation": 161161046 + } + }, + { + "ph": "f", "id": 161161046, "pid": 5714, "tid": 6744, "ts": 6300866050782.008, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866050791.758, "dur": 2.320, + "args": { + "External id": 88337, "cbid": 138, "correlation": 161161049 + } + }, + { + "ph": "f", "id": 161161049, "pid": 5714, "tid": 6744, "ts": 6300866050791.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866089484.843, "dur": 2.369, + "args": { + "External id": 88341, "device": 0, "context": 1, "stream": 7, "correlation": 161161060, "bytes": 7224, "memory bandwidth (GB/s)": 3.0493879273955256 + } + }, + { + "ph": "f", "id": 161161060, "pid": 0, "tid": 7, "ts": 6300866089484.843, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866050817.218, "dur": 13.040, + "args": { + "External id": 88341, "cbid": 41, "correlation": 161161060 + } + }, + { + "ph": "s", "id": 161161060, "pid": 5714, "tid": 6744, "ts": 6300866050817.218, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050835.188, "dur": 1.960, + "args": { + "External id": 88336, "cbid": 135, "correlation": 161161064 + } + }, + { + "ph": "f", "id": 161161064, "pid": 5714, "tid": 6744, "ts": 6300866050835.188, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300866089489.195, "dur": 10.688, + "args": { + "External id": 88336, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161068, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161068, "pid": 0, "tid": 7, "ts": 6300866089489.195, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866050839.978, "dur": 11.050, + "args": { + "External id": 88336, "cbid": 211, "correlation": 161161068 + } + }, + { + "ph": "s", "id": 161161068, "pid": 5714, "tid": 6744, "ts": 6300866050839.978, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866050947.268, "dur": 1.530, + "args": { + "External id": 88322, "cbid": 135, "correlation": 161161079 + } + }, + { + "ph": "f", "id": 161161079, "pid": 5714, "tid": 6744, "ts": 6300866050947.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866050952.078, "dur": 1.330, + "args": { + "External id": 88322, "cbid": 147, "correlation": 161161083 + } + }, + { + "ph": "s", "id": 161161083, "pid": 5714, "tid": 6744, "ts": 6300866050952.078, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866050955.268, "dur": 0.860, + "args": { + "External id": 88322, "cbid": 147, "correlation": 161161087 + } + }, + { + "ph": "s", "id": 161161087, "pid": 5714, "tid": 6744, "ts": 6300866050955.268, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866089533.644, "dur": 31.936, + "args": { + "External id": 88355, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161161111, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161161111, "pid": 0, "tid": 17, "ts": 6300866089533.644, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051110.057, "dur": 12.231, + "args": { + "External id": 88355, "cbid": 211, "correlation": 161161111 + } + }, + { + "ph": "s", "id": 161161111, "pid": 5714, "tid": 6744, "ts": 6300866051110.057, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866089635.629, "dur": 456.518, + "args": { + "External id": 88371, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161161124, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161161124, "pid": 0, "tid": 17, "ts": 6300866089635.629, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051229.067, "dur": 10.040, + "args": { + "External id": 88371, "cbid": 211, "correlation": 161161124 + } + }, + { + "ph": "s", "id": 161161124, "pid": 5714, "tid": 6744, "ts": 6300866051229.067, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866051263.987, "dur": 1.360, + "args": { + "External id": 88322, "cbid": 135, "correlation": 161161134 + } + }, + { + "ph": "f", "id": 161161134, "pid": 5714, "tid": 6744, "ts": 6300866051263.987, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866051267.297, "dur": 1.290, + "args": { + "External id": 88322, "cbid": 147, "correlation": 161161138 + } + }, + { + "ph": "s", "id": 161161138, "pid": 5714, "tid": 6744, "ts": 6300866051267.297, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866051358.107, "dur": 1.030, + "args": { + "External id": 88373, "cbid": 317, "correlation": 161161151 + } + }, + { + "ph": "f", "id": 161161151, "pid": 5714, "tid": 6744, "ts": 6300866051358.107, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866051361.097, "dur": 1.370, + "args": { + "External id": 88373, "cbid": 135, "correlation": 161161153 + } + }, + { + "ph": "f", "id": 161161153, "pid": 5714, "tid": 6744, "ts": 6300866051361.097, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866051364.247, "dur": 1.240, + "args": { + "External id": 88373, "cbid": 147, "correlation": 161161157 + } + }, + { + "ph": "s", "id": 161161157, "pid": 5714, "tid": 6744, "ts": 6300866051364.247, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866051380.567, "dur": 0.730, + "args": { + "External id": 88373, "cbid": 409, "correlation": 161161160 + } + }, + { + "ph": "f", "id": 161161160, "pid": 5714, "tid": 6744, "ts": 6300866051380.567, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866051385.657, "dur": 0.710, + "args": { + "External id": 88373, "cbid": 135, "correlation": 161161163 + } + }, + { + "ph": "f", "id": 161161163, "pid": 5714, "tid": 6744, "ts": 6300866051385.657, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866051386.557, "dur": 0.950, + "args": { + "External id": 88373, "cbid": 147, "correlation": 161161164 + } + }, + { + "ph": "s", "id": 161161164, "pid": 5714, "tid": 6744, "ts": 6300866051386.557, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866098077.744, "dur": 5008.763, + "args": { + "External id": 88373, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161161166, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161161166, "pid": 0, "tid": 20, "ts": 6300866098077.744, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866051388.607, "dur": 10.830, + "args": { + "External id": 88373, "cbid": 430, "correlation": 161161166 + } + }, + { + "ph": "s", "id": 161161166, "pid": 5714, "tid": 6744, "ts": 6300866051388.607, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866051400.527, "dur": 0.450, + "args": { + "External id": 88373, "cbid": 135, "correlation": 161161168 + } + }, + { + "ph": "f", "id": 161161168, "pid": 5714, "tid": 6744, "ts": 6300866051400.527, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866051401.137, "dur": 0.550, + "args": { + "External id": 88373, "cbid": 147, "correlation": 161161169 + } + }, + { + "ph": "s", "id": 161161169, "pid": 5714, "tid": 6744, "ts": 6300866051401.137, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866051403.337, "dur": 0.710, + "args": { + "External id": 88373, "cbid": 135, "correlation": 161161172 + } + }, + { + "ph": "f", "id": 161161172, "pid": 5714, "tid": 6744, "ts": 6300866051403.337, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866051413.157, "dur": 0.470, + "args": { + "External id": 88373, "cbid": 135, "correlation": 161161179 + } + }, + { + "ph": "f", "id": 161161179, "pid": 5714, "tid": 6744, "ts": 6300866051413.157, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866051441.747, "dur": 0.990, + "args": { + "External id": 88375, "cbid": 147, "correlation": 161161184 + } + }, + { + "ph": "s", "id": 161161184, "pid": 5714, "tid": 6744, "ts": 6300866051441.747, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866051460.297, "dur": 0.930, + "args": { + "External id": 88322, "cbid": 135, "correlation": 161161199 + } + }, + { + "ph": "f", "id": 161161199, "pid": 5714, "tid": 6744, "ts": 6300866051460.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866089500.555, "dur": 2261.691, + "args": { + "External id": 88377, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161224, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161224, "pid": 0, "tid": 7, "ts": 6300866089500.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051615.206, "dur": 12.030, + "args": { + "External id": 88377, "cbid": 211, "correlation": 161161224 + } + }, + { + "ph": "s", "id": 161161224, "pid": 5714, "tid": 6744, "ts": 6300866051615.206, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6300866091762.950, "dur": 569.671, + "args": { + "External id": 88378, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161247, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161161247, "pid": 0, "tid": 7, "ts": 6300866091762.950, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051679.496, "dur": 6.500, + "args": { + "External id": 88378, "cbid": 307, "correlation": 161161247 + } + }, + { + "ph": "s", "id": 161161247, "pid": 5714, "tid": 6744, "ts": 6300866051679.496, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866051723.346, "dur": 0.590, + "args": { + "External id": 88379, "cbid": 200, "correlation": 161161270 + } + }, + { + "ph": "f", "id": 161161270, "pid": 5714, "tid": 6744, "ts": 6300866051723.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866092361.485, "dur": 100.161, + "args": { + "External id": 88379, "device": 0, "context": 1, "stream": 7, "correlation": 161161273, "bytes": 1536, "memory bandwidth (GB/s)": 0.015335310150657441 + } + }, + { + "ph": "f", "id": 161161273, "pid": 0, "tid": 7, "ts": 6300866092361.485, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866051725.766, "dur": 7.320, + "args": { + "External id": 88379, "cbid": 51, "correlation": 161161273 + } + }, + { + "ph": "s", "id": 161161273, "pid": 5714, "tid": 6744, "ts": 6300866051725.766, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866092539.247, "dur": 769.641, + "args": { + "External id": 88379, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161274, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161274, "pid": 0, "tid": 7, "ts": 6300866092539.247, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051733.336, "dur": 6.430, + "args": { + "External id": 88379, "cbid": 307, "correlation": 161161274 + } + }, + { + "ph": "s", "id": 161161274, "pid": 5714, "tid": 6744, "ts": 6300866051733.336, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866051770.736, "dur": 0.330, + "args": { + "External id": 88380, "cbid": 200, "correlation": 161161299 + } + }, + { + "ph": "f", "id": 161161299, "pid": 5714, "tid": 6744, "ts": 6300866051770.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866093310.168, "dur": 1.600, + "args": { + "External id": 88380, "device": 0, "context": 1, "stream": 7, "correlation": 161161302, "bytes": 1536, "memory bandwidth (GB/s)": 0.96 + } + }, + { + "ph": "f", "id": 161161302, "pid": 0, "tid": 7, "ts": 6300866093310.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866051772.206, "dur": 5.190, + "args": { + "External id": 88380, "cbid": 51, "correlation": 161161302 + } + }, + { + "ph": "s", "id": 161161302, "pid": 5714, "tid": 6744, "ts": 6300866051772.206, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866093312.952, "dur": 353.540, + "args": { + "External id": 88380, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161303, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161303, "pid": 0, "tid": 7, "ts": 6300866093312.952, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051777.556, "dur": 5.540, + "args": { + "External id": 88380, "cbid": 307, "correlation": 161161303 + } + }, + { + "ph": "s", "id": 161161303, "pid": 5714, "tid": 6744, "ts": 6300866051777.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866051808.786, "dur": 0.330, + "args": { + "External id": 88381, "cbid": 200, "correlation": 161161328 + } + }, + { + "ph": "f", "id": 161161328, "pid": 5714, "tid": 6744, "ts": 6300866051808.786, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866093667.196, "dur": 360.901, + "args": { + "External id": 88381, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161331, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161331, "pid": 0, "tid": 7, "ts": 6300866093667.196, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051810.286, "dur": 5.650, + "args": { + "External id": 88381, "cbid": 307, "correlation": 161161331 + } + }, + { + "ph": "s", "id": 161161331, "pid": 5714, "tid": 6744, "ts": 6300866051810.286, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866051840.296, "dur": 0.230, + "args": { + "External id": 88382, "cbid": 200, "correlation": 161161356 + } + }, + { + "ph": "f", "id": 161161356, "pid": 5714, "tid": 6744, "ts": 6300866051840.296, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866094029.665, "dur": 1.280, + "args": { + "External id": 88382, "device": 0, "context": 1, "stream": 7, "correlation": 161161359, "bytes": 1536, "memory bandwidth (GB/s)": 1.2 + } + }, + { + "ph": "f", "id": 161161359, "pid": 0, "tid": 7, "ts": 6300866094029.665, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866051841.546, "dur": 4.750, + "args": { + "External id": 88382, "cbid": 51, "correlation": 161161359 + } + }, + { + "ph": "s", "id": 161161359, "pid": 5714, "tid": 6744, "ts": 6300866051841.546, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866094032.161, "dur": 353.604, + "args": { + "External id": 88382, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161360, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161360, "pid": 0, "tid": 7, "ts": 6300866094032.161, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051846.466, "dur": 5.210, + "args": { + "External id": 88382, "cbid": 307, "correlation": 161161360 + } + }, + { + "ph": "s", "id": 161161360, "pid": 5714, "tid": 6744, "ts": 6300866051846.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866051876.096, "dur": 0.300, + "args": { + "External id": 88383, "cbid": 200, "correlation": 161161385 + } + }, + { + "ph": "f", "id": 161161385, "pid": 5714, "tid": 6744, "ts": 6300866051876.096, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866094386.405, "dur": 361.060, + "args": { + "External id": 88383, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161388, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161388, "pid": 0, "tid": 7, "ts": 6300866094386.405, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051877.476, "dur": 5.570, + "args": { + "External id": 88383, "cbid": 307, "correlation": 161161388 + } + }, + { + "ph": "s", "id": 161161388, "pid": 5714, "tid": 6744, "ts": 6300866051877.476, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866094748.137, "dur": 89.857, + "args": { + "External id": 88384, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161401, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161401, "pid": 0, "tid": 7, "ts": 6300866094748.137, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051923.846, "dur": 6.020, + "args": { + "External id": 88384, "cbid": 307, "correlation": 161161401 + } + }, + { + "ph": "s", "id": 161161401, "pid": 5714, "tid": 6744, "ts": 6300866051923.846, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6300866094838.602, "dur": 4.000, + "args": { + "External id": 88385, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161409, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161161409, "pid": 0, "tid": 7, "ts": 6300866094838.602, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051958.206, "dur": 5.609, + "args": { + "External id": 88385, "cbid": 307, "correlation": 161161409 + } + }, + { + "ph": "s", "id": 161161409, "pid": 5714, "tid": 6744, "ts": 6300866051958.206, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6300866094843.242, "dur": 115.266, + "args": { + "External id": 88386, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161417, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161417, "pid": 0, "tid": 7, "ts": 6300866094843.242, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866051991.126, "dur": 5.509, + "args": { + "External id": 88386, "cbid": 307, "correlation": 161161417 + } + }, + { + "ph": "s", "id": 161161417, "pid": 5714, "tid": 6744, "ts": 6300866051991.126, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866052184.095, "dur": 0.570, + "args": { + "External id": 88405, "cbid": 200, "correlation": 161161463 + } + }, + { + "ph": "f", "id": 161161463, "pid": 5714, "tid": 6744, "ts": 6300866052184.095, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866094959.820, "dur": 0.768, + "args": { + "External id": 88405, "device": 0, "context": 1, "stream": 7, "correlation": 161161466, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 161161466, "pid": 0, "tid": 7, "ts": 6300866094959.820, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866052186.455, "dur": 7.880, + "args": { + "External id": 88405, "cbid": 51, "correlation": 161161466 + } + }, + { + "ph": "s", "id": 161161466, "pid": 5714, "tid": 6744, "ts": 6300866052186.455, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866094961.772, "dur": 142.945, + "args": { + "External id": 88405, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161467, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161467, "pid": 0, "tid": 7, "ts": 6300866094961.772, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866052194.605, "dur": 8.970, + "args": { + "External id": 88405, "cbid": 307, "correlation": 161161467 + } + }, + { + "ph": "s", "id": 161161467, "pid": 5714, "tid": 6744, "ts": 6300866052194.605, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866095105.357, "dur": 141.634, + "args": { + "External id": 88406, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161489, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161489, "pid": 0, "tid": 7, "ts": 6300866095105.357, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866052233.295, "dur": 6.040, + "args": { + "External id": 88406, "cbid": 211, "correlation": 161161489 + } + }, + { + "ph": "s", "id": 161161489, "pid": 5714, "tid": 6744, "ts": 6300866052233.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866052327.735, "dur": 0.510, + "args": { + "External id": 88407, "cbid": 200, "correlation": 161161507 + } + }, + { + "ph": "f", "id": 161161507, "pid": 5714, "tid": 6744, "ts": 6300866052327.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866052328.375, "dur": 0.230, + "args": { + "External id": 88407, "cbid": 200, "correlation": 161161508 + } + }, + { + "ph": "f", "id": 161161508, "pid": 5714, "tid": 6744, "ts": 6300866052328.375, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866052349.555, "dur": 0.260, + "args": { + "External id": 88407, "cbid": 200, "correlation": 161161526 + } + }, + { + "ph": "f", "id": 161161526, "pid": 5714, "tid": 6744, "ts": 6300866052349.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866095247.695, "dur": 201.283, + "args": { + "External id": 88407, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161527, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161527, "pid": 0, "tid": 7, "ts": 6300866095247.695, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866052351.165, "dur": 10.400, + "args": { + "External id": 88407, "cbid": 211, "correlation": 161161527 + } + }, + { + "ph": "s", "id": 161161527, "pid": 5714, "tid": 6744, "ts": 6300866052351.165, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866052362.435, "dur": 1.000, + "args": { + "External id": 88407, "cbid": 273, "correlation": 161161529 + } + }, + { + "ph": "f", "id": 161161529, "pid": 5714, "tid": 6744, "ts": 6300866052362.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866095464.977, "dur": 1565.555, + "args": { + "External id": 88407, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161530, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161161530, "pid": 0, "tid": 7, "ts": 6300866095464.977, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866052363.815, "dur": 4.530, + "args": { + "External id": 88407, "cbid": 211, "correlation": 161161530 + } + }, + { + "ph": "s", "id": 161161530, "pid": 5714, "tid": 6744, "ts": 6300866052363.815, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866097031.204, "dur": 208.962, + "args": { + "External id": 88407, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161532, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161161532, "pid": 0, "tid": 7, "ts": 6300866097031.204, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866052368.935, "dur": 4.030, + "args": { + "External id": 88407, "cbid": 211, "correlation": 161161532 + } + }, + { + "ph": "s", "id": 161161532, "pid": 5714, "tid": 6744, "ts": 6300866052368.935, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866097240.806, "dur": 205.347, + "args": { + "External id": 88418, "device": 0, "context": 1, "stream": 7, "correlation": 161161554, "bytes": 25165824, "memory bandwidth (GB/s)": 122.55267425382401 + } + }, + { + "ph": "f", "id": 161161554, "pid": 0, "tid": 7, "ts": 6300866097240.806, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866052515.504, "dur": 18.990, + "args": { + "External id": 88418, "cbid": 41, "correlation": 161161554 + } + }, + { + "ph": "s", "id": 161161554, "pid": 5714, "tid": 6744, "ts": 6300866052515.504, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866097446.761, "dur": 162.466, + "args": { + "External id": 88415, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161572, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161572, "pid": 0, "tid": 7, "ts": 6300866097446.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866052640.694, "dur": 8.970, + "args": { + "External id": 88415, "cbid": 307, "correlation": 161161572 + } + }, + { + "ph": "s", "id": 161161572, "pid": 5714, "tid": 6744, "ts": 6300866052640.694, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866097633.675, "dur": 69.089, + "args": { + "External id": 88425, "device": 0, "context": 1, "stream": 7, "correlation": 161161587, "bytes": 25165824, "memory bandwidth (GB/s)": 364.25225433860675 + } + }, + { + "ph": "f", "id": 161161587, "pid": 0, "tid": 7, "ts": 6300866097633.675, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866052713.914, "dur": 15.080, + "args": { + "External id": 88425, "cbid": 41, "correlation": 161161587 + } + }, + { + "ph": "s", "id": 161161587, "pid": 5714, "tid": 6744, "ts": 6300866052713.914, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866097703.372, "dur": 42.208, + "args": { + "External id": 88422, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161605, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161605, "pid": 0, "tid": 7, "ts": 6300866097703.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866052823.333, "dur": 8.140, + "args": { + "External id": 88422, "cbid": 307, "correlation": 161161605 + } + }, + { + "ph": "s", "id": 161161605, "pid": 5714, "tid": 6744, "ts": 6300866052823.333, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866052958.233, "dur": 0.580, + "args": { + "External id": 88430, "cbid": 200, "correlation": 161161635 + } + }, + { + "ph": "f", "id": 161161635, "pid": 5714, "tid": 6744, "ts": 6300866052958.233, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866097753.036, "dur": 7.872, + "args": { + "External id": 88430, "device": 0, "context": 1, "stream": 7, "correlation": 161161638, "bytes": 576, "memory bandwidth (GB/s)": 0.07317073170731707 + } + }, + { + "ph": "f", "id": 161161638, "pid": 0, "tid": 7, "ts": 6300866097753.036, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866052960.643, "dur": 8.260, + "args": { + "External id": 88430, "cbid": 51, "correlation": 161161638 + } + }, + { + "ph": "s", "id": 161161638, "pid": 5714, "tid": 6744, "ts": 6300866052960.643, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866097768.365, "dur": 161.634, + "args": { + "External id": 88430, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161639, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161639, "pid": 0, "tid": 7, "ts": 6300866097768.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866052969.153, "dur": 8.370, + "args": { + "External id": 88430, "cbid": 307, "correlation": 161161639 + } + }, + { + "ph": "s", "id": 161161639, "pid": 5714, "tid": 6744, "ts": 6300866052969.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866053005.793, "dur": 0.320, + "args": { + "External id": 88431, "cbid": 200, "correlation": 161161664 + } + }, + { + "ph": "f", "id": 161161664, "pid": 5714, "tid": 6744, "ts": 6300866053005.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866097937.711, "dur": 7.200, + "args": { + "External id": 88431, "device": 0, "context": 1, "stream": 7, "correlation": 161161667, "bytes": 576, "memory bandwidth (GB/s)": 0.08 + } + }, + { + "ph": "f", "id": 161161667, "pid": 0, "tid": 7, "ts": 6300866097937.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866053007.253, "dur": 4.570, + "args": { + "External id": 88431, "cbid": 51, "correlation": 161161667 + } + }, + { + "ph": "s", "id": 161161667, "pid": 5714, "tid": 6744, "ts": 6300866053007.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866097952.399, "dur": 139.425, + "args": { + "External id": 88431, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161668, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161668, "pid": 0, "tid": 7, "ts": 6300866097952.399, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053011.973, "dur": 5.220, + "args": { + "External id": 88431, "cbid": 307, "correlation": 161161668 + } + }, + { + "ph": "s", "id": 161161668, "pid": 5714, "tid": 6744, "ts": 6300866053011.973, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866053041.773, "dur": 0.310, + "args": { + "External id": 88432, "cbid": 200, "correlation": 161161693 + } + }, + { + "ph": "f", "id": 161161693, "pid": 5714, "tid": 6744, "ts": 6300866053041.773, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866098093.040, "dur": 1.664, + "args": { + "External id": 88432, "device": 0, "context": 1, "stream": 7, "correlation": 161161696, "bytes": 576, "memory bandwidth (GB/s)": 0.34615384615384615 + } + }, + { + "ph": "f", "id": 161161696, "pid": 0, "tid": 7, "ts": 6300866098093.040, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866053043.233, "dur": 4.320, + "args": { + "External id": 88432, "cbid": 51, "correlation": 161161696 + } + }, + { + "ph": "s", "id": 161161696, "pid": 5714, "tid": 6744, "ts": 6300866053043.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866098095.920, "dur": 221.955, + "args": { + "External id": 88432, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161697, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161697, "pid": 0, "tid": 7, "ts": 6300866098095.920, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053047.703, "dur": 5.050, + "args": { + "External id": 88432, "cbid": 307, "correlation": 161161697 + } + }, + { + "ph": "s", "id": 161161697, "pid": 5714, "tid": 6744, "ts": 6300866053047.703, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866098318.579, "dur": 446.341, + "args": { + "External id": 88433, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161719, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161719, "pid": 0, "tid": 7, "ts": 6300866098318.579, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053078.793, "dur": 6.190, + "args": { + "External id": 88433, "cbid": 211, "correlation": 161161719 + } + }, + { + "ph": "s", "id": 161161719, "pid": 5714, "tid": 6744, "ts": 6300866053078.793, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866098765.528, "dur": 141.250, + "args": { + "External id": 88434, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161742, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161742, "pid": 0, "tid": 7, "ts": 6300866098765.528, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053105.773, "dur": 5.380, + "args": { + "External id": 88434, "cbid": 211, "correlation": 161161742 + } + }, + { + "ph": "s", "id": 161161742, "pid": 5714, "tid": 6744, "ts": 6300866053105.773, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866098907.450, "dur": 141.794, + "args": { + "External id": 88435, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161765, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161161765, "pid": 0, "tid": 7, "ts": 6300866098907.450, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053129.723, "dur": 4.770, + "args": { + "External id": 88435, "cbid": 211, "correlation": 161161765 + } + }, + { + "ph": "s", "id": 161161765, "pid": 5714, "tid": 6744, "ts": 6300866053129.723, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6300866099049.980, "dur": 82.241, + "args": { + "External id": 88436, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161773, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161773, "pid": 0, "tid": 7, "ts": 6300866099049.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053170.523, "dur": 5.720, + "args": { + "External id": 88436, "cbid": 307, "correlation": 161161773 + } + }, + { + "ph": "s", "id": 161161773, "pid": 5714, "tid": 6744, "ts": 6300866053170.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866099132.861, "dur": 45.568, + "args": { + "External id": 88451, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161802, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161802, "pid": 0, "tid": 7, "ts": 6300866099132.861, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053363.592, "dur": 10.330, + "args": { + "External id": 88451, "cbid": 307, "correlation": 161161802 + } + }, + { + "ph": "s", "id": 161161802, "pid": 5714, "tid": 6744, "ts": 6300866053363.592, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866099179.037, "dur": 3.424, + "args": { + "External id": 88452, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161810, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161161810, "pid": 0, "tid": 7, "ts": 6300866099179.037, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053401.522, "dur": 5.850, + "args": { + "External id": 88452, "cbid": 307, "correlation": 161161810 + } + }, + { + "ph": "s", "id": 161161810, "pid": 5714, "tid": 6744, "ts": 6300866053401.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866099183.133, "dur": 49.473, + "args": { + "External id": 88453, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161821, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161821, "pid": 0, "tid": 7, "ts": 6300866099183.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053436.722, "dur": 5.420, + "args": { + "External id": 88453, "cbid": 307, "correlation": 161161821 + } + }, + { + "ph": "s", "id": 161161821, "pid": 5714, "tid": 6744, "ts": 6300866053436.722, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866099233.310, "dur": 48.448, + "args": { + "External id": 88454, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161826, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161826, "pid": 0, "tid": 7, "ts": 6300866099233.310, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053481.542, "dur": 7.560, + "args": { + "External id": 88454, "cbid": 211, "correlation": 161161826 + } + }, + { + "ph": "s", "id": 161161826, "pid": 5714, "tid": 6744, "ts": 6300866053481.542, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866053665.772, "dur": 2.870, + "args": { + "External id": 88460, "cbid": 147, "correlation": 161161843 + } + }, + { + "ph": "s", "id": 161161843, "pid": 5714, "tid": 6744, "ts": 6300866053665.772, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866053783.082, "dur": 2.620, + "args": { + "External id": 88468, "cbid": 138, "correlation": 161161858 + } + }, + { + "ph": "f", "id": 161161858, "pid": 5714, "tid": 6744, "ts": 6300866053783.082, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866099285.407, "dur": 6.144, + "args": { + "External id": 88472, "device": 0, "context": 1, "stream": 7, "correlation": 161161869, "bytes": 28112, "memory bandwidth (GB/s)": 4.575520833333333 + } + }, + { + "ph": "f", "id": 161161869, "pid": 0, "tid": 7, "ts": 6300866099285.407, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866053808.391, "dur": 12.570, + "args": { + "External id": 88472, "cbid": 41, "correlation": 161161869 + } + }, + { + "ph": "s", "id": 161161869, "pid": 5714, "tid": 6744, "ts": 6300866053808.391, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866053825.461, "dur": 2.220, + "args": { + "External id": 88467, "cbid": 135, "correlation": 161161873 + } + }, + { + "ph": "f", "id": 161161873, "pid": 5714, "tid": 6744, "ts": 6300866053825.461, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866099294.239, "dur": 38.272, + "args": { + "External id": 88467, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161161877, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161161877, "pid": 0, "tid": 7, "ts": 6300866099294.239, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866053830.981, "dur": 10.950, + "args": { + "External id": 88467, "cbid": 211, "correlation": 161161877 + } + }, + { + "ph": "s", "id": 161161877, "pid": 5714, "tid": 6744, "ts": 6300866053830.981, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866053885.751, "dur": 1.110, + "args": { + "External id": 88460, "cbid": 135, "correlation": 161161888 + } + }, + { + "ph": "f", "id": 161161888, "pid": 5714, "tid": 6744, "ts": 6300866053885.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866053889.061, "dur": 1.610, + "args": { + "External id": 88460, "cbid": 147, "correlation": 161161892 + } + }, + { + "ph": "s", "id": 161161892, "pid": 5714, "tid": 6744, "ts": 6300866053889.061, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866053963.981, "dur": 1.130, + "args": { + "External id": 88476, "cbid": 317, "correlation": 161161912 + } + }, + { + "ph": "f", "id": 161161912, "pid": 5714, "tid": 6744, "ts": 6300866053963.981, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866053967.161, "dur": 1.420, + "args": { + "External id": 88476, "cbid": 135, "correlation": 161161914 + } + }, + { + "ph": "f", "id": 161161914, "pid": 5714, "tid": 6744, "ts": 6300866053967.161, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866053970.091, "dur": 0.980, + "args": { + "External id": 88476, "cbid": 147, "correlation": 161161918 + } + }, + { + "ph": "s", "id": 161161918, "pid": 5714, "tid": 6744, "ts": 6300866053970.091, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866053986.291, "dur": 0.800, + "args": { + "External id": 88476, "cbid": 409, "correlation": 161161921 + } + }, + { + "ph": "f", "id": 161161921, "pid": 5714, "tid": 6744, "ts": 6300866053986.291, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866053991.401, "dur": 0.830, + "args": { + "External id": 88476, "cbid": 135, "correlation": 161161924 + } + }, + { + "ph": "f", "id": 161161924, "pid": 5714, "tid": 6744, "ts": 6300866053991.401, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866053992.411, "dur": 1.070, + "args": { + "External id": 88476, "cbid": 147, "correlation": 161161925 + } + }, + { + "ph": "s", "id": 161161925, "pid": 5714, "tid": 6744, "ts": 6300866053992.411, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866103088.107, "dur": 8676.518, + "args": { + "External id": 88476, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161161927, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161161927, "pid": 0, "tid": 20, "ts": 6300866103088.107, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866053994.671, "dur": 11.340, + "args": { + "External id": 88476, "cbid": 430, "correlation": 161161927 + } + }, + { + "ph": "s", "id": 161161927, "pid": 5714, "tid": 6744, "ts": 6300866053994.671, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866054007.121, "dur": 0.430, + "args": { + "External id": 88476, "cbid": 135, "correlation": 161161929 + } + }, + { + "ph": "f", "id": 161161929, "pid": 5714, "tid": 6744, "ts": 6300866054007.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866054007.681, "dur": 0.550, + "args": { + "External id": 88476, "cbid": 147, "correlation": 161161930 + } + }, + { + "ph": "s", "id": 161161930, "pid": 5714, "tid": 6744, "ts": 6300866054007.681, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866054009.741, "dur": 0.960, + "args": { + "External id": 88476, "cbid": 135, "correlation": 161161933 + } + }, + { + "ph": "f", "id": 161161933, "pid": 5714, "tid": 6744, "ts": 6300866054009.741, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866054018.881, "dur": 0.480, + "args": { + "External id": 88476, "cbid": 135, "correlation": 161161940 + } + }, + { + "ph": "f", "id": 161161940, "pid": 5714, "tid": 6744, "ts": 6300866054018.881, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866054046.971, "dur": 1.010, + "args": { + "External id": 88478, "cbid": 147, "correlation": 161161945 + } + }, + { + "ph": "s", "id": 161161945, "pid": 5714, "tid": 6744, "ts": 6300866054046.971, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866054065.251, "dur": 0.980, + "args": { + "External id": 88460, "cbid": 135, "correlation": 161161960 + } + }, + { + "ph": "f", "id": 161161960, "pid": 5714, "tid": 6744, "ts": 6300866054065.251, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866054263.570, "dur": 1.980, + "args": { + "External id": 88460, "cbid": 135, "correlation": 161161973 + } + }, + { + "ph": "f", "id": 161161973, "pid": 5714, "tid": 6744, "ts": 6300866054263.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866054387.360, "dur": 3.510, + "args": { + "External id": 88488, "cbid": 147, "correlation": 161161984 + } + }, + { + "ph": "s", "id": 161161984, "pid": 5714, "tid": 6744, "ts": 6300866054387.360, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866054506.410, "dur": 1.240, + "args": { + "External id": 88502, "cbid": 317, "correlation": 161162025 + } + }, + { + "ph": "f", "id": 161162025, "pid": 5714, "tid": 6744, "ts": 6300866054506.410, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866054515.630, "dur": 2.280, + "args": { + "External id": 88503, "cbid": 138, "correlation": 161162028 + } + }, + { + "ph": "f", "id": 161162028, "pid": 5714, "tid": 6744, "ts": 6300866054515.630, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866103090.891, "dur": 2.528, + "args": { + "External id": 88507, "device": 0, "context": 1, "stream": 7, "correlation": 161162039, "bytes": 7224, "memory bandwidth (GB/s)": 2.857594936708861 + } + }, + { + "ph": "f", "id": 161162039, "pid": 0, "tid": 7, "ts": 6300866103090.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866054538.870, "dur": 12.710, + "args": { + "External id": 88507, "cbid": 41, "correlation": 161162039 + } + }, + { + "ph": "s", "id": 161162039, "pid": 5714, "tid": 6744, "ts": 6300866054538.870, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866054556.470, "dur": 2.010, + "args": { + "External id": 88502, "cbid": 135, "correlation": 161162043 + } + }, + { + "ph": "f", "id": 161162043, "pid": 5714, "tid": 6744, "ts": 6300866054556.470, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300866103095.947, "dur": 12.160, + "args": { + "External id": 88502, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162047, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162047, "pid": 0, "tid": 7, "ts": 6300866103095.947, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866054561.100, "dur": 11.870, + "args": { + "External id": 88502, "cbid": 211, "correlation": 161162047 + } + }, + { + "ph": "s", "id": 161162047, "pid": 5714, "tid": 6744, "ts": 6300866054561.100, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866054669.200, "dur": 1.369, + "args": { + "External id": 88488, "cbid": 135, "correlation": 161162058 + } + }, + { + "ph": "f", "id": 161162058, "pid": 5714, "tid": 6744, "ts": 6300866054669.200, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866054674.060, "dur": 1.279, + "args": { + "External id": 88488, "cbid": 147, "correlation": 161162062 + } + }, + { + "ph": "s", "id": 161162062, "pid": 5714, "tid": 6744, "ts": 6300866054674.060, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866054677.259, "dur": 0.850, + "args": { + "External id": 88488, "cbid": 147, "correlation": 161162066 + } + }, + { + "ph": "s", "id": 161162066, "pid": 5714, "tid": 6744, "ts": 6300866054677.259, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866103141.484, "dur": 325.507, + "args": { + "External id": 88521, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161162090, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161162090, "pid": 0, "tid": 17, "ts": 6300866103141.484, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866054831.659, "dur": 12.170, + "args": { + "External id": 88521, "cbid": 211, "correlation": 161162090 + } + }, + { + "ph": "s", "id": 161162090, "pid": 5714, "tid": 6744, "ts": 6300866054831.659, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866103641.746, "dur": 513.030, + "args": { + "External id": 88537, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161162103, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161162103, "pid": 0, "tid": 17, "ts": 6300866103641.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866054949.999, "dur": 10.030, + "args": { + "External id": 88537, "cbid": 211, "correlation": 161162103 + } + }, + { + "ph": "s", "id": 161162103, "pid": 5714, "tid": 6744, "ts": 6300866054949.999, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866054985.469, "dur": 1.430, + "args": { + "External id": 88488, "cbid": 135, "correlation": 161162113 + } + }, + { + "ph": "f", "id": 161162113, "pid": 5714, "tid": 6744, "ts": 6300866054985.469, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866054988.869, "dur": 1.340, + "args": { + "External id": 88488, "cbid": 147, "correlation": 161162117 + } + }, + { + "ph": "s", "id": 161162117, "pid": 5714, "tid": 6744, "ts": 6300866054988.869, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866055044.608, "dur": 0.960, + "args": { + "External id": 88539, "cbid": 317, "correlation": 161162130 + } + }, + { + "ph": "f", "id": 161162130, "pid": 5714, "tid": 6744, "ts": 6300866055044.608, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866055047.548, "dur": 1.180, + "args": { + "External id": 88539, "cbid": 135, "correlation": 161162132 + } + }, + { + "ph": "f", "id": 161162132, "pid": 5714, "tid": 6744, "ts": 6300866055047.548, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866055050.219, "dur": 1.320, + "args": { + "External id": 88539, "cbid": 147, "correlation": 161162136 + } + }, + { + "ph": "s", "id": 161162136, "pid": 5714, "tid": 6744, "ts": 6300866055050.219, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866055066.848, "dur": 0.791, + "args": { + "External id": 88539, "cbid": 409, "correlation": 161162139 + } + }, + { + "ph": "f", "id": 161162139, "pid": 5714, "tid": 6744, "ts": 6300866055066.848, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866055071.948, "dur": 0.900, + "args": { + "External id": 88539, "cbid": 135, "correlation": 161162142 + } + }, + { + "ph": "f", "id": 161162142, "pid": 5714, "tid": 6744, "ts": 6300866055071.948, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866055073.039, "dur": 1.040, + "args": { + "External id": 88539, "cbid": 147, "correlation": 161162143 + } + }, + { + "ph": "s", "id": 161162143, "pid": 5714, "tid": 6744, "ts": 6300866055073.039, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866111815.314, "dur": 5064.091, + "args": { + "External id": 88539, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161162145, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161162145, "pid": 0, "tid": 20, "ts": 6300866111815.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866055075.259, "dur": 10.089, + "args": { + "External id": 88539, "cbid": 430, "correlation": 161162145 + } + }, + { + "ph": "s", "id": 161162145, "pid": 5714, "tid": 6744, "ts": 6300866055075.259, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866055086.408, "dur": 0.451, + "args": { + "External id": 88539, "cbid": 135, "correlation": 161162147 + } + }, + { + "ph": "f", "id": 161162147, "pid": 5714, "tid": 6744, "ts": 6300866055086.408, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866055086.968, "dur": 0.520, + "args": { + "External id": 88539, "cbid": 147, "correlation": 161162148 + } + }, + { + "ph": "s", "id": 161162148, "pid": 5714, "tid": 6744, "ts": 6300866055086.968, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866055089.039, "dur": 0.780, + "args": { + "External id": 88539, "cbid": 135, "correlation": 161162151 + } + }, + { + "ph": "f", "id": 161162151, "pid": 5714, "tid": 6744, "ts": 6300866055089.039, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866055098.039, "dur": 0.469, + "args": { + "External id": 88539, "cbid": 135, "correlation": 161162158 + } + }, + { + "ph": "f", "id": 161162158, "pid": 5714, "tid": 6744, "ts": 6300866055098.039, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866055126.268, "dur": 1.050, + "args": { + "External id": 88541, "cbid": 147, "correlation": 161162163 + } + }, + { + "ph": "s", "id": 161162163, "pid": 5714, "tid": 6744, "ts": 6300866055126.268, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866055144.558, "dur": 1.050, + "args": { + "External id": 88488, "cbid": 135, "correlation": 161162178 + } + }, + { + "ph": "f", "id": 161162178, "pid": 5714, "tid": 6744, "ts": 6300866055144.558, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866103109.003, "dur": 2431.005, + "args": { + "External id": 88543, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162203, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162203, "pid": 0, "tid": 7, "ts": 6300866103109.003, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055306.648, "dur": 12.420, + "args": { + "External id": 88543, "cbid": 211, "correlation": 161162203 + } + }, + { + "ph": "s", "id": 161162203, "pid": 5714, "tid": 6744, "ts": 6300866055306.648, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6300866105559.272, "dur": 633.031, + "args": { + "External id": 88544, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162226, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161162226, "pid": 0, "tid": 7, "ts": 6300866105559.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055375.198, "dur": 6.750, + "args": { + "External id": 88544, "cbid": 307, "correlation": 161162226 + } + }, + { + "ph": "s", "id": 161162226, "pid": 5714, "tid": 6744, "ts": 6300866055375.198, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866055420.038, "dur": 0.580, + "args": { + "External id": 88545, "cbid": 200, "correlation": 161162249 + } + }, + { + "ph": "f", "id": 161162249, "pid": 5714, "tid": 6744, "ts": 6300866055420.038, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866106249.776, "dur": 76.065, + "args": { + "External id": 88545, "device": 0, "context": 1, "stream": 7, "correlation": 161162252, "bytes": 1536, "memory bandwidth (GB/s)": 0.02019325576809308 + } + }, + { + "ph": "f", "id": 161162252, "pid": 0, "tid": 7, "ts": 6300866106249.776, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866055422.448, "dur": 7.260, + "args": { + "External id": 88545, "cbid": 51, "correlation": 161162252 + } + }, + { + "ph": "s", "id": 161162252, "pid": 5714, "tid": 6744, "ts": 6300866055422.448, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866106377.170, "dur": 630.343, + "args": { + "External id": 88545, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162253, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162253, "pid": 0, "tid": 7, "ts": 6300866106377.170, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055429.948, "dur": 6.170, + "args": { + "External id": 88545, "cbid": 307, "correlation": 161162253 + } + }, + { + "ph": "s", "id": 161162253, "pid": 5714, "tid": 6744, "ts": 6300866055429.948, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866055466.627, "dur": 0.331, + "args": { + "External id": 88546, "cbid": 200, "correlation": 161162278 + } + }, + { + "ph": "f", "id": 161162278, "pid": 5714, "tid": 6744, "ts": 6300866055466.627, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866107008.857, "dur": 1.216, + "args": { + "External id": 88546, "device": 0, "context": 1, "stream": 7, "correlation": 161162281, "bytes": 1536, "memory bandwidth (GB/s)": 1.263157894736842 + } + }, + { + "ph": "f", "id": 161162281, "pid": 0, "tid": 7, "ts": 6300866107008.857, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866055468.087, "dur": 5.431, + "args": { + "External id": 88546, "cbid": 51, "correlation": 161162281 + } + }, + { + "ph": "s", "id": 161162281, "pid": 5714, "tid": 6744, "ts": 6300866055468.087, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866107012.057, "dur": 354.500, + "args": { + "External id": 88546, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162282, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162282, "pid": 0, "tid": 7, "ts": 6300866107012.057, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055473.667, "dur": 5.331, + "args": { + "External id": 88546, "cbid": 307, "correlation": 161162282 + } + }, + { + "ph": "s", "id": 161162282, "pid": 5714, "tid": 6744, "ts": 6300866055473.667, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866055503.878, "dur": 0.300, + "args": { + "External id": 88547, "cbid": 200, "correlation": 161162307 + } + }, + { + "ph": "f", "id": 161162307, "pid": 5714, "tid": 6744, "ts": 6300866055503.878, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866107367.229, "dur": 358.661, + "args": { + "External id": 88547, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162310, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162310, "pid": 0, "tid": 7, "ts": 6300866107367.229, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055505.498, "dur": 5.749, + "args": { + "External id": 88547, "cbid": 307, "correlation": 161162310 + } + }, + { + "ph": "s", "id": 161162310, "pid": 5714, "tid": 6744, "ts": 6300866055505.498, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866055534.878, "dur": 0.269, + "args": { + "External id": 88548, "cbid": 200, "correlation": 161162335 + } + }, + { + "ph": "f", "id": 161162335, "pid": 5714, "tid": 6744, "ts": 6300866055534.878, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866107727.138, "dur": 1.536, + "args": { + "External id": 88548, "device": 0, "context": 1, "stream": 7, "correlation": 161162338, "bytes": 1536, "memory bandwidth (GB/s)": 1 + } + }, + { + "ph": "f", "id": 161162338, "pid": 0, "tid": 7, "ts": 6300866107727.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866055536.258, "dur": 4.780, + "args": { + "External id": 88548, "cbid": 51, "correlation": 161162338 + } + }, + { + "ph": "s", "id": 161162338, "pid": 5714, "tid": 6744, "ts": 6300866055536.258, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866107729.858, "dur": 358.244, + "args": { + "External id": 88548, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162339, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162339, "pid": 0, "tid": 7, "ts": 6300866107729.858, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055541.187, "dur": 5.240, + "args": { + "External id": 88548, "cbid": 307, "correlation": 161162339 + } + }, + { + "ph": "s", "id": 161162339, "pid": 5714, "tid": 6744, "ts": 6300866055541.187, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866055570.637, "dur": 0.340, + "args": { + "External id": 88549, "cbid": 200, "correlation": 161162364 + } + }, + { + "ph": "f", "id": 161162364, "pid": 5714, "tid": 6744, "ts": 6300866055570.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866108088.806, "dur": 358.788, + "args": { + "External id": 88549, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162367, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162367, "pid": 0, "tid": 7, "ts": 6300866108088.806, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055572.117, "dur": 5.480, + "args": { + "External id": 88549, "cbid": 307, "correlation": 161162367 + } + }, + { + "ph": "s", "id": 161162367, "pid": 5714, "tid": 6744, "ts": 6300866055572.117, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866108448.298, "dur": 90.817, + "args": { + "External id": 88550, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162380, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162380, "pid": 0, "tid": 7, "ts": 6300866108448.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055617.677, "dur": 6.590, + "args": { + "External id": 88550, "cbid": 307, "correlation": 161162380 + } + }, + { + "ph": "s", "id": 161162380, "pid": 5714, "tid": 6744, "ts": 6300866055617.677, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6300866108539.723, "dur": 4.096, + "args": { + "External id": 88551, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162388, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161162388, "pid": 0, "tid": 7, "ts": 6300866108539.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055650.947, "dur": 5.520, + "args": { + "External id": 88551, "cbid": 307, "correlation": 161162388 + } + }, + { + "ph": "s", "id": 161162388, "pid": 5714, "tid": 6744, "ts": 6300866055650.947, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6300866108544.523, "dur": 113.921, + "args": { + "External id": 88552, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162396, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162396, "pid": 0, "tid": 7, "ts": 6300866108544.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055683.657, "dur": 5.280, + "args": { + "External id": 88552, "cbid": 307, "correlation": 161162396 + } + }, + { + "ph": "s", "id": 161162396, "pid": 5714, "tid": 6744, "ts": 6300866055683.657, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866055876.837, "dur": 0.560, + "args": { + "External id": 88571, "cbid": 200, "correlation": 161162442 + } + }, + { + "ph": "f", "id": 161162442, "pid": 5714, "tid": 6744, "ts": 6300866055876.837, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866108659.756, "dur": 1.184, + "args": { + "External id": 88571, "device": 0, "context": 1, "stream": 7, "correlation": 161162445, "bytes": 576, "memory bandwidth (GB/s)": 0.4864864864864865 + } + }, + { + "ph": "f", "id": 161162445, "pid": 0, "tid": 7, "ts": 6300866108659.756, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866055879.167, "dur": 7.950, + "args": { + "External id": 88571, "cbid": 51, "correlation": 161162445 + } + }, + { + "ph": "s", "id": 161162445, "pid": 5714, "tid": 6744, "ts": 6300866055879.167, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866108662.572, "dur": 143.202, + "args": { + "External id": 88571, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162446, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162446, "pid": 0, "tid": 7, "ts": 6300866108662.572, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055887.367, "dur": 8.780, + "args": { + "External id": 88571, "cbid": 307, "correlation": 161162446 + } + }, + { + "ph": "s", "id": 161162446, "pid": 5714, "tid": 6744, "ts": 6300866055887.367, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866108806.446, "dur": 141.282, + "args": { + "External id": 88572, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162468, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162468, "pid": 0, "tid": 7, "ts": 6300866108806.446, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866055926.437, "dur": 6.349, + "args": { + "External id": 88572, "cbid": 211, "correlation": 161162468 + } + }, + { + "ph": "s", "id": 161162468, "pid": 5714, "tid": 6744, "ts": 6300866055926.437, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866056010.136, "dur": 0.510, + "args": { + "External id": 88573, "cbid": 200, "correlation": 161162486 + } + }, + { + "ph": "f", "id": 161162486, "pid": 5714, "tid": 6744, "ts": 6300866056010.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866056010.826, "dur": 0.230, + "args": { + "External id": 88573, "cbid": 200, "correlation": 161162487 + } + }, + { + "ph": "f", "id": 161162487, "pid": 5714, "tid": 6744, "ts": 6300866056010.826, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866056032.066, "dur": 0.260, + "args": { + "External id": 88573, "cbid": 200, "correlation": 161162505 + } + }, + { + "ph": "f", "id": 161162505, "pid": 5714, "tid": 6744, "ts": 6300866056032.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866108948.400, "dur": 92.833, + "args": { + "External id": 88573, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162506, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162506, "pid": 0, "tid": 7, "ts": 6300866108948.400, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056033.636, "dur": 10.030, + "args": { + "External id": 88573, "cbid": 211, "correlation": 161162506 + } + }, + { + "ph": "s", "id": 161162506, "pid": 5714, "tid": 6744, "ts": 6300866056033.636, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866056044.526, "dur": 1.050, + "args": { + "External id": 88573, "cbid": 273, "correlation": 161162508 + } + }, + { + "ph": "f", "id": 161162508, "pid": 5714, "tid": 6744, "ts": 6300866056044.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866109041.905, "dur": 1603.411, + "args": { + "External id": 88573, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162509, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161162509, "pid": 0, "tid": 7, "ts": 6300866109041.905, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056045.926, "dur": 4.600, + "args": { + "External id": 88573, "cbid": 211, "correlation": 161162509 + } + }, + { + "ph": "s", "id": 161162509, "pid": 5714, "tid": 6744, "ts": 6300866056045.926, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866110645.956, "dur": 190.210, + "args": { + "External id": 88573, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162511, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161162511, "pid": 0, "tid": 7, "ts": 6300866110645.956, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056051.116, "dur": 3.890, + "args": { + "External id": 88573, "cbid": 211, "correlation": 161162511 + } + }, + { + "ph": "s", "id": 161162511, "pid": 5714, "tid": 6744, "ts": 6300866056051.116, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866110836.870, "dur": 211.363, + "args": { + "External id": 88584, "device": 0, "context": 1, "stream": 7, "correlation": 161162533, "bytes": 25165824, "memory bandwidth (GB/s)": 119.0644720220663 + } + }, + { + "ph": "f", "id": 161162533, "pid": 0, "tid": 7, "ts": 6300866110836.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866056240.876, "dur": 19.550, + "args": { + "External id": 88584, "cbid": 41, "correlation": 161162533 + } + }, + { + "ph": "s", "id": 161162533, "pid": 5714, "tid": 6744, "ts": 6300866056240.876, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866111048.969, "dur": 203.074, + "args": { + "External id": 88581, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162551, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162551, "pid": 0, "tid": 7, "ts": 6300866111048.969, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056375.936, "dur": 9.380, + "args": { + "External id": 88581, "cbid": 307, "correlation": 161162551 + } + }, + { + "ph": "s", "id": 161162551, "pid": 5714, "tid": 6744, "ts": 6300866056375.936, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866111252.747, "dur": 164.674, + "args": { + "External id": 88591, "device": 0, "context": 1, "stream": 7, "correlation": 161162566, "bytes": 25165824, "memory bandwidth (GB/s)": 152.82208484642385 + } + }, + { + "ph": "f", "id": 161162566, "pid": 0, "tid": 7, "ts": 6300866111252.747, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866056451.335, "dur": 15.100, + "args": { + "External id": 88591, "cbid": 41, "correlation": 161162566 + } + }, + { + "ph": "s", "id": 161162566, "pid": 5714, "tid": 6744, "ts": 6300866056451.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866111417.997, "dur": 56.513, + "args": { + "External id": 88588, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162584, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162584, "pid": 0, "tid": 7, "ts": 6300866111417.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056561.475, "dur": 8.400, + "args": { + "External id": 88588, "cbid": 307, "correlation": 161162584 + } + }, + { + "ph": "s", "id": 161162584, "pid": 5714, "tid": 6744, "ts": 6300866056561.475, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866056696.075, "dur": 0.590, + "args": { + "External id": 88596, "cbid": 200, "correlation": 161162614 + } + }, + { + "ph": "f", "id": 161162614, "pid": 5714, "tid": 6744, "ts": 6300866056696.075, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866111489.582, "dur": 6.560, + "args": { + "External id": 88596, "device": 0, "context": 1, "stream": 7, "correlation": 161162617, "bytes": 576, "memory bandwidth (GB/s)": 0.08780487804878048 + } + }, + { + "ph": "f", "id": 161162617, "pid": 0, "tid": 7, "ts": 6300866111489.582, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866056698.535, "dur": 8.020, + "args": { + "External id": 88596, "cbid": 51, "correlation": 161162617 + } + }, + { + "ph": "s", "id": 161162617, "pid": 5714, "tid": 6744, "ts": 6300866056698.535, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866111504.366, "dur": 160.802, + "args": { + "External id": 88596, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162618, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162618, "pid": 0, "tid": 7, "ts": 6300866111504.366, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056706.815, "dur": 8.330, + "args": { + "External id": 88596, "cbid": 307, "correlation": 161162618 + } + }, + { + "ph": "s", "id": 161162618, "pid": 5714, "tid": 6744, "ts": 6300866056706.815, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866056743.555, "dur": 0.330, + "args": { + "External id": 88597, "cbid": 200, "correlation": 161162643 + } + }, + { + "ph": "f", "id": 161162643, "pid": 5714, "tid": 6744, "ts": 6300866056743.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866111673.136, "dur": 7.584, + "args": { + "External id": 88597, "device": 0, "context": 1, "stream": 7, "correlation": 161162646, "bytes": 576, "memory bandwidth (GB/s)": 0.0759493670886076 + } + }, + { + "ph": "f", "id": 161162646, "pid": 0, "tid": 7, "ts": 6300866111673.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866056745.035, "dur": 4.710, + "args": { + "External id": 88597, "cbid": 51, "correlation": 161162646 + } + }, + { + "ph": "s", "id": 161162646, "pid": 5714, "tid": 6744, "ts": 6300866056745.035, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866111688.624, "dur": 142.754, + "args": { + "External id": 88597, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162647, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162647, "pid": 0, "tid": 7, "ts": 6300866111688.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056749.915, "dur": 5.480, + "args": { + "External id": 88597, "cbid": 307, "correlation": 161162647 + } + }, + { + "ph": "s", "id": 161162647, "pid": 5714, "tid": 6744, "ts": 6300866056749.915, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866056780.175, "dur": 0.289, + "args": { + "External id": 88598, "cbid": 200, "correlation": 161162672 + } + }, + { + "ph": "f", "id": 161162672, "pid": 5714, "tid": 6744, "ts": 6300866056780.175, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866111832.658, "dur": 1.248, + "args": { + "External id": 88598, "device": 0, "context": 1, "stream": 7, "correlation": 161162675, "bytes": 576, "memory bandwidth (GB/s)": 0.46153846153846156 + } + }, + { + "ph": "f", "id": 161162675, "pid": 0, "tid": 7, "ts": 6300866111832.658, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866056781.535, "dur": 4.500, + "args": { + "External id": 88598, "cbid": 51, "correlation": 161162675 + } + }, + { + "ph": "s", "id": 161162675, "pid": 5714, "tid": 6744, "ts": 6300866056781.535, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866111835.282, "dur": 552.038, + "args": { + "External id": 88598, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162676, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162676, "pid": 0, "tid": 7, "ts": 6300866111835.282, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056786.184, "dur": 5.320, + "args": { + "External id": 88598, "cbid": 307, "correlation": 161162676 + } + }, + { + "ph": "s", "id": 161162676, "pid": 5714, "tid": 6744, "ts": 6300866056786.184, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866112388.024, "dur": 143.042, + "args": { + "External id": 88599, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162698, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162698, "pid": 0, "tid": 7, "ts": 6300866112388.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056817.315, "dur": 6.100, + "args": { + "External id": 88599, "cbid": 211, "correlation": 161162698 + } + }, + { + "ph": "s", "id": 161162698, "pid": 5714, "tid": 6744, "ts": 6300866056817.315, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866112531.674, "dur": 141.954, + "args": { + "External id": 88600, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162721, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162721, "pid": 0, "tid": 7, "ts": 6300866112531.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056842.844, "dur": 5.200, + "args": { + "External id": 88600, "cbid": 211, "correlation": 161162721 + } + }, + { + "ph": "s", "id": 161162721, "pid": 5714, "tid": 6744, "ts": 6300866056842.844, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866112674.332, "dur": 142.561, + "args": { + "External id": 88601, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162744, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161162744, "pid": 0, "tid": 7, "ts": 6300866112674.332, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056866.364, "dur": 5.231, + "args": { + "External id": 88601, "cbid": 211, "correlation": 161162744 + } + }, + { + "ph": "s", "id": 161162744, "pid": 5714, "tid": 6744, "ts": 6300866056866.364, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6300866112817.533, "dur": 81.057, + "args": { + "External id": 88602, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162752, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162752, "pid": 0, "tid": 7, "ts": 6300866112817.533, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866056907.104, "dur": 5.840, + "args": { + "External id": 88602, "cbid": 307, "correlation": 161162752 + } + }, + { + "ph": "s", "id": 161162752, "pid": 5714, "tid": 6744, "ts": 6300866056907.104, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866112899.230, "dur": 46.689, + "args": { + "External id": 88617, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162781, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162781, "pid": 0, "tid": 7, "ts": 6300866112899.230, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866057080.644, "dur": 10.040, + "args": { + "External id": 88617, "cbid": 307, "correlation": 161162781 + } + }, + { + "ph": "s", "id": 161162781, "pid": 5714, "tid": 6744, "ts": 6300866057080.644, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866112946.527, "dur": 3.136, + "args": { + "External id": 88618, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162789, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161162789, "pid": 0, "tid": 7, "ts": 6300866112946.527, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866057117.914, "dur": 5.570, + "args": { + "External id": 88618, "cbid": 307, "correlation": 161162789 + } + }, + { + "ph": "s", "id": 161162789, "pid": 5714, "tid": 6744, "ts": 6300866057117.914, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866112950.335, "dur": 50.304, + "args": { + "External id": 88619, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162800, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162800, "pid": 0, "tid": 7, "ts": 6300866112950.335, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866057152.724, "dur": 5.600, + "args": { + "External id": 88619, "cbid": 307, "correlation": 161162800 + } + }, + { + "ph": "s", "id": 161162800, "pid": 5714, "tid": 6744, "ts": 6300866057152.724, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866113001.311, "dur": 45.505, + "args": { + "External id": 88620, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162805, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162805, "pid": 0, "tid": 7, "ts": 6300866113001.311, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866057196.574, "dur": 7.680, + "args": { + "External id": 88620, "cbid": 211, "correlation": 161162805 + } + }, + { + "ph": "s", "id": 161162805, "pid": 5714, "tid": 6744, "ts": 6300866057196.574, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866057395.023, "dur": 3.200, + "args": { + "External id": 88626, "cbid": 147, "correlation": 161162822 + } + }, + { + "ph": "s", "id": 161162822, "pid": 5714, "tid": 6744, "ts": 6300866057395.023, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866057539.913, "dur": 2.600, + "args": { + "External id": 88634, "cbid": 138, "correlation": 161162837 + } + }, + { + "ph": "f", "id": 161162837, "pid": 5714, "tid": 6744, "ts": 6300866057539.913, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866113053.408, "dur": 6.080, + "args": { + "External id": 88638, "device": 0, "context": 1, "stream": 7, "correlation": 161162848, "bytes": 28112, "memory bandwidth (GB/s)": 4.623684210526315 + } + }, + { + "ph": "f", "id": 161162848, "pid": 0, "tid": 7, "ts": 6300866113053.408, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866057566.263, "dur": 12.890, + "args": { + "External id": 88638, "cbid": 41, "correlation": 161162848 + } + }, + { + "ph": "s", "id": 161162848, "pid": 5714, "tid": 6744, "ts": 6300866057566.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866057583.823, "dur": 2.120, + "args": { + "External id": 88633, "cbid": 135, "correlation": 161162852 + } + }, + { + "ph": "f", "id": 161162852, "pid": 5714, "tid": 6744, "ts": 6300866057583.823, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866113062.016, "dur": 39.329, + "args": { + "External id": 88633, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161162856, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161162856, "pid": 0, "tid": 7, "ts": 6300866113062.016, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866057589.483, "dur": 11.100, + "args": { + "External id": 88633, "cbid": 211, "correlation": 161162856 + } + }, + { + "ph": "s", "id": 161162856, "pid": 5714, "tid": 6744, "ts": 6300866057589.483, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866057643.203, "dur": 1.070, + "args": { + "External id": 88626, "cbid": 135, "correlation": 161162867 + } + }, + { + "ph": "f", "id": 161162867, "pid": 5714, "tid": 6744, "ts": 6300866057643.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866057646.453, "dur": 1.460, + "args": { + "External id": 88626, "cbid": 147, "correlation": 161162871 + } + }, + { + "ph": "s", "id": 161162871, "pid": 5714, "tid": 6744, "ts": 6300866057646.453, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866057721.573, "dur": 1.089, + "args": { + "External id": 88642, "cbid": 317, "correlation": 161162891 + } + }, + { + "ph": "f", "id": 161162891, "pid": 5714, "tid": 6744, "ts": 6300866057721.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866057724.613, "dur": 1.369, + "args": { + "External id": 88642, "cbid": 135, "correlation": 161162893 + } + }, + { + "ph": "f", "id": 161162893, "pid": 5714, "tid": 6744, "ts": 6300866057724.613, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866057727.433, "dur": 1.100, + "args": { + "External id": 88642, "cbid": 147, "correlation": 161162897 + } + }, + { + "ph": "s", "id": 161162897, "pid": 5714, "tid": 6744, "ts": 6300866057727.433, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866057743.673, "dur": 0.829, + "args": { + "External id": 88642, "cbid": 409, "correlation": 161162900 + } + }, + { + "ph": "f", "id": 161162900, "pid": 5714, "tid": 6744, "ts": 6300866057743.673, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866057748.822, "dur": 0.980, + "args": { + "External id": 88642, "cbid": 135, "correlation": 161162903 + } + }, + { + "ph": "f", "id": 161162903, "pid": 5714, "tid": 6744, "ts": 6300866057748.822, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866057750.002, "dur": 1.000, + "args": { + "External id": 88642, "cbid": 147, "correlation": 161162904 + } + }, + { + "ph": "s", "id": 161162904, "pid": 5714, "tid": 6744, "ts": 6300866057750.002, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866116881.645, "dur": 9344.942, + "args": { + "External id": 88642, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161162906, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161162906, "pid": 0, "tid": 20, "ts": 6300866116881.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866057752.222, "dur": 10.490, + "args": { + "External id": 88642, "cbid": 430, "correlation": 161162906 + } + }, + { + "ph": "s", "id": 161162906, "pid": 5714, "tid": 6744, "ts": 6300866057752.222, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866057763.802, "dur": 0.470, + "args": { + "External id": 88642, "cbid": 135, "correlation": 161162908 + } + }, + { + "ph": "f", "id": 161162908, "pid": 5714, "tid": 6744, "ts": 6300866057763.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866057764.402, "dur": 0.620, + "args": { + "External id": 88642, "cbid": 147, "correlation": 161162909 + } + }, + { + "ph": "s", "id": 161162909, "pid": 5714, "tid": 6744, "ts": 6300866057764.402, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866057766.652, "dur": 0.890, + "args": { + "External id": 88642, "cbid": 135, "correlation": 161162912 + } + }, + { + "ph": "f", "id": 161162912, "pid": 5714, "tid": 6744, "ts": 6300866057766.652, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866057776.022, "dur": 0.490, + "args": { + "External id": 88642, "cbid": 135, "correlation": 161162919 + } + }, + { + "ph": "f", "id": 161162919, "pid": 5714, "tid": 6744, "ts": 6300866057776.022, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866057804.482, "dur": 1.130, + "args": { + "External id": 88644, "cbid": 147, "correlation": 161162924 + } + }, + { + "ph": "s", "id": 161162924, "pid": 5714, "tid": 6744, "ts": 6300866057804.482, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866057823.052, "dur": 0.950, + "args": { + "External id": 88626, "cbid": 135, "correlation": 161162939 + } + }, + { + "ph": "f", "id": 161162939, "pid": 5714, "tid": 6744, "ts": 6300866057823.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866058027.442, "dur": 1.320, + "args": { + "External id": 88626, "cbid": 135, "correlation": 161162952 + } + }, + { + "ph": "f", "id": 161162952, "pid": 5714, "tid": 6744, "ts": 6300866058027.442, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866058139.972, "dur": 3.260, + "args": { + "External id": 88654, "cbid": 147, "correlation": 161162963 + } + }, + { + "ph": "s", "id": 161162963, "pid": 5714, "tid": 6744, "ts": 6300866058139.972, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866058259.641, "dur": 1.300, + "args": { + "External id": 88668, "cbid": 317, "correlation": 161163004 + } + }, + { + "ph": "f", "id": 161163004, "pid": 5714, "tid": 6744, "ts": 6300866058259.641, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866058268.541, "dur": 2.410, + "args": { + "External id": 88669, "cbid": 138, "correlation": 161163007 + } + }, + { + "ph": "f", "id": 161163007, "pid": 5714, "tid": 6744, "ts": 6300866058268.541, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866116881.293, "dur": 2.496, + "args": { + "External id": 88673, "device": 0, "context": 1, "stream": 7, "correlation": 161163018, "bytes": 7224, "memory bandwidth (GB/s)": 2.894230769230769 + } + }, + { + "ph": "f", "id": 161163018, "pid": 0, "tid": 7, "ts": 6300866116881.293, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866058292.141, "dur": 22.390, + "args": { + "External id": 88673, "cbid": 41, "correlation": 161163018 + } + }, + { + "ph": "s", "id": 161163018, "pid": 5714, "tid": 6744, "ts": 6300866058292.141, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866058319.731, "dur": 2.170, + "args": { + "External id": 88668, "cbid": 135, "correlation": 161163022 + } + }, + { + "ph": "f", "id": 161163022, "pid": 5714, "tid": 6744, "ts": 6300866058319.731, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300866116885.901, "dur": 11.680, + "args": { + "External id": 88668, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163026, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163026, "pid": 0, "tid": 7, "ts": 6300866116885.901, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866058324.731, "dur": 11.180, + "args": { + "External id": 88668, "cbid": 211, "correlation": 161163026 + } + }, + { + "ph": "s", "id": 161163026, "pid": 5714, "tid": 6744, "ts": 6300866058324.731, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866058432.261, "dur": 1.340, + "args": { + "External id": 88654, "cbid": 135, "correlation": 161163037 + } + }, + { + "ph": "f", "id": 161163037, "pid": 5714, "tid": 6744, "ts": 6300866058432.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866058436.971, "dur": 1.310, + "args": { + "External id": 88654, "cbid": 147, "correlation": 161163041 + } + }, + { + "ph": "s", "id": 161163041, "pid": 5714, "tid": 6744, "ts": 6300866058436.971, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866058440.081, "dur": 0.740, + "args": { + "External id": 88654, "cbid": 147, "correlation": 161163045 + } + }, + { + "ph": "s", "id": 161163045, "pid": 5714, "tid": 6744, "ts": 6300866058440.081, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866116931.918, "dur": 29.632, + "args": { + "External id": 88687, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161163069, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161163069, "pid": 0, "tid": 17, "ts": 6300866116931.918, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866058593.740, "dur": 12.111, + "args": { + "External id": 88687, "cbid": 211, "correlation": 161163069 + } + }, + { + "ph": "s", "id": 161163069, "pid": 5714, "tid": 6744, "ts": 6300866058593.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866116969.486, "dur": 11.872, + "args": { + "External id": 88703, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161163082, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161163082, "pid": 0, "tid": 17, "ts": 6300866116969.486, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866058710.280, "dur": 10.080, + "args": { + "External id": 88703, "cbid": 211, "correlation": 161163082 + } + }, + { + "ph": "s", "id": 161163082, "pid": 5714, "tid": 6744, "ts": 6300866058710.280, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866058745.440, "dur": 1.340, + "args": { + "External id": 88654, "cbid": 135, "correlation": 161163092 + } + }, + { + "ph": "f", "id": 161163092, "pid": 5714, "tid": 6744, "ts": 6300866058745.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866058748.720, "dur": 1.290, + "args": { + "External id": 88654, "cbid": 147, "correlation": 161163096 + } + }, + { + "ph": "s", "id": 161163096, "pid": 5714, "tid": 6744, "ts": 6300866058748.720, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866058803.610, "dur": 0.980, + "args": { + "External id": 88705, "cbid": 317, "correlation": 161163109 + } + }, + { + "ph": "f", "id": 161163109, "pid": 5714, "tid": 6744, "ts": 6300866058803.610, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866058806.580, "dur": 1.280, + "args": { + "External id": 88705, "cbid": 135, "correlation": 161163111 + } + }, + { + "ph": "f", "id": 161163111, "pid": 5714, "tid": 6744, "ts": 6300866058806.580, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866058809.370, "dur": 1.190, + "args": { + "External id": 88705, "cbid": 147, "correlation": 161163115 + } + }, + { + "ph": "s", "id": 161163115, "pid": 5714, "tid": 6744, "ts": 6300866058809.370, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866058825.280, "dur": 0.750, + "args": { + "External id": 88705, "cbid": 409, "correlation": 161163118 + } + }, + { + "ph": "f", "id": 161163118, "pid": 5714, "tid": 6744, "ts": 6300866058825.280, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866058830.390, "dur": 0.810, + "args": { + "External id": 88705, "cbid": 135, "correlation": 161163121 + } + }, + { + "ph": "f", "id": 161163121, "pid": 5714, "tid": 6744, "ts": 6300866058830.390, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866058831.390, "dur": 0.940, + "args": { + "External id": 88705, "cbid": 147, "correlation": 161163122 + } + }, + { + "ph": "s", "id": 161163122, "pid": 5714, "tid": 6744, "ts": 6300866058831.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866126241.851, "dur": 4885.145, + "args": { + "External id": 88705, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161163124, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161163124, "pid": 0, "tid": 20, "ts": 6300866126241.851, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866058833.460, "dur": 10.430, + "args": { + "External id": 88705, "cbid": 430, "correlation": 161163124 + } + }, + { + "ph": "s", "id": 161163124, "pid": 5714, "tid": 6744, "ts": 6300866058833.460, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866058844.980, "dur": 0.410, + "args": { + "External id": 88705, "cbid": 135, "correlation": 161163126 + } + }, + { + "ph": "f", "id": 161163126, "pid": 5714, "tid": 6744, "ts": 6300866058844.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866058845.570, "dur": 0.530, + "args": { + "External id": 88705, "cbid": 147, "correlation": 161163127 + } + }, + { + "ph": "s", "id": 161163127, "pid": 5714, "tid": 6744, "ts": 6300866058845.570, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866058847.680, "dur": 0.910, + "args": { + "External id": 88705, "cbid": 135, "correlation": 161163130 + } + }, + { + "ph": "f", "id": 161163130, "pid": 5714, "tid": 6744, "ts": 6300866058847.680, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866058856.820, "dur": 0.480, + "args": { + "External id": 88705, "cbid": 135, "correlation": 161163137 + } + }, + { + "ph": "f", "id": 161163137, "pid": 5714, "tid": 6744, "ts": 6300866058856.820, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866058884.620, "dur": 1.080, + "args": { + "External id": 88707, "cbid": 147, "correlation": 161163142 + } + }, + { + "ph": "s", "id": 161163142, "pid": 5714, "tid": 6744, "ts": 6300866058884.620, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866058902.730, "dur": 1.030, + "args": { + "External id": 88654, "cbid": 135, "correlation": 161163157 + } + }, + { + "ph": "f", "id": 161163157, "pid": 5714, "tid": 6744, "ts": 6300866058902.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866116898.285, "dur": 2235.259, + "args": { + "External id": 88709, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163182, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163182, "pid": 0, "tid": 7, "ts": 6300866116898.285, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059058.070, "dur": 11.949, + "args": { + "External id": 88709, "cbid": 211, "correlation": 161163182 + } + }, + { + "ph": "s", "id": 161163182, "pid": 5714, "tid": 6744, "ts": 6300866059058.070, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6300866119134.152, "dur": 559.110, + "args": { + "External id": 88710, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163205, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161163205, "pid": 0, "tid": 7, "ts": 6300866119134.152, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059121.629, "dur": 6.890, + "args": { + "External id": 88710, "cbid": 307, "correlation": 161163205 + } + }, + { + "ph": "s", "id": 161163205, "pid": 5714, "tid": 6744, "ts": 6300866059121.629, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866059165.919, "dur": 0.550, + "args": { + "External id": 88711, "cbid": 200, "correlation": 161163228 + } + }, + { + "ph": "f", "id": 161163228, "pid": 5714, "tid": 6744, "ts": 6300866059165.919, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866119749.711, "dur": 86.721, + "args": { + "External id": 88711, "device": 0, "context": 1, "stream": 7, "correlation": 161163231, "bytes": 1536, "memory bandwidth (GB/s)": 0.01771197287854153 + } + }, + { + "ph": "f", "id": 161163231, "pid": 0, "tid": 7, "ts": 6300866119749.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866059168.319, "dur": 7.090, + "args": { + "External id": 88711, "cbid": 51, "correlation": 161163231 + } + }, + { + "ph": "s", "id": 161163231, "pid": 5714, "tid": 6744, "ts": 6300866059168.319, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866119859.152, "dur": 874.282, + "args": { + "External id": 88711, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163232, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163232, "pid": 0, "tid": 7, "ts": 6300866119859.152, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059175.649, "dur": 6.410, + "args": { + "External id": 88711, "cbid": 307, "correlation": 161163232 + } + }, + { + "ph": "s", "id": 161163232, "pid": 5714, "tid": 6744, "ts": 6300866059175.649, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866059211.579, "dur": 0.310, + "args": { + "External id": 88712, "cbid": 200, "correlation": 161163257 + } + }, + { + "ph": "f", "id": 161163257, "pid": 5714, "tid": 6744, "ts": 6300866059211.579, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866120754.907, "dur": 2.272, + "args": { + "External id": 88712, "device": 0, "context": 1, "stream": 7, "correlation": 161163260, "bytes": 1536, "memory bandwidth (GB/s)": 0.676056338028169 + } + }, + { + "ph": "f", "id": 161163260, "pid": 0, "tid": 7, "ts": 6300866120754.907, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866059213.059, "dur": 4.830, + "args": { + "External id": 88712, "cbid": 51, "correlation": 161163260 + } + }, + { + "ph": "s", "id": 161163260, "pid": 5714, "tid": 6744, "ts": 6300866059213.059, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866120758.459, "dur": 354.884, + "args": { + "External id": 88712, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163261, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163261, "pid": 0, "tid": 7, "ts": 6300866120758.459, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059218.029, "dur": 5.380, + "args": { + "External id": 88712, "cbid": 307, "correlation": 161163261 + } + }, + { + "ph": "s", "id": 161163261, "pid": 5714, "tid": 6744, "ts": 6300866059218.029, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866059248.059, "dur": 0.290, + "args": { + "External id": 88713, "cbid": 200, "correlation": 161163286 + } + }, + { + "ph": "f", "id": 161163286, "pid": 5714, "tid": 6744, "ts": 6300866059248.059, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866121114.015, "dur": 358.532, + "args": { + "External id": 88713, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163289, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163289, "pid": 0, "tid": 7, "ts": 6300866121114.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059249.559, "dur": 5.780, + "args": { + "External id": 88713, "cbid": 307, "correlation": 161163289 + } + }, + { + "ph": "s", "id": 161163289, "pid": 5714, "tid": 6744, "ts": 6300866059249.559, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866059278.979, "dur": 0.270, + "args": { + "External id": 88714, "cbid": 200, "correlation": 161163314 + } + }, + { + "ph": "f", "id": 161163314, "pid": 5714, "tid": 6744, "ts": 6300866059278.979, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866121473.955, "dur": 1.376, + "args": { + "External id": 88714, "device": 0, "context": 1, "stream": 7, "correlation": 161163317, "bytes": 1536, "memory bandwidth (GB/s)": 1.1162790697674418 + } + }, + { + "ph": "f", "id": 161163317, "pid": 0, "tid": 7, "ts": 6300866121473.955, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866059280.279, "dur": 5.000, + "args": { + "External id": 88714, "cbid": 51, "correlation": 161163317 + } + }, + { + "ph": "s", "id": 161163317, "pid": 5714, "tid": 6744, "ts": 6300866059280.279, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866121476.739, "dur": 355.620, + "args": { + "External id": 88714, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163318, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163318, "pid": 0, "tid": 7, "ts": 6300866121476.739, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059285.429, "dur": 5.170, + "args": { + "External id": 88714, "cbid": 307, "correlation": 161163318 + } + }, + { + "ph": "s", "id": 161163318, "pid": 5714, "tid": 6744, "ts": 6300866059285.429, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866059327.519, "dur": 0.360, + "args": { + "External id": 88715, "cbid": 200, "correlation": 161163343 + } + }, + { + "ph": "f", "id": 161163343, "pid": 5714, "tid": 6744, "ts": 6300866059327.519, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866121833.063, "dur": 357.604, + "args": { + "External id": 88715, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163346, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163346, "pid": 0, "tid": 7, "ts": 6300866121833.063, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059328.989, "dur": 6.470, + "args": { + "External id": 88715, "cbid": 307, "correlation": 161163346 + } + }, + { + "ph": "s", "id": 161163346, "pid": 5714, "tid": 6744, "ts": 6300866059328.989, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866122191.275, "dur": 88.674, + "args": { + "External id": 88716, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163359, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163359, "pid": 0, "tid": 7, "ts": 6300866122191.275, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059376.849, "dur": 6.340, + "args": { + "External id": 88716, "cbid": 307, "correlation": 161163359 + } + }, + { + "ph": "s", "id": 161163359, "pid": 5714, "tid": 6744, "ts": 6300866059376.849, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6300866122280.621, "dur": 3.520, + "args": { + "External id": 88717, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163367, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161163367, "pid": 0, "tid": 7, "ts": 6300866122280.621, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059409.449, "dur": 5.360, + "args": { + "External id": 88717, "cbid": 307, "correlation": 161163367 + } + }, + { + "ph": "s", "id": 161163367, "pid": 5714, "tid": 6744, "ts": 6300866059409.449, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6300866122284.845, "dur": 113.601, + "args": { + "External id": 88718, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163375, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163375, "pid": 0, "tid": 7, "ts": 6300866122284.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059442.758, "dur": 5.451, + "args": { + "External id": 88718, "cbid": 307, "correlation": 161163375 + } + }, + { + "ph": "s", "id": 161163375, "pid": 5714, "tid": 6744, "ts": 6300866059442.758, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866059636.148, "dur": 0.590, + "args": { + "External id": 88737, "cbid": 200, "correlation": 161163421 + } + }, + { + "ph": "f", "id": 161163421, "pid": 5714, "tid": 6744, "ts": 6300866059636.148, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866122399.758, "dur": 1.216, + "args": { + "External id": 88737, "device": 0, "context": 1, "stream": 7, "correlation": 161163424, "bytes": 576, "memory bandwidth (GB/s)": 0.47368421052631576 + } + }, + { + "ph": "f", "id": 161163424, "pid": 0, "tid": 7, "ts": 6300866122399.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866059638.518, "dur": 7.810, + "args": { + "External id": 88737, "cbid": 51, "correlation": 161163424 + } + }, + { + "ph": "s", "id": 161163424, "pid": 5714, "tid": 6744, "ts": 6300866059638.518, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866122402.638, "dur": 144.065, + "args": { + "External id": 88737, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163425, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163425, "pid": 0, "tid": 7, "ts": 6300866122402.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059646.588, "dur": 8.710, + "args": { + "External id": 88737, "cbid": 307, "correlation": 161163425 + } + }, + { + "ph": "s", "id": 161163425, "pid": 5714, "tid": 6744, "ts": 6300866059646.588, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866122547.375, "dur": 141.602, + "args": { + "External id": 88738, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163447, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163447, "pid": 0, "tid": 7, "ts": 6300866122547.375, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059684.068, "dur": 6.150, + "args": { + "External id": 88738, "cbid": 211, "correlation": 161163447 + } + }, + { + "ph": "s", "id": 161163447, "pid": 5714, "tid": 6744, "ts": 6300866059684.068, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866059781.318, "dur": 0.510, + "args": { + "External id": 88739, "cbid": 200, "correlation": 161163465 + } + }, + { + "ph": "f", "id": 161163465, "pid": 5714, "tid": 6744, "ts": 6300866059781.318, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866059781.958, "dur": 0.230, + "args": { + "External id": 88739, "cbid": 200, "correlation": 161163466 + } + }, + { + "ph": "f", "id": 161163466, "pid": 5714, "tid": 6744, "ts": 6300866059781.958, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866059802.968, "dur": 0.240, + "args": { + "External id": 88739, "cbid": 200, "correlation": 161163484 + } + }, + { + "ph": "f", "id": 161163484, "pid": 5714, "tid": 6744, "ts": 6300866059802.968, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866122689.713, "dur": 94.306, + "args": { + "External id": 88739, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163485, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163485, "pid": 0, "tid": 7, "ts": 6300866122689.713, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059804.508, "dur": 10.490, + "args": { + "External id": 88739, "cbid": 211, "correlation": 161163485 + } + }, + { + "ph": "s", "id": 161163485, "pid": 5714, "tid": 6744, "ts": 6300866059804.508, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866059815.838, "dur": 1.090, + "args": { + "External id": 88739, "cbid": 273, "correlation": 161163487 + } + }, + { + "ph": "f", "id": 161163487, "pid": 5714, "tid": 6744, "ts": 6300866059815.838, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866122784.690, "dur": 1157.390, + "args": { + "External id": 88739, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163488, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161163488, "pid": 0, "tid": 7, "ts": 6300866122784.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059817.278, "dur": 4.440, + "args": { + "External id": 88739, "cbid": 211, "correlation": 161163488 + } + }, + { + "ph": "s", "id": 161163488, "pid": 5714, "tid": 6744, "ts": 6300866059817.278, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866123942.784, "dur": 216.546, + "args": { + "External id": 88739, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163490, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161163490, "pid": 0, "tid": 7, "ts": 6300866123942.784, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866059822.458, "dur": 3.950, + "args": { + "External id": 88739, "cbid": 211, "correlation": 161163490 + } + }, + { + "ph": "s", "id": 161163490, "pid": 5714, "tid": 6744, "ts": 6300866059822.458, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866124159.938, "dur": 215.395, + "args": { + "External id": 88750, "device": 0, "context": 1, "stream": 7, "correlation": 161163512, "bytes": 25165824, "memory bandwidth (GB/s)": 116.83569256482276 + } + }, + { + "ph": "f", "id": 161163512, "pid": 0, "tid": 7, "ts": 6300866124159.938, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866059967.967, "dur": 19.040, + "args": { + "External id": 88750, "cbid": 41, "correlation": 161163512 + } + }, + { + "ph": "s", "id": 161163512, "pid": 5714, "tid": 6744, "ts": 6300866059967.967, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866124376.037, "dur": 112.705, + "args": { + "External id": 88747, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163530, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163530, "pid": 0, "tid": 7, "ts": 6300866124376.037, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060093.347, "dur": 9.260, + "args": { + "External id": 88747, "cbid": 307, "correlation": 161163530 + } + }, + { + "ph": "s", "id": 161163530, "pid": 5714, "tid": 6744, "ts": 6300866060093.347, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866124489.415, "dur": 78.496, + "args": { + "External id": 88757, "device": 0, "context": 1, "stream": 7, "correlation": 161163545, "bytes": 25165824, "memory bandwidth (GB/s)": 320.60008153281694 + } + }, + { + "ph": "f", "id": 161163545, "pid": 0, "tid": 7, "ts": 6300866124489.415, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866060168.857, "dur": 15.060, + "args": { + "External id": 88757, "cbid": 41, "correlation": 161163545 + } + }, + { + "ph": "s", "id": 161163545, "pid": 5714, "tid": 6744, "ts": 6300866060168.857, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866124568.551, "dur": 28.481, + "args": { + "External id": 88754, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163563, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163563, "pid": 0, "tid": 7, "ts": 6300866124568.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060279.947, "dur": 8.080, + "args": { + "External id": 88754, "cbid": 307, "correlation": 161163563 + } + }, + { + "ph": "s", "id": 161163563, "pid": 5714, "tid": 6744, "ts": 6300866060279.947, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866060423.206, "dur": 0.590, + "args": { + "External id": 88762, "cbid": 200, "correlation": 161163593 + } + }, + { + "ph": "f", "id": 161163593, "pid": 5714, "tid": 6744, "ts": 6300866060423.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866124598.344, "dur": 0.864, + "args": { + "External id": 88762, "device": 0, "context": 1, "stream": 7, "correlation": 161163596, "bytes": 576, "memory bandwidth (GB/s)": 0.6666666666666666 + } + }, + { + "ph": "f", "id": 161163596, "pid": 0, "tid": 7, "ts": 6300866124598.344, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866060425.686, "dur": 8.040, + "args": { + "External id": 88762, "cbid": 51, "correlation": 161163596 + } + }, + { + "ph": "s", "id": 161163596, "pid": 5714, "tid": 6744, "ts": 6300866060425.686, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866124600.808, "dur": 146.626, + "args": { + "External id": 88762, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163597, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163597, "pid": 0, "tid": 7, "ts": 6300866124600.808, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060433.976, "dur": 8.740, + "args": { + "External id": 88762, "cbid": 307, "correlation": 161163597 + } + }, + { + "ph": "s", "id": 161163597, "pid": 5714, "tid": 6744, "ts": 6300866060433.976, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866060471.216, "dur": 0.310, + "args": { + "External id": 88763, "cbid": 200, "correlation": 161163622 + } + }, + { + "ph": "f", "id": 161163622, "pid": 5714, "tid": 6744, "ts": 6300866060471.216, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866124748.682, "dur": 1.088, + "args": { + "External id": 88763, "device": 0, "context": 1, "stream": 7, "correlation": 161163625, "bytes": 576, "memory bandwidth (GB/s)": 0.5294117647058824 + } + }, + { + "ph": "f", "id": 161163625, "pid": 0, "tid": 7, "ts": 6300866124748.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866060472.586, "dur": 4.550, + "args": { + "External id": 88763, "cbid": 51, "correlation": 161163625 + } + }, + { + "ph": "s", "id": 161163625, "pid": 5714, "tid": 6744, "ts": 6300866060472.586, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866124751.402, "dur": 141.345, + "args": { + "External id": 88763, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163626, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163626, "pid": 0, "tid": 7, "ts": 6300866124751.402, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060477.296, "dur": 5.710, + "args": { + "External id": 88763, "cbid": 307, "correlation": 161163626 + } + }, + { + "ph": "s", "id": 161163626, "pid": 5714, "tid": 6744, "ts": 6300866060477.296, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866060506.856, "dur": 0.290, + "args": { + "External id": 88764, "cbid": 200, "correlation": 161163651 + } + }, + { + "ph": "f", "id": 161163651, "pid": 5714, "tid": 6744, "ts": 6300866060506.856, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866124894.091, "dur": 1.312, + "args": { + "External id": 88764, "device": 0, "context": 1, "stream": 7, "correlation": 161163654, "bytes": 576, "memory bandwidth (GB/s)": 0.43902439024390244 + } + }, + { + "ph": "f", "id": 161163654, "pid": 0, "tid": 7, "ts": 6300866124894.091, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866060508.156, "dur": 4.200, + "args": { + "External id": 88764, "cbid": 51, "correlation": 161163654 + } + }, + { + "ph": "s", "id": 161163654, "pid": 5714, "tid": 6744, "ts": 6300866060508.156, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866124896.715, "dur": 269.763, + "args": { + "External id": 88764, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163655, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163655, "pid": 0, "tid": 7, "ts": 6300866124896.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060512.506, "dur": 5.230, + "args": { + "External id": 88764, "cbid": 307, "correlation": 161163655 + } + }, + { + "ph": "s", "id": 161163655, "pid": 5714, "tid": 6744, "ts": 6300866060512.506, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866125167.214, "dur": 612.903, + "args": { + "External id": 88765, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163677, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163677, "pid": 0, "tid": 7, "ts": 6300866125167.214, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060543.666, "dur": 6.230, + "args": { + "External id": 88765, "cbid": 211, "correlation": 161163677 + } + }, + { + "ph": "s", "id": 161163677, "pid": 5714, "tid": 6744, "ts": 6300866060543.666, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866125796.534, "dur": 163.554, + "args": { + "External id": 88766, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163700, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163700, "pid": 0, "tid": 7, "ts": 6300866125796.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060569.186, "dur": 4.990, + "args": { + "External id": 88766, "cbid": 211, "correlation": 161163700 + } + }, + { + "ph": "s", "id": 161163700, "pid": 5714, "tid": 6744, "ts": 6300866060569.186, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866125960.696, "dur": 155.426, + "args": { + "External id": 88767, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163723, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161163723, "pid": 0, "tid": 7, "ts": 6300866125960.696, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060592.076, "dur": 4.700, + "args": { + "External id": 88767, "cbid": 211, "correlation": 161163723 + } + }, + { + "ph": "s", "id": 161163723, "pid": 5714, "tid": 6744, "ts": 6300866060592.076, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6300866126116.730, "dur": 85.921, + "args": { + "External id": 88768, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163731, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163731, "pid": 0, "tid": 7, "ts": 6300866126116.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060632.826, "dur": 5.780, + "args": { + "External id": 88768, "cbid": 307, "correlation": 161163731 + } + }, + { + "ph": "s", "id": 161163731, "pid": 5714, "tid": 6744, "ts": 6300866060632.826, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866126203.355, "dur": 48.352, + "args": { + "External id": 88783, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163760, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163760, "pid": 0, "tid": 7, "ts": 6300866126203.355, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060806.386, "dur": 9.700, + "args": { + "External id": 88783, "cbid": 307, "correlation": 161163760 + } + }, + { + "ph": "s", "id": 161163760, "pid": 5714, "tid": 6744, "ts": 6300866060806.386, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866126252.379, "dur": 3.296, + "args": { + "External id": 88784, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163768, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161163768, "pid": 0, "tid": 7, "ts": 6300866126252.379, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060842.475, "dur": 5.760, + "args": { + "External id": 88784, "cbid": 307, "correlation": 161163768 + } + }, + { + "ph": "s", "id": 161163768, "pid": 5714, "tid": 6744, "ts": 6300866060842.475, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866126256.315, "dur": 51.297, + "args": { + "External id": 88785, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163779, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163779, "pid": 0, "tid": 7, "ts": 6300866126256.315, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060876.285, "dur": 5.460, + "args": { + "External id": 88785, "cbid": 307, "correlation": 161163779 + } + }, + { + "ph": "s", "id": 161163779, "pid": 5714, "tid": 6744, "ts": 6300866060876.285, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866126308.252, "dur": 46.688, + "args": { + "External id": 88786, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163784, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163784, "pid": 0, "tid": 7, "ts": 6300866126308.252, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866060919.045, "dur": 7.770, + "args": { + "External id": 88786, "cbid": 211, "correlation": 161163784 + } + }, + { + "ph": "s", "id": 161163784, "pid": 5714, "tid": 6744, "ts": 6300866060919.045, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866061104.575, "dur": 3.180, + "args": { + "External id": 88792, "cbid": 147, "correlation": 161163801 + } + }, + { + "ph": "s", "id": 161163801, "pid": 5714, "tid": 6744, "ts": 6300866061104.575, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866061249.785, "dur": 2.500, + "args": { + "External id": 88800, "cbid": 138, "correlation": 161163816 + } + }, + { + "ph": "f", "id": 161163816, "pid": 5714, "tid": 6744, "ts": 6300866061249.785, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866126357.564, "dur": 6.049, + "args": { + "External id": 88804, "device": 0, "context": 1, "stream": 7, "correlation": 161163827, "bytes": 28112, "memory bandwidth (GB/s)": 4.647379732187138 + } + }, + { + "ph": "f", "id": 161163827, "pid": 0, "tid": 7, "ts": 6300866126357.564, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866061275.414, "dur": 12.420, + "args": { + "External id": 88804, "cbid": 41, "correlation": 161163827 + } + }, + { + "ph": "s", "id": 161163827, "pid": 5714, "tid": 6744, "ts": 6300866061275.414, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866061292.454, "dur": 2.200, + "args": { + "External id": 88799, "cbid": 135, "correlation": 161163831 + } + }, + { + "ph": "f", "id": 161163831, "pid": 5714, "tid": 6744, "ts": 6300866061292.454, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866126365.597, "dur": 182.785, + "args": { + "External id": 88799, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161163835, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161163835, "pid": 0, "tid": 7, "ts": 6300866126365.597, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866061305.834, "dur": 11.440, + "args": { + "External id": 88799, "cbid": 211, "correlation": 161163835 + } + }, + { + "ph": "s", "id": 161163835, "pid": 5714, "tid": 6744, "ts": 6300866061305.834, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866061359.944, "dur": 1.160, + "args": { + "External id": 88792, "cbid": 135, "correlation": 161163846 + } + }, + { + "ph": "f", "id": 161163846, "pid": 5714, "tid": 6744, "ts": 6300866061359.944, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866061363.284, "dur": 1.510, + "args": { + "External id": 88792, "cbid": 147, "correlation": 161163850 + } + }, + { + "ph": "s", "id": 161163850, "pid": 5714, "tid": 6744, "ts": 6300866061363.284, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866061439.234, "dur": 1.130, + "args": { + "External id": 88808, "cbid": 317, "correlation": 161163870 + } + }, + { + "ph": "f", "id": 161163870, "pid": 5714, "tid": 6744, "ts": 6300866061439.234, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866061442.424, "dur": 1.410, + "args": { + "External id": 88808, "cbid": 135, "correlation": 161163872 + } + }, + { + "ph": "f", "id": 161163872, "pid": 5714, "tid": 6744, "ts": 6300866061442.424, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866061445.354, "dur": 1.130, + "args": { + "External id": 88808, "cbid": 147, "correlation": 161163876 + } + }, + { + "ph": "s", "id": 161163876, "pid": 5714, "tid": 6744, "ts": 6300866061445.354, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866061461.404, "dur": 0.770, + "args": { + "External id": 88808, "cbid": 409, "correlation": 161163879 + } + }, + { + "ph": "f", "id": 161163879, "pid": 5714, "tid": 6744, "ts": 6300866061461.404, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866061466.544, "dur": 0.860, + "args": { + "External id": 88808, "cbid": 135, "correlation": 161163882 + } + }, + { + "ph": "f", "id": 161163882, "pid": 5714, "tid": 6744, "ts": 6300866061466.544, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866061467.594, "dur": 0.920, + "args": { + "External id": 88808, "cbid": 147, "correlation": 161163883 + } + }, + { + "ph": "s", "id": 161163883, "pid": 5714, "tid": 6744, "ts": 6300866061467.594, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866131129.684, "dur": 9336.686, + "args": { + "External id": 88808, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161163885, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161163885, "pid": 0, "tid": 20, "ts": 6300866131129.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866061469.714, "dur": 10.400, + "args": { + "External id": 88808, "cbid": 430, "correlation": 161163885 + } + }, + { + "ph": "s", "id": 161163885, "pid": 5714, "tid": 6744, "ts": 6300866061469.714, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866061481.154, "dur": 0.410, + "args": { + "External id": 88808, "cbid": 135, "correlation": 161163887 + } + }, + { + "ph": "f", "id": 161163887, "pid": 5714, "tid": 6744, "ts": 6300866061481.154, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866061481.694, "dur": 0.670, + "args": { + "External id": 88808, "cbid": 147, "correlation": 161163888 + } + }, + { + "ph": "s", "id": 161163888, "pid": 5714, "tid": 6744, "ts": 6300866061481.694, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866061484.184, "dur": 0.930, + "args": { + "External id": 88808, "cbid": 135, "correlation": 161163891 + } + }, + { + "ph": "f", "id": 161163891, "pid": 5714, "tid": 6744, "ts": 6300866061484.184, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866061493.484, "dur": 0.480, + "args": { + "External id": 88808, "cbid": 135, "correlation": 161163898 + } + }, + { + "ph": "f", "id": 161163898, "pid": 5714, "tid": 6744, "ts": 6300866061493.484, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866061521.934, "dur": 1.060, + "args": { + "External id": 88810, "cbid": 147, "correlation": 161163903 + } + }, + { + "ph": "s", "id": 161163903, "pid": 5714, "tid": 6744, "ts": 6300866061521.934, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866061540.254, "dur": 0.940, + "args": { + "External id": 88792, "cbid": 135, "correlation": 161163918 + } + }, + { + "ph": "f", "id": 161163918, "pid": 5714, "tid": 6744, "ts": 6300866061540.254, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866061740.143, "dur": 1.390, + "args": { + "External id": 88792, "cbid": 135, "correlation": 161163931 + } + }, + { + "ph": "f", "id": 161163931, "pid": 5714, "tid": 6744, "ts": 6300866061740.143, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866061854.183, "dur": 3.260, + "args": { + "External id": 88820, "cbid": 147, "correlation": 161163942 + } + }, + { + "ph": "s", "id": 161163942, "pid": 5714, "tid": 6744, "ts": 6300866061854.183, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866061974.543, "dur": 1.270, + "args": { + "External id": 88834, "cbid": 317, "correlation": 161163983 + } + }, + { + "ph": "f", "id": 161163983, "pid": 5714, "tid": 6744, "ts": 6300866061974.543, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866061983.583, "dur": 2.370, + "args": { + "External id": 88835, "cbid": 138, "correlation": 161163986 + } + }, + { + "ph": "f", "id": 161163986, "pid": 5714, "tid": 6744, "ts": 6300866061983.583, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866131129.652, "dur": 2.432, + "args": { + "External id": 88839, "device": 0, "context": 1, "stream": 7, "correlation": 161163997, "bytes": 7224, "memory bandwidth (GB/s)": 2.9703947368421053 + } + }, + { + "ph": "f", "id": 161163997, "pid": 0, "tid": 7, "ts": 6300866131129.652, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866062007.783, "dur": 12.990, + "args": { + "External id": 88839, "cbid": 41, "correlation": 161163997 + } + }, + { + "ph": "s", "id": 161163997, "pid": 5714, "tid": 6744, "ts": 6300866062007.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866062025.493, "dur": 2.090, + "args": { + "External id": 88834, "cbid": 135, "correlation": 161164001 + } + }, + { + "ph": "f", "id": 161164001, "pid": 5714, "tid": 6744, "ts": 6300866062025.493, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300866131134.164, "dur": 11.616, + "args": { + "External id": 88834, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164005, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164005, "pid": 0, "tid": 7, "ts": 6300866131134.164, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866062030.153, "dur": 11.230, + "args": { + "External id": 88834, "cbid": 211, "correlation": 161164005 + } + }, + { + "ph": "s", "id": 161164005, "pid": 5714, "tid": 6744, "ts": 6300866062030.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866062135.763, "dur": 1.449, + "args": { + "External id": 88820, "cbid": 135, "correlation": 161164016 + } + }, + { + "ph": "f", "id": 161164016, "pid": 5714, "tid": 6744, "ts": 6300866062135.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866062140.523, "dur": 1.249, + "args": { + "External id": 88820, "cbid": 147, "correlation": 161164020 + } + }, + { + "ph": "s", "id": 161164020, "pid": 5714, "tid": 6744, "ts": 6300866062140.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866062143.672, "dur": 0.780, + "args": { + "External id": 88820, "cbid": 147, "correlation": 161164024 + } + }, + { + "ph": "s", "id": 161164024, "pid": 5714, "tid": 6744, "ts": 6300866062143.672, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866131179.509, "dur": 28.576, + "args": { + "External id": 88853, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161164048, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161164048, "pid": 0, "tid": 17, "ts": 6300866131179.509, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866062326.222, "dur": 12.750, + "args": { + "External id": 88853, "cbid": 211, "correlation": 161164048 + } + }, + { + "ph": "s", "id": 161164048, "pid": 5714, "tid": 6744, "ts": 6300866062326.222, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866131217.718, "dur": 11.744, + "args": { + "External id": 88869, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161164061, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161164061, "pid": 0, "tid": 17, "ts": 6300866131217.718, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866062448.852, "dur": 10.050, + "args": { + "External id": 88869, "cbid": 211, "correlation": 161164061 + } + }, + { + "ph": "s", "id": 161164061, "pid": 5714, "tid": 6744, "ts": 6300866062448.852, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866062484.522, "dur": 1.350, + "args": { + "External id": 88820, "cbid": 135, "correlation": 161164071 + } + }, + { + "ph": "f", "id": 161164071, "pid": 5714, "tid": 6744, "ts": 6300866062484.522, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866062487.772, "dur": 1.300, + "args": { + "External id": 88820, "cbid": 147, "correlation": 161164075 + } + }, + { + "ph": "s", "id": 161164075, "pid": 5714, "tid": 6744, "ts": 6300866062487.772, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866062544.751, "dur": 1.011, + "args": { + "External id": 88871, "cbid": 317, "correlation": 161164088 + } + }, + { + "ph": "f", "id": 161164088, "pid": 5714, "tid": 6744, "ts": 6300866062544.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866062547.791, "dur": 1.191, + "args": { + "External id": 88871, "cbid": 135, "correlation": 161164090 + } + }, + { + "ph": "f", "id": 161164090, "pid": 5714, "tid": 6744, "ts": 6300866062547.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866062550.531, "dur": 1.291, + "args": { + "External id": 88871, "cbid": 147, "correlation": 161164094 + } + }, + { + "ph": "s", "id": 161164094, "pid": 5714, "tid": 6744, "ts": 6300866062550.531, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866062566.842, "dur": 0.829, + "args": { + "External id": 88871, "cbid": 409, "correlation": 161164097 + } + }, + { + "ph": "f", "id": 161164097, "pid": 5714, "tid": 6744, "ts": 6300866062566.842, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866062571.951, "dur": 0.720, + "args": { + "External id": 88871, "cbid": 135, "correlation": 161164100 + } + }, + { + "ph": "f", "id": 161164100, "pid": 5714, "tid": 6744, "ts": 6300866062571.951, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866062572.851, "dur": 0.900, + "args": { + "External id": 88871, "cbid": 147, "correlation": 161164101 + } + }, + { + "ph": "s", "id": 161164101, "pid": 5714, "tid": 6744, "ts": 6300866062572.851, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866140472.834, "dur": 5374.239, + "args": { + "External id": 88871, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161164103, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161164103, "pid": 0, "tid": 20, "ts": 6300866140472.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866062574.891, "dur": 10.211, + "args": { + "External id": 88871, "cbid": 430, "correlation": 161164103 + } + }, + { + "ph": "s", "id": 161164103, "pid": 5714, "tid": 6744, "ts": 6300866062574.891, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866062586.131, "dur": 0.440, + "args": { + "External id": 88871, "cbid": 135, "correlation": 161164105 + } + }, + { + "ph": "f", "id": 161164105, "pid": 5714, "tid": 6744, "ts": 6300866062586.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866062586.702, "dur": 0.509, + "args": { + "External id": 88871, "cbid": 147, "correlation": 161164106 + } + }, + { + "ph": "s", "id": 161164106, "pid": 5714, "tid": 6744, "ts": 6300866062586.702, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866062588.891, "dur": 0.871, + "args": { + "External id": 88871, "cbid": 135, "correlation": 161164109 + } + }, + { + "ph": "f", "id": 161164109, "pid": 5714, "tid": 6744, "ts": 6300866062588.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866062597.881, "dur": 0.500, + "args": { + "External id": 88871, "cbid": 135, "correlation": 161164116 + } + }, + { + "ph": "f", "id": 161164116, "pid": 5714, "tid": 6744, "ts": 6300866062597.881, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866062626.521, "dur": 1.120, + "args": { + "External id": 88873, "cbid": 147, "correlation": 161164121 + } + }, + { + "ph": "s", "id": 161164121, "pid": 5714, "tid": 6744, "ts": 6300866062626.521, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866062644.901, "dur": 0.930, + "args": { + "External id": 88820, "cbid": 135, "correlation": 161164136 + } + }, + { + "ph": "f", "id": 161164136, "pid": 5714, "tid": 6744, "ts": 6300866062644.901, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866131146.420, "dur": 2385.373, + "args": { + "External id": 88875, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164161, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164161, "pid": 0, "tid": 7, "ts": 6300866131146.420, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866062798.891, "dur": 12.190, + "args": { + "External id": 88875, "cbid": 211, "correlation": 161164161 + } + }, + { + "ph": "s", "id": 161164161, "pid": 5714, "tid": 6744, "ts": 6300866062798.891, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6300866133532.497, "dur": 549.222, + "args": { + "External id": 88876, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164184, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161164184, "pid": 0, "tid": 7, "ts": 6300866133532.497, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866062863.801, "dur": 6.780, + "args": { + "External id": 88876, "cbid": 307, "correlation": 161164184 + } + }, + { + "ph": "s", "id": 161164184, "pid": 5714, "tid": 6744, "ts": 6300866062863.801, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866062906.851, "dur": 0.590, + "args": { + "External id": 88877, "cbid": 200, "correlation": 161164207 + } + }, + { + "ph": "f", "id": 161164207, "pid": 5714, "tid": 6744, "ts": 6300866062906.851, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866134190.969, "dur": 35.552, + "args": { + "External id": 88877, "device": 0, "context": 1, "stream": 7, "correlation": 161164210, "bytes": 1536, "memory bandwidth (GB/s)": 0.043204320432043204 + } + }, + { + "ph": "f", "id": 161164210, "pid": 0, "tid": 7, "ts": 6300866134190.969, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866062909.361, "dur": 7.370, + "args": { + "External id": 88877, "cbid": 51, "correlation": 161164210 + } + }, + { + "ph": "s", "id": 161164210, "pid": 5714, "tid": 6744, "ts": 6300866062909.361, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866134299.002, "dur": 760.905, + "args": { + "External id": 88877, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164211, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164211, "pid": 0, "tid": 7, "ts": 6300866134299.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866062916.951, "dur": 6.370, + "args": { + "External id": 88877, "cbid": 307, "correlation": 161164211 + } + }, + { + "ph": "s", "id": 161164211, "pid": 5714, "tid": 6744, "ts": 6300866062916.951, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866062952.570, "dur": 0.311, + "args": { + "External id": 88878, "cbid": 200, "correlation": 161164236 + } + }, + { + "ph": "f", "id": 161164236, "pid": 5714, "tid": 6744, "ts": 6300866062952.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866135061.219, "dur": 1.344, + "args": { + "External id": 88878, "device": 0, "context": 1, "stream": 7, "correlation": 161164239, "bytes": 1536, "memory bandwidth (GB/s)": 1.1428571428571428 + } + }, + { + "ph": "f", "id": 161164239, "pid": 0, "tid": 7, "ts": 6300866135061.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866062954.010, "dur": 4.531, + "args": { + "External id": 88878, "cbid": 51, "correlation": 161164239 + } + }, + { + "ph": "s", "id": 161164239, "pid": 5714, "tid": 6744, "ts": 6300866062954.010, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866135064.547, "dur": 354.212, + "args": { + "External id": 88878, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164240, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164240, "pid": 0, "tid": 7, "ts": 6300866135064.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866062958.670, "dur": 5.560, + "args": { + "External id": 88878, "cbid": 307, "correlation": 161164240 + } + }, + { + "ph": "s", "id": 161164240, "pid": 5714, "tid": 6744, "ts": 6300866062958.670, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866062990.061, "dur": 0.309, + "args": { + "External id": 88879, "cbid": 200, "correlation": 161164265 + } + }, + { + "ph": "f", "id": 161164265, "pid": 5714, "tid": 6744, "ts": 6300866062990.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866135419.399, "dur": 359.684, + "args": { + "External id": 88879, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164268, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164268, "pid": 0, "tid": 7, "ts": 6300866135419.399, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866062991.661, "dur": 5.860, + "args": { + "External id": 88879, "cbid": 307, "correlation": 161164268 + } + }, + { + "ph": "s", "id": 161164268, "pid": 5714, "tid": 6744, "ts": 6300866062991.661, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866063020.530, "dur": 0.260, + "args": { + "External id": 88880, "cbid": 200, "correlation": 161164293 + } + }, + { + "ph": "f", "id": 161164293, "pid": 5714, "tid": 6744, "ts": 6300866063020.530, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866135780.427, "dur": 1.568, + "args": { + "External id": 88880, "device": 0, "context": 1, "stream": 7, "correlation": 161164296, "bytes": 1536, "memory bandwidth (GB/s)": 0.9795918367346939 + } + }, + { + "ph": "f", "id": 161164296, "pid": 0, "tid": 7, "ts": 6300866135780.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866063021.890, "dur": 4.760, + "args": { + "External id": 88880, "cbid": 51, "correlation": 161164296 + } + }, + { + "ph": "s", "id": 161164296, "pid": 5714, "tid": 6744, "ts": 6300866063021.890, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866135783.179, "dur": 355.300, + "args": { + "External id": 88880, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164297, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164297, "pid": 0, "tid": 7, "ts": 6300866135783.179, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063026.801, "dur": 5.380, + "args": { + "External id": 88880, "cbid": 307, "correlation": 161164297 + } + }, + { + "ph": "s", "id": 161164297, "pid": 5714, "tid": 6744, "ts": 6300866063026.801, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866063056.490, "dur": 0.310, + "args": { + "External id": 88881, "cbid": 200, "correlation": 161164322 + } + }, + { + "ph": "f", "id": 161164322, "pid": 5714, "tid": 6744, "ts": 6300866063056.490, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866136139.183, "dur": 356.804, + "args": { + "External id": 88881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164325, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164325, "pid": 0, "tid": 7, "ts": 6300866136139.183, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063057.880, "dur": 5.530, + "args": { + "External id": 88881, "cbid": 307, "correlation": 161164325 + } + }, + { + "ph": "s", "id": 161164325, "pid": 5714, "tid": 6744, "ts": 6300866063057.880, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866136496.691, "dur": 92.193, + "args": { + "External id": 88882, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164338, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164338, "pid": 0, "tid": 7, "ts": 6300866136496.691, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063103.180, "dur": 6.540, + "args": { + "External id": 88882, "cbid": 307, "correlation": 161164338 + } + }, + { + "ph": "s", "id": 161164338, "pid": 5714, "tid": 6744, "ts": 6300866063103.180, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6300866136589.620, "dur": 3.584, + "args": { + "External id": 88883, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164346, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161164346, "pid": 0, "tid": 7, "ts": 6300866136589.620, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063136.340, "dur": 5.480, + "args": { + "External id": 88883, "cbid": 307, "correlation": 161164346 + } + }, + { + "ph": "s", "id": 161164346, "pid": 5714, "tid": 6744, "ts": 6300866063136.340, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6300866136593.844, "dur": 114.146, + "args": { + "External id": 88884, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164354, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164354, "pid": 0, "tid": 7, "ts": 6300866136593.844, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063168.410, "dur": 5.290, + "args": { + "External id": 88884, "cbid": 307, "correlation": 161164354 + } + }, + { + "ph": "s", "id": 161164354, "pid": 5714, "tid": 6744, "ts": 6300866063168.410, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866063370.390, "dur": 0.550, + "args": { + "External id": 88903, "cbid": 200, "correlation": 161164400 + } + }, + { + "ph": "f", "id": 161164400, "pid": 5714, "tid": 6744, "ts": 6300866063370.390, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866136709.238, "dur": 1.408, + "args": { + "External id": 88903, "device": 0, "context": 1, "stream": 7, "correlation": 161164403, "bytes": 576, "memory bandwidth (GB/s)": 0.4090909090909091 + } + }, + { + "ph": "f", "id": 161164403, "pid": 0, "tid": 7, "ts": 6300866136709.238, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866063372.720, "dur": 7.849, + "args": { + "External id": 88903, "cbid": 51, "correlation": 161164403 + } + }, + { + "ph": "s", "id": 161164403, "pid": 5714, "tid": 6744, "ts": 6300866063372.720, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866136712.022, "dur": 142.978, + "args": { + "External id": 88903, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164404, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164404, "pid": 0, "tid": 7, "ts": 6300866136712.022, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063380.809, "dur": 9.191, + "args": { + "External id": 88903, "cbid": 307, "correlation": 161164404 + } + }, + { + "ph": "s", "id": 161164404, "pid": 5714, "tid": 6744, "ts": 6300866063380.809, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866136855.608, "dur": 141.761, + "args": { + "External id": 88904, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164426, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164426, "pid": 0, "tid": 7, "ts": 6300866136855.608, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063420.409, "dur": 6.500, + "args": { + "External id": 88904, "cbid": 211, "correlation": 161164426 + } + }, + { + "ph": "s", "id": 161164426, "pid": 5714, "tid": 6744, "ts": 6300866063420.409, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866063505.349, "dur": 0.500, + "args": { + "External id": 88905, "cbid": 200, "correlation": 161164444 + } + }, + { + "ph": "f", "id": 161164444, "pid": 5714, "tid": 6744, "ts": 6300866063505.349, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866063505.979, "dur": 0.260, + "args": { + "External id": 88905, "cbid": 200, "correlation": 161164445 + } + }, + { + "ph": "f", "id": 161164445, "pid": 5714, "tid": 6744, "ts": 6300866063505.979, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866063526.799, "dur": 0.260, + "args": { + "External id": 88905, "cbid": 200, "correlation": 161164463 + } + }, + { + "ph": "f", "id": 161164463, "pid": 5714, "tid": 6744, "ts": 6300866063526.799, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866136998.041, "dur": 92.641, + "args": { + "External id": 88905, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164464, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164464, "pid": 0, "tid": 7, "ts": 6300866136998.041, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063528.419, "dur": 9.800, + "args": { + "External id": 88905, "cbid": 211, "correlation": 161164464 + } + }, + { + "ph": "s", "id": 161164464, "pid": 5714, "tid": 6744, "ts": 6300866063528.419, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866063539.079, "dur": 1.020, + "args": { + "External id": 88905, "cbid": 273, "correlation": 161164466 + } + }, + { + "ph": "f", "id": 161164466, "pid": 5714, "tid": 6744, "ts": 6300866063539.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866137091.290, "dur": 1130.254, + "args": { + "External id": 88905, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164467, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161164467, "pid": 0, "tid": 7, "ts": 6300866137091.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063540.439, "dur": 4.650, + "args": { + "External id": 88905, "cbid": 211, "correlation": 161164467 + } + }, + { + "ph": "s", "id": 161164467, "pid": 5714, "tid": 6744, "ts": 6300866063540.439, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866138228.232, "dur": 214.498, + "args": { + "External id": 88905, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164469, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161164469, "pid": 0, "tid": 7, "ts": 6300866138228.232, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063545.669, "dur": 3.930, + "args": { + "External id": 88905, "cbid": 211, "correlation": 161164469 + } + }, + { + "ph": "s", "id": 161164469, "pid": 5714, "tid": 6744, "ts": 6300866063545.669, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866138443.338, "dur": 214.627, + "args": { + "External id": 88916, "device": 0, "context": 1, "stream": 7, "correlation": 161164491, "bytes": 25165824, "memory bandwidth (GB/s)": 117.25376583561248 + } + }, + { + "ph": "f", "id": 161164491, "pid": 0, "tid": 7, "ts": 6300866138443.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866063691.809, "dur": 18.280, + "args": { + "External id": 88916, "cbid": 41, "correlation": 161164491 + } + }, + { + "ph": "s", "id": 161164491, "pid": 5714, "tid": 6744, "ts": 6300866063691.809, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866138658.637, "dur": 137.922, + "args": { + "External id": 88913, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164509, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164509, "pid": 0, "tid": 7, "ts": 6300866138658.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866063816.499, "dur": 8.660, + "args": { + "External id": 88913, "cbid": 307, "correlation": 161164509 + } + }, + { + "ph": "s", "id": 161164509, "pid": 5714, "tid": 6744, "ts": 6300866063816.499, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866138797.263, "dur": 49.632, + "args": { + "External id": 88923, "device": 0, "context": 1, "stream": 7, "correlation": 161164524, "bytes": 25165824, "memory bandwidth (GB/s)": 507.04835589941973 + } + }, + { + "ph": "f", "id": 161164524, "pid": 0, "tid": 7, "ts": 6300866138797.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866063889.928, "dur": 14.620, + "args": { + "External id": 88923, "cbid": 41, "correlation": 161164524 + } + }, + { + "ph": "s", "id": 161164524, "pid": 5714, "tid": 6744, "ts": 6300866063889.928, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866138847.503, "dur": 28.608, + "args": { + "External id": 88920, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164542, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164542, "pid": 0, "tid": 7, "ts": 6300866138847.503, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064001.148, "dur": 8.060, + "args": { + "External id": 88920, "cbid": 307, "correlation": 161164542 + } + }, + { + "ph": "s", "id": 161164542, "pid": 5714, "tid": 6744, "ts": 6300866064001.148, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866064133.948, "dur": 0.580, + "args": { + "External id": 88928, "cbid": 200, "correlation": 161164572 + } + }, + { + "ph": "f", "id": 161164572, "pid": 5714, "tid": 6744, "ts": 6300866064133.948, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866138877.423, "dur": 1.472, + "args": { + "External id": 88928, "device": 0, "context": 1, "stream": 7, "correlation": 161164575, "bytes": 576, "memory bandwidth (GB/s)": 0.391304347826087 + } + }, + { + "ph": "f", "id": 161164575, "pid": 0, "tid": 7, "ts": 6300866138877.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866064136.468, "dur": 7.830, + "args": { + "External id": 88928, "cbid": 51, "correlation": 161164575 + } + }, + { + "ph": "s", "id": 161164575, "pid": 5714, "tid": 6744, "ts": 6300866064136.468, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866138880.175, "dur": 145.570, + "args": { + "External id": 88928, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164576, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164576, "pid": 0, "tid": 7, "ts": 6300866138880.175, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064144.548, "dur": 8.340, + "args": { + "External id": 88928, "cbid": 307, "correlation": 161164576 + } + }, + { + "ph": "s", "id": 161164576, "pid": 5714, "tid": 6744, "ts": 6300866064144.548, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866064181.308, "dur": 0.320, + "args": { + "External id": 88929, "cbid": 200, "correlation": 161164601 + } + }, + { + "ph": "f", "id": 161164601, "pid": 5714, "tid": 6744, "ts": 6300866064181.308, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866139027.153, "dur": 0.832, + "args": { + "External id": 88929, "device": 0, "context": 1, "stream": 7, "correlation": 161164604, "bytes": 576, "memory bandwidth (GB/s)": 0.6923076923076923 + } + }, + { + "ph": "f", "id": 161164604, "pid": 0, "tid": 7, "ts": 6300866139027.153, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866064182.818, "dur": 4.730, + "args": { + "External id": 88929, "cbid": 51, "correlation": 161164604 + } + }, + { + "ph": "s", "id": 161164604, "pid": 5714, "tid": 6744, "ts": 6300866064182.818, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866139029.649, "dur": 140.898, + "args": { + "External id": 88929, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164605, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164605, "pid": 0, "tid": 7, "ts": 6300866139029.649, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064187.708, "dur": 5.760, + "args": { + "External id": 88929, "cbid": 307, "correlation": 161164605 + } + }, + { + "ph": "s", "id": 161164605, "pid": 5714, "tid": 6744, "ts": 6300866064187.708, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866064218.038, "dur": 0.320, + "args": { + "External id": 88930, "cbid": 200, "correlation": 161164630 + } + }, + { + "ph": "f", "id": 161164630, "pid": 5714, "tid": 6744, "ts": 6300866064218.038, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866139171.827, "dur": 0.800, + "args": { + "External id": 88930, "device": 0, "context": 1, "stream": 7, "correlation": 161164633, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 161164633, "pid": 0, "tid": 7, "ts": 6300866139171.827, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866064219.408, "dur": 4.170, + "args": { + "External id": 88930, "cbid": 51, "correlation": 161164633 + } + }, + { + "ph": "s", "id": 161164633, "pid": 5714, "tid": 6744, "ts": 6300866064219.408, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866139173.811, "dur": 237.699, + "args": { + "External id": 88930, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164634, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164634, "pid": 0, "tid": 7, "ts": 6300866139173.811, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064223.748, "dur": 5.280, + "args": { + "External id": 88930, "cbid": 307, "correlation": 161164634 + } + }, + { + "ph": "s", "id": 161164634, "pid": 5714, "tid": 6744, "ts": 6300866064223.748, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866139417.846, "dur": 460.421, + "args": { + "External id": 88931, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164656, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164656, "pid": 0, "tid": 7, "ts": 6300866139417.846, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064254.008, "dur": 5.979, + "args": { + "External id": 88931, "cbid": 211, "correlation": 161164656 + } + }, + { + "ph": "s", "id": 161164656, "pid": 5714, "tid": 6744, "ts": 6300866064254.008, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866139878.939, "dur": 302.532, + "args": { + "External id": 88932, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164679, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164679, "pid": 0, "tid": 7, "ts": 6300866139878.939, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064279.287, "dur": 5.020, + "args": { + "External id": 88932, "cbid": 211, "correlation": 161164679 + } + }, + { + "ph": "s", "id": 161164679, "pid": 5714, "tid": 6744, "ts": 6300866064279.287, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866140182.079, "dur": 157.249, + "args": { + "External id": 88933, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164702, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161164702, "pid": 0, "tid": 7, "ts": 6300866140182.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064312.558, "dur": 5.669, + "args": { + "External id": 88933, "cbid": 211, "correlation": 161164702 + } + }, + { + "ph": "s", "id": 161164702, "pid": 5714, "tid": 6744, "ts": 6300866064312.558, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6300866140340.032, "dur": 94.786, + "args": { + "External id": 88934, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164710, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164710, "pid": 0, "tid": 7, "ts": 6300866140340.032, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064356.507, "dur": 5.790, + "args": { + "External id": 88934, "cbid": 307, "correlation": 161164710 + } + }, + { + "ph": "s", "id": 161164710, "pid": 5714, "tid": 6744, "ts": 6300866064356.507, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866140435.489, "dur": 52.641, + "args": { + "External id": 88949, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164739, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164739, "pid": 0, "tid": 7, "ts": 6300866140435.489, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064530.547, "dur": 9.570, + "args": { + "External id": 88949, "cbid": 307, "correlation": 161164739 + } + }, + { + "ph": "s", "id": 161164739, "pid": 5714, "tid": 6744, "ts": 6300866064530.547, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866140488.770, "dur": 4.000, + "args": { + "External id": 88950, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164747, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161164747, "pid": 0, "tid": 7, "ts": 6300866140488.770, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064567.727, "dur": 5.830, + "args": { + "External id": 88950, "cbid": 307, "correlation": 161164747 + } + }, + { + "ph": "s", "id": 161164747, "pid": 5714, "tid": 6744, "ts": 6300866064567.727, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866140493.410, "dur": 51.777, + "args": { + "External id": 88951, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164758, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164758, "pid": 0, "tid": 7, "ts": 6300866140493.410, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064602.537, "dur": 5.570, + "args": { + "External id": 88951, "cbid": 307, "correlation": 161164758 + } + }, + { + "ph": "s", "id": 161164758, "pid": 5714, "tid": 6744, "ts": 6300866064602.537, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866140545.891, "dur": 47.328, + "args": { + "External id": 88952, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164763, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164763, "pid": 0, "tid": 7, "ts": 6300866140545.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866064645.977, "dur": 9.820, + "args": { + "External id": 88952, "cbid": 211, "correlation": 161164763 + } + }, + { + "ph": "s", "id": 161164763, "pid": 5714, "tid": 6744, "ts": 6300866064645.977, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866064833.866, "dur": 3.040, + "args": { + "External id": 88958, "cbid": 147, "correlation": 161164780 + } + }, + { + "ph": "s", "id": 161164780, "pid": 5714, "tid": 6744, "ts": 6300866064833.866, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866064953.836, "dur": 2.980, + "args": { + "External id": 88966, "cbid": 138, "correlation": 161164795 + } + }, + { + "ph": "f", "id": 161164795, "pid": 5714, "tid": 6744, "ts": 6300866064953.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866064957.476, "dur": 0.690, + "args": { + "External id": 88966, "cbid": 138, "correlation": 161164796 + } + }, + { + "ph": "f", "id": 161164796, "pid": 5714, "tid": 6744, "ts": 6300866064957.476, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866064958.386, "dur": 0.640, + "args": { + "External id": 88966, "cbid": 138, "correlation": 161164797 + } + }, + { + "ph": "f", "id": 161164797, "pid": 5714, "tid": 6744, "ts": 6300866064958.386, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866140600.356, "dur": 6.048, + "args": { + "External id": 88970, "device": 0, "context": 1, "stream": 7, "correlation": 161164808, "bytes": 28112, "memory bandwidth (GB/s)": 4.648148148148148 + } + }, + { + "ph": "f", "id": 161164808, "pid": 0, "tid": 7, "ts": 6300866140600.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866064981.716, "dur": 12.660, + "args": { + "External id": 88970, "cbid": 41, "correlation": 161164808 + } + }, + { + "ph": "s", "id": 161164808, "pid": 5714, "tid": 6744, "ts": 6300866064981.716, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866064998.756, "dur": 1.890, + "args": { + "External id": 88965, "cbid": 135, "correlation": 161164812 + } + }, + { + "ph": "f", "id": 161164812, "pid": 5714, "tid": 6744, "ts": 6300866064998.756, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866140608.516, "dur": 181.698, + "args": { + "External id": 88965, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164816, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164816, "pid": 0, "tid": 7, "ts": 6300866140608.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866065003.716, "dur": 10.960, + "args": { + "External id": 88965, "cbid": 211, "correlation": 161164816 + } + }, + { + "ph": "s", "id": 161164816, "pid": 5714, "tid": 6744, "ts": 6300866065003.716, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866065057.306, "dur": 1.060, + "args": { + "External id": 88958, "cbid": 135, "correlation": 161164827 + } + }, + { + "ph": "f", "id": 161164827, "pid": 5714, "tid": 6744, "ts": 6300866065057.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866065060.526, "dur": 1.490, + "args": { + "External id": 88958, "cbid": 147, "correlation": 161164831 + } + }, + { + "ph": "s", "id": 161164831, "pid": 5714, "tid": 6744, "ts": 6300866065060.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866065133.746, "dur": 1.080, + "args": { + "External id": 88974, "cbid": 317, "correlation": 161164851 + } + }, + { + "ph": "f", "id": 161164851, "pid": 5714, "tid": 6744, "ts": 6300866065133.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866065136.836, "dur": 1.470, + "args": { + "External id": 88974, "cbid": 135, "correlation": 161164853 + } + }, + { + "ph": "f", "id": 161164853, "pid": 5714, "tid": 6744, "ts": 6300866065136.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866065139.765, "dur": 1.051, + "args": { + "External id": 88974, "cbid": 147, "correlation": 161164857 + } + }, + { + "ph": "s", "id": 161164857, "pid": 5714, "tid": 6744, "ts": 6300866065139.765, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866065155.776, "dur": 0.929, + "args": { + "External id": 88974, "cbid": 409, "correlation": 161164860 + } + }, + { + "ph": "f", "id": 161164860, "pid": 5714, "tid": 6744, "ts": 6300866065155.776, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866065161.096, "dur": 0.929, + "args": { + "External id": 88974, "cbid": 135, "correlation": 161164863 + } + }, + { + "ph": "f", "id": 161164863, "pid": 5714, "tid": 6744, "ts": 6300866065161.096, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866065162.205, "dur": 0.931, + "args": { + "External id": 88974, "cbid": 147, "correlation": 161164864 + } + }, + { + "ph": "s", "id": 161164864, "pid": 5714, "tid": 6744, "ts": 6300866065162.205, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866145848.769, "dur": 8407.299, + "args": { + "External id": 88974, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161164866, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161164866, "pid": 0, "tid": 20, "ts": 6300866145848.769, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866065164.356, "dur": 10.649, + "args": { + "External id": 88974, "cbid": 430, "correlation": 161164866 + } + }, + { + "ph": "s", "id": 161164866, "pid": 5714, "tid": 6744, "ts": 6300866065164.356, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866065176.085, "dur": 0.440, + "args": { + "External id": 88974, "cbid": 135, "correlation": 161164868 + } + }, + { + "ph": "f", "id": 161164868, "pid": 5714, "tid": 6744, "ts": 6300866065176.085, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866065176.696, "dur": 0.560, + "args": { + "External id": 88974, "cbid": 147, "correlation": 161164869 + } + }, + { + "ph": "s", "id": 161164869, "pid": 5714, "tid": 6744, "ts": 6300866065176.696, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866065178.796, "dur": 0.860, + "args": { + "External id": 88974, "cbid": 135, "correlation": 161164872 + } + }, + { + "ph": "f", "id": 161164872, "pid": 5714, "tid": 6744, "ts": 6300866065178.796, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866065189.056, "dur": 0.489, + "args": { + "External id": 88974, "cbid": 135, "correlation": 161164879 + } + }, + { + "ph": "f", "id": 161164879, "pid": 5714, "tid": 6744, "ts": 6300866065189.056, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866065217.756, "dur": 1.069, + "args": { + "External id": 88976, "cbid": 147, "correlation": 161164884 + } + }, + { + "ph": "s", "id": 161164884, "pid": 5714, "tid": 6744, "ts": 6300866065217.756, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866065236.205, "dur": 0.870, + "args": { + "External id": 88958, "cbid": 135, "correlation": 161164899 + } + }, + { + "ph": "f", "id": 161164899, "pid": 5714, "tid": 6744, "ts": 6300866065236.205, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866065449.365, "dur": 2.050, + "args": { + "External id": 88958, "cbid": 135, "correlation": 161164912 + } + }, + { + "ph": "f", "id": 161164912, "pid": 5714, "tid": 6744, "ts": 6300866065449.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866065564.275, "dur": 3.250, + "args": { + "External id": 88986, "cbid": 147, "correlation": 161164923 + } + }, + { + "ph": "s", "id": 161164923, "pid": 5714, "tid": 6744, "ts": 6300866065564.275, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866065683.484, "dur": 1.300, + "args": { + "External id": 89000, "cbid": 317, "correlation": 161164964 + } + }, + { + "ph": "f", "id": 161164964, "pid": 5714, "tid": 6744, "ts": 6300866065683.484, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866065692.534, "dur": 2.490, + "args": { + "External id": 89001, "cbid": 138, "correlation": 161164967 + } + }, + { + "ph": "f", "id": 161164967, "pid": 5714, "tid": 6744, "ts": 6300866065692.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866145852.737, "dur": 2.688, + "args": { + "External id": 89005, "device": 0, "context": 1, "stream": 7, "correlation": 161164978, "bytes": 7224, "memory bandwidth (GB/s)": 2.6875 + } + }, + { + "ph": "f", "id": 161164978, "pid": 0, "tid": 7, "ts": 6300866145852.737, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866065717.034, "dur": 12.870, + "args": { + "External id": 89005, "cbid": 41, "correlation": 161164978 + } + }, + { + "ph": "s", "id": 161164978, "pid": 5714, "tid": 6744, "ts": 6300866065717.034, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866065734.514, "dur": 1.700, + "args": { + "External id": 89000, "cbid": 135, "correlation": 161164982 + } + }, + { + "ph": "f", "id": 161164982, "pid": 5714, "tid": 6744, "ts": 6300866065734.514, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300866145857.825, "dur": 471.302, + "args": { + "External id": 89000, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161164986, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161164986, "pid": 0, "tid": 7, "ts": 6300866145857.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866065738.854, "dur": 11.140, + "args": { + "External id": 89000, "cbid": 211, "correlation": 161164986 + } + }, + { + "ph": "s", "id": 161164986, "pid": 5714, "tid": 6744, "ts": 6300866065738.854, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866065844.984, "dur": 1.420, + "args": { + "External id": 88986, "cbid": 135, "correlation": 161164997 + } + }, + { + "ph": "f", "id": 161164997, "pid": 5714, "tid": 6744, "ts": 6300866065844.984, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866065849.754, "dur": 1.220, + "args": { + "External id": 88986, "cbid": 147, "correlation": 161165001 + } + }, + { + "ph": "s", "id": 161165001, "pid": 5714, "tid": 6744, "ts": 6300866065849.754, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866065853.374, "dur": 0.760, + "args": { + "External id": 88986, "cbid": 147, "correlation": 161165005 + } + }, + { + "ph": "s", "id": 161165005, "pid": 5714, "tid": 6744, "ts": 6300866065853.374, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866147190.993, "dur": 632.327, + "args": { + "External id": 89019, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161165029, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161165029, "pid": 0, "tid": 17, "ts": 6300866147190.993, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066007.864, "dur": 12.350, + "args": { + "External id": 89019, "cbid": 211, "correlation": 161165029 + } + }, + { + "ph": "s", "id": 161165029, "pid": 5714, "tid": 6744, "ts": 6300866066007.864, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866147851.449, "dur": 161.250, + "args": { + "External id": 89035, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161165042, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161165042, "pid": 0, "tid": 17, "ts": 6300866147851.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066126.123, "dur": 10.270, + "args": { + "External id": 89035, "cbid": 211, "correlation": 161165042 + } + }, + { + "ph": "s", "id": 161165042, "pid": 5714, "tid": 6744, "ts": 6300866066126.123, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866066161.873, "dur": 1.340, + "args": { + "External id": 88986, "cbid": 135, "correlation": 161165052 + } + }, + { + "ph": "f", "id": 161165052, "pid": 5714, "tid": 6744, "ts": 6300866066161.873, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866066165.123, "dur": 1.310, + "args": { + "External id": 88986, "cbid": 147, "correlation": 161165056 + } + }, + { + "ph": "s", "id": 161165056, "pid": 5714, "tid": 6744, "ts": 6300866066165.123, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866066221.433, "dur": 0.960, + "args": { + "External id": 89037, "cbid": 317, "correlation": 161165069 + } + }, + { + "ph": "f", "id": 161165069, "pid": 5714, "tid": 6744, "ts": 6300866066221.433, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866066224.433, "dur": 1.130, + "args": { + "External id": 89037, "cbid": 135, "correlation": 161165071 + } + }, + { + "ph": "f", "id": 161165071, "pid": 5714, "tid": 6744, "ts": 6300866066224.433, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866066227.083, "dur": 1.250, + "args": { + "External id": 89037, "cbid": 147, "correlation": 161165075 + } + }, + { + "ph": "s", "id": 161165075, "pid": 5714, "tid": 6744, "ts": 6300866066227.083, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866066243.263, "dur": 0.760, + "args": { + "External id": 89037, "cbid": 409, "correlation": 161165078 + } + }, + { + "ph": "f", "id": 161165078, "pid": 5714, "tid": 6744, "ts": 6300866066243.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866066248.303, "dur": 0.760, + "args": { + "External id": 89037, "cbid": 135, "correlation": 161165081 + } + }, + { + "ph": "f", "id": 161165081, "pid": 5714, "tid": 6744, "ts": 6300866066248.303, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866066249.253, "dur": 0.950, + "args": { + "External id": 89037, "cbid": 147, "correlation": 161165082 + } + }, + { + "ph": "s", "id": 161165082, "pid": 5714, "tid": 6744, "ts": 6300866066249.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866154332.869, "dur": 4933.658, + "args": { + "External id": 89037, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161165084, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161165084, "pid": 0, "tid": 20, "ts": 6300866154332.869, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866066251.403, "dur": 10.400, + "args": { + "External id": 89037, "cbid": 430, "correlation": 161165084 + } + }, + { + "ph": "s", "id": 161165084, "pid": 5714, "tid": 6744, "ts": 6300866066251.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866066262.953, "dur": 0.430, + "args": { + "External id": 89037, "cbid": 135, "correlation": 161165086 + } + }, + { + "ph": "f", "id": 161165086, "pid": 5714, "tid": 6744, "ts": 6300866066262.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866066263.513, "dur": 0.550, + "args": { + "External id": 89037, "cbid": 147, "correlation": 161165087 + } + }, + { + "ph": "s", "id": 161165087, "pid": 5714, "tid": 6744, "ts": 6300866066263.513, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866066265.813, "dur": 0.820, + "args": { + "External id": 89037, "cbid": 135, "correlation": 161165090 + } + }, + { + "ph": "f", "id": 161165090, "pid": 5714, "tid": 6744, "ts": 6300866066265.813, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866066274.513, "dur": 0.520, + "args": { + "External id": 89037, "cbid": 135, "correlation": 161165097 + } + }, + { + "ph": "f", "id": 161165097, "pid": 5714, "tid": 6744, "ts": 6300866066274.513, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866066312.883, "dur": 1.190, + "args": { + "External id": 89039, "cbid": 147, "correlation": 161165102 + } + }, + { + "ph": "s", "id": 161165102, "pid": 5714, "tid": 6744, "ts": 6300866066312.883, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866066332.373, "dur": 1.080, + "args": { + "External id": 88986, "cbid": 135, "correlation": 161165117 + } + }, + { + "ph": "f", "id": 161165117, "pid": 5714, "tid": 6744, "ts": 6300866066332.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866146454.792, "dur": 2192.218, + "args": { + "External id": 89041, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165142, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165142, "pid": 0, "tid": 7, "ts": 6300866146454.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066486.073, "dur": 12.320, + "args": { + "External id": 89041, "cbid": 211, "correlation": 161165142 + } + }, + { + "ph": "s", "id": 161165142, "pid": 5714, "tid": 6744, "ts": 6300866066486.073, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6300866148647.650, "dur": 590.023, + "args": { + "External id": 89042, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165165, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161165165, "pid": 0, "tid": 7, "ts": 6300866148647.650, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066550.773, "dur": 6.689, + "args": { + "External id": 89042, "cbid": 307, "correlation": 161165165 + } + }, + { + "ph": "s", "id": 161165165, "pid": 5714, "tid": 6744, "ts": 6300866066550.773, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866066595.062, "dur": 0.550, + "args": { + "External id": 89043, "cbid": 200, "correlation": 161165188 + } + }, + { + "ph": "f", "id": 161165188, "pid": 5714, "tid": 6744, "ts": 6300866066595.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866149302.890, "dur": 71.329, + "args": { + "External id": 89043, "device": 0, "context": 1, "stream": 7, "correlation": 161165191, "bytes": 1536, "memory bandwidth (GB/s)": 0.02153401842167982 + } + }, + { + "ph": "f", "id": 161165191, "pid": 0, "tid": 7, "ts": 6300866149302.890, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866066597.512, "dur": 7.630, + "args": { + "External id": 89043, "cbid": 51, "correlation": 161165191 + } + }, + { + "ph": "s", "id": 161165191, "pid": 5714, "tid": 6744, "ts": 6300866066597.512, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866149425.739, "dur": 406.309, + "args": { + "External id": 89043, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165192, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165192, "pid": 0, "tid": 7, "ts": 6300866149425.739, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066605.372, "dur": 6.780, + "args": { + "External id": 89043, "cbid": 307, "correlation": 161165192 + } + }, + { + "ph": "s", "id": 161165192, "pid": 5714, "tid": 6744, "ts": 6300866066605.372, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866066641.742, "dur": 0.340, + "args": { + "External id": 89044, "cbid": 200, "correlation": 161165217 + } + }, + { + "ph": "f", "id": 161165217, "pid": 5714, "tid": 6744, "ts": 6300866066641.742, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866149833.040, "dur": 1.312, + "args": { + "External id": 89044, "device": 0, "context": 1, "stream": 7, "correlation": 161165220, "bytes": 1536, "memory bandwidth (GB/s)": 1.170731707317073 + } + }, + { + "ph": "f", "id": 161165220, "pid": 0, "tid": 7, "ts": 6300866149833.040, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866066643.262, "dur": 5.300, + "args": { + "External id": 89044, "cbid": 51, "correlation": 161165220 + } + }, + { + "ph": "s", "id": 161165220, "pid": 5714, "tid": 6744, "ts": 6300866066643.262, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866149836.016, "dur": 352.516, + "args": { + "External id": 89044, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165221, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165221, "pid": 0, "tid": 7, "ts": 6300866149836.016, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066648.712, "dur": 5.340, + "args": { + "External id": 89044, "cbid": 307, "correlation": 161165221 + } + }, + { + "ph": "s", "id": 161165221, "pid": 5714, "tid": 6744, "ts": 6300866066648.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866066679.052, "dur": 0.320, + "args": { + "External id": 89045, "cbid": 200, "correlation": 161165246 + } + }, + { + "ph": "f", "id": 161165246, "pid": 5714, "tid": 6744, "ts": 6300866066679.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866150189.172, "dur": 359.812, + "args": { + "External id": 89045, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165249, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165249, "pid": 0, "tid": 7, "ts": 6300866150189.172, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066680.652, "dur": 5.680, + "args": { + "External id": 89045, "cbid": 307, "correlation": 161165249 + } + }, + { + "ph": "s", "id": 161165249, "pid": 5714, "tid": 6744, "ts": 6300866066680.652, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866066709.192, "dur": 0.270, + "args": { + "External id": 89046, "cbid": 200, "correlation": 161165274 + } + }, + { + "ph": "f", "id": 161165274, "pid": 5714, "tid": 6744, "ts": 6300866066709.192, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866150550.425, "dur": 1.216, + "args": { + "External id": 89046, "device": 0, "context": 1, "stream": 7, "correlation": 161165277, "bytes": 1536, "memory bandwidth (GB/s)": 1.263157894736842 + } + }, + { + "ph": "f", "id": 161165277, "pid": 0, "tid": 7, "ts": 6300866150550.425, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866066710.502, "dur": 4.760, + "args": { + "External id": 89046, "cbid": 51, "correlation": 161165277 + } + }, + { + "ph": "s", "id": 161165277, "pid": 5714, "tid": 6744, "ts": 6300866066710.502, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866150553.017, "dur": 353.188, + "args": { + "External id": 89046, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165278, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165278, "pid": 0, "tid": 7, "ts": 6300866150553.017, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066715.442, "dur": 5.410, + "args": { + "External id": 89046, "cbid": 307, "correlation": 161165278 + } + }, + { + "ph": "s", "id": 161165278, "pid": 5714, "tid": 6744, "ts": 6300866066715.442, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866066745.372, "dur": 0.320, + "args": { + "External id": 89047, "cbid": 200, "correlation": 161165303 + } + }, + { + "ph": "f", "id": 161165303, "pid": 5714, "tid": 6744, "ts": 6300866066745.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866150906.877, "dur": 359.588, + "args": { + "External id": 89047, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165306, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165306, "pid": 0, "tid": 7, "ts": 6300866150906.877, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066746.802, "dur": 5.540, + "args": { + "External id": 89047, "cbid": 307, "correlation": 161165306 + } + }, + { + "ph": "s", "id": 161165306, "pid": 5714, "tid": 6744, "ts": 6300866066746.802, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866151267.105, "dur": 89.505, + "args": { + "External id": 89048, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165319, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165319, "pid": 0, "tid": 7, "ts": 6300866151267.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066792.252, "dur": 6.290, + "args": { + "External id": 89048, "cbid": 307, "correlation": 161165319 + } + }, + { + "ph": "s", "id": 161165319, "pid": 5714, "tid": 6744, "ts": 6300866066792.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6300866151357.250, "dur": 3.840, + "args": { + "External id": 89049, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165327, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161165327, "pid": 0, "tid": 7, "ts": 6300866151357.250, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066824.742, "dur": 5.330, + "args": { + "External id": 89049, "cbid": 307, "correlation": 161165327 + } + }, + { + "ph": "s", "id": 161165327, "pid": 5714, "tid": 6744, "ts": 6300866066824.742, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6300866151361.794, "dur": 114.434, + "args": { + "External id": 89050, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165335, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165335, "pid": 0, "tid": 7, "ts": 6300866151361.794, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866066857.662, "dur": 5.490, + "args": { + "External id": 89050, "cbid": 307, "correlation": 161165335 + } + }, + { + "ph": "s", "id": 161165335, "pid": 5714, "tid": 6744, "ts": 6300866066857.662, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866067049.241, "dur": 0.550, + "args": { + "External id": 89069, "cbid": 200, "correlation": 161165381 + } + }, + { + "ph": "f", "id": 161165381, "pid": 5714, "tid": 6744, "ts": 6300866067049.241, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866151477.700, "dur": 1.024, + "args": { + "External id": 89069, "device": 0, "context": 1, "stream": 7, "correlation": 161165384, "bytes": 576, "memory bandwidth (GB/s)": 0.5625 + } + }, + { + "ph": "f", "id": 161165384, "pid": 0, "tid": 7, "ts": 6300866151477.700, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866067051.561, "dur": 7.680, + "args": { + "External id": 89069, "cbid": 51, "correlation": 161165384 + } + }, + { + "ph": "s", "id": 161165384, "pid": 5714, "tid": 6744, "ts": 6300866067051.561, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866151480.388, "dur": 170.689, + "args": { + "External id": 89069, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165385, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165385, "pid": 0, "tid": 7, "ts": 6300866151480.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067059.501, "dur": 8.760, + "args": { + "External id": 89069, "cbid": 307, "correlation": 161165385 + } + }, + { + "ph": "s", "id": 161165385, "pid": 5714, "tid": 6744, "ts": 6300866067059.501, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866151651.653, "dur": 339.492, + "args": { + "External id": 89070, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165407, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165407, "pid": 0, "tid": 7, "ts": 6300866151651.653, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067097.111, "dur": 6.380, + "args": { + "External id": 89070, "cbid": 211, "correlation": 161165407 + } + }, + { + "ph": "s", "id": 161165407, "pid": 5714, "tid": 6744, "ts": 6300866067097.111, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866067180.231, "dur": 0.510, + "args": { + "External id": 89071, "cbid": 200, "correlation": 161165425 + } + }, + { + "ph": "f", "id": 161165425, "pid": 5714, "tid": 6744, "ts": 6300866067180.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866067180.871, "dur": 0.230, + "args": { + "External id": 89071, "cbid": 200, "correlation": 161165426 + } + }, + { + "ph": "f", "id": 161165426, "pid": 5714, "tid": 6744, "ts": 6300866067180.871, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866067202.061, "dur": 0.240, + "args": { + "External id": 89071, "cbid": 200, "correlation": 161165444 + } + }, + { + "ph": "f", "id": 161165444, "pid": 5714, "tid": 6744, "ts": 6300866067202.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866151991.849, "dur": 256.227, + "args": { + "External id": 89071, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165445, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165445, "pid": 0, "tid": 7, "ts": 6300866151991.849, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067203.681, "dur": 9.850, + "args": { + "External id": 89071, "cbid": 211, "correlation": 161165445 + } + }, + { + "ph": "s", "id": 161165445, "pid": 5714, "tid": 6744, "ts": 6300866067203.681, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866067214.381, "dur": 1.030, + "args": { + "External id": 89071, "cbid": 273, "correlation": 161165447 + } + }, + { + "ph": "f", "id": 161165447, "pid": 5714, "tid": 6744, "ts": 6300866067214.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866152249.868, "dur": 1592.883, + "args": { + "External id": 89071, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165448, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161165448, "pid": 0, "tid": 7, "ts": 6300866152249.868, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067215.811, "dur": 4.850, + "args": { + "External id": 89071, "cbid": 211, "correlation": 161165448 + } + }, + { + "ph": "s", "id": 161165448, "pid": 5714, "tid": 6744, "ts": 6300866067215.811, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866153843.391, "dur": 115.458, + "args": { + "External id": 89071, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165450, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161165450, "pid": 0, "tid": 7, "ts": 6300866153843.391, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067221.241, "dur": 4.060, + "args": { + "External id": 89071, "cbid": 211, "correlation": 161165450 + } + }, + { + "ph": "s", "id": 161165450, "pid": 5714, "tid": 6744, "ts": 6300866067221.241, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866153959.553, "dur": 64.832, + "args": { + "External id": 89082, "device": 0, "context": 1, "stream": 7, "correlation": 161165472, "bytes": 25165824, "memory bandwidth (GB/s)": 388.16979269496545 + } + }, + { + "ph": "f", "id": 161165472, "pid": 0, "tid": 7, "ts": 6300866153959.553, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866067382.280, "dur": 19.440, + "args": { + "External id": 89082, "cbid": 41, "correlation": 161165472 + } + }, + { + "ph": "s", "id": 161165472, "pid": 5714, "tid": 6744, "ts": 6300866067382.280, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866154024.993, "dur": 46.561, + "args": { + "External id": 89079, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165490, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165490, "pid": 0, "tid": 7, "ts": 6300866154024.993, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067506.830, "dur": 9.380, + "args": { + "External id": 89079, "cbid": 307, "correlation": 161165490 + } + }, + { + "ph": "s", "id": 161165490, "pid": 5714, "tid": 6744, "ts": 6300866067506.830, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866154072.226, "dur": 59.744, + "args": { + "External id": 89089, "device": 0, "context": 1, "stream": 7, "correlation": 161165505, "bytes": 25165824, "memory bandwidth (GB/s)": 421.22763792179967 + } + }, + { + "ph": "f", "id": 161165505, "pid": 0, "tid": 7, "ts": 6300866154072.226, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866067620.510, "dur": 15.750, + "args": { + "External id": 89089, "cbid": 41, "correlation": 161165505 + } + }, + { + "ph": "s", "id": 161165505, "pid": 5714, "tid": 6744, "ts": 6300866067620.510, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866154132.642, "dur": 45.281, + "args": { + "External id": 89086, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165523, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165523, "pid": 0, "tid": 7, "ts": 6300866154132.642, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067734.800, "dur": 8.510, + "args": { + "External id": 89086, "cbid": 307, "correlation": 161165523 + } + }, + { + "ph": "s", "id": 161165523, "pid": 5714, "tid": 6744, "ts": 6300866067734.800, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866067869.559, "dur": 0.560, + "args": { + "External id": 89094, "cbid": 200, "correlation": 161165553 + } + }, + { + "ph": "f", "id": 161165553, "pid": 5714, "tid": 6744, "ts": 6300866067869.559, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866154188.099, "dur": 7.008, + "args": { + "External id": 89094, "device": 0, "context": 1, "stream": 7, "correlation": 161165556, "bytes": 576, "memory bandwidth (GB/s)": 0.0821917808219178 + } + }, + { + "ph": "f", "id": 161165556, "pid": 0, "tid": 7, "ts": 6300866154188.099, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866067872.010, "dur": 8.169, + "args": { + "External id": 89094, "cbid": 51, "correlation": 161165556 + } + }, + { + "ph": "s", "id": 161165556, "pid": 5714, "tid": 6744, "ts": 6300866067872.010, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866154202.916, "dur": 144.577, + "args": { + "External id": 89094, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165557, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165557, "pid": 0, "tid": 7, "ts": 6300866154202.916, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067880.459, "dur": 8.700, + "args": { + "External id": 89094, "cbid": 307, "correlation": 161165557 + } + }, + { + "ph": "s", "id": 161165557, "pid": 5714, "tid": 6744, "ts": 6300866067880.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866067917.439, "dur": 0.320, + "args": { + "External id": 89095, "cbid": 200, "correlation": 161165582 + } + }, + { + "ph": "f", "id": 161165582, "pid": 5714, "tid": 6744, "ts": 6300866067917.439, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866154349.157, "dur": 1.120, + "args": { + "External id": 89095, "device": 0, "context": 1, "stream": 7, "correlation": 161165585, "bytes": 576, "memory bandwidth (GB/s)": 0.5142857142857142 + } + }, + { + "ph": "f", "id": 161165585, "pid": 0, "tid": 7, "ts": 6300866154349.157, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866067918.879, "dur": 4.710, + "args": { + "External id": 89095, "cbid": 51, "correlation": 161165585 + } + }, + { + "ph": "s", "id": 161165585, "pid": 5714, "tid": 6744, "ts": 6300866067918.879, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866154351.845, "dur": 539.398, + "args": { + "External id": 89095, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165586, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165586, "pid": 0, "tid": 7, "ts": 6300866154351.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067923.729, "dur": 5.780, + "args": { + "External id": 89095, "cbid": 307, "correlation": 161165586 + } + }, + { + "ph": "s", "id": 161165586, "pid": 5714, "tid": 6744, "ts": 6300866067923.729, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866067953.259, "dur": 0.350, + "args": { + "External id": 89096, "cbid": 200, "correlation": 161165611 + } + }, + { + "ph": "f", "id": 161165611, "pid": 5714, "tid": 6744, "ts": 6300866067953.259, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866154892.555, "dur": 1.281, + "args": { + "External id": 89096, "device": 0, "context": 1, "stream": 7, "correlation": 161165614, "bytes": 576, "memory bandwidth (GB/s)": 0.4496487119437939 + } + }, + { + "ph": "f", "id": 161165614, "pid": 0, "tid": 7, "ts": 6300866154892.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866067954.609, "dur": 4.470, + "args": { + "External id": 89096, "cbid": 51, "correlation": 161165614 + } + }, + { + "ph": "s", "id": 161165614, "pid": 5714, "tid": 6744, "ts": 6300866067954.609, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866154895.244, "dur": 141.921, + "args": { + "External id": 89096, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165615, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165615, "pid": 0, "tid": 7, "ts": 6300866154895.244, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067959.229, "dur": 4.820, + "args": { + "External id": 89096, "cbid": 307, "correlation": 161165615 + } + }, + { + "ph": "s", "id": 161165615, "pid": 5714, "tid": 6744, "ts": 6300866067959.229, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866155037.837, "dur": 141.250, + "args": { + "External id": 89097, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165637, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165637, "pid": 0, "tid": 7, "ts": 6300866155037.837, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866067990.269, "dur": 6.050, + "args": { + "External id": 89097, "cbid": 211, "correlation": 161165637 + } + }, + { + "ph": "s", "id": 161165637, "pid": 5714, "tid": 6744, "ts": 6300866067990.269, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866155179.759, "dur": 142.018, + "args": { + "External id": 89098, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165660, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165660, "pid": 0, "tid": 7, "ts": 6300866155179.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866068015.869, "dur": 5.140, + "args": { + "External id": 89098, "cbid": 211, "correlation": 161165660 + } + }, + { + "ph": "s", "id": 161165660, "pid": 5714, "tid": 6744, "ts": 6300866068015.869, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866155322.449, "dur": 143.393, + "args": { + "External id": 89099, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165683, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161165683, "pid": 0, "tid": 7, "ts": 6300866155322.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866068038.739, "dur": 4.800, + "args": { + "External id": 89099, "cbid": 211, "correlation": 161165683 + } + }, + { + "ph": "s", "id": 161165683, "pid": 5714, "tid": 6744, "ts": 6300866068038.739, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6300866155466.514, "dur": 78.753, + "args": { + "External id": 89100, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165691, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165691, "pid": 0, "tid": 7, "ts": 6300866155466.514, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866068079.219, "dur": 5.950, + "args": { + "External id": 89100, "cbid": 307, "correlation": 161165691 + } + }, + { + "ph": "s", "id": 161165691, "pid": 5714, "tid": 6744, "ts": 6300866068079.219, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866155545.971, "dur": 47.137, + "args": { + "External id": 89115, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165720, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165720, "pid": 0, "tid": 7, "ts": 6300866155545.971, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866068250.718, "dur": 9.940, + "args": { + "External id": 89115, "cbid": 307, "correlation": 161165720 + } + }, + { + "ph": "s", "id": 161165720, "pid": 5714, "tid": 6744, "ts": 6300866068250.718, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866155593.748, "dur": 4.256, + "args": { + "External id": 89116, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165728, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161165728, "pid": 0, "tid": 7, "ts": 6300866155593.748, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866068287.938, "dur": 5.600, + "args": { + "External id": 89116, "cbid": 307, "correlation": 161165728 + } + }, + { + "ph": "s", "id": 161165728, "pid": 5714, "tid": 6744, "ts": 6300866068287.938, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866155598.676, "dur": 50.080, + "args": { + "External id": 89117, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165739, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165739, "pid": 0, "tid": 7, "ts": 6300866155598.676, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866068334.838, "dur": 6.580, + "args": { + "External id": 89117, "cbid": 307, "correlation": 161165739 + } + }, + { + "ph": "s", "id": 161165739, "pid": 5714, "tid": 6744, "ts": 6300866068334.838, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866155649.364, "dur": 46.785, + "args": { + "External id": 89118, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165744, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165744, "pid": 0, "tid": 7, "ts": 6300866155649.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866068381.888, "dur": 7.610, + "args": { + "External id": 89118, "cbid": 211, "correlation": 161165744 + } + }, + { + "ph": "s", "id": 161165744, "pid": 5714, "tid": 6744, "ts": 6300866068381.888, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866068566.748, "dur": 3.150, + "args": { + "External id": 89124, "cbid": 147, "correlation": 161165761 + } + }, + { + "ph": "s", "id": 161165761, "pid": 5714, "tid": 6744, "ts": 6300866068566.748, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866068699.277, "dur": 2.791, + "args": { + "External id": 89132, "cbid": 138, "correlation": 161165776 + } + }, + { + "ph": "f", "id": 161165776, "pid": 5714, "tid": 6744, "ts": 6300866068699.277, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866155826.135, "dur": 132.289, + "args": { + "External id": 89136, "device": 0, "context": 1, "stream": 7, "correlation": 161165787, "bytes": 28112, "memory bandwidth (GB/s)": 0.21250444103440194 + } + }, + { + "ph": "f", "id": 161165787, "pid": 0, "tid": 7, "ts": 6300866155826.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866068725.308, "dur": 12.969, + "args": { + "External id": 89136, "cbid": 41, "correlation": 161165787 + } + }, + { + "ph": "s", "id": 161165787, "pid": 5714, "tid": 6744, "ts": 6300866068725.308, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866068742.857, "dur": 2.100, + "args": { + "External id": 89131, "cbid": 135, "correlation": 161165791 + } + }, + { + "ph": "f", "id": 161165791, "pid": 5714, "tid": 6744, "ts": 6300866068742.857, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866156158.106, "dur": 177.090, + "args": { + "External id": 89131, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165795, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165795, "pid": 0, "tid": 7, "ts": 6300866156158.106, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866068748.217, "dur": 10.890, + "args": { + "External id": 89131, "cbid": 211, "correlation": 161165795 + } + }, + { + "ph": "s", "id": 161165795, "pid": 5714, "tid": 6744, "ts": 6300866068748.217, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866068801.737, "dur": 1.130, + "args": { + "External id": 89124, "cbid": 135, "correlation": 161165806 + } + }, + { + "ph": "f", "id": 161165806, "pid": 5714, "tid": 6744, "ts": 6300866068801.737, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866068804.957, "dur": 1.450, + "args": { + "External id": 89124, "cbid": 147, "correlation": 161165810 + } + }, + { + "ph": "s", "id": 161165810, "pid": 5714, "tid": 6744, "ts": 6300866068804.957, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866068880.677, "dur": 1.160, + "args": { + "External id": 89140, "cbid": 317, "correlation": 161165830 + } + }, + { + "ph": "f", "id": 161165830, "pid": 5714, "tid": 6744, "ts": 6300866068880.677, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866068883.797, "dur": 1.440, + "args": { + "External id": 89140, "cbid": 135, "correlation": 161165832 + } + }, + { + "ph": "f", "id": 161165832, "pid": 5714, "tid": 6744, "ts": 6300866068883.797, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866068886.817, "dur": 1.150, + "args": { + "External id": 89140, "cbid": 147, "correlation": 161165836 + } + }, + { + "ph": "s", "id": 161165836, "pid": 5714, "tid": 6744, "ts": 6300866068886.817, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866068903.267, "dur": 0.760, + "args": { + "External id": 89140, "cbid": 409, "correlation": 161165839 + } + }, + { + "ph": "f", "id": 161165839, "pid": 5714, "tid": 6744, "ts": 6300866068903.267, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866068908.417, "dur": 0.830, + "args": { + "External id": 89140, "cbid": 135, "correlation": 161165842 + } + }, + { + "ph": "f", "id": 161165842, "pid": 5714, "tid": 6744, "ts": 6300866068908.417, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866068909.437, "dur": 0.870, + "args": { + "External id": 89140, "cbid": 147, "correlation": 161165843 + } + }, + { + "ph": "s", "id": 161165843, "pid": 5714, "tid": 6744, "ts": 6300866068909.437, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866159267.679, "dur": 8514.404, + "args": { + "External id": 89140, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161165845, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161165845, "pid": 0, "tid": 20, "ts": 6300866159267.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866068911.497, "dur": 10.830, + "args": { + "External id": 89140, "cbid": 430, "correlation": 161165845 + } + }, + { + "ph": "s", "id": 161165845, "pid": 5714, "tid": 6744, "ts": 6300866068911.497, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866068923.427, "dur": 0.440, + "args": { + "External id": 89140, "cbid": 135, "correlation": 161165847 + } + }, + { + "ph": "f", "id": 161165847, "pid": 5714, "tid": 6744, "ts": 6300866068923.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866068923.977, "dur": 0.570, + "args": { + "External id": 89140, "cbid": 147, "correlation": 161165848 + } + }, + { + "ph": "s", "id": 161165848, "pid": 5714, "tid": 6744, "ts": 6300866068923.977, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866068926.107, "dur": 0.880, + "args": { + "External id": 89140, "cbid": 135, "correlation": 161165851 + } + }, + { + "ph": "f", "id": 161165851, "pid": 5714, "tid": 6744, "ts": 6300866068926.107, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866068935.367, "dur": 0.460, + "args": { + "External id": 89140, "cbid": 135, "correlation": 161165858 + } + }, + { + "ph": "f", "id": 161165858, "pid": 5714, "tid": 6744, "ts": 6300866068935.367, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866068963.317, "dur": 1.050, + "args": { + "External id": 89142, "cbid": 147, "correlation": 161165863 + } + }, + { + "ph": "s", "id": 161165863, "pid": 5714, "tid": 6744, "ts": 6300866068963.317, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866068985.147, "dur": 1.070, + "args": { + "External id": 89124, "cbid": 135, "correlation": 161165878 + } + }, + { + "ph": "f", "id": 161165878, "pid": 5714, "tid": 6744, "ts": 6300866068985.147, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866069222.786, "dur": 1.510, + "args": { + "External id": 89124, "cbid": 135, "correlation": 161165891 + } + }, + { + "ph": "f", "id": 161165891, "pid": 5714, "tid": 6744, "ts": 6300866069222.786, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866069357.706, "dur": 3.610, + "args": { + "External id": 89152, "cbid": 147, "correlation": 161165902 + } + }, + { + "ph": "s", "id": 161165902, "pid": 5714, "tid": 6744, "ts": 6300866069357.706, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866069487.496, "dur": 1.510, + "args": { + "External id": 89166, "cbid": 317, "correlation": 161165943 + } + }, + { + "ph": "f", "id": 161165943, "pid": 5714, "tid": 6744, "ts": 6300866069487.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866069497.636, "dur": 2.670, + "args": { + "External id": 89167, "cbid": 138, "correlation": 161165946 + } + }, + { + "ph": "f", "id": 161165946, "pid": 5714, "tid": 6744, "ts": 6300866069497.636, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866159271.263, "dur": 2.432, + "args": { + "External id": 89171, "device": 0, "context": 1, "stream": 7, "correlation": 161165957, "bytes": 7224, "memory bandwidth (GB/s)": 2.9703947368421053 + } + }, + { + "ph": "f", "id": 161165957, "pid": 0, "tid": 7, "ts": 6300866159271.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866069524.636, "dur": 14.139, + "args": { + "External id": 89171, "cbid": 41, "correlation": 161165957 + } + }, + { + "ph": "s", "id": 161165957, "pid": 5714, "tid": 6744, "ts": 6300866069524.636, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866069543.995, "dur": 1.960, + "args": { + "External id": 89166, "cbid": 135, "correlation": 161165961 + } + }, + { + "ph": "f", "id": 161165961, "pid": 5714, "tid": 6744, "ts": 6300866069543.995, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300866159276.319, "dur": 12.224, + "args": { + "External id": 89166, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161165965, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161165965, "pid": 0, "tid": 7, "ts": 6300866159276.319, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866069548.846, "dur": 11.940, + "args": { + "External id": 89166, "cbid": 211, "correlation": 161165965 + } + }, + { + "ph": "s", "id": 161165965, "pid": 5714, "tid": 6744, "ts": 6300866069548.846, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866069667.465, "dur": 1.680, + "args": { + "External id": 89152, "cbid": 135, "correlation": 161165976 + } + }, + { + "ph": "f", "id": 161165976, "pid": 5714, "tid": 6744, "ts": 6300866069667.465, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866069672.815, "dur": 1.390, + "args": { + "External id": 89152, "cbid": 147, "correlation": 161165980 + } + }, + { + "ph": "s", "id": 161165980, "pid": 5714, "tid": 6744, "ts": 6300866069672.815, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866069676.295, "dur": 0.890, + "args": { + "External id": 89152, "cbid": 147, "correlation": 161165984 + } + }, + { + "ph": "s", "id": 161165984, "pid": 5714, "tid": 6744, "ts": 6300866069676.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866159322.015, "dur": 28.097, + "args": { + "External id": 89185, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161166008, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161166008, "pid": 0, "tid": 17, "ts": 6300866159322.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866069856.355, "dur": 13.790, + "args": { + "External id": 89185, "cbid": 211, "correlation": 161166008 + } + }, + { + "ph": "s", "id": 161166008, "pid": 5714, "tid": 6744, "ts": 6300866069856.355, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6300866159360.576, "dur": 11.968, + "args": { + "External id": 89201, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 161166021, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161166021, "pid": 0, "tid": 17, "ts": 6300866159360.576, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866069990.885, "dur": 13.089, + "args": { + "External id": 89201, "cbid": 211, "correlation": 161166021 + } + }, + { + "ph": "s", "id": 161166021, "pid": 5714, "tid": 6744, "ts": 6300866069990.885, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866070038.934, "dur": 2.240, + "args": { + "External id": 89152, "cbid": 135, "correlation": 161166031 + } + }, + { + "ph": "f", "id": 161166031, "pid": 5714, "tid": 6744, "ts": 6300866070038.934, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866070044.054, "dur": 1.840, + "args": { + "External id": 89152, "cbid": 147, "correlation": 161166035 + } + }, + { + "ph": "s", "id": 161166035, "pid": 5714, "tid": 6744, "ts": 6300866070044.054, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866070124.994, "dur": 1.540, + "args": { + "External id": 89203, "cbid": 317, "correlation": 161166048 + } + }, + { + "ph": "f", "id": 161166048, "pid": 5714, "tid": 6744, "ts": 6300866070124.994, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866070129.494, "dur": 1.720, + "args": { + "External id": 89203, "cbid": 135, "correlation": 161166050 + } + }, + { + "ph": "f", "id": 161166050, "pid": 5714, "tid": 6744, "ts": 6300866070129.494, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866070133.474, "dur": 1.750, + "args": { + "External id": 89203, "cbid": 147, "correlation": 161166054 + } + }, + { + "ph": "s", "id": 161166054, "pid": 5714, "tid": 6744, "ts": 6300866070133.474, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866070156.664, "dur": 1.170, + "args": { + "External id": 89203, "cbid": 409, "correlation": 161166057 + } + }, + { + "ph": "f", "id": 161166057, "pid": 5714, "tid": 6744, "ts": 6300866070156.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866070164.154, "dur": 1.120, + "args": { + "External id": 89203, "cbid": 135, "correlation": 161166060 + } + }, + { + "ph": "f", "id": 161166060, "pid": 5714, "tid": 6744, "ts": 6300866070164.154, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866070165.544, "dur": 1.400, + "args": { + "External id": 89203, "cbid": 147, "correlation": 161166061 + } + }, + { + "ph": "s", "id": 161166061, "pid": 5714, "tid": 6744, "ts": 6300866070165.544, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866167782.723, "dur": 5494.208, + "args": { + "External id": 89203, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161166063, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161166063, "pid": 0, "tid": 20, "ts": 6300866167782.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866070168.634, "dur": 15.030, + "args": { + "External id": 89203, "cbid": 430, "correlation": 161166063 + } + }, + { + "ph": "s", "id": 161166063, "pid": 5714, "tid": 6744, "ts": 6300866070168.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866070185.054, "dur": 0.590, + "args": { + "External id": 89203, "cbid": 135, "correlation": 161166065 + } + }, + { + "ph": "f", "id": 161166065, "pid": 5714, "tid": 6744, "ts": 6300866070185.054, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866070185.794, "dur": 0.770, + "args": { + "External id": 89203, "cbid": 147, "correlation": 161166066 + } + }, + { + "ph": "s", "id": 161166066, "pid": 5714, "tid": 6744, "ts": 6300866070185.794, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866070188.554, "dur": 1.170, + "args": { + "External id": 89203, "cbid": 135, "correlation": 161166069 + } + }, + { + "ph": "f", "id": 161166069, "pid": 5714, "tid": 6744, "ts": 6300866070188.554, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866070200.094, "dur": 0.590, + "args": { + "External id": 89203, "cbid": 135, "correlation": 161166076 + } + }, + { + "ph": "f", "id": 161166076, "pid": 5714, "tid": 6744, "ts": 6300866070200.094, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866070234.624, "dur": 1.360, + "args": { + "External id": 89205, "cbid": 147, "correlation": 161166081 + } + }, + { + "ph": "s", "id": 161166081, "pid": 5714, "tid": 6744, "ts": 6300866070234.624, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866070257.634, "dur": 1.150, + "args": { + "External id": 89152, "cbid": 135, "correlation": 161166096 + } + }, + { + "ph": "f", "id": 161166096, "pid": 5714, "tid": 6744, "ts": 6300866070257.634, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866159289.375, "dur": 2233.019, + "args": { + "External id": 89207, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166121, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166121, "pid": 0, "tid": 7, "ts": 6300866159289.375, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866070465.033, "dur": 15.460, + "args": { + "External id": 89207, "cbid": 211, "correlation": 161166121 + } + }, + { + "ph": "s", "id": 161166121, "pid": 5714, "tid": 6744, "ts": 6300866070465.033, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6300866161523.066, "dur": 554.246, + "args": { + "External id": 89208, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166144, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161166144, "pid": 0, "tid": 7, "ts": 6300866161523.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866070544.023, "dur": 8.030, + "args": { + "External id": 89208, "cbid": 307, "correlation": 161166144 + } + }, + { + "ph": "s", "id": 161166144, "pid": 5714, "tid": 6744, "ts": 6300866070544.023, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866070598.523, "dur": 0.740, + "args": { + "External id": 89209, "cbid": 200, "correlation": 161166167 + } + }, + { + "ph": "f", "id": 161166167, "pid": 5714, "tid": 6744, "ts": 6300866070598.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866162120.800, "dur": 24.513, + "args": { + "External id": 89209, "device": 0, "context": 1, "stream": 7, "correlation": 161166170, "bytes": 1536, "memory bandwidth (GB/s)": 0.06266062905397136 + } + }, + { + "ph": "f", "id": 161166170, "pid": 0, "tid": 7, "ts": 6300866162120.800, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866070601.603, "dur": 8.960, + "args": { + "External id": 89209, "cbid": 51, "correlation": 161166170 + } + }, + { + "ph": "s", "id": 161166170, "pid": 5714, "tid": 6744, "ts": 6300866070601.603, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866162232.706, "dur": 868.490, + "args": { + "External id": 89209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166171, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166171, "pid": 0, "tid": 7, "ts": 6300866162232.706, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866070610.843, "dur": 7.810, + "args": { + "External id": 89209, "cbid": 307, "correlation": 161166171 + } + }, + { + "ph": "s", "id": 161166171, "pid": 5714, "tid": 6744, "ts": 6300866070610.843, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866070654.993, "dur": 0.390, + "args": { + "External id": 89210, "cbid": 200, "correlation": 161166196 + } + }, + { + "ph": "f", "id": 161166196, "pid": 5714, "tid": 6744, "ts": 6300866070654.993, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866163102.540, "dur": 1.152, + "args": { + "External id": 89210, "device": 0, "context": 1, "stream": 7, "correlation": 161166199, "bytes": 1536, "memory bandwidth (GB/s)": 1.3333333333333333 + } + }, + { + "ph": "f", "id": 161166199, "pid": 0, "tid": 7, "ts": 6300866163102.540, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866070656.783, "dur": 5.910, + "args": { + "External id": 89210, "cbid": 51, "correlation": 161166199 + } + }, + { + "ph": "s", "id": 161166199, "pid": 5714, "tid": 6744, "ts": 6300866070656.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866163105.356, "dur": 353.028, + "args": { + "External id": 89210, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166200, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166200, "pid": 0, "tid": 7, "ts": 6300866163105.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866070662.903, "dur": 6.850, + "args": { + "External id": 89210, "cbid": 307, "correlation": 161166200 + } + }, + { + "ph": "s", "id": 161166200, "pid": 5714, "tid": 6744, "ts": 6300866070662.903, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866070700.903, "dur": 0.370, + "args": { + "External id": 89211, "cbid": 200, "correlation": 161166225 + } + }, + { + "ph": "f", "id": 161166225, "pid": 5714, "tid": 6744, "ts": 6300866070700.903, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866163458.960, "dur": 356.996, + "args": { + "External id": 89211, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166228, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166228, "pid": 0, "tid": 7, "ts": 6300866163458.960, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866070702.803, "dur": 7.180, + "args": { + "External id": 89211, "cbid": 307, "correlation": 161166228 + } + }, + { + "ph": "s", "id": 161166228, "pid": 5714, "tid": 6744, "ts": 6300866070702.803, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866070739.253, "dur": 0.390, + "args": { + "External id": 89212, "cbid": 200, "correlation": 161166253 + } + }, + { + "ph": "f", "id": 161166253, "pid": 5714, "tid": 6744, "ts": 6300866070739.253, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866163817.204, "dur": 1.312, + "args": { + "External id": 89212, "device": 0, "context": 1, "stream": 7, "correlation": 161166256, "bytes": 1536, "memory bandwidth (GB/s)": 1.170731707317073 + } + }, + { + "ph": "f", "id": 161166256, "pid": 0, "tid": 7, "ts": 6300866163817.204, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866070740.933, "dur": 6.020, + "args": { + "External id": 89212, "cbid": 51, "correlation": 161166256 + } + }, + { + "ph": "s", "id": 161166256, "pid": 5714, "tid": 6744, "ts": 6300866070740.933, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866163820.020, "dur": 356.484, + "args": { + "External id": 89212, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166257, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166257, "pid": 0, "tid": 7, "ts": 6300866163820.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866070747.143, "dur": 6.920, + "args": { + "External id": 89212, "cbid": 307, "correlation": 161166257 + } + }, + { + "ph": "s", "id": 161166257, "pid": 5714, "tid": 6744, "ts": 6300866070747.143, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866070784.803, "dur": 0.390, + "args": { + "External id": 89213, "cbid": 200, "correlation": 161166282 + } + }, + { + "ph": "f", "id": 161166282, "pid": 5714, "tid": 6744, "ts": 6300866070784.803, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866164177.176, "dur": 356.389, + "args": { + "External id": 89213, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166285, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166285, "pid": 0, "tid": 7, "ts": 6300866164177.176, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866070786.563, "dur": 7.250, + "args": { + "External id": 89213, "cbid": 307, "correlation": 161166285 + } + }, + { + "ph": "s", "id": 161166285, "pid": 5714, "tid": 6744, "ts": 6300866070786.563, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866164534.237, "dur": 85.633, + "args": { + "External id": 89214, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166298, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166298, "pid": 0, "tid": 7, "ts": 6300866164534.237, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866070841.853, "dur": 7.740, + "args": { + "External id": 89214, "cbid": 307, "correlation": 161166298 + } + }, + { + "ph": "s", "id": 161166298, "pid": 5714, "tid": 6744, "ts": 6300866070841.853, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6300866164620.510, "dur": 3.936, + "args": { + "External id": 89215, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166306, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161166306, "pid": 0, "tid": 7, "ts": 6300866164620.510, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866070882.823, "dur": 7.100, + "args": { + "External id": 89215, "cbid": 307, "correlation": 161166306 + } + }, + { + "ph": "s", "id": 161166306, "pid": 5714, "tid": 6744, "ts": 6300866070882.823, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6300866164625.150, "dur": 114.113, + "args": { + "External id": 89216, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166314, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166314, "pid": 0, "tid": 7, "ts": 6300866164625.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866070924.643, "dur": 6.620, + "args": { + "External id": 89216, "cbid": 307, "correlation": 161166314 + } + }, + { + "ph": "s", "id": 161166314, "pid": 5714, "tid": 6744, "ts": 6300866070924.643, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866071216.692, "dur": 0.780, + "args": { + "External id": 89235, "cbid": 200, "correlation": 161166360 + } + }, + { + "ph": "f", "id": 161166360, "pid": 5714, "tid": 6744, "ts": 6300866071216.692, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866164740.607, "dur": 1.216, + "args": { + "External id": 89235, "device": 0, "context": 1, "stream": 7, "correlation": 161166363, "bytes": 576, "memory bandwidth (GB/s)": 0.47368421052631576 + } + }, + { + "ph": "f", "id": 161166363, "pid": 0, "tid": 7, "ts": 6300866164740.607, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866071219.962, "dur": 10.890, + "args": { + "External id": 89235, "cbid": 51, "correlation": 161166363 + } + }, + { + "ph": "s", "id": 161166363, "pid": 5714, "tid": 6744, "ts": 6300866071219.962, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866164743.135, "dur": 143.426, + "args": { + "External id": 89235, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166364, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166364, "pid": 0, "tid": 7, "ts": 6300866164743.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866071231.242, "dur": 11.910, + "args": { + "External id": 89235, "cbid": 307, "correlation": 161166364 + } + }, + { + "ph": "s", "id": 161166364, "pid": 5714, "tid": 6744, "ts": 6300866071231.242, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866164887.169, "dur": 141.569, + "args": { + "External id": 89236, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166386, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166386, "pid": 0, "tid": 7, "ts": 6300866164887.169, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866071282.742, "dur": 8.740, + "args": { + "External id": 89236, "cbid": 211, "correlation": 161166386 + } + }, + { + "ph": "s", "id": 161166386, "pid": 5714, "tid": 6744, "ts": 6300866071282.742, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866071412.011, "dur": 0.710, + "args": { + "External id": 89237, "cbid": 200, "correlation": 161166404 + } + }, + { + "ph": "f", "id": 161166404, "pid": 5714, "tid": 6744, "ts": 6300866071412.011, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866071412.911, "dur": 0.310, + "args": { + "External id": 89237, "cbid": 200, "correlation": 161166405 + } + }, + { + "ph": "f", "id": 161166405, "pid": 5714, "tid": 6744, "ts": 6300866071412.911, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866071442.921, "dur": 0.320, + "args": { + "External id": 89237, "cbid": 200, "correlation": 161166423 + } + }, + { + "ph": "f", "id": 161166423, "pid": 5714, "tid": 6744, "ts": 6300866071442.921, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866165029.346, "dur": 92.866, + "args": { + "External id": 89237, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166424, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166424, "pid": 0, "tid": 7, "ts": 6300866165029.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866071445.101, "dur": 14.180, + "args": { + "External id": 89237, "cbid": 211, "correlation": 161166424 + } + }, + { + "ph": "s", "id": 161166424, "pid": 5714, "tid": 6744, "ts": 6300866071445.101, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866071460.441, "dur": 1.460, + "args": { + "External id": 89237, "cbid": 273, "correlation": 161166426 + } + }, + { + "ph": "f", "id": 161166426, "pid": 5714, "tid": 6744, "ts": 6300866071460.441, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866165122.852, "dur": 1094.221, + "args": { + "External id": 89237, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166427, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161166427, "pid": 0, "tid": 7, "ts": 6300866165122.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866071462.431, "dur": 6.060, + "args": { + "External id": 89237, "cbid": 211, "correlation": 161166427 + } + }, + { + "ph": "s", "id": 161166427, "pid": 5714, "tid": 6744, "ts": 6300866071462.431, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866166217.777, "dur": 74.112, + "args": { + "External id": 89237, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166429, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161166429, "pid": 0, "tid": 7, "ts": 6300866166217.777, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866071469.311, "dur": 5.370, + "args": { + "External id": 89237, "cbid": 211, "correlation": 161166429 + } + }, + { + "ph": "s", "id": 161166429, "pid": 5714, "tid": 6744, "ts": 6300866071469.311, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866166292.561, "dur": 48.033, + "args": { + "External id": 89248, "device": 0, "context": 1, "stream": 7, "correlation": 161166451, "bytes": 25165824, "memory bandwidth (GB/s)": 523.9277996377491 + } + }, + { + "ph": "f", "id": 161166451, "pid": 0, "tid": 7, "ts": 6300866166292.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866071671.721, "dur": 24.940, + "args": { + "External id": 89248, "cbid": 41, "correlation": 161166451 + } + }, + { + "ph": "s", "id": 161166451, "pid": 5714, "tid": 6744, "ts": 6300866071671.721, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866166341.298, "dur": 33.889, + "args": { + "External id": 89245, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166469, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166469, "pid": 0, "tid": 7, "ts": 6300866166341.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866071846.120, "dur": 12.570, + "args": { + "External id": 89245, "cbid": 307, "correlation": 161166469 + } + }, + { + "ph": "s", "id": 161166469, "pid": 5714, "tid": 6744, "ts": 6300866071846.120, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866166375.795, "dur": 39.808, + "args": { + "External id": 89255, "device": 0, "context": 1, "stream": 7, "correlation": 161166484, "bytes": 25165824, "memory bandwidth (GB/s)": 632.1800643086817 + } + }, + { + "ph": "f", "id": 161166484, "pid": 0, "tid": 7, "ts": 6300866166375.795, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866071949.170, "dur": 20.180, + "args": { + "External id": 89255, "cbid": 41, "correlation": 161166484 + } + }, + { + "ph": "s", "id": 161166484, "pid": 5714, "tid": 6744, "ts": 6300866071949.170, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866166416.275, "dur": 28.448, + "args": { + "External id": 89252, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166502, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166502, "pid": 0, "tid": 7, "ts": 6300866166416.275, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866072145.510, "dur": 15.770, + "args": { + "External id": 89252, "cbid": 307, "correlation": 161166502 + } + }, + { + "ph": "s", "id": 161166502, "pid": 5714, "tid": 6744, "ts": 6300866072145.510, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866072392.479, "dur": 0.920, + "args": { + "External id": 89260, "cbid": 200, "correlation": 161166532 + } + }, + { + "ph": "f", "id": 161166532, "pid": 5714, "tid": 6744, "ts": 6300866072392.479, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866166446.003, "dur": 0.800, + "args": { + "External id": 89260, "device": 0, "context": 1, "stream": 7, "correlation": 161166535, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 161166535, "pid": 0, "tid": 7, "ts": 6300866166446.003, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866072396.619, "dur": 13.400, + "args": { + "External id": 89260, "cbid": 51, "correlation": 161166535 + } + }, + { + "ph": "s", "id": 161166535, "pid": 5714, "tid": 6744, "ts": 6300866072396.619, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866166447.987, "dur": 256.835, + "args": { + "External id": 89260, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166536, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166536, "pid": 0, "tid": 7, "ts": 6300866166447.987, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866072410.459, "dur": 13.760, + "args": { + "External id": 89260, "cbid": 307, "correlation": 161166536 + } + }, + { + "ph": "s", "id": 161166536, "pid": 5714, "tid": 6744, "ts": 6300866072410.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866072471.809, "dur": 0.560, + "args": { + "External id": 89261, "cbid": 200, "correlation": 161166561 + } + }, + { + "ph": "f", "id": 161166561, "pid": 5714, "tid": 6744, "ts": 6300866072471.809, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866166779.863, "dur": 55.169, + "args": { + "External id": 89261, "device": 0, "context": 1, "stream": 7, "correlation": 161166564, "bytes": 576, "memory bandwidth (GB/s)": 0.010440646014972176 + } + }, + { + "ph": "f", "id": 161166564, "pid": 0, "tid": 7, "ts": 6300866166779.863, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866072474.199, "dur": 7.550, + "args": { + "External id": 89261, "cbid": 51, "correlation": 161166564 + } + }, + { + "ph": "s", "id": 161166564, "pid": 5714, "tid": 6744, "ts": 6300866072474.199, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866166891.704, "dur": 549.415, + "args": { + "External id": 89261, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166565, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166565, "pid": 0, "tid": 7, "ts": 6300866166891.704, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866072481.989, "dur": 8.530, + "args": { + "External id": 89261, "cbid": 307, "correlation": 161166565 + } + }, + { + "ph": "s", "id": 161166565, "pid": 5714, "tid": 6744, "ts": 6300866072481.989, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866072530.999, "dur": 0.530, + "args": { + "External id": 89262, "cbid": 200, "correlation": 161166590 + } + }, + { + "ph": "f", "id": 161166590, "pid": 5714, "tid": 6744, "ts": 6300866072530.999, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866167486.496, "dur": 15.776, + "args": { + "External id": 89262, "device": 0, "context": 1, "stream": 7, "correlation": 161166593, "bytes": 576, "memory bandwidth (GB/s)": 0.036511156186612576 + } + }, + { + "ph": "f", "id": 161166593, "pid": 0, "tid": 7, "ts": 6300866167486.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866072533.299, "dur": 7.060, + "args": { + "External id": 89262, "cbid": 51, "correlation": 161166593 + } + }, + { + "ph": "s", "id": 161166593, "pid": 5714, "tid": 6744, "ts": 6300866072533.299, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866167515.872, "dur": 156.290, + "args": { + "External id": 89262, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166594, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166594, "pid": 0, "tid": 7, "ts": 6300866167515.872, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866072540.639, "dur": 8.290, + "args": { + "External id": 89262, "cbid": 307, "correlation": 161166594 + } + }, + { + "ph": "s", "id": 161166594, "pid": 5714, "tid": 6744, "ts": 6300866072540.639, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866167672.770, "dur": 141.953, + "args": { + "External id": 89263, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166616, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166616, "pid": 0, "tid": 7, "ts": 6300866167672.770, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866072590.139, "dur": 9.630, + "args": { + "External id": 89263, "cbid": 211, "correlation": 161166616 + } + }, + { + "ph": "s", "id": 161166616, "pid": 5714, "tid": 6744, "ts": 6300866072590.139, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866167815.363, "dur": 142.146, + "args": { + "External id": 89264, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166639, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166639, "pid": 0, "tid": 7, "ts": 6300866167815.363, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866072633.148, "dur": 7.840, + "args": { + "External id": 89264, "cbid": 211, "correlation": 161166639 + } + }, + { + "ph": "s", "id": 161166639, "pid": 5714, "tid": 6744, "ts": 6300866072633.148, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866167958.117, "dur": 559.046, + "args": { + "External id": 89265, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166662, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166662, "pid": 0, "tid": 7, "ts": 6300866167958.117, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866072672.508, "dur": 7.620, + "args": { + "External id": 89265, "cbid": 211, "correlation": 161166662 + } + }, + { + "ph": "s", "id": 161166662, "pid": 5714, "tid": 6744, "ts": 6300866072672.508, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6300866168517.835, "dur": 80.642, + "args": { + "External id": 89266, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166670, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166670, "pid": 0, "tid": 7, "ts": 6300866168517.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866072739.738, "dur": 9.280, + "args": { + "External id": 89266, "cbid": 307, "correlation": 161166670 + } + }, + { + "ph": "s", "id": 161166670, "pid": 5714, "tid": 6744, "ts": 6300866072739.738, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866168599.213, "dur": 46.144, + "args": { + "External id": 89281, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166699, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166699, "pid": 0, "tid": 7, "ts": 6300866168599.213, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866073035.588, "dur": 19.560, + "args": { + "External id": 89281, "cbid": 307, "correlation": 161166699 + } + }, + { + "ph": "s", "id": 161166699, "pid": 5714, "tid": 6744, "ts": 6300866073035.588, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866168646.029, "dur": 3.648, + "args": { + "External id": 89282, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166707, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161166707, "pid": 0, "tid": 7, "ts": 6300866168646.029, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866073122.838, "dur": 13.000, + "args": { + "External id": 89282, "cbid": 307, "correlation": 161166707 + } + }, + { + "ph": "s", "id": 161166707, "pid": 5714, "tid": 6744, "ts": 6300866073122.838, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866168650.413, "dur": 50.401, + "args": { + "External id": 89283, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166718, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166718, "pid": 0, "tid": 7, "ts": 6300866168650.413, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866073202.057, "dur": 10.380, + "args": { + "External id": 89283, "cbid": 307, "correlation": 161166718 + } + }, + { + "ph": "s", "id": 161166718, "pid": 5714, "tid": 6744, "ts": 6300866073202.057, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866168701.486, "dur": 46.624, + "args": { + "External id": 89284, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166723, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166723, "pid": 0, "tid": 7, "ts": 6300866168701.486, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866073291.337, "dur": 73.800, + "args": { + "External id": 89284, "cbid": 211, "correlation": 161166723 + } + }, + { + "ph": "s", "id": 161166723, "pid": 5714, "tid": 6744, "ts": 6300866073291.337, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866073734.026, "dur": 6.360, + "args": { + "External id": 89290, "cbid": 147, "correlation": 161166740 + } + }, + { + "ph": "s", "id": 161166740, "pid": 5714, "tid": 6744, "ts": 6300866073734.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866073976.816, "dur": 6.200, + "args": { + "External id": 89298, "cbid": 138, "correlation": 161166755 + } + }, + { + "ph": "f", "id": 161166755, "pid": 5714, "tid": 6744, "ts": 6300866073976.816, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866073983.785, "dur": 1.680, + "args": { + "External id": 89298, "cbid": 138, "correlation": 161166756 + } + }, + { + "ph": "f", "id": 161166756, "pid": 5714, "tid": 6744, "ts": 6300866073983.785, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866168757.614, "dur": 6.240, + "args": { + "External id": 89302, "device": 0, "context": 1, "stream": 7, "correlation": 161166767, "bytes": 28112, "memory bandwidth (GB/s)": 4.505128205128205 + } + }, + { + "ph": "f", "id": 161166767, "pid": 0, "tid": 7, "ts": 6300866168757.614, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866074031.176, "dur": 26.409, + "args": { + "External id": 89302, "cbid": 41, "correlation": 161166767 + } + }, + { + "ph": "s", "id": 161166767, "pid": 5714, "tid": 6744, "ts": 6300866074031.176, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866074067.935, "dur": 4.680, + "args": { + "External id": 89297, "cbid": 135, "correlation": 161166771 + } + }, + { + "ph": "f", "id": 161166771, "pid": 5714, "tid": 6744, "ts": 6300866074067.935, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866168766.606, "dur": 49.761, + "args": { + "External id": 89297, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166775, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166775, "pid": 0, "tid": 7, "ts": 6300866168766.606, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866074081.095, "dur": 26.450, + "args": { + "External id": 89297, "cbid": 211, "correlation": 161166775 + } + }, + { + "ph": "s", "id": 161166775, "pid": 5714, "tid": 6744, "ts": 6300866074081.095, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866074213.635, "dur": 2.600, + "args": { + "External id": 89290, "cbid": 135, "correlation": 161166786 + } + }, + { + "ph": "f", "id": 161166786, "pid": 5714, "tid": 6744, "ts": 6300866074213.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866074221.595, "dur": 3.470, + "args": { + "External id": 89290, "cbid": 147, "correlation": 161166790 + } + }, + { + "ph": "s", "id": 161166790, "pid": 5714, "tid": 6744, "ts": 6300866074221.595, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866074426.115, "dur": 3.140, + "args": { + "External id": 89306, "cbid": 317, "correlation": 161166810 + } + }, + { + "ph": "f", "id": 161166810, "pid": 5714, "tid": 6744, "ts": 6300866074426.115, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866074434.315, "dur": 3.889, + "args": { + "External id": 89306, "cbid": 135, "correlation": 161166812 + } + }, + { + "ph": "f", "id": 161166812, "pid": 5714, "tid": 6744, "ts": 6300866074434.315, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866074441.844, "dur": 2.851, + "args": { + "External id": 89306, "cbid": 147, "correlation": 161166816 + } + }, + { + "ph": "s", "id": 161166816, "pid": 5714, "tid": 6744, "ts": 6300866074441.844, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866074481.734, "dur": 1.840, + "args": { + "External id": 89306, "cbid": 409, "correlation": 161166819 + } + }, + { + "ph": "f", "id": 161166819, "pid": 5714, "tid": 6744, "ts": 6300866074481.734, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866074494.584, "dur": 2.030, + "args": { + "External id": 89306, "cbid": 135, "correlation": 161166822 + } + }, + { + "ph": "f", "id": 161166822, "pid": 5714, "tid": 6744, "ts": 6300866074494.584, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866074497.054, "dur": 2.370, + "args": { + "External id": 89306, "cbid": 147, "correlation": 161166823 + } + }, + { + "ph": "s", "id": 161166823, "pid": 5714, "tid": 6744, "ts": 6300866074497.054, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866173279.395, "dur": 8578.757, + "args": { + "External id": 89306, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161166825, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161166825, "pid": 0, "tid": 20, "ts": 6300866173279.395, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866074502.324, "dur": 26.070, + "args": { + "External id": 89306, "cbid": 430, "correlation": 161166825 + } + }, + { + "ph": "s", "id": 161166825, "pid": 5714, "tid": 6744, "ts": 6300866074502.324, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866074531.084, "dur": 1.090, + "args": { + "External id": 89306, "cbid": 135, "correlation": 161166827 + } + }, + { + "ph": "f", "id": 161166827, "pid": 5714, "tid": 6744, "ts": 6300866074531.084, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866074532.474, "dur": 1.410, + "args": { + "External id": 89306, "cbid": 147, "correlation": 161166828 + } + }, + { + "ph": "s", "id": 161166828, "pid": 5714, "tid": 6744, "ts": 6300866074532.474, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866074537.744, "dur": 1.860, + "args": { + "External id": 89306, "cbid": 135, "correlation": 161166831 + } + }, + { + "ph": "f", "id": 161166831, "pid": 5714, "tid": 6744, "ts": 6300866074537.744, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866074559.914, "dur": 1.170, + "args": { + "External id": 89306, "cbid": 135, "correlation": 161166838 + } + }, + { + "ph": "f", "id": 161166838, "pid": 5714, "tid": 6744, "ts": 6300866074559.914, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866074628.714, "dur": 2.550, + "args": { + "External id": 89308, "cbid": 147, "correlation": 161166843 + } + }, + { + "ph": "s", "id": 161166843, "pid": 5714, "tid": 6744, "ts": 6300866074628.714, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866074674.374, "dur": 2.130, + "args": { + "External id": 89290, "cbid": 135, "correlation": 161166858 + } + }, + { + "ph": "f", "id": 161166858, "pid": 5714, "tid": 6744, "ts": 6300866074674.374, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866075182.903, "dur": 4.090, + "args": { + "External id": 89290, "cbid": 135, "correlation": 161166871 + } + }, + { + "ph": "f", "id": 161166871, "pid": 5714, "tid": 6744, "ts": 6300866075182.903, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866075565.012, "dur": 10.430, + "args": { + "External id": 89318, "cbid": 147, "correlation": 161166882 + } + }, + { + "ph": "s", "id": 161166882, "pid": 5714, "tid": 6744, "ts": 6300866075565.012, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866075911.621, "dur": 4.110, + "args": { + "External id": 89332, "cbid": 317, "correlation": 161166923 + } + }, + { + "ph": "f", "id": 161166923, "pid": 5714, "tid": 6744, "ts": 6300866075911.621, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866075939.431, "dur": 8.170, + "args": { + "External id": 89333, "cbid": 138, "correlation": 161166926 + } + }, + { + "ph": "f", "id": 161166926, "pid": 5714, "tid": 6744, "ts": 6300866075939.431, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866075948.711, "dur": 2.440, + "args": { + "External id": 89333, "cbid": 138, "correlation": 161166927 + } + }, + { + "ph": "f", "id": 161166927, "pid": 5714, "tid": 6744, "ts": 6300866075948.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866173279.139, "dur": 2.784, + "args": { + "External id": 89337, "device": 0, "context": 1, "stream": 7, "correlation": 161166938, "bytes": 7224, "memory bandwidth (GB/s)": 2.5948275862068964 + } + }, + { + "ph": "f", "id": 161166938, "pid": 0, "tid": 7, "ts": 6300866173279.139, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866076016.591, "dur": 39.630, + "args": { + "External id": 89337, "cbid": 41, "correlation": 161166938 + } + }, + { + "ph": "s", "id": 161166938, "pid": 5714, "tid": 6744, "ts": 6300866076016.591, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866076070.941, "dur": 5.900, + "args": { + "External id": 89332, "cbid": 135, "correlation": 161166942 + } + }, + { + "ph": "f", "id": 161166942, "pid": 5714, "tid": 6744, "ts": 6300866076070.941, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6300866173283.972, "dur": 12.960, + "args": { + "External id": 89332, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166946, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161166946, "pid": 0, "tid": 7, "ts": 6300866173283.972, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866076086.231, "dur": 45.560, + "args": { + "External id": 89332, "cbid": 211, "correlation": 161166946 + } + }, + { + "ph": "s", "id": 161166946, "pid": 5714, "tid": 6744, "ts": 6300866076086.231, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866076575.870, "dur": 6.509, + "args": { + "External id": 89318, "cbid": 135, "correlation": 161166957 + } + }, + { + "ph": "f", "id": 161166957, "pid": 5714, "tid": 6744, "ts": 6300866076575.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866076597.759, "dur": 6.071, + "args": { + "External id": 89318, "cbid": 147, "correlation": 161166961 + } + }, + { + "ph": "s", "id": 161166961, "pid": 5714, "tid": 6744, "ts": 6300866076597.759, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866076611.910, "dur": 3.800, + "args": { + "External id": 89318, "cbid": 147, "correlation": 161166965 + } + }, + { + "ph": "s", "id": 161166965, "pid": 5714, "tid": 6744, "ts": 6300866076611.910, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6300866173297.604, "dur": 2256.250, + "args": { + "External id": 89339, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161166997, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161166997, "pid": 0, "tid": 7, "ts": 6300866173297.604, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866077188.518, "dur": 85.120, + "args": { + "External id": 89339, "cbid": 211, "correlation": 161166997 + } + }, + { + "ph": "s", "id": 161166997, "pid": 5714, "tid": 6744, "ts": 6300866077188.518, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6300866175554.558, "dur": 578.279, + "args": { + "External id": 89340, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167020, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161167020, "pid": 0, "tid": 7, "ts": 6300866175554.558, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866077691.897, "dur": 48.350, + "args": { + "External id": 89340, "cbid": 307, "correlation": 161167020 + } + }, + { + "ph": "s", "id": 161167020, "pid": 5714, "tid": 6744, "ts": 6300866077691.897, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866078012.006, "dur": 4.280, + "args": { + "External id": 89341, "cbid": 200, "correlation": 161167043 + } + }, + { + "ph": "f", "id": 161167043, "pid": 5714, "tid": 6744, "ts": 6300866078012.006, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866176208.486, "dur": 77.473, + "args": { + "External id": 89341, "device": 0, "context": 1, "stream": 7, "correlation": 161167046, "bytes": 1536, "memory bandwidth (GB/s)": 0.01982626205258606 + } + }, + { + "ph": "f", "id": 161167046, "pid": 0, "tid": 7, "ts": 6300866176208.486, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866078029.666, "dur": 47.770, + "args": { + "External id": 89341, "cbid": 51, "correlation": 161167046 + } + }, + { + "ph": "s", "id": 161167046, "pid": 5714, "tid": 6744, "ts": 6300866078029.666, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866176347.687, "dur": 775.754, + "args": { + "External id": 89341, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167047, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167047, "pid": 0, "tid": 7, "ts": 6300866176347.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866078079.176, "dur": 44.560, + "args": { + "External id": 89341, "cbid": 307, "correlation": 161167047 + } + }, + { + "ph": "s", "id": 161167047, "pid": 5714, "tid": 6744, "ts": 6300866078079.176, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866078339.066, "dur": 1.869, + "args": { + "External id": 89342, "cbid": 200, "correlation": 161167072 + } + }, + { + "ph": "f", "id": 161167072, "pid": 5714, "tid": 6744, "ts": 6300866078339.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866177124.817, "dur": 1.248, + "args": { + "External id": 89342, "device": 0, "context": 1, "stream": 7, "correlation": 161167075, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 161167075, "pid": 0, "tid": 7, "ts": 6300866177124.817, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866078347.555, "dur": 25.900, + "args": { + "External id": 89342, "cbid": 51, "correlation": 161167075 + } + }, + { + "ph": "s", "id": 161167075, "pid": 5714, "tid": 6744, "ts": 6300866078347.555, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866177127.569, "dur": 352.068, + "args": { + "External id": 89342, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167076, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167076, "pid": 0, "tid": 7, "ts": 6300866177127.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866078374.346, "dur": 30.400, + "args": { + "External id": 89342, "cbid": 307, "correlation": 161167076 + } + }, + { + "ph": "s", "id": 161167076, "pid": 5714, "tid": 6744, "ts": 6300866078374.346, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866078556.955, "dur": 1.530, + "args": { + "External id": 89343, "cbid": 200, "correlation": 161167101 + } + }, + { + "ph": "f", "id": 161167101, "pid": 5714, "tid": 6744, "ts": 6300866078556.955, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866177480.245, "dur": 357.188, + "args": { + "External id": 89343, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167104, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167104, "pid": 0, "tid": 7, "ts": 6300866177480.245, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866078566.075, "dur": 29.590, + "args": { + "External id": 89343, "cbid": 307, "correlation": 161167104 + } + }, + { + "ph": "s", "id": 161167104, "pid": 5714, "tid": 6744, "ts": 6300866078566.075, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866078727.305, "dur": 1.540, + "args": { + "External id": 89344, "cbid": 200, "correlation": 161167129 + } + }, + { + "ph": "f", "id": 161167129, "pid": 5714, "tid": 6744, "ts": 6300866078727.305, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866177838.905, "dur": 0.896, + "args": { + "External id": 89344, "device": 0, "context": 1, "stream": 7, "correlation": 161167132, "bytes": 1536, "memory bandwidth (GB/s)": 1.7142857142857142 + } + }, + { + "ph": "f", "id": 161167132, "pid": 0, "tid": 7, "ts": 6300866177838.905, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866078735.275, "dur": 24.250, + "args": { + "External id": 89344, "cbid": 51, "correlation": 161167132 + } + }, + { + "ph": "s", "id": 161167132, "pid": 5714, "tid": 6744, "ts": 6300866078735.275, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866177841.497, "dur": 358.212, + "args": { + "External id": 89344, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167133, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167133, "pid": 0, "tid": 7, "ts": 6300866177841.497, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866078760.265, "dur": 25.109, + "args": { + "External id": 89344, "cbid": 307, "correlation": 161167133 + } + }, + { + "ph": "s", "id": 161167133, "pid": 5714, "tid": 6744, "ts": 6300866078760.265, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866078919.104, "dur": 1.760, + "args": { + "External id": 89345, "cbid": 200, "correlation": 161167158 + } + }, + { + "ph": "f", "id": 161167158, "pid": 5714, "tid": 6744, "ts": 6300866078919.104, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866178200.317, "dur": 357.445, + "args": { + "External id": 89345, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167161, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167161, "pid": 0, "tid": 7, "ts": 6300866178200.317, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866078927.394, "dur": 26.300, + "args": { + "External id": 89345, "cbid": 307, "correlation": 161167161 + } + }, + { + "ph": "s", "id": 161167161, "pid": 5714, "tid": 6744, "ts": 6300866078927.394, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866178558.402, "dur": 89.793, + "args": { + "External id": 89346, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167174, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167174, "pid": 0, "tid": 7, "ts": 6300866178558.402, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866079176.164, "dur": 30.590, + "args": { + "External id": 89346, "cbid": 307, "correlation": 161167174 + } + }, + { + "ph": "s", "id": 161167174, "pid": 5714, "tid": 6744, "ts": 6300866079176.164, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6300866178648.867, "dur": 4.224, + "args": { + "External id": 89347, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167182, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161167182, "pid": 0, "tid": 7, "ts": 6300866178648.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866079384.653, "dur": 30.370, + "args": { + "External id": 89347, "cbid": 307, "correlation": 161167182 + } + }, + { + "ph": "s", "id": 161167182, "pid": 5714, "tid": 6744, "ts": 6300866079384.653, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6300866178653.699, "dur": 113.761, + "args": { + "External id": 89348, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167190, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167190, "pid": 0, "tid": 7, "ts": 6300866178653.699, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866079571.993, "dur": 28.260, + "args": { + "External id": 89348, "cbid": 307, "correlation": 161167190 + } + }, + { + "ph": "s", "id": 161167190, "pid": 5714, "tid": 6744, "ts": 6300866079571.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866081096.599, "dur": 4.410, + "args": { + "External id": 89367, "cbid": 200, "correlation": 161167236 + } + }, + { + "ph": "f", "id": 161167236, "pid": 5714, "tid": 6744, "ts": 6300866081096.599, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866178768.708, "dur": 0.864, + "args": { + "External id": 89367, "device": 0, "context": 1, "stream": 7, "correlation": 161167239, "bytes": 576, "memory bandwidth (GB/s)": 0.6666666666666666 + } + }, + { + "ph": "f", "id": 161167239, "pid": 0, "tid": 7, "ts": 6300866178768.708, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866081115.049, "dur": 59.700, + "args": { + "External id": 89367, "cbid": 51, "correlation": 161167239 + } + }, + { + "ph": "s", "id": 161167239, "pid": 5714, "tid": 6744, "ts": 6300866081115.049, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866178770.788, "dur": 143.330, + "args": { + "External id": 89367, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167240, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167240, "pid": 0, "tid": 7, "ts": 6300866178770.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866081176.999, "dur": 63.660, + "args": { + "External id": 89367, "cbid": 307, "correlation": 161167240 + } + }, + { + "ph": "s", "id": 161167240, "pid": 5714, "tid": 6744, "ts": 6300866081176.999, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866178914.790, "dur": 319.779, + "args": { + "External id": 89368, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167262, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167262, "pid": 0, "tid": 7, "ts": 6300866178914.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866081523.928, "dur": 51.630, + "args": { + "External id": 89368, "cbid": 211, "correlation": 161167262 + } + }, + { + "ph": "s", "id": 161167262, "pid": 5714, "tid": 6744, "ts": 6300866081523.928, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866082167.767, "dur": 3.940, + "args": { + "External id": 89369, "cbid": 200, "correlation": 161167280 + } + }, + { + "ph": "f", "id": 161167280, "pid": 5714, "tid": 6744, "ts": 6300866082167.767, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866082172.827, "dur": 1.770, + "args": { + "External id": 89369, "cbid": 200, "correlation": 161167281 + } + }, + { + "ph": "f", "id": 161167281, "pid": 5714, "tid": 6744, "ts": 6300866082172.827, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866082382.917, "dur": 2.120, + "args": { + "External id": 89369, "cbid": 200, "correlation": 161167299 + } + }, + { + "ph": "f", "id": 161167299, "pid": 5714, "tid": 6744, "ts": 6300866082382.917, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866179239.785, "dur": 246.339, + "args": { + "External id": 89369, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167300, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167300, "pid": 0, "tid": 7, "ts": 6300866179239.785, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866082394.966, "dur": 77.430, + "args": { + "External id": 89369, "cbid": 211, "correlation": 161167300 + } + }, + { + "ph": "s", "id": 161167300, "pid": 5714, "tid": 6744, "ts": 6300866082394.966, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866082479.006, "dur": 7.880, + "args": { + "External id": 89369, "cbid": 273, "correlation": 161167302 + } + }, + { + "ph": "f", "id": 161167302, "pid": 5714, "tid": 6744, "ts": 6300866082479.006, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6300866179486.828, "dur": 1363.952, + "args": { + "External id": 89369, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167303, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161167303, "pid": 0, "tid": 7, "ts": 6300866179486.828, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866082489.596, "dur": 31.080, + "args": { + "External id": 89369, "cbid": 211, "correlation": 161167303 + } + }, + { + "ph": "s", "id": 161167303, "pid": 5714, "tid": 6744, "ts": 6300866082489.596, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6300866180851.484, "dur": 223.715, + "args": { + "External id": 89369, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167305, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161167305, "pid": 0, "tid": 7, "ts": 6300866180851.484, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866082525.416, "dur": 27.770, + "args": { + "External id": 89369, "cbid": 211, "correlation": 161167305 + } + }, + { + "ph": "s", "id": 161167305, "pid": 5714, "tid": 6744, "ts": 6300866082525.416, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866181078.719, "dur": 246.723, + "args": { + "External id": 89380, "device": 0, "context": 1, "stream": 7, "correlation": 161167327, "bytes": 25165824, "memory bandwidth (GB/s)": 102.00031614401576 + } + }, + { + "ph": "f", "id": 161167327, "pid": 0, "tid": 7, "ts": 6300866181078.719, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866083685.494, "dur": 135.139, + "args": { + "External id": 89380, "cbid": 41, "correlation": 161167327 + } + }, + { + "ph": "s", "id": 161167327, "pid": 5714, "tid": 6744, "ts": 6300866083685.494, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866181326.050, "dur": 101.185, + "args": { + "External id": 89377, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167345, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167345, "pid": 0, "tid": 7, "ts": 6300866181326.050, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866084792.961, "dur": 71.280, + "args": { + "External id": 89377, "cbid": 307, "correlation": 161167345 + } + }, + { + "ph": "s", "id": 161167345, "pid": 5714, "tid": 6744, "ts": 6300866084792.961, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866181427.875, "dur": 54.273, + "args": { + "External id": 89387, "device": 0, "context": 1, "stream": 7, "correlation": 161167360, "bytes": 25165824, "memory bandwidth (GB/s)": 463.68956939914875 + } + }, + { + "ph": "f", "id": 161167360, "pid": 0, "tid": 7, "ts": 6300866181427.875, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866085436.199, "dur": 154.800, + "args": { + "External id": 89387, "cbid": 41, "correlation": 161167360 + } + }, + { + "ph": "s", "id": 161167360, "pid": 5714, "tid": 6744, "ts": 6300866085436.199, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6300866181482.788, "dur": 44.033, + "args": { + "External id": 89384, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167378, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167378, "pid": 0, "tid": 7, "ts": 6300866181482.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866086373.717, "dur": 62.030, + "args": { + "External id": 89384, "cbid": 307, "correlation": 161167378 + } + }, + { + "ph": "s", "id": 161167378, "pid": 5714, "tid": 6744, "ts": 6300866086373.717, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866087450.045, "dur": 4.340, + "args": { + "External id": 89392, "cbid": 200, "correlation": 161167408 + } + }, + { + "ph": "f", "id": 161167408, "pid": 5714, "tid": 6744, "ts": 6300866087450.045, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866181534.085, "dur": 118.433, + "args": { + "External id": 89392, "device": 0, "context": 1, "stream": 7, "correlation": 161167411, "bytes": 576, "memory bandwidth (GB/s)": 0.004863509325948004 + } + }, + { + "ph": "f", "id": 161167411, "pid": 0, "tid": 7, "ts": 6300866181534.085, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866087468.985, "dur": 59.170, + "args": { + "External id": 89392, "cbid": 51, "correlation": 161167411 + } + }, + { + "ph": "s", "id": 161167411, "pid": 5714, "tid": 6744, "ts": 6300866087468.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866181659.686, "dur": 157.058, + "args": { + "External id": 89392, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167412, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167412, "pid": 0, "tid": 7, "ts": 6300866181659.686, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866087530.145, "dur": 63.769, + "args": { + "External id": 89392, "cbid": 307, "correlation": 161167412 + } + }, + { + "ph": "s", "id": 161167412, "pid": 5714, "tid": 6744, "ts": 6300866087530.145, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866087820.314, "dur": 2.470, + "args": { + "External id": 89393, "cbid": 200, "correlation": 161167437 + } + }, + { + "ph": "f", "id": 161167437, "pid": 5714, "tid": 6744, "ts": 6300866087820.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866181820.136, "dur": 1.408, + "args": { + "External id": 89393, "device": 0, "context": 1, "stream": 7, "correlation": 161167440, "bytes": 576, "memory bandwidth (GB/s)": 0.4090909090909091 + } + }, + { + "ph": "f", "id": 161167440, "pid": 0, "tid": 7, "ts": 6300866181820.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866087831.044, "dur": 32.700, + "args": { + "External id": 89393, "cbid": 51, "correlation": 161167440 + } + }, + { + "ph": "s", "id": 161167440, "pid": 5714, "tid": 6744, "ts": 6300866087831.044, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866181823.368, "dur": 137.249, + "args": { + "External id": 89393, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167441, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167441, "pid": 0, "tid": 7, "ts": 6300866181823.368, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866087864.974, "dur": 38.020, + "args": { + "External id": 89393, "cbid": 307, "correlation": 161167441 + } + }, + { + "ph": "s", "id": 161167441, "pid": 5714, "tid": 6744, "ts": 6300866087864.974, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6300866088086.553, "dur": 2.400, + "args": { + "External id": 89394, "cbid": 200, "correlation": 161167466 + } + }, + { + "ph": "f", "id": 161167466, "pid": 5714, "tid": 6744, "ts": 6300866088086.553, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6300866181961.449, "dur": 0.800, + "args": { + "External id": 89394, "device": 0, "context": 1, "stream": 7, "correlation": 161167469, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 161167469, "pid": 0, "tid": 7, "ts": 6300866181961.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6300866088097.024, "dur": 30.869, + "args": { + "External id": 89394, "cbid": 51, "correlation": 161167469 + } + }, + { + "ph": "s", "id": 161167469, "pid": 5714, "tid": 6744, "ts": 6300866088097.024, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6300866181963.433, "dur": 135.842, + "args": { + "External id": 89394, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167470, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167470, "pid": 0, "tid": 7, "ts": 6300866181963.433, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866088129.013, "dur": 35.130, + "args": { + "External id": 89394, "cbid": 307, "correlation": 161167470 + } + }, + { + "ph": "s", "id": 161167470, "pid": 5714, "tid": 6744, "ts": 6300866088129.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866182099.947, "dur": 121.505, + "args": { + "External id": 89395, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167492, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167492, "pid": 0, "tid": 7, "ts": 6300866182099.947, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866088384.013, "dur": 42.920, + "args": { + "External id": 89395, "cbid": 211, "correlation": 161167492 + } + }, + { + "ph": "s", "id": 161167492, "pid": 5714, "tid": 6744, "ts": 6300866088384.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866182222.092, "dur": 121.186, + "args": { + "External id": 89396, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167515, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167515, "pid": 0, "tid": 7, "ts": 6300866182222.092, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866088579.502, "dur": 33.190, + "args": { + "External id": 89396, "cbid": 211, "correlation": 161167515 + } + }, + { + "ph": "s", "id": 161167515, "pid": 5714, "tid": 6744, "ts": 6300866088579.502, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6300866182343.982, "dur": 122.817, + "args": { + "External id": 89397, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167538, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 161167538, "pid": 0, "tid": 7, "ts": 6300866182343.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866088739.882, "dur": 30.350, + "args": { + "External id": 89397, "cbid": 211, "correlation": 161167538 + } + }, + { + "ph": "s", "id": 161167538, "pid": 5714, "tid": 6744, "ts": 6300866088739.882, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6300866182467.407, "dur": 79.073, + "args": { + "External id": 89398, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167546, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167546, "pid": 0, "tid": 7, "ts": 6300866182467.407, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866089003.511, "dur": 36.020, + "args": { + "External id": 89398, "cbid": 307, "correlation": 161167546 + } + }, + { + "ph": "s", "id": 161167546, "pid": 5714, "tid": 6744, "ts": 6300866089003.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6300866182547.088, "dur": 42.785, + "args": { + "External id": 89413, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167575, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167575, "pid": 0, "tid": 7, "ts": 6300866182547.088, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866090268.188, "dur": 120.010, + "args": { + "External id": 89413, "cbid": 307, "correlation": 161167575 + } + }, + { + "ph": "s", "id": 161167575, "pid": 5714, "tid": 6744, "ts": 6300866090268.188, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6300866182590.513, "dur": 1.824, + "args": { + "External id": 89414, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167583, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 161167583, "pid": 0, "tid": 7, "ts": 6300866182590.513, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866090610.128, "dur": 40.900, + "args": { + "External id": 89414, "cbid": 307, "correlation": 161167583 + } + }, + { + "ph": "s", "id": 161167583, "pid": 5714, "tid": 6744, "ts": 6300866090610.128, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6300866182593.073, "dur": 52.289, + "args": { + "External id": 89415, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167594, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167594, "pid": 0, "tid": 7, "ts": 6300866182593.073, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866090872.807, "dur": 38.920, + "args": { + "External id": 89415, "cbid": 307, "correlation": 161167594 + } + }, + { + "ph": "s", "id": 161167594, "pid": 5714, "tid": 6744, "ts": 6300866090872.807, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866182646.002, "dur": 43.936, + "args": { + "External id": 89416, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167599, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167599, "pid": 0, "tid": 7, "ts": 6300866182646.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866091193.217, "dur": 51.859, + "args": { + "External id": 89416, "cbid": 211, "correlation": 161167599 + } + }, + { + "ph": "s", "id": 161167599, "pid": 5714, "tid": 6744, "ts": 6300866091193.217, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091796.805, "dur": 36.470, + "args": { + "cbid": 138, "correlation": 161167613 + } + }, + { + "ph": "f", "id": 161167613, "pid": 5714, "tid": 1822426688, "ts": 6300866091796.805, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091835.765, "dur": 5.520, + "args": { + "cbid": 138, "correlation": 161167614 + } + }, + { + "ph": "f", "id": 161167614, "pid": 5714, "tid": 1822426688, "ts": 6300866091835.765, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091864.165, "dur": 3.940, + "args": { + "cbid": 138, "correlation": 161167615 + } + }, + { + "ph": "f", "id": 161167615, "pid": 5714, "tid": 1822426688, "ts": 6300866091864.165, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091891.125, "dur": 5.460, + "args": { + "cbid": 138, "correlation": 161167616 + } + }, + { + "ph": "f", "id": 161167616, "pid": 5714, "tid": 1822426688, "ts": 6300866091891.125, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091897.535, "dur": 3.260, + "args": { + "cbid": 138, "correlation": 161167617 + } + }, + { + "ph": "f", "id": 161167617, "pid": 5714, "tid": 1822426688, "ts": 6300866091897.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091904.685, "dur": 3.090, + "args": { + "cbid": 138, "correlation": 161167618 + } + }, + { + "ph": "f", "id": 161167618, "pid": 5714, "tid": 1822426688, "ts": 6300866091904.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091928.175, "dur": 5.460, + "args": { + "cbid": 138, "correlation": 161167619 + } + }, + { + "ph": "f", "id": 161167619, "pid": 5714, "tid": 1822426688, "ts": 6300866091928.175, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091934.475, "dur": 2.570, + "args": { + "cbid": 138, "correlation": 161167620 + } + }, + { + "ph": "f", "id": 161167620, "pid": 5714, "tid": 1822426688, "ts": 6300866091934.475, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091941.785, "dur": 2.780, + "args": { + "cbid": 138, "correlation": 161167621 + } + }, + { + "ph": "f", "id": 161167621, "pid": 5714, "tid": 1822426688, "ts": 6300866091941.785, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091953.365, "dur": 6.140, + "args": { + "cbid": 138, "correlation": 161167622 + } + }, + { + "ph": "f", "id": 161167622, "pid": 5714, "tid": 1822426688, "ts": 6300866091953.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091960.395, "dur": 2.740, + "args": { + "cbid": 138, "correlation": 161167623 + } + }, + { + "ph": "f", "id": 161167623, "pid": 5714, "tid": 1822426688, "ts": 6300866091960.395, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091967.895, "dur": 2.820, + "args": { + "cbid": 138, "correlation": 161167624 + } + }, + { + "ph": "f", "id": 161167624, "pid": 5714, "tid": 1822426688, "ts": 6300866091967.895, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091979.135, "dur": 5.460, + "args": { + "cbid": 138, "correlation": 161167625 + } + }, + { + "ph": "f", "id": 161167625, "pid": 5714, "tid": 1822426688, "ts": 6300866091979.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091985.455, "dur": 2.529, + "args": { + "cbid": 138, "correlation": 161167626 + } + }, + { + "ph": "f", "id": 161167626, "pid": 5714, "tid": 1822426688, "ts": 6300866091985.455, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866091991.575, "dur": 2.900, + "args": { + "cbid": 138, "correlation": 161167627 + } + }, + { + "ph": "f", "id": 161167627, "pid": 5714, "tid": 1822426688, "ts": 6300866091991.575, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092001.624, "dur": 5.840, + "args": { + "cbid": 138, "correlation": 161167628 + } + }, + { + "ph": "f", "id": 161167628, "pid": 5714, "tid": 1822426688, "ts": 6300866092001.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092018.175, "dur": 3.489, + "args": { + "cbid": 138, "correlation": 161167630 + } + }, + { + "ph": "f", "id": 161167630, "pid": 5714, "tid": 1822426688, "ts": 6300866092018.175, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092028.255, "dur": 3.809, + "args": { + "cbid": 138, "correlation": 161167632 + } + }, + { + "ph": "f", "id": 161167632, "pid": 5714, "tid": 1822426688, "ts": 6300866092028.255, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092038.404, "dur": 3.800, + "args": { + "cbid": 138, "correlation": 161167634 + } + }, + { + "ph": "f", "id": 161167634, "pid": 5714, "tid": 1822426688, "ts": 6300866092038.404, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092049.144, "dur": 4.711, + "args": { + "cbid": 138, "correlation": 161167636 + } + }, + { + "ph": "f", "id": 161167636, "pid": 5714, "tid": 1822426688, "ts": 6300866092049.144, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092060.664, "dur": 4.471, + "args": { + "cbid": 138, "correlation": 161167638 + } + }, + { + "ph": "f", "id": 161167638, "pid": 5714, "tid": 1822426688, "ts": 6300866092060.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092072.284, "dur": 3.460, + "args": { + "cbid": 138, "correlation": 161167640 + } + }, + { + "ph": "f", "id": 161167640, "pid": 5714, "tid": 1822426688, "ts": 6300866092072.284, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092082.194, "dur": 4.150, + "args": { + "cbid": 138, "correlation": 161167642 + } + }, + { + "ph": "f", "id": 161167642, "pid": 5714, "tid": 1822426688, "ts": 6300866092082.194, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092092.974, "dur": 4.040, + "args": { + "cbid": 138, "correlation": 161167644 + } + }, + { + "ph": "f", "id": 161167644, "pid": 5714, "tid": 1822426688, "ts": 6300866092092.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092103.014, "dur": 4.040, + "args": { + "cbid": 138, "correlation": 161167646 + } + }, + { + "ph": "f", "id": 161167646, "pid": 5714, "tid": 1822426688, "ts": 6300866092103.014, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092113.824, "dur": 3.530, + "args": { + "cbid": 138, "correlation": 161167648 + } + }, + { + "ph": "f", "id": 161167648, "pid": 5714, "tid": 1822426688, "ts": 6300866092113.824, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092123.614, "dur": 3.530, + "args": { + "cbid": 138, "correlation": 161167650 + } + }, + { + "ph": "f", "id": 161167650, "pid": 5714, "tid": 1822426688, "ts": 6300866092123.614, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866092133.454, "dur": 3.910, + "args": { + "cbid": 138, "correlation": 161167652 + } + }, + { + "ph": "f", "id": 161167652, "pid": 5714, "tid": 1822426688, "ts": 6300866092133.454, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866092601.323, "dur": 17.160, + "args": { + "External id": 89422, "cbid": 147, "correlation": 161167657 + } + }, + { + "ph": "s", "id": 161167657, "pid": 5714, "tid": 6744, "ts": 6300866092601.323, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866093232.782, "dur": 18.280, + "args": { + "External id": 89430, "cbid": 138, "correlation": 161167672 + } + }, + { + "ph": "f", "id": 161167672, "pid": 5714, "tid": 6744, "ts": 6300866093232.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866093253.792, "dur": 4.190, + "args": { + "External id": 89430, "cbid": 138, "correlation": 161167673 + } + }, + { + "ph": "f", "id": 161167673, "pid": 5714, "tid": 6744, "ts": 6300866093253.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866093259.272, "dur": 4.620, + "args": { + "External id": 89430, "cbid": 138, "correlation": 161167674 + } + }, + { + "ph": "f", "id": 161167674, "pid": 5714, "tid": 6744, "ts": 6300866093259.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866182696.498, "dur": 5.856, + "args": { + "External id": 89434, "device": 0, "context": 1, "stream": 7, "correlation": 161167685, "bytes": 28112, "memory bandwidth (GB/s)": 4.800546448087432 + } + }, + { + "ph": "f", "id": 161167685, "pid": 0, "tid": 7, "ts": 6300866182696.498, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866093436.861, "dur": 77.820, + "args": { + "External id": 89434, "cbid": 41, "correlation": 161167685 + } + }, + { + "ph": "s", "id": 161167685, "pid": 5714, "tid": 6744, "ts": 6300866093436.861, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866093541.961, "dur": 11.340, + "args": { + "External id": 89429, "cbid": 135, "correlation": 161167689 + } + }, + { + "ph": "f", "id": 161167689, "pid": 5714, "tid": 6744, "ts": 6300866093541.961, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866182703.954, "dur": 49.217, + "args": { + "External id": 89429, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167693, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167693, "pid": 0, "tid": 7, "ts": 6300866182703.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866093573.001, "dur": 65.620, + "args": { + "External id": 89429, "cbid": 211, "correlation": 161167693 + } + }, + { + "ph": "s", "id": 161167693, "pid": 5714, "tid": 6744, "ts": 6300866093573.001, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866093904.470, "dur": 6.820, + "args": { + "External id": 89422, "cbid": 135, "correlation": 161167704 + } + }, + { + "ph": "f", "id": 161167704, "pid": 5714, "tid": 6744, "ts": 6300866093904.470, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866093924.710, "dur": 9.530, + "args": { + "External id": 89422, "cbid": 147, "correlation": 161167708 + } + }, + { + "ph": "s", "id": 161167708, "pid": 5714, "tid": 6744, "ts": 6300866093924.710, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866094425.489, "dur": 7.470, + "args": { + "External id": 89438, "cbid": 317, "correlation": 161167728 + } + }, + { + "ph": "f", "id": 161167728, "pid": 5714, "tid": 6744, "ts": 6300866094425.489, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866094446.239, "dur": 10.560, + "args": { + "External id": 89438, "cbid": 135, "correlation": 161167730 + } + }, + { + "ph": "f", "id": 161167730, "pid": 5714, "tid": 6744, "ts": 6300866094446.239, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866094466.509, "dur": 8.560, + "args": { + "External id": 89438, "cbid": 147, "correlation": 161167734 + } + }, + { + "ph": "s", "id": 161167734, "pid": 5714, "tid": 6744, "ts": 6300866094466.509, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866094572.589, "dur": 5.330, + "args": { + "External id": 89438, "cbid": 409, "correlation": 161167737 + } + }, + { + "ph": "f", "id": 161167737, "pid": 5714, "tid": 6744, "ts": 6300866094572.589, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866094605.959, "dur": 5.300, + "args": { + "External id": 89438, "cbid": 135, "correlation": 161167740 + } + }, + { + "ph": "f", "id": 161167740, "pid": 5714, "tid": 6744, "ts": 6300866094605.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866094612.459, "dur": 5.920, + "args": { + "External id": 89438, "cbid": 147, "correlation": 161167741 + } + }, + { + "ph": "s", "id": 161167741, "pid": 5714, "tid": 6744, "ts": 6300866094612.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866182846.580, "dur": 10893.279, + "args": { + "External id": 89438, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161167743, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161167743, "pid": 0, "tid": 20, "ts": 6300866182846.580, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866094625.669, "dur": 67.420, + "args": { + "External id": 89438, "cbid": 430, "correlation": 161167743 + } + }, + { + "ph": "s", "id": 161167743, "pid": 5714, "tid": 6744, "ts": 6300866094625.669, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866094699.758, "dur": 2.760, + "args": { + "External id": 89438, "cbid": 135, "correlation": 161167745 + } + }, + { + "ph": "f", "id": 161167745, "pid": 5714, "tid": 6744, "ts": 6300866094699.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866094703.338, "dur": 3.820, + "args": { + "External id": 89438, "cbid": 147, "correlation": 161167746 + } + }, + { + "ph": "s", "id": 161167746, "pid": 5714, "tid": 6744, "ts": 6300866094703.338, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866094717.438, "dur": 3.940, + "args": { + "External id": 89438, "cbid": 135, "correlation": 161167749 + } + }, + { + "ph": "f", "id": 161167749, "pid": 5714, "tid": 6744, "ts": 6300866094717.438, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866094773.998, "dur": 3.170, + "args": { + "External id": 89438, "cbid": 135, "correlation": 161167756 + } + }, + { + "ph": "f", "id": 161167756, "pid": 5714, "tid": 6744, "ts": 6300866094773.998, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866094949.518, "dur": 6.370, + "args": { + "External id": 89440, "cbid": 147, "correlation": 161167761 + } + }, + { + "ph": "s", "id": 161167761, "pid": 5714, "tid": 6744, "ts": 6300866094949.518, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866095067.837, "dur": 5.520, + "args": { + "External id": 89422, "cbid": 135, "correlation": 161167776 + } + }, + { + "ph": "f", "id": 161167776, "pid": 5714, "tid": 6744, "ts": 6300866095067.837, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866096544.424, "dur": 11.180, + "args": { + "External id": 89422, "cbid": 135, "correlation": 161167789 + } + }, + { + "ph": "f", "id": 161167789, "pid": 5714, "tid": 6744, "ts": 6300866096544.424, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_dense_backward_0", "pid": 0, "tid": 7, + "ts": 6300866182753.875, "dur": 95.617, + "args": { + "External id": 89452, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167805, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 3000.000000, "grid": [48000, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167805, "pid": 0, "tid": 7, "ts": 6300866182753.875, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866097703.742, "dur": 91.360, + "args": { + "External id": 89452, "cbid": 307, "correlation": 161167805 + } + }, + { + "ph": "s", "id": 161167805, "pid": 5714, "tid": 6744, "ts": 6300866097703.742, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_dense_backward_1", "pid": 0, "tid": 7, + "ts": 6300866182850.100, "dur": 891.530, + "args": { + "External id": 89453, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167810, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167810, "pid": 0, "tid": 7, "ts": 6300866182850.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866097970.051, "dur": 38.790, + "args": { + "External id": 89453, "cbid": 307, "correlation": 161167810 + } + }, + { + "ph": "s", "id": 161167810, "pid": 5714, "tid": 6744, "ts": 6300866097970.051, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_dense_backward_2", "pid": 0, "tid": 7, + "ts": 6300866183844.256, "dur": 887.626, + "args": { + "External id": 89454, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167818, "registers per thread": 20, "shared memory": 0, "blocks per SM": 187.500000, "warps per SM": 750.000000, "grid": [24000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167818, "pid": 0, "tid": 7, "ts": 6300866183844.256, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866098201.261, "dur": 41.529, + "args": { + "External id": 89454, "cbid": 307, "correlation": 161167818 + } + }, + { + "ph": "s", "id": 161167818, "pid": 5714, "tid": 6744, "ts": 6300866098201.261, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866184793.003, "dur": 269.507, + "args": { + "External id": 89455, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167823, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167823, "pid": 0, "tid": 7, "ts": 6300866184793.003, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866098553.450, "dur": 55.950, + "args": { + "External id": 89455, "cbid": 211, "correlation": 161167823 + } + }, + { + "ph": "s", "id": 161167823, "pid": 5714, "tid": 6744, "ts": 6300866098553.450, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866100918.944, "dur": 23.340, + "args": { + "cbid": 147, "correlation": 161167838 + } + }, + { + "ph": "s", "id": 161167838, "pid": 5714, "tid": 6744, "ts": 6300866100918.944, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866102371.701, "dur": 21.590, + "args": { + "External id": 89468, "cbid": 138, "correlation": 161167853 + } + }, + { + "ph": "f", "id": 161167853, "pid": 5714, "tid": 6744, "ts": 6300866102371.701, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6300866102395.991, "dur": 5.480, + "args": { + "External id": 89468, "cbid": 138, "correlation": 161167854 + } + }, + { + "ph": "f", "id": 161167854, "pid": 5714, "tid": 6744, "ts": 6300866102395.991, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6300866193741.555, "dur": 27.201, + "args": { + "External id": 89472, "device": 0, "context": 1, "stream": 7, "correlation": 161167865, "bytes": 208504, "memory bandwidth (GB/s)": 7.665306422557994 + } + }, + { + "ph": "f", "id": 161167865, "pid": 0, "tid": 7, "ts": 6300866193741.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6300866102623.051, "dur": 94.669, + "args": { + "External id": 89472, "cbid": 41, "correlation": 161167865 + } + }, + { + "ph": "s", "id": 161167865, "pid": 5714, "tid": 6744, "ts": 6300866102623.051, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866102751.060, "dur": 13.640, + "args": { + "External id": 89467, "cbid": 135, "correlation": 161167869 + } + }, + { + "ph": "f", "id": 161167869, "pid": 5714, "tid": 6744, "ts": 6300866102751.060, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6300866193770.388, "dur": 335.396, + "args": { + "External id": 89467, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161167873, "registers per thread": 32, "shared memory": 0, "blocks per SM": 807.281250, "warps per SM": 3229.125000, "grid": [25833, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 161167873, "pid": 0, "tid": 7, "ts": 6300866193770.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6300866102788.510, "dur": 81.630, + "args": { + "External id": 89467, "cbid": 211, "correlation": 161167873 + } + }, + { + "ph": "s", "id": 161167873, "pid": 5714, "tid": 6744, "ts": 6300866102788.510, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866103364.389, "dur": 8.500, + "args": { + "cbid": 135, "correlation": 161167884 + } + }, + { + "ph": "f", "id": 161167884, "pid": 5714, "tid": 6744, "ts": 6300866103364.389, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866103390.089, "dur": 11.070, + "args": { + "cbid": 147, "correlation": 161167888 + } + }, + { + "ph": "s", "id": 161167888, "pid": 5714, "tid": 6744, "ts": 6300866103390.089, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6300866103941.037, "dur": 8.320, + "args": { + "External id": 89476, "cbid": 317, "correlation": 161167908 + } + }, + { + "ph": "f", "id": 161167908, "pid": 5714, "tid": 6744, "ts": 6300866103941.037, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866103964.577, "dur": 10.220, + "args": { + "External id": 89476, "cbid": 135, "correlation": 161167910 + } + }, + { + "ph": "f", "id": 161167910, "pid": 5714, "tid": 6744, "ts": 6300866103964.577, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866103986.477, "dur": 8.820, + "args": { + "External id": 89476, "cbid": 147, "correlation": 161167914 + } + }, + { + "ph": "s", "id": 161167914, "pid": 5714, "tid": 6744, "ts": 6300866103986.477, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6300866104108.687, "dur": 5.950, + "args": { + "External id": 89476, "cbid": 409, "correlation": 161167917 + } + }, + { + "ph": "f", "id": 161167917, "pid": 5714, "tid": 6744, "ts": 6300866104108.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866104148.137, "dur": 6.320, + "args": { + "External id": 89476, "cbid": 135, "correlation": 161167920 + } + }, + { + "ph": "f", "id": 161167920, "pid": 5714, "tid": 6744, "ts": 6300866104148.137, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866104155.987, "dur": 6.940, + "args": { + "External id": 89476, "cbid": 147, "correlation": 161167921 + } + }, + { + "ph": "s", "id": 161167921, "pid": 5714, "tid": 6744, "ts": 6300866104155.987, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866194109.016, "dur": 67807.100, + "args": { + "External id": 89476, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161167923, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 52894464, "Out msg nelems": 13223616, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161167923, "pid": 0, "tid": 20, "ts": 6300866194109.016, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6300866104171.757, "dur": 80.210, + "args": { + "External id": 89476, "cbid": 430, "correlation": 161167923 + } + }, + { + "ph": "s", "id": 161167923, "pid": 5714, "tid": 6744, "ts": 6300866104171.757, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866104260.497, "dur": 3.310, + "args": { + "External id": 89476, "cbid": 135, "correlation": 161167925 + } + }, + { + "ph": "f", "id": 161167925, "pid": 5714, "tid": 6744, "ts": 6300866104260.497, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866104264.797, "dur": 4.430, + "args": { + "External id": 89476, "cbid": 147, "correlation": 161167926 + } + }, + { + "ph": "s", "id": 161167926, "pid": 5714, "tid": 6744, "ts": 6300866104264.797, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866104281.337, "dur": 5.080, + "args": { + "External id": 89476, "cbid": 135, "correlation": 161167929 + } + }, + { + "ph": "f", "id": 161167929, "pid": 5714, "tid": 6744, "ts": 6300866104281.337, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866104391.116, "dur": 4.051, + "args": { + "External id": 89476, "cbid": 135, "correlation": 161167936 + } + }, + { + "ph": "f", "id": 161167936, "pid": 5714, "tid": 6744, "ts": 6300866104391.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866104604.126, "dur": 8.820, + "args": { + "External id": 89478, "cbid": 147, "correlation": 161167941 + } + }, + { + "ph": "s", "id": 161167941, "pid": 5714, "tid": 6744, "ts": 6300866104604.126, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866104744.746, "dur": 6.929, + "args": { + "cbid": 135, "correlation": 161167956 + } + }, + { + "ph": "f", "id": 161167956, "pid": 5714, "tid": 6744, "ts": 6300866104744.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6300866108569.507, "dur": 8.800, + "args": { + "cbid": 135, "correlation": 161167969 + } + }, + { + "ph": "f", "id": 161167969, "pid": 5714, "tid": 6744, "ts": 6300866108569.507, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866108943.366, "dur": 15.680, + "args": { + "cbid": 147, "correlation": 161167976 + } + }, + { + "ph": "s", "id": 161167976, "pid": 5714, "tid": 6744, "ts": 6300866108943.366, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109059.276, "dur": 8.500, + "args": { + "cbid": 147, "correlation": 161167986 + } + }, + { + "ph": "s", "id": 161167986, "pid": 5714, "tid": 6744, "ts": 6300866109059.276, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109139.066, "dur": 6.099, + "args": { + "cbid": 147, "correlation": 161167996 + } + }, + { + "ph": "s", "id": 161167996, "pid": 5714, "tid": 6744, "ts": 6300866109139.066, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109205.545, "dur": 5.511, + "args": { + "cbid": 147, "correlation": 161168006 + } + }, + { + "ph": "s", "id": 161168006, "pid": 5714, "tid": 6744, "ts": 6300866109205.545, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109273.245, "dur": 6.110, + "args": { + "cbid": 147, "correlation": 161168016 + } + }, + { + "ph": "s", "id": 161168016, "pid": 5714, "tid": 6744, "ts": 6300866109273.245, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109386.175, "dur": 7.200, + "args": { + "cbid": 147, "correlation": 161168026 + } + }, + { + "ph": "s", "id": 161168026, "pid": 5714, "tid": 6744, "ts": 6300866109386.175, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109459.045, "dur": 5.610, + "args": { + "cbid": 147, "correlation": 161168036 + } + }, + { + "ph": "s", "id": 161168036, "pid": 5714, "tid": 6744, "ts": 6300866109459.045, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109528.015, "dur": 5.510, + "args": { + "cbid": 147, "correlation": 161168046 + } + }, + { + "ph": "s", "id": 161168046, "pid": 5714, "tid": 6744, "ts": 6300866109528.015, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109598.715, "dur": 5.620, + "args": { + "cbid": 147, "correlation": 161168056 + } + }, + { + "ph": "s", "id": 161168056, "pid": 5714, "tid": 6744, "ts": 6300866109598.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109666.564, "dur": 5.191, + "args": { + "cbid": 147, "correlation": 161168066 + } + }, + { + "ph": "s", "id": 161168066, "pid": 5714, "tid": 6744, "ts": 6300866109666.564, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109732.484, "dur": 5.140, + "args": { + "cbid": 147, "correlation": 161168076 + } + }, + { + "ph": "s", "id": 161168076, "pid": 5714, "tid": 6744, "ts": 6300866109732.484, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6300866109797.074, "dur": 6.720, + "args": { + "cbid": 147, "correlation": 161168086 + } + }, + { + "ph": "s", "id": 161168086, "pid": 5714, "tid": 6744, "ts": 6300866109797.074, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866261917.780, "dur": 1.504, + "args": { + "External id": 85626, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168099, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168099, "pid": 0, "tid": 7, "ts": 6300866261917.780, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866110505.942, "dur": 159.780, + "args": { + "External id": 85626, "cbid": 211, "correlation": 161168099 + } + }, + { + "ph": "s", "id": 161168099, "pid": 5714, "tid": 5714, "ts": 6300866110505.942, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866261919.924, "dur": 1.312, + "args": { + "External id": 85627, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168109, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168109, "pid": 0, "tid": 7, "ts": 6300866261919.924, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866111011.431, "dur": 41.940, + "args": { + "External id": 85627, "cbid": 211, "correlation": 161168109 + } + }, + { + "ph": "s", "id": 161168109, "pid": 5714, "tid": 5714, "ts": 6300866111011.431, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866261921.908, "dur": 1.120, + "args": { + "External id": 85628, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168119, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168119, "pid": 0, "tid": 7, "ts": 6300866261921.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866111186.561, "dur": 37.180, + "args": { + "External id": 85628, "cbid": 211, "correlation": 161168119 + } + }, + { + "ph": "s", "id": 161168119, "pid": 5714, "tid": 5714, "ts": 6300866111186.561, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866261923.764, "dur": 1.088, + "args": { + "External id": 85629, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168129, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168129, "pid": 0, "tid": 7, "ts": 6300866261923.764, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866113056.927, "dur": 57.980, + "args": { + "External id": 85629, "cbid": 211, "correlation": 161168129 + } + }, + { + "ph": "s", "id": 161168129, "pid": 5714, "tid": 5714, "ts": 6300866113056.927, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866261925.492, "dur": 1.088, + "args": { + "External id": 85630, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168139, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168139, "pid": 0, "tid": 7, "ts": 6300866261925.492, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866113230.816, "dur": 34.110, + "args": { + "External id": 85630, "cbid": 211, "correlation": 161168139 + } + }, + { + "ph": "s", "id": 161168139, "pid": 5714, "tid": 5714, "ts": 6300866113230.816, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866261927.188, "dur": 0.864, + "args": { + "External id": 85636, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168154, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.187500, "warps per SM": 0.750000, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 161168154, "pid": 0, "tid": 7, "ts": 6300866261927.188, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866121205.978, "dur": 66.300, + "args": { + "External id": 85636, "cbid": 211, "correlation": 161168154 + } + }, + { + "ph": "s", "id": 161168154, "pid": 5714, "tid": 5714, "ts": 6300866121205.978, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::LpNormFunctor, float*, int>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::LpNormFunctor, float*, int)", "pid": 0, "tid": 7, + "ts": 6300866261928.725, "dur": 89.152, + "args": { + "External id": 85632, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168673, "registers per thread": 28, "shared memory": 2048, "blocks per SM": 2.500000, "warps per SM": 40.000000, "grid": [320, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161168673, "pid": 0, "tid": 7, "ts": 6300866261928.725, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866122894.404, "dur": 40.300, + "args": { + "External id": 85632, "cbid": 211, "correlation": 161168673 + } + }, + { + "ph": "s", "id": 161168673, "pid": 5714, "tid": 5714, "ts": 6300866122894.404, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::LpNormFunctor, float*, int>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::LpNormFunctor, float*, int)", "pid": 0, "tid": 7, + "ts": 6300866262018.581, "dur": 61.025, + "args": { + "External id": 85632, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168676, "registers per thread": 28, "shared memory": 2048, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 161168676, "pid": 0, "tid": 7, "ts": 6300866262018.581, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866122947.214, "dur": 19.180, + "args": { + "External id": 85632, "cbid": 211, "correlation": 161168676 + } + }, + { + "ph": "s", "id": 161168676, "pid": 5714, "tid": 5714, "ts": 6300866122947.214, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::lpnorm_cleanup(float const*, at::native::TensorListAddresses, int)", "pid": 0, "tid": 7, + "ts": 6300866262080.278, "dur": 1.728, + "args": { + "External id": 85632, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168681, "registers per thread": 16, "shared memory": 2048, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161168681, "pid": 0, "tid": 7, "ts": 6300866262080.278, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866122988.474, "dur": 23.930, + "args": { + "External id": 85632, "cbid": 211, "correlation": 161168681 + } + }, + { + "ph": "s", "id": 161168681, "pid": 5714, "tid": 5714, "ts": 6300866122988.474, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy_aligned16_contig, unsigned int, 1, 128, 1>(at::native::(anonymous namespace)::OpaqueType<4u>*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, unsigned int, 128, 1>, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 0, "tid": 7, + "ts": 6300866262082.678, "dur": 1.568, + "args": { + "External id": 89607, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168694, "registers per thread": 30, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 4.000000, "grid": [1, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 161168694, "pid": 0, "tid": 7, "ts": 6300866262082.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866135402.676, "dur": 69.760, + "args": { + "External id": 89607, "cbid": 211, "correlation": 161168694 + } + }, + { + "ph": "s", "id": 161168694, "pid": 5714, "tid": 5714, "ts": 6300866135402.676, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6300866262084.982, "dur": 2.304, + "args": { + "External id": 89609, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168708, "registers per thread": 32, "shared memory": 528, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168708, "pid": 0, "tid": 7, "ts": 6300866262084.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866136112.074, "dur": 60.400, + "args": { + "External id": 89609, "cbid": 211, "correlation": 161168708 + } + }, + { + "ph": "s", "id": 161168708, "pid": 5714, "tid": 5714, "ts": 6300866136112.074, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866262087.958, "dur": 1.152, + "args": { + "External id": 89612, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168718, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168718, "pid": 0, "tid": 7, "ts": 6300866262087.958, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866137365.521, "dur": 59.350, + "args": { + "External id": 89612, "cbid": 211, "correlation": 161168718 + } + }, + { + "ph": "s", "id": 161168718, "pid": 5714, "tid": 5714, "ts": 6300866137365.521, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6300866262089.814, "dur": 0.992, + "args": { + "External id": 89619, "device": 0, "context": 1, "stream": 7, "correlation": 161168730, "bytes": 4, "memory bandwidth (GB/s)": 0.004032258064516129 + } + }, + { + "ph": "f", "id": 161168730, "pid": 0, "tid": 7, "ts": 6300866262089.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300866137716.151, "dur": 73.800, + "args": { + "External id": 89619, "cbid": 41, "correlation": 161168730 + } + }, + { + "ph": "s", "id": 161168730, "pid": 5714, "tid": 5714, "ts": 6300866137716.151, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300866137961.580, "dur": 5.480, + "args": { + "External id": 89621, "cbid": 317, "correlation": 161168736 + } + }, + { + "ph": "f", "id": 161168736, "pid": 5714, "tid": 5714, "ts": 6300866137961.580, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300866137980.750, "dur": 10.770, + "args": { + "External id": 89621, "cbid": 135, "correlation": 161168738 + } + }, + { + "ph": "f", "id": 161168738, "pid": 5714, "tid": 5714, "ts": 6300866137980.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300866138000.120, "dur": 8.540, + "args": { + "External id": 89621, "cbid": 147, "correlation": 161168742 + } + }, + { + "ph": "s", "id": 161168742, "pid": 5714, "tid": 5714, "ts": 6300866138000.120, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6300866138112.590, "dur": 5.270, + "args": { + "External id": 89621, "cbid": 409, "correlation": 161168745 + } + }, + { + "ph": "f", "id": 161168745, "pid": 5714, "tid": 5714, "ts": 6300866138112.590, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300866138146.510, "dur": 7.970, + "args": { + "External id": 89621, "cbid": 135, "correlation": 161168748 + } + }, + { + "ph": "f", "id": 161168748, "pid": 5714, "tid": 5714, "ts": 6300866138146.510, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300866138155.550, "dur": 6.660, + "args": { + "External id": 89621, "cbid": 147, "correlation": 161168749 + } + }, + { + "ph": "s", "id": 161168749, "pid": 5714, "tid": 5714, "ts": 6300866138155.550, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllReduce_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6300866262093.174, "dur": 384.325, + "args": { + "External id": 89621, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 161168751, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.007812, "warps per SM": 0.023438, "grid": [1, 1, 1], "block": [96, 1, 1], "est. achieved occupancy %": 0, "Collective name": "allreduce", "In msg nelems": 1, "Out msg nelems": 1, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 161168751, "pid": 0, "tid": 20, "ts": 6300866262093.174, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6300866138169.350, "dur": 55.589, + "args": { + "External id": 89621, "cbid": 430, "correlation": 161168751 + } + }, + { + "ph": "s", "id": 161168751, "pid": 5714, "tid": 5714, "ts": 6300866138169.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300866138232.019, "dur": 3.060, + "args": { + "External id": 89621, "cbid": 135, "correlation": 161168753 + } + }, + { + "ph": "f", "id": 161168753, "pid": 5714, "tid": 5714, "ts": 6300866138232.019, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300866138235.870, "dur": 3.829, + "args": { + "External id": 89621, "cbid": 147, "correlation": 161168754 + } + }, + { + "ph": "s", "id": 161168754, "pid": 5714, "tid": 5714, "ts": 6300866138235.870, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300866138250.839, "dur": 4.851, + "args": { + "External id": 89621, "cbid": 135, "correlation": 161168757 + } + }, + { + "ph": "f", "id": 161168757, "pid": 5714, "tid": 5714, "ts": 6300866138250.839, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6300866138358.549, "dur": 3.550, + "args": { + "External id": 89621, "cbid": 135, "correlation": 161168764 + } + }, + { + "ph": "f", "id": 161168764, "pid": 5714, "tid": 5714, "ts": 6300866138358.549, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6300866139169.967, "dur": 17.170, + "args": { + "External id": 89625, "cbid": 147, "correlation": 161168769 + } + }, + { + "ph": "s", "id": 161168769, "pid": 5714, "tid": 5714, "ts": 6300866139169.967, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866262478.235, "dur": 1.120, + "args": { + "External id": 89626, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168785, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168785, "pid": 0, "tid": 7, "ts": 6300866262478.235, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866139673.346, "dur": 92.010, + "args": { + "External id": 89626, "cbid": 211, "correlation": 161168785 + } + }, + { + "ph": "s", "id": 161168785, "pid": 5714, "tid": 5714, "ts": 6300866139673.346, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6300866262479.931, "dur": 0.992, + "args": { + "External id": 89632, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168795, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168795, "pid": 0, "tid": 7, "ts": 6300866262479.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866141349.503, "dur": 84.419, + "args": { + "External id": 89632, "cbid": 211, "correlation": 161168795 + } + }, + { + "ph": "s", "id": 161168795, "pid": 5714, "tid": 5714, "ts": 6300866141349.503, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866262481.499, "dur": 1.120, + "args": { + "External id": 89633, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168805, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168805, "pid": 0, "tid": 7, "ts": 6300866262481.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866141561.852, "dur": 42.280, + "args": { + "External id": 89633, "cbid": 211, "correlation": 161168805 + } + }, + { + "ph": "s", "id": 161168805, "pid": 5714, "tid": 5714, "ts": 6300866141561.852, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866262483.227, "dur": 0.992, + "args": { + "External id": 89634, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168815, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168815, "pid": 0, "tid": 7, "ts": 6300866262483.227, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866141714.791, "dur": 40.540, + "args": { + "External id": 89634, "cbid": 211, "correlation": 161168815 + } + }, + { + "ph": "s", "id": 161168815, "pid": 5714, "tid": 5714, "ts": 6300866141714.791, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6300866262484.955, "dur": 1.056, + "args": { + "External id": 89635, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168825, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168825, "pid": 0, "tid": 7, "ts": 6300866262484.955, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866141916.301, "dur": 43.350, + "args": { + "External id": 89635, "cbid": 211, "correlation": 161168825 + } + }, + { + "ph": "s", "id": 161168825, "pid": 5714, "tid": 5714, "ts": 6300866141916.301, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float)", "pid": 0, "tid": 7, + "ts": 6300866262486.619, "dur": 116.001, + "args": { + "External id": 89639, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168831, "registers per thread": 28, "shared memory": 0, "blocks per SM": 2.500000, "warps per SM": 40.000000, "grid": [320, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 161168831, "pid": 0, "tid": 7, "ts": 6300866262486.619, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866144775.975, "dur": 70.000, + "args": { + "External id": 89639, "cbid": 211, "correlation": 161168831 + } + }, + { + "ph": "s", "id": 161168831, "pid": 5714, "tid": 5714, "ts": 6300866144775.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float)", "pid": 0, "tid": 7, + "ts": 6300866262603.356, "dur": 111.809, + "args": { + "External id": 89639, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168834, "registers per thread": 28, "shared memory": 0, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 161168834, "pid": 0, "tid": 7, "ts": 6300866262603.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866144860.755, "dur": 26.439, + "args": { + "External id": 89639, "cbid": 211, "correlation": 161168834 + } + }, + { + "ph": "s", "id": 161168834, "pid": 5714, "tid": 5714, "ts": 6300866144860.755, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866262715.837, "dur": 1.376, + "args": { + "External id": 89641, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168844, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168844, "pid": 0, "tid": 7, "ts": 6300866262715.837, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866145342.353, "dur": 62.320, + "args": { + "External id": 89641, "cbid": 211, "correlation": 161168844 + } + }, + { + "ph": "s", "id": 161168844, "pid": 5714, "tid": 5714, "ts": 6300866145342.353, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866145490.613, "dur": 20.860, + "args": { + "External id": 89644, "cbid": 138, "correlation": 161168849 + } + }, + { + "ph": "f", "id": 161168849, "pid": 5714, "tid": 5714, "ts": 6300866145490.613, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866145517.193, "dur": 5.490, + "args": { + "External id": 89644, "cbid": 138, "correlation": 161168850 + } + }, + { + "ph": "f", "id": 161168850, "pid": 5714, "tid": 5714, "ts": 6300866145517.193, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866145525.753, "dur": 4.440, + "args": { + "External id": 89644, "cbid": 138, "correlation": 161168851 + } + }, + { + "ph": "f", "id": 161168851, "pid": 5714, "tid": 5714, "ts": 6300866145525.753, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866145532.733, "dur": 4.130, + "args": { + "External id": 89644, "cbid": 138, "correlation": 161168852 + } + }, + { + "ph": "f", "id": 161168852, "pid": 5714, "tid": 5714, "ts": 6300866145532.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866145539.373, "dur": 3.860, + "args": { + "External id": 89644, "cbid": 138, "correlation": 161168853 + } + }, + { + "ph": "f", "id": 161168853, "pid": 5714, "tid": 5714, "ts": 6300866145539.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866145545.833, "dur": 3.990, + "args": { + "External id": 89644, "cbid": 138, "correlation": 161168854 + } + }, + { + "ph": "f", "id": 161168854, "pid": 5714, "tid": 5714, "ts": 6300866145545.833, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866145552.423, "dur": 5.020, + "args": { + "External id": 89644, "cbid": 138, "correlation": 161168855 + } + }, + { + "ph": "f", "id": 161168855, "pid": 5714, "tid": 5714, "ts": 6300866145552.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6300866262725.150, "dur": 0.992, + "args": { + "External id": 89644, "device": 0, "context": 1, "stream": 7, "correlation": 161168858, "bytes": 1, "memory bandwidth (GB/s)": 0.0010080645161290322 + } + }, + { + "ph": "f", "id": 161168858, "pid": 0, "tid": 7, "ts": 6300866262725.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300866145574.593, "dur": 77.200, + "args": { + "External id": 89644, "cbid": 41, "correlation": 161168858 + } + }, + { + "ph": "s", "id": 161168858, "pid": 5714, "tid": 5714, "ts": 6300866145574.593, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192239.057, "dur": 13.980, + "args": { + "cbid": 138, "correlation": 161168860 + } + }, + { + "ph": "f", "id": 161168860, "pid": 5714, "tid": 1822426688, "ts": 6300866192239.057, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192254.177, "dur": 2.230, + "args": { + "cbid": 138, "correlation": 161168861 + } + }, + { + "ph": "f", "id": 161168861, "pid": 5714, "tid": 1822426688, "ts": 6300866192254.177, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192268.007, "dur": 2.190, + "args": { + "cbid": 138, "correlation": 161168862 + } + }, + { + "ph": "f", "id": 161168862, "pid": 5714, "tid": 1822426688, "ts": 6300866192268.007, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192281.077, "dur": 2.590, + "args": { + "cbid": 138, "correlation": 161168863 + } + }, + { + "ph": "f", "id": 161168863, "pid": 5714, "tid": 1822426688, "ts": 6300866192281.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192284.267, "dur": 1.950, + "args": { + "cbid": 138, "correlation": 161168864 + } + }, + { + "ph": "f", "id": 161168864, "pid": 5714, "tid": 1822426688, "ts": 6300866192284.267, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192290.097, "dur": 1.860, + "args": { + "cbid": 138, "correlation": 161168865 + } + }, + { + "ph": "f", "id": 161168865, "pid": 5714, "tid": 1822426688, "ts": 6300866192290.097, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192325.056, "dur": 3.080, + "args": { + "cbid": 138, "correlation": 161168866 + } + }, + { + "ph": "f", "id": 161168866, "pid": 5714, "tid": 1822426688, "ts": 6300866192325.056, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192328.916, "dur": 1.740, + "args": { + "cbid": 138, "correlation": 161168867 + } + }, + { + "ph": "f", "id": 161168867, "pid": 5714, "tid": 1822426688, "ts": 6300866192328.916, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192333.807, "dur": 1.840, + "args": { + "cbid": 138, "correlation": 161168868 + } + }, + { + "ph": "f", "id": 161168868, "pid": 5714, "tid": 1822426688, "ts": 6300866192333.807, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192340.996, "dur": 2.540, + "args": { + "cbid": 138, "correlation": 161168869 + } + }, + { + "ph": "f", "id": 161168869, "pid": 5714, "tid": 1822426688, "ts": 6300866192340.996, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192344.176, "dur": 1.731, + "args": { + "cbid": 138, "correlation": 161168870 + } + }, + { + "ph": "f", "id": 161168870, "pid": 5714, "tid": 1822426688, "ts": 6300866192344.176, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192349.216, "dur": 1.820, + "args": { + "cbid": 138, "correlation": 161168871 + } + }, + { + "ph": "f", "id": 161168871, "pid": 5714, "tid": 1822426688, "ts": 6300866192349.216, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192356.356, "dur": 3.320, + "args": { + "cbid": 138, "correlation": 161168872 + } + }, + { + "ph": "f", "id": 161168872, "pid": 5714, "tid": 1822426688, "ts": 6300866192356.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192360.316, "dur": 1.631, + "args": { + "cbid": 138, "correlation": 161168873 + } + }, + { + "ph": "f", "id": 161168873, "pid": 5714, "tid": 1822426688, "ts": 6300866192360.316, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192364.476, "dur": 1.791, + "args": { + "cbid": 138, "correlation": 161168874 + } + }, + { + "ph": "f", "id": 161168874, "pid": 5714, "tid": 1822426688, "ts": 6300866192364.476, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192372.367, "dur": 3.109, + "args": { + "cbid": 138, "correlation": 161168875 + } + }, + { + "ph": "f", "id": 161168875, "pid": 5714, "tid": 1822426688, "ts": 6300866192372.367, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192376.056, "dur": 1.771, + "args": { + "cbid": 138, "correlation": 161168876 + } + }, + { + "ph": "f", "id": 161168876, "pid": 5714, "tid": 1822426688, "ts": 6300866192376.056, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192381.847, "dur": 1.789, + "args": { + "cbid": 138, "correlation": 161168877 + } + }, + { + "ph": "f", "id": 161168877, "pid": 5714, "tid": 1822426688, "ts": 6300866192381.847, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192389.587, "dur": 3.189, + "args": { + "cbid": 138, "correlation": 161168878 + } + }, + { + "ph": "f", "id": 161168878, "pid": 5714, "tid": 1822426688, "ts": 6300866192389.587, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192393.336, "dur": 1.700, + "args": { + "cbid": 138, "correlation": 161168879 + } + }, + { + "ph": "f", "id": 161168879, "pid": 5714, "tid": 1822426688, "ts": 6300866192393.336, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192397.876, "dur": 1.751, + "args": { + "cbid": 138, "correlation": 161168880 + } + }, + { + "ph": "f", "id": 161168880, "pid": 5714, "tid": 1822426688, "ts": 6300866192397.876, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192407.347, "dur": 2.679, + "args": { + "cbid": 138, "correlation": 161168881 + } + }, + { + "ph": "f", "id": 161168881, "pid": 5714, "tid": 1822426688, "ts": 6300866192407.347, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192410.576, "dur": 1.710, + "args": { + "cbid": 138, "correlation": 161168882 + } + }, + { + "ph": "f", "id": 161168882, "pid": 5714, "tid": 1822426688, "ts": 6300866192410.576, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192415.156, "dur": 1.850, + "args": { + "cbid": 138, "correlation": 161168883 + } + }, + { + "ph": "f", "id": 161168883, "pid": 5714, "tid": 1822426688, "ts": 6300866192415.156, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192422.456, "dur": 2.340, + "args": { + "cbid": 138, "correlation": 161168884 + } + }, + { + "ph": "f", "id": 161168884, "pid": 5714, "tid": 1822426688, "ts": 6300866192422.456, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192425.356, "dur": 1.640, + "args": { + "cbid": 138, "correlation": 161168885 + } + }, + { + "ph": "f", "id": 161168885, "pid": 5714, "tid": 1822426688, "ts": 6300866192425.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192429.866, "dur": 1.790, + "args": { + "cbid": 138, "correlation": 161168886 + } + }, + { + "ph": "f", "id": 161168886, "pid": 5714, "tid": 1822426688, "ts": 6300866192429.866, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192436.936, "dur": 3.010, + "args": { + "cbid": 138, "correlation": 161168887 + } + }, + { + "ph": "f", "id": 161168887, "pid": 5714, "tid": 1822426688, "ts": 6300866192436.936, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192440.496, "dur": 1.660, + "args": { + "cbid": 138, "correlation": 161168888 + } + }, + { + "ph": "f", "id": 161168888, "pid": 5714, "tid": 1822426688, "ts": 6300866192440.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192444.146, "dur": 1.780, + "args": { + "cbid": 138, "correlation": 161168889 + } + }, + { + "ph": "f", "id": 161168889, "pid": 5714, "tid": 1822426688, "ts": 6300866192444.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192450.776, "dur": 2.290, + "args": { + "cbid": 138, "correlation": 161168890 + } + }, + { + "ph": "f", "id": 161168890, "pid": 5714, "tid": 1822426688, "ts": 6300866192450.776, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192453.636, "dur": 1.640, + "args": { + "cbid": 138, "correlation": 161168891 + } + }, + { + "ph": "f", "id": 161168891, "pid": 5714, "tid": 1822426688, "ts": 6300866192453.636, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192457.736, "dur": 1.780, + "args": { + "cbid": 138, "correlation": 161168892 + } + }, + { + "ph": "f", "id": 161168892, "pid": 5714, "tid": 1822426688, "ts": 6300866192457.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192464.996, "dur": 2.380, + "args": { + "cbid": 138, "correlation": 161168893 + } + }, + { + "ph": "f", "id": 161168893, "pid": 5714, "tid": 1822426688, "ts": 6300866192464.996, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192467.936, "dur": 1.640, + "args": { + "cbid": 138, "correlation": 161168894 + } + }, + { + "ph": "f", "id": 161168894, "pid": 5714, "tid": 1822426688, "ts": 6300866192467.936, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192471.956, "dur": 1.860, + "args": { + "cbid": 138, "correlation": 161168895 + } + }, + { + "ph": "f", "id": 161168895, "pid": 5714, "tid": 1822426688, "ts": 6300866192471.956, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192482.226, "dur": 3.020, + "args": { + "cbid": 138, "correlation": 161168896 + } + }, + { + "ph": "f", "id": 161168896, "pid": 5714, "tid": 1822426688, "ts": 6300866192482.226, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192485.816, "dur": 1.670, + "args": { + "cbid": 138, "correlation": 161168897 + } + }, + { + "ph": "f", "id": 161168897, "pid": 5714, "tid": 1822426688, "ts": 6300866192485.816, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192490.116, "dur": 1.820, + "args": { + "cbid": 138, "correlation": 161168898 + } + }, + { + "ph": "f", "id": 161168898, "pid": 5714, "tid": 1822426688, "ts": 6300866192490.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192497.066, "dur": 3.400, + "args": { + "cbid": 138, "correlation": 161168899 + } + }, + { + "ph": "f", "id": 161168899, "pid": 5714, "tid": 1822426688, "ts": 6300866192497.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192506.326, "dur": 3.220, + "args": { + "cbid": 138, "correlation": 161168901 + } + }, + { + "ph": "f", "id": 161168901, "pid": 5714, "tid": 1822426688, "ts": 6300866192506.326, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866192515.746, "dur": 3.310, + "args": { + "cbid": 138, "correlation": 161168903 + } + }, + { + "ph": "f", "id": 161168903, "pid": 5714, "tid": 1822426688, "ts": 6300866192515.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6300866145654.383, "dur": 117094.564, + "args": { + "External id": 89644, "cbid": 131, "correlation": 161168859 + } + }, + { + "ph": "s", "id": 161168859, "pid": 5714, "tid": 5714, "ts": 6300866145654.383, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6300866263123.554, "dur": 0.992, + "args": { + "External id": 89648, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168920, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168920, "pid": 0, "tid": 7, "ts": 6300866263123.554, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866263049.806, "dur": 90.140, + "args": { + "External id": 89648, "cbid": 211, "correlation": 161168920 + } + }, + { + "ph": "s", "id": 161168920, "pid": 5714, "tid": 5714, "ts": 6300866263049.806, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6300866263280.740, "dur": 1.024, + "args": { + "External id": 89650, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168930, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 161168930, "pid": 0, "tid": 7, "ts": 6300866263280.740, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866263248.715, "dur": 40.170, + "args": { + "External id": 89650, "cbid": 211, "correlation": 161168930 + } + }, + { + "ph": "s", "id": 161168930, "pid": 5714, "tid": 5714, "ts": 6300866263248.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866263414.645, "dur": 13.280, + "args": { + "External id": 89653, "cbid": 138, "correlation": 161168935 + } + }, + { + "ph": "f", "id": 161168935, "pid": 5714, "tid": 5714, "ts": 6300866263414.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866263431.565, "dur": 5.810, + "args": { + "External id": 89653, "cbid": 138, "correlation": 161168936 + } + }, + { + "ph": "f", "id": 161168936, "pid": 5714, "tid": 5714, "ts": 6300866263431.565, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866263440.155, "dur": 4.190, + "args": { + "External id": 89653, "cbid": 138, "correlation": 161168937 + } + }, + { + "ph": "f", "id": 161168937, "pid": 5714, "tid": 5714, "ts": 6300866263440.155, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866263446.555, "dur": 5.030, + "args": { + "External id": 89653, "cbid": 138, "correlation": 161168938 + } + }, + { + "ph": "f", "id": 161168938, "pid": 5714, "tid": 5714, "ts": 6300866263446.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866263453.845, "dur": 3.770, + "args": { + "External id": 89653, "cbid": 138, "correlation": 161168939 + } + }, + { + "ph": "f", "id": 161168939, "pid": 5714, "tid": 5714, "ts": 6300866263453.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866263462.305, "dur": 4.610, + "args": { + "External id": 89653, "cbid": 138, "correlation": 161168940 + } + }, + { + "ph": "f", "id": 161168940, "pid": 5714, "tid": 5714, "ts": 6300866263462.305, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6300866263469.535, "dur": 4.470, + "args": { + "External id": 89653, "cbid": 138, "correlation": 161168941 + } + }, + { + "ph": "f", "id": 161168941, "pid": 5714, "tid": 5714, "ts": 6300866263469.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6300866263552.231, "dur": 0.896, + "args": { + "External id": 89653, "device": 0, "context": 1, "stream": 7, "correlation": 161168943, "bytes": 1, "memory bandwidth (GB/s)": 0.0011160714285714285 + } + }, + { + "ph": "f", "id": 161168943, "pid": 0, "tid": 7, "ts": 6300866263552.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6300866263489.015, "dur": 71.190, + "args": { + "External id": 89653, "cbid": 41, "correlation": 161168943 + } + }, + { + "ph": "s", "id": 161168943, "pid": 5714, "tid": 5714, "ts": 6300866263489.015, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6300866263562.355, "dur": 13.490, + "args": { + "External id": 89653, "cbid": 131, "correlation": 161168944 + } + }, + { + "ph": "s", "id": 161168944, "pid": 5714, "tid": 5714, "ts": 6300866263562.355, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6300866264095.313, "dur": 7.600, + "args": { + "cbid": 317, "correlation": 161168950 + } + }, + { + "ph": "f", "id": 161168950, "pid": 5714, "tid": 5714, "ts": 6300866264095.313, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float)", "pid": 0, "tid": 7, + "ts": 6300866266510.858, "dur": 1.824, + "args": { + "External id": 89656, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168953, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.859375, "warps per SM": 13.750000, "grid": [110, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 29 + } + }, + { + "ph": "f", "id": 161168953, "pid": 0, "tid": 7, "ts": 6300866266510.858, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866266481.668, "dur": 36.570, + "args": { + "External id": 89656, "cbid": 211, "correlation": 161168953 + } + }, + { + "ph": "s", "id": 161168953, "pid": 5714, "tid": 5714, "ts": 6300866266481.668, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float)", "pid": 0, "tid": 7, + "ts": 6300866266535.306, "dur": 1.568, + "args": { + "External id": 89656, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168956, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.140625, "warps per SM": 2.250000, "grid": [18, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 5 + } + }, + { + "ph": "f", "id": 161168956, "pid": 0, "tid": 7, "ts": 6300866266535.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866266523.838, "dur": 13.320, + "args": { + "External id": 89656, "cbid": 211, "correlation": 161168956 + } + }, + { + "ph": "s", "id": 161168956, "pid": 5714, "tid": 5714, "ts": 6300866266523.838, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6300866270323.671, "dur": 513.798, + "args": { + "External id": 89786, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168962, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.687500, "warps per SM": 27.000000, "grid": [216, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 56 + } + }, + { + "ph": "f", "id": 161168962, "pid": 0, "tid": 7, "ts": 6300866270323.671, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866270306.619, "dur": 19.460, + "args": { + "External id": 89786, "cbid": 211, "correlation": 161168962 + } + }, + { + "ph": "s", "id": 161168962, "pid": 5714, "tid": 5714, "ts": 6300866270306.619, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6300866270838.333, "dur": 260.387, + "args": { + "External id": 89786, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168965, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161168965, "pid": 0, "tid": 7, "ts": 6300866270838.333, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866270330.869, "dur": 6.740, + "args": { + "External id": 89786, "cbid": 211, "correlation": 161168965 + } + }, + { + "ph": "s", "id": 161168965, "pid": 5714, "tid": 5714, "ts": 6300866270330.869, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6300866271099.360, "dur": 260.483, + "args": { + "External id": 89786, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168968, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 161168968, "pid": 0, "tid": 7, "ts": 6300866271099.360, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866270341.689, "dur": 5.440, + "args": { + "External id": 89786, "cbid": 211, "correlation": 161168968 + } + }, + { + "ph": "s", "id": 161168968, "pid": 5714, "tid": 5714, "ts": 6300866270341.689, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6300866271360.547, "dur": 258.627, + "args": { + "External id": 89786, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 161168971, "registers per thread": 64, "shared memory": 0, "blocks per SM": 0.554688, "warps per SM": 8.875000, "grid": [71, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 18 + } + }, + { + "ph": "f", "id": 161168971, "pid": 0, "tid": 7, "ts": 6300866271360.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6300866270350.299, "dur": 5.220, + "args": { + "External id": 89786, "cbid": 211, "correlation": 161168971 + } + }, + { + "ph": "s", "id": 161168971, "pid": 5714, "tid": 5714, "ts": 6300866270350.299, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceSynchronize", "pid": 5714, "tid": 5714, + "ts": 6300866270658.609, "dur": 963.167, + "args": { + "cbid": 165, "correlation": 161168977 + } + }, + { + "ph": "s", "id": 161168977, "pid": 5714, "tid": 5714, "ts": 6300866270658.609, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866292589.269, "dur": 5.510, + "args": { + "cbid": 138, "correlation": 161168979 + } + }, + { + "ph": "f", "id": 161168979, "pid": 5714, "tid": 1822426688, "ts": 6300866292589.269, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866292595.379, "dur": 0.580, + "args": { + "cbid": 138, "correlation": 161168980 + } + }, + { + "ph": "f", "id": 161168980, "pid": 5714, "tid": 1822426688, "ts": 6300866292595.379, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866292599.599, "dur": 0.550, + "args": { + "cbid": 138, "correlation": 161168981 + } + }, + { + "ph": "f", "id": 161168981, "pid": 5714, "tid": 1822426688, "ts": 6300866292599.599, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866292603.639, "dur": 1.050, + "args": { + "cbid": 138, "correlation": 161168982 + } + }, + { + "ph": "f", "id": 161168982, "pid": 5714, "tid": 1822426688, "ts": 6300866292603.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866292604.929, "dur": 0.430, + "args": { + "cbid": 138, "correlation": 161168983 + } + }, + { + "ph": "f", "id": 161168983, "pid": 5714, "tid": 1822426688, "ts": 6300866292604.929, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866292606.499, "dur": 0.450, + "args": { + "cbid": 138, "correlation": 161168984 + } + }, + { + "ph": "f", "id": 161168984, "pid": 5714, "tid": 1822426688, "ts": 6300866292606.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866292609.919, "dur": 0.990, + "args": { + "cbid": 138, "correlation": 161168985 + } + }, + { + "ph": "f", "id": 161168985, "pid": 5714, "tid": 1822426688, "ts": 6300866292609.919, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866292611.049, "dur": 0.420, + "args": { + "cbid": 138, "correlation": 161168986 + } + }, + { + "ph": "f", "id": 161168986, "pid": 5714, "tid": 1822426688, "ts": 6300866292611.049, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6300866292612.399, "dur": 0.500, + "args": { + "cbid": 138, "correlation": 161168987 + } + }, + { + "ph": "f", "id": 161168987, "pid": 5714, "tid": 1822426688, "ts": 6300866292612.399, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "Optimizer.step#AdamW.step", "pid": 0, "tid": 7, + "ts": 6300866266510.857, "dur": 5108.318, + "args": { + "External id": 89655 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce", "pid": 0, "tid": 7, + "ts": 6300866193741.554, "dur": 364.231, + "args": { + "External id": 89463 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.0)", "pid": 0, "tid": 7, + "ts": 6300866182696.497, "dur": 56.675, + "args": { + "External id": 89425 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 0, "tid": 7, + "ts": 6300866173279.138, "dur": 17.795, + "args": { + "External id": 89320 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 0, "tid": 7, + "ts": 6300865914145.439, "dur": 746.859, + "args": { + "External id": 83138 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 0, "tid": 7, + "ts": 6300866131129.651, "dur": 16.130, + "args": { + "External id": 88822 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 0, "tid": 7, + "ts": 6300865909378.791, "dur": 103.555, + "args": { + "External id": 83015 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 0, "tid": 7, + "ts": 6300865904439.117, "dur": 19.746, + "args": { + "External id": 82892 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.2)", "pid": 0, "tid": 7, + "ts": 6300866155826.134, "dur": 509.063, + "args": { + "External id": 89127 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 0, "tid": 7, + "ts": 6300865899600.660, "dur": 458.856, + "args": { + "External id": 82769 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 0, "tid": 7, + "ts": 6300866116881.292, "dur": 16.290, + "args": { + "External id": 88656 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 0, "tid": 7, + "ts": 6300865893549.773, "dur": 469.448, + "args": { + "External id": 82646 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "ProfilerStep#5631", "pid": 0, "tid": 7, + "ts": 6300865686198.058, "dur": 577355.070, + "args": { + "External id": 81921 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 0, "tid": 7, + "ts": 6300865874408.045, "dur": 16.770, + "args": { + "External id": 82400 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 0, "tid": 7, + "ts": 6300866103090.890, "dur": 17.218, + "args": { + "External id": 88490 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 0, "tid": 7, + "ts": 6300865867929.600, "dur": 24.195, + "args": { + "External id": 82277 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.9)", "pid": 0, "tid": 7, + "ts": 6300866063222.006, "dur": 38.530, + "args": { + "External id": 87965 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 0, "tid": 7, + "ts": 6300866159271.262, "dur": 17.282, + "args": { + "External id": 89154 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.1)", "pid": 0, "tid": 7, + "ts": 6300866168757.613, "dur": 58.755, + "args": { + "External id": 89293 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 0, "tid": 7, + "ts": 6300865863632.846, "dur": 17.218, + "args": { + "External id": 82154 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.4)", "pid": 0, "tid": 7, + "ts": 6300866126357.563, "dur": 190.820, + "args": { + "External id": 88795 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 0, "tid": 7, + "ts": 6300865887486.598, "dur": 627.657, + "args": { + "External id": 82523 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.3)", "pid": 0, "tid": 7, + "ts": 6300866140600.355, "dur": 189.860, + "args": { + "External id": 88961 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out", "pid": 0, "tid": 7, + "ts": 6300865860430.760, "dur": 1047.695, + "args": { + "External id": 82068 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.9)", "pid": 0, "tid": 7, + "ts": 6300865918837.334, "dur": 17.859, + "args": { + "External id": 83261 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 0, "tid": 7, + "ts": 6300866075518.342, "dur": 16.322, + "args": { + "External id": 88158 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 0, "tid": 7, + "ts": 6300866063264.118, "dur": 328.102, + "args": { + "External id": 87992 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.8)", "pid": 0, "tid": 7, + "ts": 6300866072461.987, "dur": 255.908, + "args": { + "External id": 88131 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.7)", "pid": 0, "tid": 7, + "ts": 6300866084757.843, "dur": 44.514, + "args": { + "External id": 88297 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 0, "tid": 7, + "ts": 6300866089484.842, "dur": 15.042, + "args": { + "External id": 88324 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.6)", "pid": 0, "tid": 7, + "ts": 6300866099285.406, "dur": 47.106, + "args": { + "External id": 88463 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.5)", "pid": 0, "tid": 7, + "ts": 6300866113053.407, "dur": 47.939, + "args": { + "External id": 88629 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 0, "tid": 7, + "ts": 6300866145852.736, "dur": 476.392, + "args": { + "External id": 88988 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 0, "tid": 17, + "ts": 6300866103141.483, "dur": 1013.294, + "args": { + "External id": 88509 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 0, "tid": 17, + "ts": 6300866089533.643, "dur": 558.505, + "args": { + "External id": 88343 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 0, "tid": 17, + "ts": 6300866075568.711, "dur": 50.466, + "args": { + "External id": 88177 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather", "pid": 0, "tid": 17, + "ts": 6300865687419.833, "dur": 301.317, + "args": { + "External id": 81977 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 0, "tid": 17, + "ts": 6300865863651.822, "dur": 84.547, + "args": { + "External id": 82367 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 0, "tid": 17, + "ts": 6300866116931.917, "dur": 49.442, + "args": { + "External id": 88675 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 0, "tid": 17, + "ts": 6300865689520.465, "dur": 566.761, + "args": { + "External id": 82121 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 0, "tid": 17, + "ts": 6300865861535.253, "dur": 457.992, + "args": { + "External id": 82244 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 0, "tid": 17, + "ts": 6300866131179.508, "dur": 49.955, + "args": { + "External id": 88841 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 0, "tid": 17, + "ts": 6300865867956.705, "dur": 74.050, + "args": { + "External id": 82490 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 0, "tid": 17, + "ts": 6300865874428.653, "dur": 923.724, + "args": { + "External id": 82613 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 0, "tid": 17, + "ts": 6300865888399.313, "dur": 46.882, + "args": { + "External id": 82736 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 0, "tid": 17, + "ts": 6300866039286.493, "dur": 195.236, + "args": { + "External id": 87845 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 0, "tid": 17, + "ts": 6300866147190.992, "dur": 821.708, + "args": { + "External id": 89007 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 0, "tid": 17, + "ts": 6300865894315.094, "dur": 200.965, + "args": { + "External id": 82859 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 0, "tid": 17, + "ts": 6300865900133.243, "dur": 436.583, + "args": { + "External id": 82982 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 0, "tid": 17, + "ts": 6300865904628.207, "dur": 761.163, + "args": { + "External id": 83105 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 0, "tid": 17, + "ts": 6300866159322.014, "dur": 50.531, + "args": { + "External id": 89173 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.9)", "pid": 0, "tid": 17, + "ts": 6300865909784.716, "dur": 546.953, + "args": { + "External id": 83228 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 0, "tid": 17, + "ts": 6300866063645.019, "dur": 1290.257, + "args": { + "External id": 88011 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:all_reduce", "pid": 0, "tid": 20, + "ts": 6300866262093.173, "dur": 384.327, + "args": { + "External id": 89622 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866194109.015, "dur": 67807.102, + "args": { + "External id": 89477 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300866167782.722, "dur": 5494.210, + "args": { + "External id": 89204 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865909376.423, "dur": 4762.874, + "args": { + "External id": 83136 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865914140.159, "dur": 4693.305, + "args": { + "External id": 83259 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866063262.262, "dur": 7716.189, + "args": { + "External id": 87979 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865904437.901, "dur": 4936.924, + "args": { + "External id": 83013 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866182846.579, "dur": 10893.281, + "args": { + "External id": 89439 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865899596.692, "dur": 4839.707, + "args": { + "External id": 82890 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300866154332.868, "dur": 4933.660, + "args": { + "External id": 89038 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866089482.538, "dur": 8511.782, + "args": { + "External id": 88311 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865893545.037, "dur": 6049.065, + "args": { + "External id": 82767 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865887481.542, "dur": 6061.321, + "args": { + "External id": 82644 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865867931.008, "dur": 6474.895, + "args": { + "External id": 82398 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865860426.408, "dur": 3205.640, + "args": { + "External id": 82152 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865863633.678, "dur": 4295.348, + "args": { + "External id": 82275 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300866126241.850, "dur": 4885.147, + "args": { + "External id": 88706 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865874408.141, "dur": 13071.067, + "args": { + "External id": 82521 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300866140472.833, "dur": 5374.241, + "args": { + "External id": 88872 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300865687899.454, "dur": 172525.292, + "args": { + "External id": 82066 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866103088.106, "dur": 8676.520, + "args": { + "External id": 88477 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866075517.158, "dur": 8599.687, + "args": { + "External id": 88145 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300866039684.065, "dur": 3667.469, + "args": { + "External id": 87876 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866116881.644, "dur": 9344.944, + "args": { + "External id": 88643 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866159267.678, "dur": 8514.406, + "args": { + "External id": 89141 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300866098077.743, "dur": 5008.765, + "args": { + "External id": 88374 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300866071101.587, "dur": 4414.517, + "args": { + "External id": 88042 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866131129.683, "dur": 9336.688, + "args": { + "External id": 88809 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866173279.394, "dur": 8578.759, + "args": { + "External id": 89307 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300866111815.313, "dur": 5064.093, + "args": { + "External id": 88540 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6300866084194.124, "dur": 5286.688, + "args": { + "External id": 88208 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6300866145848.768, "dur": 8407.301, + "args": { + "External id": 88975 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 5714, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 5714, "tid": 0, + "args": { + "labels": "CPU" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 5714, "tid": 0, + "args": { + "sort_index": 5714 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 0, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 0, "tid": 0, + "args": { + "labels": "GPU 0" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 0, "tid": 0, + "args": { + "sort_index": 5000000 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 1, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 1, "tid": 0, + "args": { + "labels": "GPU 1" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 1, "tid": 0, + "args": { + "sort_index": 5000001 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 2, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 2, "tid": 0, + "args": { + "labels": "GPU 2" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 2, "tid": 0, + "args": { + "sort_index": 5000002 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 3, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 3, "tid": 0, + "args": { + "labels": "GPU 3" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 3, "tid": 0, + "args": { + "sort_index": 5000003 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 4, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 4, "tid": 0, + "args": { + "labels": "GPU 4" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 4, "tid": 0, + "args": { + "sort_index": 5000004 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 5, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 5, "tid": 0, + "args": { + "labels": "GPU 5" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 5, "tid": 0, + "args": { + "sort_index": 5000005 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 6, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 6, "tid": 0, + "args": { + "labels": "GPU 6" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 6, "tid": 0, + "args": { + "sort_index": 5000006 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 7, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 7, "tid": 0, + "args": { + "labels": "GPU 7" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 7, "tid": 0, + "args": { + "sort_index": 5000007 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 8, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 8, "tid": 0, + "args": { + "labels": "GPU 8" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 8, "tid": 0, + "args": { + "sort_index": 5000008 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 9, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 9, "tid": 0, + "args": { + "labels": "GPU 9" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 9, "tid": 0, + "args": { + "sort_index": 5000009 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 10, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 10, "tid": 0, + "args": { + "labels": "GPU 10" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 10, "tid": 0, + "args": { + "sort_index": 5000010 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 11, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 11, "tid": 0, + "args": { + "labels": "GPU 11" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 11, "tid": 0, + "args": { + "sort_index": 5000011 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 12, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 12, "tid": 0, + "args": { + "labels": "GPU 12" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 12, "tid": 0, + "args": { + "sort_index": 5000012 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 13, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 13, "tid": 0, + "args": { + "labels": "GPU 13" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 13, "tid": 0, + "args": { + "sort_index": 5000013 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 14, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 14, "tid": 0, + "args": { + "labels": "GPU 14" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 14, "tid": 0, + "args": { + "sort_index": 5000014 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6300865683362.830, "pid": 15, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6300865683362.830, "pid": 15, "tid": 0, + "args": { + "labels": "GPU 15" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 15, "tid": 0, + "args": { + "sort_index": 5000015 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6300865683362.830, "pid": 0, "tid": 7, + "args": { + "name": "stream 7 " + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 0, "tid": 7, + "args": { + "sort_index": 7 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6300865683362.830, "pid": 0, "tid": 17, + "args": { + "name": "stream 17 " + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 0, "tid": 17, + "args": { + "sort_index": 17 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6300865683362.830, "pid": 0, "tid": 20, + "args": { + "name": "stream 20 " + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 0, "tid": 20, + "args": { + "sort_index": 20 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6300865683362.830, "pid": 5714, "tid": 5714, + "args": { + "name": "thread 5714 (python3)" + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 5714, "tid": 5714, + "args": { + "sort_index": 5714 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6300865683362.830, "pid": 5714, "tid": 6744, + "args": { + "name": "thread 6744 (python3)" + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 5714, "tid": 6744, + "args": { + "sort_index": 6744 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6300865683362.830, "pid": 5714, "tid": 6744, + "args": { + "name": "thread 6744 (pt_autograd_0)" + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6300865683362.830, "pid": 5714, "tid": 6744, + "args": { + "sort_index": 6744 + } + }, + { + "ph": "X", "cat": "Trace", "ts": 6300865683275.381, "dur": 588357.532, + "pid": "Spans", "tid": "PyTorch Profiler", + "name": "PyTorch Profiler (0)", + "args": { + "Op count": 0 + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6300865683275.381, + "pid": "Spans", "tid": 0, + "args": { + "sort_index": 536870912 + } + }, + { + "name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g", + "pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 6300865683275.381 + }, + { + "name": "Record Window End", "ph": "i", "s": "g", + "pid": "", "tid": "", "ts": 6300866295336.230 + } + ], + "traceName": "exp/mtp.120M.batch8.seqlen2048.context2048.warmup1000.update1.steps15000.nft4.lr5e-4.cosine/profile_trace/iteration_5632/rank0_trace.json", + "displayTimeUnit": "ms", + "baseTimeNanoseconds": 1743521598000000000 +} \ No newline at end of file diff --git a/profile_trace/iteration_8192/rank3_trace.json b/profile_trace/iteration_8192/rank3_trace.json new file mode 100644 index 0000000000000000000000000000000000000000..d47b79a25fb3472a77f6e9530fcbd5c2d7bd1c60 --- /dev/null +++ b/profile_trace/iteration_8192/rank3_trace.json @@ -0,0 +1,108942 @@ + +{ + "schemaVersion": 1, + "deviceProperties": [ + { + "id": 0, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + }, + { + "id": 1, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + }, + { + "id": 2, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + }, + { + "id": 3, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + } + ], + "cupti_version": 22, + "cuda_runtime_version": 12040, + "cuda_driver_version": 12040, + "distributedInfo": {"backend": "nccl", "rank": 3, "world_size": 4, "pg_count": 1, "pg_config": [{"pg_name": "0", "pg_desc": "default_pg", "backend_config": "cuda:nccl", "pg_size": 4, "ranks": [0, 1, 2, 3]}], "nccl_version": "2.21.5"}, + "record_shapes": 1, + "trace_id": "F36ADCCE357B4A088F59E30D7F4976CA", + "traceEvents": [ + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: DivBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685346597.950, "dur": 120.120, + "args": { + "External id": 126977,"Record function id": 0, "Sequence number": 2576068, "Fwd thread id": 1, "Ev Idx": 0 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "DivBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685346615.400, "dur": 91.590, + "args": { + "External id": 126978,"Sequence number": 2576068, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 1 + } + }, + { + "ph": "f", "id": 1, "pid": 5717, "tid": 6759, "ts": 6302685346615.400, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685346625.240, "dur": 78.890, + "args": { + "External id": 126979,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 2 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685346730.030, "dur": 286.599, + "args": { + "External id": 126980,"Record function id": 0, "Ev Idx": 3 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward", "pid": 5717, "tid": 6759, + "ts": 6302685346805.600, "dur": 124.109, + "args": { + "External id": 126981,"Record function id": 0, "Ev Idx": 4 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.9", "pid": 5717, "tid": 6759, + "ts": 6302685346848.019, "dur": 64.720, + "args": { + "External id": 126982,"Record function id": 0, "Ev Idx": 5 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685346936.819, "dur": 2.630, + "args": { + "External id": 126983,"Sequence number": 2576067, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 6 + } + }, + { + "ph": "f", "id": 2, "pid": 5717, "tid": 6759, "ts": 6302685346936.819, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685346944.529, "dur": 64.600, + "args": { + "External id": 126984,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685346955.099, "dur": 53.340, + "args": { + "External id": 126985,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 8 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685346969.729, "dur": 3.080, + "args": { + "External id": 126986,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 9 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685347031.329, "dur": 15190.256, + "args": { + "External id": 126987,"Record function id": 0, "Sequence number": 2576065, "Fwd thread id": 1, "Ev Idx": 10 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685347034.259, "dur": 15170.766, + "args": { + "External id": 126988,"Sequence number": 2576065, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 11 + } + }, + { + "ph": "f", "id": 3, "pid": 5717, "tid": 6759, "ts": 6302685347034.259, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685347101.449, "dur": 6.730, + "args": { + "External id": 126989,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 12 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685347112.499, "dur": 14977.856, + "args": { + "External id": 126990,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 13 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685347116.189, "dur": 14973.346, + "args": { + "External id": 126991,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 14 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685347119.419, "dur": 8.180, + "args": { + "External id": 126992,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 15 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685347129.829, "dur": 14957.696, + "args": { + "External id": 126993,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 16 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5717, "tid": 6759, + "ts": 6302685362095.455, "dur": 0.390, + "args": { + "External id": 126994,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 17 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5717, "tid": 6759, + "ts": 6302685362098.615, "dur": 3.220, + "args": { + "External id": 126995,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 18 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5717, "tid": 6759, + "ts": 6302685362100.445, "dur": 1.240, + "args": { + "External id": 126996,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 19 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5717, "tid": 6759, + "ts": 6302685362109.035, "dur": 36.500, + "args": { + "External id": 126997,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 20 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5717, "tid": 6759, + "ts": 6302685362155.445, "dur": 36.760, + "args": { + "External id": 126998,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 21 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5717, "tid": 6759, + "ts": 6302685362157.775, "dur": 34.230, + "args": { + "External id": 126999,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 22 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5717, "tid": 6759, + "ts": 6302685362160.465, "dur": 31.140, + "args": { + "External id": 127000,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 23 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362239.994, "dur": 22.160, + "args": { + "External id": 127001,"Record function id": 0, "Sequence number": 2576064, "Fwd thread id": 1, "Ev Idx": 24 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362242.354, "dur": 15.711, + "args": { + "External id": 127002,"Sequence number": 2576064, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 25 + } + }, + { + "ph": "f", "id": 4, "pid": 5717, "tid": 6759, "ts": 6302685362242.354, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685362248.465, "dur": 9.240, + "args": { + "External id": 127003,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 26 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685362250.774, "dur": 6.691, + "args": { + "External id": 127004,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 27 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362266.614, "dur": 149.450, + "args": { + "External id": 127005,"Record function id": 0, "Sequence number": 2576063, "Fwd thread id": 1, "Ev Idx": 28 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362267.865, "dur": 134.559, + "args": { + "External id": 127006,"Sequence number": 2576063, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 29 + } + }, + { + "ph": "f", "id": 5, "pid": 5717, "tid": 6759, "ts": 6302685362267.865, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685362271.445, "dur": 129.129, + "args": { + "External id": 127007,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 30 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685362281.385, "dur": 54.879, + "args": { + "External id": 127008,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 31 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685362286.274, "dur": 20.180, + "args": { + "External id": 127009,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 32 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685362308.734, "dur": 27.130, + "args": { + "External id": 127010,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 33 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685362313.684, "dur": 21.220, + "args": { + "External id": 127011,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 34 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685362341.184, "dur": 7.180, + "args": { + "External id": 127012,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 35 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685362345.014, "dur": 1.380, + "args": { + "External id": 127013,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 36 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685362351.434, "dur": 48.140, + "args": { + "External id": 127014,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 37 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362430.554, "dur": 76.640, + "args": { + "External id": 127015,"Record function id": 0, "Sequence number": 2576062, "Fwd thread id": 1, "Ev Idx": 38 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362433.724, "dur": 68.180, + "args": { + "External id": 127016,"Sequence number": 2576062, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 39 + } + }, + { + "ph": "f", "id": 6, "pid": 5717, "tid": 6759, "ts": 6302685362433.724, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5717, "tid": 6759, + "ts": 6302685362439.834, "dur": 61.690, + "args": { + "External id": 127017,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "3"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 40 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685362444.034, "dur": 23.180, + "args": { + "External id": 127018,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 41 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685362445.424, "dur": 6.910, + "args": { + "External id": 127019,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 42 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685362453.314, "dur": 13.580, + "args": { + "External id": 127020,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 43 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685362454.444, "dur": 11.680, + "args": { + "External id": 127021,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 44 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 6759, + "ts": 6302685362471.084, "dur": 9.510, + "args": { + "External id": 127022,"Record function id": 0, "Concrete Inputs": ["", "2", "3"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 45 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685362477.594, "dur": 1.590, + "args": { + "External id": 127023,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 46 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685362481.384, "dur": 19.520, + "args": { + "External id": 127024,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 47 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362514.504, "dur": 68.420, + "args": { + "External id": 127025,"Record function id": 0, "Sequence number": 2576061, "Fwd thread id": 1, "Ev Idx": 48 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362516.184, "dur": 61.840, + "args": { + "External id": 127026,"Sequence number": 2576061, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 49 + } + }, + { + "ph": "f", "id": 7, "pid": 5717, "tid": 6759, "ts": 6302685362516.184, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685362518.434, "dur": 59.180, + "args": { + "External id": 127027,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 50 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685362521.464, "dur": 20.290, + "args": { + "External id": 127028,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 51 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685362522.694, "dur": 5.410, + "args": { + "External id": 127029,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 52 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685362528.904, "dur": 12.560, + "args": { + "External id": 127030,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 53 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685362530.864, "dur": 9.840, + "args": { + "External id": 127031,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 54 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685362544.434, "dur": 3.100, + "args": { + "External id": 127032,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 55 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685362546.254, "dur": 0.920, + "args": { + "External id": 127033,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 56 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685362548.324, "dur": 28.520, + "args": { + "External id": 127034,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 57 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362591.584, "dur": 51.280, + "args": { + "External id": 127035,"Record function id": 0, "Sequence number": 2576060, "Fwd thread id": 1, "Ev Idx": 58 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362593.124, "dur": 45.309, + "args": { + "External id": 127036,"Sequence number": 2576060, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 59 + } + }, + { + "ph": "f", "id": 8, "pid": 5717, "tid": 6759, "ts": 6302685362593.124, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685362594.934, "dur": 43.090, + "args": { + "External id": 127037,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 60 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685362596.754, "dur": 19.130, + "args": { + "External id": 127038,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 61 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685362597.924, "dur": 4.980, + "args": { + "External id": 127039,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 62 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685362603.754, "dur": 11.870, + "args": { + "External id": 127040,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 63 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685362604.774, "dur": 10.010, + "args": { + "External id": 127041,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 64 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685362618.164, "dur": 5.730, + "args": { + "External id": 127042,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 65 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685362621.574, "dur": 1.990, + "args": { + "External id": 127043,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 66 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685362624.644, "dur": 12.689, + "args": { + "External id": 127044,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 67 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362650.173, "dur": 44.111, + "args": { + "External id": 127045,"Record function id": 0, "Sequence number": 2576059, "Fwd thread id": 1, "Ev Idx": 68 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685362652.153, "dur": 1.171, + "args": { + "External id": 127046,"Sequence number": 2576059, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 69 + } + }, + { + "ph": "f", "id": 9, "pid": 5717, "tid": 6759, "ts": 6302685362652.153, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685362656.453, "dur": 33.200, + "args": { + "External id": 127047,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 70 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685362659.353, "dur": 29.871, + "args": { + "External id": 127048,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 71 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685362667.873, "dur": 0.600, + "args": { + "External id": 127049,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 72 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685362701.533, "dur": 704.099, + "args": { + "External id": 127050,"Record function id": 0, "Sequence number": 2576057, "Fwd thread id": 1, "Ev Idx": 73 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685362703.393, "dur": 652.619, + "args": { + "External id": 127051,"Sequence number": 2576057, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 74 + } + }, + { + "ph": "f", "id": 10, "pid": 5717, "tid": 6759, "ts": 6302685362703.393, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685362745.293, "dur": 3.460, + "args": { + "External id": 127052,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 75 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685362751.193, "dur": 521.989, + "args": { + "External id": 127053,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 76 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685362754.423, "dur": 517.209, + "args": { + "External id": 127054,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 77 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685362757.283, "dur": 6.240, + "args": { + "External id": 127055,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 78 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685362764.583, "dur": 506.309, + "args": { + "External id": 127056,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 79 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5717, "tid": 6759, + "ts": 6302685363277.282, "dur": 0.210, + "args": { + "External id": 127057,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 80 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5717, "tid": 6759, + "ts": 6302685363278.722, "dur": 3.180, + "args": { + "External id": 127058,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 81 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5717, "tid": 6759, + "ts": 6302685363280.872, "dur": 0.830, + "args": { + "External id": 127059,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 82 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5717, "tid": 6759, + "ts": 6302685363287.712, "dur": 32.750, + "args": { + "External id": 127060,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 83 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5717, "tid": 6759, + "ts": 6302685363326.992, "dur": 21.810, + "args": { + "External id": 127061,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 84 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5717, "tid": 6759, + "ts": 6302685363328.062, "dur": 20.510, + "args": { + "External id": 127062,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 85 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5717, "tid": 6759, + "ts": 6302685363329.092, "dur": 19.020, + "args": { + "External id": 127063,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 86 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685363373.982, "dur": 27.440, + "args": { + "External id": 127064,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 87 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363417.502, "dur": 13.380, + "args": { + "External id": 127065,"Record function id": 0, "Sequence number": 2576056, "Fwd thread id": 1, "Ev Idx": 88 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363419.692, "dur": 7.760, + "args": { + "External id": 127066,"Sequence number": 2576056, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 89 + } + }, + { + "ph": "f", "id": 11, "pid": 5717, "tid": 6759, "ts": 6302685363419.692, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685363422.462, "dur": 4.770, + "args": { + "External id": 127067,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 90 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685363423.932, "dur": 3.050, + "args": { + "External id": 127068,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 91 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363434.592, "dur": 62.020, + "args": { + "External id": 127069,"Record function id": 0, "Sequence number": 2576055, "Fwd thread id": 1, "Ev Idx": 92 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363435.862, "dur": 53.490, + "args": { + "External id": 127070,"Sequence number": 2576055, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 93 + } + }, + { + "ph": "f", "id": 12, "pid": 5717, "tid": 6759, "ts": 6302685363435.862, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685363437.912, "dur": 50.990, + "args": { + "External id": 127071,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 94 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685363440.222, "dur": 23.080, + "args": { + "External id": 127072,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 95 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685363441.782, "dur": 5.480, + "args": { + "External id": 127073,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 96 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685363448.212, "dur": 14.690, + "args": { + "External id": 127074,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 97 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685363449.732, "dur": 12.380, + "args": { + "External id": 127075,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 98 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685363466.092, "dur": 3.710, + "args": { + "External id": 127076,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 99 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685363468.422, "dur": 0.950, + "args": { + "External id": 127077,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685363470.822, "dur": 17.270, + "args": { + "External id": 127078,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363504.442, "dur": 52.020, + "args": { + "External id": 127079,"Record function id": 0, "Sequence number": 2576054, "Fwd thread id": 1, "Ev Idx": 102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363506.362, "dur": 45.529, + "args": { + "External id": 127080,"Sequence number": 2576054, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 103 + } + }, + { + "ph": "f", "id": 13, "pid": 5717, "tid": 6759, "ts": 6302685363506.362, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5717, "tid": 6759, + "ts": 6302685363508.522, "dur": 43.060, + "args": { + "External id": 127081,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "2"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685363510.212, "dur": 20.879, + "args": { + "External id": 127082,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685363511.402, "dur": 6.000, + "args": { + "External id": 127083,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685363518.222, "dur": 12.600, + "args": { + "External id": 127084,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685363519.351, "dur": 10.731, + "args": { + "External id": 127085,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 6759, + "ts": 6302685363533.662, "dur": 4.989, + "args": { + "External id": 127086,"Record function id": 0, "Concrete Inputs": ["", "2", "2"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685363537.002, "dur": 1.069, + "args": { + "External id": 127087,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685363539.511, "dur": 11.511, + "args": { + "External id": 127088,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363563.222, "dur": 56.829, + "args": { + "External id": 127089,"Record function id": 0, "Sequence number": 2576053, "Fwd thread id": 1, "Ev Idx": 112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363564.802, "dur": 50.879, + "args": { + "External id": 127090,"Sequence number": 2576053, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 113 + } + }, + { + "ph": "f", "id": 14, "pid": 5717, "tid": 6759, "ts": 6302685363564.802, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685363566.882, "dur": 48.399, + "args": { + "External id": 127091,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685363568.551, "dur": 20.080, + "args": { + "External id": 127092,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685363569.822, "dur": 5.229, + "args": { + "External id": 127093,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685363575.771, "dur": 12.551, + "args": { + "External id": 127094,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685363577.842, "dur": 9.649, + "args": { + "External id": 127095,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685363591.002, "dur": 4.080, + "args": { + "External id": 127096,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685363593.882, "dur": 0.809, + "args": { + "External id": 127097,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685363595.962, "dur": 18.659, + "args": { + "External id": 127098,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363627.101, "dur": 59.730, + "args": { + "External id": 127099,"Record function id": 0, "Sequence number": 2576052, "Fwd thread id": 1, "Ev Idx": 122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363628.841, "dur": 40.480, + "args": { + "External id": 127100,"Sequence number": 2576052, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 123 + } + }, + { + "ph": "f", "id": 15, "pid": 5717, "tid": 6759, "ts": 6302685363628.841, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685363630.771, "dur": 38.150, + "args": { + "External id": 127101,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685363632.341, "dur": 18.330, + "args": { + "External id": 127102,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685363633.371, "dur": 4.390, + "args": { + "External id": 127103,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685363638.651, "dur": 11.760, + "args": { + "External id": 127104,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685363639.841, "dur": 9.870, + "args": { + "External id": 127105,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685363651.721, "dur": 2.790, + "args": { + "External id": 127106,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685363653.421, "dur": 0.780, + "args": { + "External id": 127107,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685363656.471, "dur": 11.780, + "args": { + "External id": 127108,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685363674.231, "dur": 10.260, + "args": { + "External id": 127109,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363694.811, "dur": 35.740, + "args": { + "External id": 127110,"Record function id": 0, "Sequence number": 2576051, "Fwd thread id": 1, "Ev Idx": 133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685363696.571, "dur": 1.230, + "args": { + "External id": 127111,"Sequence number": 2576051, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 134 + } + }, + { + "ph": "f", "id": 16, "pid": 5717, "tid": 6759, "ts": 6302685363696.571, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685363699.801, "dur": 23.630, + "args": { + "External id": 127112,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685363701.341, "dur": 21.640, + "args": { + "External id": 127113,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685363708.801, "dur": 0.670, + "args": { + "External id": 127114,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685363738.381, "dur": 1109.747, + "args": { + "External id": 127115,"Record function id": 0, "Sequence number": 2576049, "Fwd thread id": 1, "Ev Idx": 138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685363741.401, "dur": 1079.908, + "args": { + "External id": 127116,"Sequence number": 2576049, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 139 + } + }, + { + "ph": "f", "id": 17, "pid": 5717, "tid": 6759, "ts": 6302685363741.401, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685363767.471, "dur": 2.420, + "args": { + "External id": 127117,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685363771.841, "dur": 983.068, + "args": { + "External id": 127118,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685363773.161, "dur": 981.348, + "args": { + "External id": 127119,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685363775.471, "dur": 6.900, + "args": { + "External id": 127120,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685363783.461, "dur": 970.188, + "args": { + "External id": 127121,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5717, "tid": 6759, + "ts": 6302685364757.839, "dur": 0.190, + "args": { + "External id": 127122,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5717, "tid": 6759, + "ts": 6302685364759.139, "dur": 2.980, + "args": { + "External id": 127123,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5717, "tid": 6759, + "ts": 6302685364761.189, "dur": 0.740, + "args": { + "External id": 127124,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5717, "tid": 6759, + "ts": 6302685364765.449, "dur": 22.750, + "args": { + "External id": 127125,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5717, "tid": 6759, + "ts": 6302685364793.849, "dur": 20.600, + "args": { + "External id": 127126,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5717, "tid": 6759, + "ts": 6302685364794.799, "dur": 19.430, + "args": { + "External id": 127127,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5717, "tid": 6759, + "ts": 6302685364795.979, "dur": 17.810, + "args": { + "External id": 127128,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685364830.179, "dur": 13.840, + "args": { + "External id": 127129,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685364860.228, "dur": 14.091, + "args": { + "External id": 127130,"Record function id": 0, "Sequence number": 2576048, "Fwd thread id": 1, "Ev Idx": 153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685364862.419, "dur": 8.809, + "args": { + "External id": 127131,"Sequence number": 2576048, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 154 + } + }, + { + "ph": "f", "id": 18, "pid": 5717, "tid": 6759, "ts": 6302685364862.419, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685364865.488, "dur": 5.511, + "args": { + "External id": 127132,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685364867.888, "dur": 2.891, + "args": { + "External id": 127133,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685364878.048, "dur": 68.710, + "args": { + "External id": 127134,"Record function id": 0, "Sequence number": 2576047, "Fwd thread id": 1, "Ev Idx": 157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685364879.479, "dur": 57.899, + "args": { + "External id": 127135,"Sequence number": 2576047, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 158 + } + }, + { + "ph": "f", "id": 19, "pid": 5717, "tid": 6759, "ts": 6302685364879.479, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685364881.379, "dur": 55.549, + "args": { + "External id": 127136,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685364883.819, "dur": 23.000, + "args": { + "External id": 127137,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685364885.428, "dur": 5.520, + "args": { + "External id": 127138,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685364892.088, "dur": 14.440, + "args": { + "External id": 127139,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685364893.499, "dur": 12.229, + "args": { + "External id": 127140,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685364908.168, "dur": 3.840, + "args": { + "External id": 127141,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685364910.608, "dur": 1.011, + "args": { + "External id": 127142,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685364912.999, "dur": 21.740, + "args": { + "External id": 127143,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685364960.978, "dur": 59.720, + "args": { + "External id": 127144,"Record function id": 0, "Sequence number": 2576046, "Fwd thread id": 1, "Ev Idx": 167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685364963.888, "dur": 52.130, + "args": { + "External id": 127145,"Sequence number": 2576046, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 168 + } + }, + { + "ph": "f", "id": 20, "pid": 5717, "tid": 6759, "ts": 6302685364963.888, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5717, "tid": 6759, + "ts": 6302685364967.208, "dur": 48.530, + "args": { + "External id": 127146,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685364970.228, "dur": 25.150, + "args": { + "External id": 127147,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685364972.678, "dur": 8.510, + "args": { + "External id": 127148,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685364982.128, "dur": 12.990, + "args": { + "External id": 127149,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685364983.288, "dur": 11.100, + "args": { + "External id": 127150,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 6759, + "ts": 6302685364996.358, "dur": 6.340, + "args": { + "External id": 127151,"Record function id": 0, "Concrete Inputs": ["", "2", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685365000.978, "dur": 1.120, + "args": { + "External id": 127152,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685365003.558, "dur": 11.570, + "args": { + "External id": 127153,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685365027.688, "dur": 47.460, + "args": { + "External id": 127154,"Record function id": 0, "Sequence number": 2576045, "Fwd thread id": 1, "Ev Idx": 177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685365029.388, "dur": 41.590, + "args": { + "External id": 127155,"Sequence number": 2576045, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 178 + } + }, + { + "ph": "f", "id": 21, "pid": 5717, "tid": 6759, "ts": 6302685365029.388, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685365031.658, "dur": 38.900, + "args": { + "External id": 127156,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685365033.218, "dur": 17.390, + "args": { + "External id": 127157,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685365034.358, "dur": 4.290, + "args": { + "External id": 127158,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685365039.288, "dur": 11.080, + "args": { + "External id": 127159,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685365040.508, "dur": 9.130, + "args": { + "External id": 127160,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685365051.878, "dur": 3.420, + "args": { + "External id": 127161,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685365053.778, "dur": 1.170, + "args": { + "External id": 127162,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685365056.078, "dur": 13.770, + "args": { + "External id": 127163,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685365083.598, "dur": 73.120, + "args": { + "External id": 127164,"Record function id": 0, "Sequence number": 2576044, "Fwd thread id": 1, "Ev Idx": 187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685365085.538, "dur": 52.700, + "args": { + "External id": 127165,"Sequence number": 2576044, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 188 + } + }, + { + "ph": "f", "id": 22, "pid": 5717, "tid": 6759, "ts": 6302685365085.538, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685365088.358, "dur": 49.430, + "args": { + "External id": 127166,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685365090.038, "dur": 27.840, + "args": { + "External id": 127167,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685365091.138, "dur": 8.310, + "args": { + "External id": 127168,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685365100.598, "dur": 15.730, + "args": { + "External id": 127169,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685365102.778, "dur": 12.760, + "args": { + "External id": 127170,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685365119.248, "dur": 3.310, + "args": { + "External id": 127171,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685365121.258, "dur": 0.990, + "args": { + "External id": 127172,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685365123.368, "dur": 13.740, + "args": { + "External id": 127173,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685365143.738, "dur": 10.740, + "args": { + "External id": 127174,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685365165.038, "dur": 35.480, + "args": { + "External id": 127175,"Record function id": 0, "Sequence number": 2576043, "Fwd thread id": 1, "Ev Idx": 198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685365168.158, "dur": 1.310, + "args": { + "External id": 127176,"Sequence number": 2576043, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 199 + } + }, + { + "ph": "f", "id": 23, "pid": 5717, "tid": 6759, "ts": 6302685365168.158, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685365171.008, "dur": 23.210, + "args": { + "External id": 127177,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685365172.718, "dur": 21.060, + "args": { + "External id": 127178,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685365179.248, "dur": 0.660, + "args": { + "External id": 127179,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685365207.228, "dur": 1082.407, + "args": { + "External id": 127180,"Record function id": 0, "Sequence number": 2576042, "Fwd thread id": 1, "Ev Idx": 203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685365219.778, "dur": 1042.397, + "args": { + "External id": 127181,"Sequence number": 2576042, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 204 + } + }, + { + "ph": "f", "id": 24, "pid": 5717, "tid": 6759, "ts": 6302685365219.778, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685365243.788, "dur": 2.590, + "args": { + "External id": 127182,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685365248.378, "dur": 952.247, + "args": { + "External id": 127183,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685365249.488, "dur": 950.728, + "args": { + "External id": 127184,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685365251.408, "dur": 5.580, + "args": { + "External id": 127185,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685365258.138, "dur": 941.198, + "args": { + "External id": 127186,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5717, "tid": 6759, + "ts": 6302685366203.676, "dur": 0.149, + "args": { + "External id": 127187,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5717, "tid": 6759, + "ts": 6302685366205.136, "dur": 2.089, + "args": { + "External id": 127188,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5717, "tid": 6759, + "ts": 6302685366206.285, "dur": 0.740, + "args": { + "External id": 127189,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5717, "tid": 6759, + "ts": 6302685366210.605, "dur": 20.131, + "args": { + "External id": 127190,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5717, "tid": 6759, + "ts": 6302685366235.485, "dur": 20.351, + "args": { + "External id": 127191,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5717, "tid": 6759, + "ts": 6302685366237.605, "dur": 17.991, + "args": { + "External id": 127192,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5717, "tid": 6759, + "ts": 6302685366238.625, "dur": 16.560, + "args": { + "External id": 127193,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685366271.435, "dur": 13.780, + "args": { + "External id": 127194,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685366305.325, "dur": 13.390, + "args": { + "External id": 127195,"Record function id": 0, "Sequence number": 2576041, "Fwd thread id": 1, "Ev Idx": 218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685366307.745, "dur": 7.770, + "args": { + "External id": 127196,"Sequence number": 2576041, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 219 + } + }, + { + "ph": "f", "id": 25, "pid": 5717, "tid": 6759, "ts": 6302685366307.745, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685366311.005, "dur": 4.280, + "args": { + "External id": 127197,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685366312.125, "dur": 2.880, + "args": { + "External id": 127198,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685366322.865, "dur": 63.640, + "args": { + "External id": 127199,"Record function id": 0, "Sequence number": 2576040, "Fwd thread id": 1, "Ev Idx": 222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685366324.205, "dur": 54.910, + "args": { + "External id": 127200,"Sequence number": 2576040, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 223 + } + }, + { + "ph": "f", "id": 26, "pid": 5717, "tid": 6759, "ts": 6302685366324.205, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685366326.535, "dur": 52.130, + "args": { + "External id": 127201,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685366328.995, "dur": 24.290, + "args": { + "External id": 127202,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685366331.665, "dur": 5.420, + "args": { + "External id": 127203,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685366338.165, "dur": 14.810, + "args": { + "External id": 127204,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685366339.575, "dur": 12.640, + "args": { + "External id": 127205,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685366354.755, "dur": 3.910, + "args": { + "External id": 127206,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685366357.315, "dur": 0.910, + "args": { + "External id": 127207,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685366359.695, "dur": 18.130, + "args": { + "External id": 127208,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685366395.485, "dur": 60.510, + "args": { + "External id": 127209,"Record function id": 0, "Sequence number": 2576039, "Fwd thread id": 1, "Ev Idx": 232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685366397.365, "dur": 53.970, + "args": { + "External id": 127210,"Sequence number": 2576039, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 233 + } + }, + { + "ph": "f", "id": 27, "pid": 5717, "tid": 6759, "ts": 6302685366397.365, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5717, "tid": 6759, + "ts": 6302685366399.545, "dur": 51.360, + "args": { + "External id": 127211,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685366401.455, "dur": 29.940, + "args": { + "External id": 127212,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685366404.445, "dur": 8.770, + "args": { + "External id": 127213,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685366416.915, "dur": 14.140, + "args": { + "External id": 127214,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685366418.235, "dur": 12.080, + "args": { + "External id": 127215,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 6759, + "ts": 6302685366432.605, "dur": 5.290, + "args": { + "External id": 127216,"Record function id": 0, "Concrete Inputs": ["", "2", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685366436.115, "dur": 1.210, + "args": { + "External id": 127217,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685366438.795, "dur": 11.490, + "args": { + "External id": 127218,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685366463.235, "dur": 67.730, + "args": { + "External id": 127219,"Record function id": 0, "Sequence number": 2576038, "Fwd thread id": 1, "Ev Idx": 242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685366465.065, "dur": 61.520, + "args": { + "External id": 127220,"Sequence number": 2576038, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 243 + } + }, + { + "ph": "f", "id": 28, "pid": 5717, "tid": 6759, "ts": 6302685366465.065, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685366467.135, "dur": 58.920, + "args": { + "External id": 127221,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685366468.895, "dur": 22.220, + "args": { + "External id": 127222,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685366470.095, "dur": 4.790, + "args": { + "External id": 127223,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685366476.545, "dur": 14.200, + "args": { + "External id": 127224,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685366477.775, "dur": 12.090, + "args": { + "External id": 127225,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685366493.895, "dur": 3.380, + "args": { + "External id": 127226,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685366495.915, "dur": 0.940, + "args": { + "External id": 127227,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685366498.235, "dur": 27.000, + "args": { + "External id": 127228,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685366540.585, "dur": 77.410, + "args": { + "External id": 127229,"Record function id": 0, "Sequence number": 2576037, "Fwd thread id": 1, "Ev Idx": 252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685366543.075, "dur": 48.260, + "args": { + "External id": 127230,"Sequence number": 2576037, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 253 + } + }, + { + "ph": "f", "id": 29, "pid": 5717, "tid": 6759, "ts": 6302685366543.075, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5717, "tid": 6759, + "ts": 6302685366545.385, "dur": 45.550, + "args": { + "External id": 127231,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 6759, + "ts": 6302685366547.435, "dur": 24.430, + "args": { + "External id": 127232,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685366549.765, "dur": 5.230, + "args": { + "External id": 127233,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 6759, + "ts": 6302685366555.915, "dur": 15.650, + "args": { + "External id": 127234,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 6759, + "ts": 6302685366559.425, "dur": 11.420, + "args": { + "External id": 127235,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685366573.085, "dur": 3.070, + "args": { + "External id": 127236,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685366574.825, "dur": 1.040, + "args": { + "External id": 127237,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685366577.085, "dur": 13.080, + "args": { + "External id": 127238,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685366599.905, "dur": 14.759, + "args": { + "External id": 127239,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685366631.855, "dur": 379.209, + "args": { + "External id": 127240,"Record function id": 0, "Sequence number": 2576036, "Fwd thread id": 1, "Ev Idx": 263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685366634.204, "dur": 367.630, + "args": { + "External id": 127241,"Sequence number": 2576036, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 264 + } + }, + { + "ph": "f", "id": 30, "pid": 5717, "tid": 6759, "ts": 6302685366634.204, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685366807.654, "dur": 38.610, + "args": { + "External id": 127242,"kernel_hash": "claezs3y243gqb7p27czfjeb6dpkfa64yia3acm4avk75qpmejcr", "grid": "grid(131328,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "8", "2048", "4", "131328", "384"], "kernel_file": "/tmp/torchinductor_root/la/claezs3y243gqb7p27czfjeb6dpkfa64yia3acm4avk75qpmejcr.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], [8192, 4, 1, 1], [131328, 131328, 131328, 1, 768], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], [8, 2048, 4, 1], [1, 1, 1, 768, 171], [], [], [], [], []], "Ev Idx": 265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685366870.874, "dur": 23.080, + "args": { + "External id": 127243,"kernel_hash": "c2lykxf44rmprhs7blf2vlasz5aag5ltqmm65boxvwgrm34bivxu", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "171"], "kernel_file": "/tmp/torchinductor_root/2l/c2lykxf44rmprhs7blf2vlasz5aag5ltqmm65boxvwgrm34bivxu.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[131328, 131328, 131328, 1, 768], [768, 768, 768, 1], [], []], "Input Dims": [[1, 1, 1, 768, 171], [1, 1, 1, 768], [], []], "Ev Idx": 266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685366919.624, "dur": 38.090, + "args": { + "External id": 127244,"kernel_hash": "czjcllx7i5w3k4f6xkhehxhp74vfmvhv7fivz4vxnhvev6uf35bz", "grid": "grid(65536,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "65536", "768"], "kernel_file": "/tmp/torchinductor_root/zj/czjcllx7i5w3k4f6xkhehxhp74vfmvhv7fivz4vxnhvev6uf35bz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [1], [6291456, 3072, 768, 1], [8192, 4, 1, 1], [6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [768], [8, 2048, 4, 768], [8, 2048, 4, 1], [8, 2048, 4, 768], [], []], "Ev Idx": 267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685367026.844, "dur": 16.510, + "args": { + "External id": 127245,"Record function id": 0, "Ev Idx": 268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685367030.374, "dur": 10.250, + "args": { + "External id": 127246,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685367034.914, "dur": 4.850, + "args": { + "External id": 127247,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685367035.904, "dur": 3.660, + "args": { + "External id": 127248,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: StackBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367048.814, "dur": 31.549, + "args": { + "External id": 127249,"Record function id": 0, "Sequence number": 2576035, "Fwd thread id": 1, "Ev Idx": 272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "StackBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367050.054, "dur": 21.600, + "args": { + "External id": 127250,"Sequence number": 2576035, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 273 + } + }, + { + "ph": "f", "id": 31, "pid": 5717, "tid": 6759, "ts": 6302685367050.054, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 6759, + "ts": 6302685367051.963, "dur": 8.711, + "args": { + "External id": 127251,"Record function id": 0, "Concrete Inputs": ["", "-2", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367057.674, "dur": 1.489, + "args": { + "External id": 127252,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 6759, + "ts": 6302685367061.414, "dur": 3.749, + "args": { + "External id": 127253,"Record function id": 0, "Concrete Inputs": ["", "-2", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367063.123, "dur": 1.480, + "args": { + "External id": 127254,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 6759, + "ts": 6302685367065.723, "dur": 2.420, + "args": { + "External id": 127255,"Record function id": 0, "Concrete Inputs": ["", "-2", "2"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367067.374, "dur": 0.289, + "args": { + "External id": 127256,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 6759, + "ts": 6302685367068.674, "dur": 2.360, + "args": { + "External id": 127257,"Record function id": 0, "Concrete Inputs": ["", "-2", "3"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367070.323, "dur": 0.240, + "args": { + "External id": 127258,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367084.634, "dur": 5.440, + "args": { + "External id": 127259,"Record function id": 0, "Sequence number": 2576034, "Fwd thread id": 1, "Ev Idx": 282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367085.854, "dur": 0.869, + "args": { + "External id": 127260,"Sequence number": 2576034, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 283 + } + }, + { + "ph": "f", "id": 32, "pid": 5717, "tid": 6759, "ts": 6302685367085.854, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685367094.783, "dur": 488.279, + "args": { + "External id": 127261,"Record function id": 0, "Sequence number": 2576033, "Fwd thread id": 1, "Ev Idx": 284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685367096.383, "dur": 474.139, + "args": { + "External id": 127262,"Sequence number": 2576033, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 285 + } + }, + { + "ph": "f", "id": 33, "pid": 5717, "tid": 6759, "ts": 6302685367096.383, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685367144.043, "dur": 12.130, + "args": { + "External id": 127263,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5717, "tid": 6759, + "ts": 6302685367150.393, "dur": 5.160, + "args": { + "External id": 127264,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685367160.633, "dur": 8.920, + "args": { + "External id": 127265,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685367164.533, "dur": 4.240, + "args": { + "External id": 127266,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367167.833, "dur": 0.640, + "args": { + "External id": 127267,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 6759, + "ts": 6302685367174.533, "dur": 86.860, + "args": { + "External id": 127268,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685367175.653, "dur": 5.860, + "args": { + "External id": 127269,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685367178.633, "dur": 2.260, + "args": { + "External id": 127270,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367180.273, "dur": 0.420, + "args": { + "External id": 127271,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 6759, + "ts": 6302685367183.553, "dur": 77.000, + "args": { + "External id": 127272,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685367186.613, "dur": 73.140, + "args": { + "External id": 127273,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 6759, + "ts": 6302685367268.603, "dur": 4.490, + "args": { + "External id": 127274,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685367270.623, "dur": 2.280, + "args": { + "External id": 127275,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685367326.053, "dur": 10.110, + "args": { + "External id": 127276,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685367337.603, "dur": 2.980, + "args": { + "External id": 127277,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685367341.683, "dur": 3.230, + "args": { + "External id": 127278,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685367386.573, "dur": 4.820, + "args": { + "External id": 127279,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685367388.543, "dur": 2.550, + "args": { + "External id": 127280,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5717, "tid": 6759, + "ts": 6302685367416.813, "dur": 135.089, + "args": { + "External id": 127281,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 6759, + "ts": 6302685367425.013, "dur": 8.180, + "args": { + "External id": 127282,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367430.013, "dur": 1.260, + "args": { + "External id": 127283,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685367435.343, "dur": 4.840, + "args": { + "External id": 127284,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367439.033, "dur": 0.390, + "args": { + "External id": 127285,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 6759, + "ts": 6302685367442.583, "dur": 2.040, + "args": { + "External id": 127286,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367443.933, "dur": 0.310, + "args": { + "External id": 127287,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685367445.583, "dur": 2.250, + "args": { + "External id": 127288,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367447.193, "dur": 0.250, + "args": { + "External id": 127289,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685367455.253, "dur": 3.250, + "args": { + "External id": 127290,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367456.633, "dur": 1.540, + "args": { + "External id": 127291,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685367459.653, "dur": 4.760, + "args": { + "External id": 127292,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5717, "tid": 6759, + "ts": 6302685367462.733, "dur": 1.460, + "args": { + "External id": 127293,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685367465.393, "dur": 2.070, + "args": { + "External id": 127294,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367466.933, "dur": 0.220, + "args": { + "External id": 127295,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685367468.313, "dur": 1.930, + "args": { + "External id": 127296,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685367469.063, "dur": 1.050, + "args": { + "External id": 127297,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685367472.113, "dur": 66.509, + "args": { + "External id": 127298,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685367541.402, "dur": 1.931, + "args": { + "External id": 127299,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685367544.402, "dur": 3.060, + "args": { + "External id": 127300,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367546.502, "dur": 0.460, + "args": { + "External id": 127301,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685367549.802, "dur": 0.651, + "args": { + "External id": 127302,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685367595.552, "dur": 10.380, + "args": { + "External id": 127303,"Record function id": 0, "Ev Idx": 326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685367598.442, "dur": 6.170, + "args": { + "External id": 127304,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685367600.662, "dur": 3.150, + "args": { + "External id": 127305,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685367601.512, "dur": 2.070, + "args": { + "External id": 127306,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367610.362, "dur": 9.580, + "args": { + "External id": 127307,"Record function id": 0, "Sequence number": 2576032, "Fwd thread id": 1, "Ev Idx": 330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367611.932, "dur": 4.820, + "args": { + "External id": 127308,"Sequence number": 2576032, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 331 + } + }, + { + "ph": "f", "id": 34, "pid": 5717, "tid": 6759, "ts": 6302685367611.932, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685367613.832, "dur": 2.670, + "args": { + "External id": 127309,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685367614.852, "dur": 1.440, + "args": { + "External id": 127310,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367624.942, "dur": 107.450, + "args": { + "External id": 127311,"Record function id": 0, "Sequence number": 2576031, "Fwd thread id": 1, "Ev Idx": 334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367626.312, "dur": 96.230, + "args": { + "External id": 127312,"Sequence number": 2576031, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 335 + } + }, + { + "ph": "f", "id": 35, "pid": 5717, "tid": 6759, "ts": 6302685367626.312, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685367630.372, "dur": 4.920, + "args": { + "External id": 127313,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685367631.762, "dur": 2.900, + "args": { + "External id": 127314,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367633.402, "dur": 0.940, + "args": { + "External id": 127315,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685367636.922, "dur": 40.910, + "args": { + "External id": 127316,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685367680.112, "dur": 6.910, + "args": { + "External id": 127317,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685367682.002, "dur": 4.260, + "args": { + "External id": 127318,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367683.922, "dur": 2.070, + "args": { + "External id": 127319,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685367689.222, "dur": 3.100, + "args": { + "External id": 127320,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685367690.022, "dur": 1.800, + "args": { + "External id": 127321,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367691.412, "dur": 0.250, + "args": { + "External id": 127322,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685367693.032, "dur": 28.470, + "args": { + "External id": 127323,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367741.522, "dur": 11.730, + "args": { + "External id": 127324,"Record function id": 0, "Sequence number": 2576030, "Fwd thread id": 1, "Ev Idx": 347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367743.402, "dur": 7.550, + "args": { + "External id": 127325,"Sequence number": 2576030, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 348 + } + }, + { + "ph": "f", "id": 36, "pid": 5717, "tid": 6759, "ts": 6302685367743.402, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685367745.842, "dur": 4.900, + "args": { + "External id": 127326,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685367748.082, "dur": 2.450, + "args": { + "External id": 127327,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367757.162, "dur": 9.090, + "args": { + "External id": 127328,"Record function id": 0, "Sequence number": 2576029, "Fwd thread id": 1, "Ev Idx": 351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367758.422, "dur": 5.490, + "args": { + "External id": 127329,"Sequence number": 2576029, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 352 + } + }, + { + "ph": "f", "id": 37, "pid": 5717, "tid": 6759, "ts": 6302685367758.422, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685367759.532, "dur": 4.150, + "args": { + "External id": 127330,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685367760.552, "dur": 2.580, + "args": { + "External id": 127331,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367762.042, "dur": 0.820, + "args": { + "External id": 127332,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685367771.122, "dur": 8.060, + "args": { + "External id": 127333,"Record function id": 0, "Ev Idx": 356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685367772.932, "dur": 5.150, + "args": { + "External id": 127334,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685367774.412, "dur": 3.270, + "args": { + "External id": 127335,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685367776.112, "dur": 1.370, + "args": { + "External id": 127336,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367783.052, "dur": 8.420, + "args": { + "External id": 127337,"Record function id": 0, "Sequence number": 2576028, "Fwd thread id": 1, "Ev Idx": 360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367784.882, "dur": 3.180, + "args": { + "External id": 127338,"Sequence number": 2576028, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 361 + } + }, + { + "ph": "f", "id": 38, "pid": 5717, "tid": 6759, "ts": 6302685367784.882, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685367786.102, "dur": 1.830, + "args": { + "External id": 127339,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685367786.872, "dur": 0.900, + "args": { + "External id": 127340,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367794.842, "dur": 98.060, + "args": { + "External id": 127341,"Record function id": 0, "Sequence number": 2576027, "Fwd thread id": 1, "Ev Idx": 364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367796.062, "dur": 86.820, + "args": { + "External id": 127342,"Sequence number": 2576027, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 365 + } + }, + { + "ph": "f", "id": 39, "pid": 5717, "tid": 6759, "ts": 6302685367796.062, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685367798.572, "dur": 4.840, + "args": { + "External id": 127343,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685367799.262, "dur": 3.710, + "args": { + "External id": 127344,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367802.092, "dur": 0.640, + "args": { + "External id": 127345,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685367804.342, "dur": 34.710, + "args": { + "External id": 127346,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685367840.902, "dur": 5.690, + "args": { + "External id": 127347,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685367842.002, "dur": 3.800, + "args": { + "External id": 127348,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367843.662, "dur": 1.930, + "args": { + "External id": 127349,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685367851.012, "dur": 4.540, + "args": { + "External id": 127350,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685367853.082, "dur": 2.060, + "args": { + "External id": 127351,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367854.312, "dur": 0.650, + "args": { + "External id": 127352,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685367856.452, "dur": 25.370, + "args": { + "External id": 127353,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367903.102, "dur": 36.250, + "args": { + "External id": 127354,"Record function id": 0, "Sequence number": 2576026, "Fwd thread id": 1, "Ev Idx": 377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367904.992, "dur": 6.090, + "args": { + "External id": 127355,"Sequence number": 2576026, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 378 + } + }, + { + "ph": "f", "id": 40, "pid": 5717, "tid": 6759, "ts": 6302685367904.992, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685367907.052, "dur": 3.860, + "args": { + "External id": 127356,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685367908.152, "dur": 2.560, + "args": { + "External id": 127357,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685367915.062, "dur": 20.170, + "args": { + "External id": 127358,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367950.012, "dur": 20.940, + "args": { + "External id": 127359,"Record function id": 0, "Sequence number": 2576025, "Fwd thread id": 1, "Ev Idx": 382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367953.061, "dur": 11.960, + "args": { + "External id": 127360,"Sequence number": 2576025, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 383 + } + }, + { + "ph": "f", "id": 41, "pid": 5717, "tid": 6759, "ts": 6302685367953.061, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685367956.841, "dur": 7.920, + "args": { + "External id": 127361,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685367958.332, "dur": 5.409, + "args": { + "External id": 127362,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685367961.241, "dur": 2.131, + "args": { + "External id": 127363,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685367978.812, "dur": 7.869, + "args": { + "External id": 127364,"Record function id": 0, "Ev Idx": 387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685367980.832, "dur": 4.709, + "args": { + "External id": 127365,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685367982.592, "dur": 2.549, + "args": { + "External id": 127366,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685367983.441, "dur": 1.540, + "args": { + "External id": 127367,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367990.612, "dur": 95.039, + "args": { + "External id": 127368,"Record function id": 0, "Sequence number": 2576024, "Fwd thread id": 1, "Ev Idx": 391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685367991.912, "dur": 44.719, + "args": { + "External id": 127369,"Sequence number": 2576024, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 392 + } + }, + { + "ph": "f", "id": 42, "pid": 5717, "tid": 6759, "ts": 6302685367991.912, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685367994.612, "dur": 25.229, + "args": { + "External id": 127370,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685368021.461, "dur": 14.720, + "args": { + "External id": 127371,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685368040.041, "dur": 32.360, + "args": { + "External id": 127372,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685368077.501, "dur": 2.040, + "args": { + "External id": 127373,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685368094.481, "dur": 7.820, + "args": { + "External id": 127374,"Record function id": 0, "Ev Idx": 397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685368096.991, "dur": 4.080, + "args": { + "External id": 127375,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685368098.571, "dur": 2.030, + "args": { + "External id": 127376,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685368099.261, "dur": 1.170, + "args": { + "External id": 127377,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368106.311, "dur": 43.780, + "args": { + "External id": 127378,"Record function id": 0, "Sequence number": 2576023, "Fwd thread id": 1, "Ev Idx": 401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368107.731, "dur": 35.420, + "args": { + "External id": 127379,"Sequence number": 2576023, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 402 + } + }, + { + "ph": "f", "id": 43, "pid": 5717, "tid": 6759, "ts": 6302685368107.731, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685368111.881, "dur": 30.910, + "args": { + "External id": 127380,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685368113.541, "dur": 28.990, + "args": { + "External id": 127381,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685368116.971, "dur": 5.540, + "args": { + "External id": 127382,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685368123.741, "dur": 18.050, + "args": { + "External id": 127383,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368156.891, "dur": 99.130, + "args": { + "External id": 127384,"Record function id": 0, "Sequence number": 2576022, "Fwd thread id": 1, "Ev Idx": 407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368158.571, "dur": 61.740, + "args": { + "External id": 127385,"Sequence number": 2576022, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 408 + } + }, + { + "ph": "f", "id": 44, "pid": 5717, "tid": 6759, "ts": 6302685368158.571, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685368161.571, "dur": 29.420, + "args": { + "External id": 127386,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685368194.011, "dur": 24.600, + "args": { + "External id": 127387,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685368225.291, "dur": 25.580, + "args": { + "External id": 127388,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368263.631, "dur": 126.709, + "args": { + "External id": 127389,"Record function id": 0, "Sequence number": 2576021, "Fwd thread id": 1, "Ev Idx": 412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368266.391, "dur": 113.740, + "args": { + "External id": 127390,"Sequence number": 2576021, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 413 + } + }, + { + "ph": "f", "id": 45, "pid": 5717, "tid": 6759, "ts": 6302685368266.391, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685368270.491, "dur": 44.880, + "args": { + "External id": 127391,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685368278.181, "dur": 0.720, + "args": { + "External id": 127392,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685368280.831, "dur": 0.570, + "args": { + "External id": 127393,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685368319.131, "dur": 39.310, + "args": { + "External id": 127394,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685368326.281, "dur": 29.990, + "args": { + "External id": 127395,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685368359.411, "dur": 17.129, + "args": { + "External id": 127396,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685368402.251, "dur": 4.700, + "args": { + "External id": 127397,"Record function id": 0, "Sequence number": 2576020, "Fwd thread id": 1, "Ev Idx": 420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685368404.180, "dur": 0.631, + "args": { + "External id": 127398,"Sequence number": 2576020, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 421 + } + }, + { + "ph": "f", "id": 46, "pid": 5717, "tid": 6759, "ts": 6302685368404.180, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685368410.511, "dur": 50.680, + "args": { + "External id": 127399,"Record function id": 0, "Sequence number": 2576019, "Fwd thread id": 1, "Ev Idx": 422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685368413.080, "dur": 43.751, + "args": { + "External id": 127400,"Sequence number": 2576019, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 423 + } + }, + { + "ph": "f", "id": 47, "pid": 5717, "tid": 6759, "ts": 6302685368413.080, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 6759, + "ts": 6302685368417.511, "dur": 8.060, + "args": { + "External id": 127401,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685368422.291, "dur": 1.489, + "args": { + "External id": 127402,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685368427.691, "dur": 28.500, + "args": { + "External id": 127403,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685368432.871, "dur": 22.340, + "args": { + "External id": 127404,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368468.460, "dur": 88.530, + "args": { + "External id": 127405,"Record function id": 0, "Sequence number": 2576018, "Fwd thread id": 1, "Ev Idx": 428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368470.190, "dur": 64.930, + "args": { + "External id": 127406,"Sequence number": 2576018, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 429 + } + }, + { + "ph": "f", "id": 48, "pid": 5717, "tid": 6759, "ts": 6302685368470.190, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685368472.340, "dur": 31.200, + "args": { + "External id": 127407,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685368473.950, "dur": 0.430, + "args": { + "External id": 127408,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685368475.200, "dur": 0.200, + "args": { + "External id": 127409,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685368481.070, "dur": 21.280, + "args": { + "External id": 127410,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685368504.660, "dur": 16.950, + "args": { + "External id": 127411,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685368507.210, "dur": 13.530, + "args": { + "External id": 127412,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685368522.560, "dur": 9.370, + "args": { + "External id": 127413,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685368541.090, "dur": 12.490, + "args": { + "External id": 127414,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368566.390, "dur": 56.610, + "args": { + "External id": 127415,"Record function id": 0, "Sequence number": 2576017, "Fwd thread id": 1, "Ev Idx": 438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368568.230, "dur": 28.220, + "args": { + "External id": 127416,"Sequence number": 2576017, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 439 + } + }, + { + "ph": "f", "id": 49, "pid": 5717, "tid": 6759, "ts": 6302685368568.230, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685368569.940, "dur": 26.200, + "args": { + "External id": 127417,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685368571.030, "dur": 24.880, + "args": { + "External id": 127418,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685368574.540, "dur": 4.940, + "args": { + "External id": 127419,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685368580.770, "dur": 14.620, + "args": { + "External id": 127420,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685368600.930, "dur": 16.610, + "args": { + "External id": 127421,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368630.620, "dur": 7.510, + "args": { + "External id": 127422,"Record function id": 0, "Sequence number": 2576016, "Fwd thread id": 1, "Ev Idx": 445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368632.370, "dur": 1.340, + "args": { + "External id": 127423,"Sequence number": 2576016, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 446 + } + }, + { + "ph": "f", "id": 50, "pid": 5717, "tid": 6759, "ts": 6302685368632.370, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368641.750, "dur": 13.640, + "args": { + "External id": 127424,"Record function id": 0, "Sequence number": 2576015, "Fwd thread id": 1, "Ev Idx": 447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368645.630, "dur": 7.540, + "args": { + "External id": 127425,"Sequence number": 2576015, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 448 + } + }, + { + "ph": "f", "id": 51, "pid": 5717, "tid": 6759, "ts": 6302685368645.630, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685368647.690, "dur": 5.310, + "args": { + "External id": 127426,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685368650.010, "dur": 2.750, + "args": { + "External id": 127427,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368658.880, "dur": 101.900, + "args": { + "External id": 127428,"Record function id": 0, "Sequence number": 2576014, "Fwd thread id": 1, "Ev Idx": 451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368660.100, "dur": 95.080, + "args": { + "External id": 127429,"Sequence number": 2576014, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 452 + } + }, + { + "ph": "f", "id": 52, "pid": 5717, "tid": 6759, "ts": 6302685368660.100, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685368662.670, "dur": 5.400, + "args": { + "External id": 127430,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685368664.150, "dur": 3.130, + "args": { + "External id": 127431,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685368665.890, "dur": 1.090, + "args": { + "External id": 127432,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685368670.380, "dur": 42.250, + "args": { + "External id": 127433,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685368714.540, "dur": 6.240, + "args": { + "External id": 127434,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685368716.410, "dur": 3.590, + "args": { + "External id": 127435,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685368718.100, "dur": 1.700, + "args": { + "External id": 127436,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685368722.750, "dur": 4.190, + "args": { + "External id": 127437,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685368723.860, "dur": 2.740, + "args": { + "External id": 127438,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685368726.110, "dur": 0.350, + "args": { + "External id": 127439,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685368727.860, "dur": 26.380, + "args": { + "External id": 127440,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368768.920, "dur": 11.130, + "args": { + "External id": 127441,"Record function id": 0, "Sequence number": 2576013, "Fwd thread id": 1, "Ev Idx": 464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368771.070, "dur": 5.600, + "args": { + "External id": 127442,"Sequence number": 2576013, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 465 + } + }, + { + "ph": "f", "id": 53, "pid": 5717, "tid": 6759, "ts": 6302685368771.070, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685368773.000, "dur": 3.500, + "args": { + "External id": 127443,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685368774.180, "dur": 2.100, + "args": { + "External id": 127444,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368783.600, "dur": 10.670, + "args": { + "External id": 127445,"Record function id": 0, "Sequence number": 2576012, "Fwd thread id": 1, "Ev Idx": 468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368784.800, "dur": 6.380, + "args": { + "External id": 127446,"Sequence number": 2576012, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 469 + } + }, + { + "ph": "f", "id": 54, "pid": 5717, "tid": 6759, "ts": 6302685368784.800, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685368785.740, "dur": 5.190, + "args": { + "External id": 127447,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685368786.740, "dur": 3.690, + "args": { + "External id": 127448,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685368789.530, "dur": 0.630, + "args": { + "External id": 127449,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685368799.850, "dur": 8.170, + "args": { + "External id": 127450,"Record function id": 0, "Ev Idx": 473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685368801.960, "dur": 4.840, + "args": { + "External id": 127451,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685368803.720, "dur": 2.670, + "args": { + "External id": 127452,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685368804.470, "dur": 1.740, + "args": { + "External id": 127453,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368811.450, "dur": 8.480, + "args": { + "External id": 127454,"Record function id": 0, "Sequence number": 2576011, "Fwd thread id": 1, "Ev Idx": 477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685368812.839, "dur": 3.931, + "args": { + "External id": 127455,"Sequence number": 2576011, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 478 + } + }, + { + "ph": "f", "id": 55, "pid": 5717, "tid": 6759, "ts": 6302685368812.839, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685368814.179, "dur": 2.451, + "args": { + "External id": 127456,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685368815.179, "dur": 1.271, + "args": { + "External id": 127457,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5717, "tid": 6759, + "ts": 6302685368824.959, "dur": 350.270, + "args": { + "External id": 127458,"Record function id": 0, "Sequence number": 2576010, "Fwd thread id": 1, "Ev Idx": 481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5717, "tid": 6759, + "ts": 6302685368826.719, "dur": 318.580, + "args": { + "External id": 127459,"Sequence number": 2576010, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 482 + } + }, + { + "ph": "f", "id": 56, "pid": 5717, "tid": 6759, "ts": 6302685368826.719, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685368845.519, "dur": 8.731, + "args": { + "External id": 127460,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685368847.879, "dur": 5.740, + "args": { + "External id": 127461,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685368856.650, "dur": 5.820, + "args": { + "External id": 127462,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685368857.819, "dur": 4.391, + "args": { + "External id": 127463,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685368864.090, "dur": 5.600, + "args": { + "External id": 127464,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685368865.310, "dur": 4.160, + "args": { + "External id": 127465,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685368894.470, "dur": 199.109, + "args": { + "External id": 127466,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685368993.359, "dur": 5.550, + "args": { + "External id": 127467,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685369000.949, "dur": 3.450, + "args": { + "External id": 127468,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685369117.989, "dur": 8.800, + "args": { + "External id": 127469,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685369133.989, "dur": 0.680, + "args": { + "External id": 127470,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685369140.039, "dur": 0.690, + "args": { + "External id": 127471,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685369186.379, "dur": 331.759, + "args": { + "External id": 127472,"Record function id": 0, "Sequence number": 2576009, "Fwd thread id": 1, "Ev Idx": 495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685369189.259, "dur": 318.979, + "args": { + "External id": 127473,"Sequence number": 2576009, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 496 + } + }, + { + "ph": "f", "id": 57, "pid": 5717, "tid": 6759, "ts": 6302685369189.259, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685369208.959, "dur": 41.090, + "args": { + "External id": 127474,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685369213.349, "dur": 6.710, + "args": { + "External id": 127475,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685369221.109, "dur": 28.090, + "args": { + "External id": 127476,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685369261.358, "dur": 11.311, + "args": { + "External id": 127477,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685369264.378, "dur": 7.800, + "args": { + "External id": 127478,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685369530.228, "dur": 163.490, + "args": { + "External id": 127479,"Record function id": 0, "Sequence number": 2576008, "Fwd thread id": 1, "Ev Idx": 502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685369533.108, "dur": 152.180, + "args": { + "External id": 127480,"Sequence number": 2576008, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 503 + } + }, + { + "ph": "f", "id": 58, "pid": 5717, "tid": 6759, "ts": 6302685369533.108, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685369548.028, "dur": 32.690, + "args": { + "External id": 127481,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685369551.558, "dur": 6.320, + "args": { + "External id": 127482,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685369559.048, "dur": 21.100, + "args": { + "External id": 127483,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685369588.958, "dur": 6.940, + "args": { + "External id": 127484,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685369590.708, "dur": 4.810, + "args": { + "External id": 127485,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369704.708, "dur": 24.620, + "args": { + "External id": 127486,"Record function id": 0, "Sequence number": 2576007, "Fwd thread id": 1, "Ev Idx": 509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369707.048, "dur": 18.549, + "args": { + "External id": 127487,"Sequence number": 2576007, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 510 + } + }, + { + "ph": "f", "id": 59, "pid": 5717, "tid": 6759, "ts": 6302685369707.048, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685369710.837, "dur": 14.480, + "args": { + "External id": 127488,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685369721.197, "dur": 3.840, + "args": { + "External id": 127489,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369733.597, "dur": 7.171, + "args": { + "External id": 127490,"Record function id": 0, "Sequence number": 2576006, "Fwd thread id": 1, "Ev Idx": 513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369734.797, "dur": 3.400, + "args": { + "External id": 127491,"Sequence number": 2576006, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 514 + } + }, + { + "ph": "f", "id": 60, "pid": 5717, "tid": 6759, "ts": 6302685369734.797, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685369736.237, "dur": 1.811, + "args": { + "External id": 127492,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685369736.988, "dur": 0.880, + "args": { + "External id": 127493,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369744.148, "dur": 6.580, + "args": { + "External id": 127494,"Record function id": 0, "Sequence number": 2576005, "Fwd thread id": 1, "Ev Idx": 517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369745.397, "dur": 3.080, + "args": { + "External id": 127495,"Sequence number": 2576005, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 518 + } + }, + { + "ph": "f", "id": 61, "pid": 5717, "tid": 6759, "ts": 6302685369745.397, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685369746.628, "dur": 1.689, + "args": { + "External id": 127496,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685369747.377, "dur": 0.780, + "args": { + "External id": 127497,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369754.248, "dur": 9.220, + "args": { + "External id": 127498,"Record function id": 0, "Sequence number": 2576004, "Fwd thread id": 1, "Ev Idx": 521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369755.628, "dur": 4.300, + "args": { + "External id": 127499,"Sequence number": 2576004, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 522 + } + }, + { + "ph": "f", "id": 62, "pid": 5717, "tid": 6759, "ts": 6302685369755.628, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685369757.788, "dur": 1.980, + "args": { + "External id": 127500,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685369758.788, "dur": 0.820, + "args": { + "External id": 127501,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369766.797, "dur": 103.620, + "args": { + "External id": 127502,"Record function id": 0, "Sequence number": 2576003, "Fwd thread id": 1, "Ev Idx": 525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369767.897, "dur": 92.860, + "args": { + "External id": 127503,"Sequence number": 2576003, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 526 + } + }, + { + "ph": "f", "id": 63, "pid": 5717, "tid": 6759, "ts": 6302685369767.897, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685369771.377, "dur": 7.460, + "args": { + "External id": 127504,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685369772.688, "dur": 5.589, + "args": { + "External id": 127505,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685369775.657, "dur": 2.271, + "args": { + "External id": 127506,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685369779.857, "dur": 43.360, + "args": { + "External id": 127507,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685369825.147, "dur": 5.820, + "args": { + "External id": 127508,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685369826.207, "dur": 3.880, + "args": { + "External id": 127509,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685369827.917, "dur": 1.910, + "args": { + "External id": 127510,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685369832.917, "dur": 4.140, + "args": { + "External id": 127511,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685369833.777, "dur": 2.860, + "args": { + "External id": 127512,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685369836.117, "dur": 0.380, + "args": { + "External id": 127513,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685369837.747, "dur": 22.110, + "args": { + "External id": 127514,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369879.217, "dur": 14.420, + "args": { + "External id": 127515,"Record function id": 0, "Sequence number": 2576002, "Fwd thread id": 1, "Ev Idx": 538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369881.117, "dur": 8.990, + "args": { + "External id": 127516,"Sequence number": 2576002, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 539 + } + }, + { + "ph": "f", "id": 64, "pid": 5717, "tid": 6759, "ts": 6302685369881.117, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685369885.697, "dur": 4.160, + "args": { + "External id": 127517,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685369887.177, "dur": 2.460, + "args": { + "External id": 127518,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369897.697, "dur": 11.180, + "args": { + "External id": 127519,"Record function id": 0, "Sequence number": 2576001, "Fwd thread id": 1, "Ev Idx": 542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369899.187, "dur": 6.690, + "args": { + "External id": 127520,"Sequence number": 2576001, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 543 + } + }, + { + "ph": "f", "id": 65, "pid": 5717, "tid": 6759, "ts": 6302685369899.187, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685369901.347, "dur": 4.290, + "args": { + "External id": 127521,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685369902.307, "dur": 2.750, + "args": { + "External id": 127522,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685369904.017, "dur": 0.770, + "args": { + "External id": 127523,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685369915.857, "dur": 11.240, + "args": { + "External id": 127524,"Record function id": 0, "Ev Idx": 547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685369919.267, "dur": 6.500, + "args": { + "External id": 127525,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685369921.107, "dur": 4.150, + "args": { + "External id": 127526,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685369921.787, "dur": 3.250, + "args": { + "External id": 127527,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369934.547, "dur": 12.030, + "args": { + "External id": 127528,"Record function id": 0, "Sequence number": 2576000, "Fwd thread id": 1, "Ev Idx": 551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369936.147, "dur": 6.390, + "args": { + "External id": 127529,"Sequence number": 2576000, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 552 + } + }, + { + "ph": "f", "id": 66, "pid": 5717, "tid": 6759, "ts": 6302685369936.147, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685369939.087, "dur": 3.260, + "args": { + "External id": 127530,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685369940.897, "dur": 1.260, + "args": { + "External id": 127531,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369951.457, "dur": 104.190, + "args": { + "External id": 127532,"Record function id": 0, "Sequence number": 2575999, "Fwd thread id": 1, "Ev Idx": 555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685369952.707, "dur": 93.370, + "args": { + "External id": 127533,"Sequence number": 2575999, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 556 + } + }, + { + "ph": "f", "id": 67, "pid": 5717, "tid": 6759, "ts": 6302685369952.707, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685369956.677, "dur": 5.970, + "args": { + "External id": 127534,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685369957.317, "dur": 4.870, + "args": { + "External id": 127535,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685369959.927, "dur": 1.990, + "args": { + "External id": 127536,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685369964.947, "dur": 45.250, + "args": { + "External id": 127537,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685370012.107, "dur": 5.900, + "args": { + "External id": 127538,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685370014.207, "dur": 3.020, + "args": { + "External id": 127539,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370016.207, "dur": 0.820, + "args": { + "External id": 127540,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685370019.777, "dur": 2.990, + "args": { + "External id": 127541,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685370020.647, "dur": 1.530, + "args": { + "External id": 127542,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370021.717, "dur": 0.310, + "args": { + "External id": 127543,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685370023.717, "dur": 21.460, + "args": { + "External id": 127544,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370064.307, "dur": 33.770, + "args": { + "External id": 127545,"Record function id": 0, "Sequence number": 2575998, "Fwd thread id": 1, "Ev Idx": 568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370066.137, "dur": 7.000, + "args": { + "External id": 127546,"Sequence number": 2575998, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 569 + } + }, + { + "ph": "f", "id": 68, "pid": 5717, "tid": 6759, "ts": 6302685370066.137, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685370069.337, "dur": 3.660, + "args": { + "External id": 127547,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685370070.577, "dur": 2.200, + "args": { + "External id": 127548,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685370075.847, "dur": 18.680, + "args": { + "External id": 127549,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370104.517, "dur": 12.300, + "args": { + "External id": 127550,"Record function id": 0, "Sequence number": 2575997, "Fwd thread id": 1, "Ev Idx": 573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370106.327, "dur": 7.820, + "args": { + "External id": 127551,"Sequence number": 2575997, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 574 + } + }, + { + "ph": "f", "id": 69, "pid": 5717, "tid": 6759, "ts": 6302685370106.327, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685370107.387, "dur": 6.520, + "args": { + "External id": 127552,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685370108.717, "dur": 4.340, + "args": { + "External id": 127553,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370111.797, "dur": 0.930, + "args": { + "External id": 127554,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685370121.957, "dur": 8.440, + "args": { + "External id": 127555,"Record function id": 0, "Ev Idx": 578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685370123.867, "dur": 5.350, + "args": { + "External id": 127556,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685370125.377, "dur": 3.400, + "args": { + "External id": 127557,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685370126.047, "dur": 2.530, + "args": { + "External id": 127558,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370134.177, "dur": 7.859, + "args": { + "External id": 127559,"Record function id": 0, "Sequence number": 2575996, "Fwd thread id": 1, "Ev Idx": 582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370135.677, "dur": 3.659, + "args": { + "External id": 127560,"Sequence number": 2575996, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 583 + } + }, + { + "ph": "f", "id": 70, "pid": 5717, "tid": 6759, "ts": 6302685370135.677, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685370136.796, "dur": 2.371, + "args": { + "External id": 127561,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685370137.576, "dur": 1.400, + "args": { + "External id": 127562,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370145.447, "dur": 91.689, + "args": { + "External id": 127563,"Record function id": 0, "Sequence number": 2575995, "Fwd thread id": 1, "Ev Idx": 586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370147.707, "dur": 79.249, + "args": { + "External id": 127564,"Sequence number": 2575995, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 587 + } + }, + { + "ph": "f", "id": 71, "pid": 5717, "tid": 6759, "ts": 6302685370147.707, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685370150.287, "dur": 4.969, + "args": { + "External id": 127565,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685370150.956, "dur": 3.900, + "args": { + "External id": 127566,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370153.127, "dur": 1.500, + "args": { + "External id": 127567,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685370156.327, "dur": 37.080, + "args": { + "External id": 127568,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685370195.256, "dur": 5.431, + "args": { + "External id": 127569,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685370196.147, "dur": 3.809, + "args": { + "External id": 127570,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370198.927, "dur": 0.809, + "args": { + "External id": 127571,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685370202.427, "dur": 2.649, + "args": { + "External id": 127572,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685370203.336, "dur": 1.400, + "args": { + "External id": 127573,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370204.316, "dur": 0.280, + "args": { + "External id": 127574,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685370205.716, "dur": 20.420, + "args": { + "External id": 127575,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370245.736, "dur": 29.180, + "args": { + "External id": 127576,"Record function id": 0, "Sequence number": 2575994, "Fwd thread id": 1, "Ev Idx": 599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370247.526, "dur": 6.710, + "args": { + "External id": 127577,"Sequence number": 2575994, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 600 + } + }, + { + "ph": "f", "id": 72, "pid": 5717, "tid": 6759, "ts": 6302685370247.526, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685370249.806, "dur": 4.260, + "args": { + "External id": 127578,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685370251.686, "dur": 2.130, + "args": { + "External id": 127579,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685370256.956, "dur": 14.510, + "args": { + "External id": 127580,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370280.776, "dur": 9.980, + "args": { + "External id": 127581,"Record function id": 0, "Sequence number": 2575993, "Fwd thread id": 1, "Ev Idx": 604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370282.316, "dur": 6.250, + "args": { + "External id": 127582,"Sequence number": 2575993, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 605 + } + }, + { + "ph": "f", "id": 73, "pid": 5717, "tid": 6759, "ts": 6302685370282.316, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685370283.286, "dur": 5.040, + "args": { + "External id": 127583,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685370284.436, "dur": 3.020, + "args": { + "External id": 127584,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370286.156, "dur": 0.990, + "args": { + "External id": 127585,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685370295.746, "dur": 18.880, + "args": { + "External id": 127586,"Record function id": 0, "Ev Idx": 609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685370306.266, "dur": 6.940, + "args": { + "External id": 127587,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685370307.806, "dur": 4.920, + "args": { + "External id": 127588,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685370309.536, "dur": 2.940, + "args": { + "External id": 127589,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370318.766, "dur": 75.280, + "args": { + "External id": 127590,"Record function id": 0, "Sequence number": 2575992, "Fwd thread id": 1, "Ev Idx": 613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370319.986, "dur": 35.550, + "args": { + "External id": 127591,"Sequence number": 2575992, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 614 + } + }, + { + "ph": "f", "id": 74, "pid": 5717, "tid": 6759, "ts": 6302685370319.986, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685370322.226, "dur": 19.380, + "args": { + "External id": 127592,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685370343.286, "dur": 11.830, + "args": { + "External id": 127593,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685370358.446, "dur": 25.250, + "args": { + "External id": 127594,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685370386.576, "dur": 2.110, + "args": { + "External id": 127595,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685370402.766, "dur": 8.170, + "args": { + "External id": 127596,"Record function id": 0, "Ev Idx": 619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685370405.056, "dur": 4.620, + "args": { + "External id": 127597,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685370406.996, "dur": 2.190, + "args": { + "External id": 127598,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685370407.716, "dur": 1.290, + "args": { + "External id": 127599,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370415.636, "dur": 34.700, + "args": { + "External id": 127600,"Record function id": 0, "Sequence number": 2575991, "Fwd thread id": 1, "Ev Idx": 623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370417.986, "dur": 28.350, + "args": { + "External id": 127601,"Sequence number": 2575991, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 624 + } + }, + { + "ph": "f", "id": 75, "pid": 5717, "tid": 6759, "ts": 6302685370417.986, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685370419.486, "dur": 26.490, + "args": { + "External id": 127602,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685370420.476, "dur": 25.220, + "args": { + "External id": 127603,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370423.836, "dur": 5.270, + "args": { + "External id": 127604,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685370430.266, "dur": 14.720, + "args": { + "External id": 127605,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370456.456, "dur": 65.590, + "args": { + "External id": 127606,"Record function id": 0, "Sequence number": 2575990, "Fwd thread id": 1, "Ev Idx": 629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370457.896, "dur": 40.390, + "args": { + "External id": 127607,"Sequence number": 2575990, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 630 + } + }, + { + "ph": "f", "id": 76, "pid": 5717, "tid": 6759, "ts": 6302685370457.896, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685370460.246, "dur": 24.660, + "args": { + "External id": 127608,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685370486.366, "dur": 11.480, + "args": { + "External id": 127609,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685370501.686, "dur": 15.480, + "args": { + "External id": 127610,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370530.016, "dur": 57.559, + "args": { + "External id": 127611,"Record function id": 0, "Sequence number": 2575989, "Fwd thread id": 1, "Ev Idx": 634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370531.676, "dur": 50.859, + "args": { + "External id": 127612,"Sequence number": 2575989, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 635 + } + }, + { + "ph": "f", "id": 77, "pid": 5717, "tid": 6759, "ts": 6302685370531.676, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685370534.146, "dur": 19.270, + "args": { + "External id": 127613,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685370536.676, "dur": 0.450, + "args": { + "External id": 127614,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685370538.086, "dur": 0.290, + "args": { + "External id": 127615,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685370554.626, "dur": 15.460, + "args": { + "External id": 127616,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685370557.696, "dur": 11.550, + "args": { + "External id": 127617,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685370570.926, "dur": 9.689, + "args": { + "External id": 127618,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685370594.826, "dur": 5.460, + "args": { + "External id": 127619,"Record function id": 0, "Sequence number": 2575988, "Fwd thread id": 1, "Ev Idx": 642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685370597.746, "dur": 0.360, + "args": { + "External id": 127620,"Sequence number": 2575988, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 643 + } + }, + { + "ph": "f", "id": 78, "pid": 5717, "tid": 6759, "ts": 6302685370597.746, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685370603.666, "dur": 35.469, + "args": { + "External id": 127621,"Record function id": 0, "Sequence number": 2575987, "Fwd thread id": 1, "Ev Idx": 644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685370604.855, "dur": 29.871, + "args": { + "External id": 127622,"Sequence number": 2575987, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 645 + } + }, + { + "ph": "f", "id": 79, "pid": 5717, "tid": 6759, "ts": 6302685370604.855, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 6759, + "ts": 6302685370607.195, "dur": 5.680, + "args": { + "External id": 127623,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370610.026, "dur": 1.389, + "args": { + "External id": 127624,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685370613.726, "dur": 20.389, + "args": { + "External id": 127625,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685370616.255, "dur": 16.851, + "args": { + "External id": 127626,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370645.935, "dur": 83.790, + "args": { + "External id": 127627,"Record function id": 0, "Sequence number": 2575986, "Fwd thread id": 1, "Ev Idx": 650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370647.515, "dur": 63.410, + "args": { + "External id": 127628,"Sequence number": 2575986, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 651 + } + }, + { + "ph": "f", "id": 80, "pid": 5717, "tid": 6759, "ts": 6302685370647.515, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685370649.286, "dur": 29.239, + "args": { + "External id": 127629,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685370650.615, "dur": 0.311, + "args": { + "External id": 127630,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685370651.786, "dur": 0.180, + "args": { + "External id": 127631,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685370658.866, "dur": 18.509, + "args": { + "External id": 127632,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685370679.575, "dur": 17.550, + "args": { + "External id": 127633,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685370683.105, "dur": 13.140, + "args": { + "External id": 127634,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685370699.035, "dur": 10.080, + "args": { + "External id": 127635,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685370716.505, "dur": 9.410, + "args": { + "External id": 127636,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370738.075, "dur": 41.630, + "args": { + "External id": 127637,"Record function id": 0, "Sequence number": 2575985, "Fwd thread id": 1, "Ev Idx": 660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370739.845, "dur": 24.350, + "args": { + "External id": 127638,"Sequence number": 2575985, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 661 + } + }, + { + "ph": "f", "id": 81, "pid": 5717, "tid": 6759, "ts": 6302685370739.845, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685370741.375, "dur": 22.510, + "args": { + "External id": 127639,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685370742.255, "dur": 21.370, + "args": { + "External id": 127640,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370745.645, "dur": 4.860, + "args": { + "External id": 127641,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685370751.615, "dur": 11.410, + "args": { + "External id": 127642,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685370768.525, "dur": 8.920, + "args": { + "External id": 127643,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370787.385, "dur": 7.480, + "args": { + "External id": 127644,"Record function id": 0, "Sequence number": 2575984, "Fwd thread id": 1, "Ev Idx": 667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685370790.455, "dur": 1.000, + "args": { + "External id": 127645,"Sequence number": 2575984, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 668 + } + }, + { + "ph": "f", "id": 82, "pid": 5717, "tid": 6759, "ts": 6302685370790.455, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685370799.535, "dur": 299.959, + "args": { + "External id": 127646,"Record function id": 0, "Sequence number": 2575983, "Fwd thread id": 1, "Ev Idx": 669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685370801.025, "dur": 286.809, + "args": { + "External id": 127647,"Sequence number": 2575983, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 670 + } + }, + { + "ph": "f", "id": 83, "pid": 5717, "tid": 6759, "ts": 6302685370801.025, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685370826.615, "dur": 7.210, + "args": { + "External id": 127648,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5717, "tid": 6759, + "ts": 6302685370830.125, "dur": 3.260, + "args": { + "External id": 127649,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685370836.075, "dur": 6.160, + "args": { + "External id": 127650,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685370837.365, "dur": 4.130, + "args": { + "External id": 127651,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370840.375, "dur": 0.840, + "args": { + "External id": 127652,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 6759, + "ts": 6302685370844.625, "dur": 40.670, + "args": { + "External id": 127653,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685370845.165, "dur": 3.450, + "args": { + "External id": 127654,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685370845.765, "dur": 2.480, + "args": { + "External id": 127655,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370846.835, "dur": 1.200, + "args": { + "External id": 127656,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 6759, + "ts": 6302685370849.555, "dur": 34.960, + "args": { + "External id": 127657,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685370850.625, "dur": 33.090, + "args": { + "External id": 127658,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 6759, + "ts": 6302685370890.895, "dur": 3.670, + "args": { + "External id": 127659,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685370892.325, "dur": 2.040, + "args": { + "External id": 127660,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685370916.135, "dur": 6.010, + "args": { + "External id": 127661,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685370923.595, "dur": 4.070, + "args": { + "External id": 127662,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685370929.025, "dur": 4.050, + "args": { + "External id": 127663,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685370960.345, "dur": 4.170, + "args": { + "External id": 127664,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685370961.235, "dur": 2.940, + "args": { + "External id": 127665,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5717, "tid": 6759, + "ts": 6302685370977.015, "dur": 95.290, + "args": { + "External id": 127666,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 6759, + "ts": 6302685370980.245, "dur": 4.810, + "args": { + "External id": 127667,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370983.035, "dur": 1.090, + "args": { + "External id": 127668,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685370986.285, "dur": 3.430, + "args": { + "External id": 127669,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370988.815, "dur": 0.370, + "args": { + "External id": 127670,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 6759, + "ts": 6302685370990.945, "dur": 2.020, + "args": { + "External id": 127671,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370992.345, "dur": 0.270, + "args": { + "External id": 127672,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685370993.925, "dur": 2.080, + "args": { + "External id": 127673,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685370995.345, "dur": 0.350, + "args": { + "External id": 127674,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685370998.885, "dur": 3.030, + "args": { + "External id": 127675,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371001.195, "dur": 0.390, + "args": { + "External id": 127676,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685371003.855, "dur": 4.900, + "args": { + "External id": 127677,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5717, "tid": 6759, + "ts": 6302685371007.215, "dur": 1.330, + "args": { + "External id": 127678,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685371009.675, "dur": 3.170, + "args": { + "External id": 127679,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371011.195, "dur": 1.340, + "args": { + "External id": 127680,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685371013.685, "dur": 2.890, + "args": { + "External id": 127681,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685371014.645, "dur": 1.800, + "args": { + "External id": 127682,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685371017.615, "dur": 42.110, + "args": { + "External id": 127683,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685371062.714, "dur": 1.611, + "args": { + "External id": 127684,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685371065.425, "dur": 3.140, + "args": { + "External id": 127685,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371067.474, "dur": 0.520, + "args": { + "External id": 127686,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685371070.705, "dur": 0.729, + "args": { + "External id": 127687,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685371111.874, "dur": 15.500, + "args": { + "External id": 127688,"Record function id": 0, "Ev Idx": 711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685371118.324, "dur": 7.740, + "args": { + "External id": 127689,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685371122.094, "dur": 3.070, + "args": { + "External id": 127690,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685371122.964, "dur": 2.000, + "args": { + "External id": 127691,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371131.574, "dur": 8.640, + "args": { + "External id": 127692,"Record function id": 0, "Sequence number": 2575982, "Fwd thread id": 1, "Ev Idx": 715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371133.144, "dur": 4.390, + "args": { + "External id": 127693,"Sequence number": 2575982, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 716 + } + }, + { + "ph": "f", "id": 84, "pid": 5717, "tid": 6759, "ts": 6302685371133.144, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685371134.904, "dur": 2.390, + "args": { + "External id": 127694,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685371135.794, "dur": 1.320, + "args": { + "External id": 127695,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371143.694, "dur": 97.460, + "args": { + "External id": 127696,"Record function id": 0, "Sequence number": 2575981, "Fwd thread id": 1, "Ev Idx": 719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371145.834, "dur": 86.390, + "args": { + "External id": 127697,"Sequence number": 2575981, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 720 + } + }, + { + "ph": "f", "id": 85, "pid": 5717, "tid": 6759, "ts": 6302685371145.834, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685371148.664, "dur": 5.010, + "args": { + "External id": 127698,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685371149.884, "dur": 3.110, + "args": { + "External id": 127699,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371151.714, "dur": 0.960, + "args": { + "External id": 127700,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685371154.894, "dur": 39.000, + "args": { + "External id": 127701,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685371195.854, "dur": 5.950, + "args": { + "External id": 127702,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685371196.694, "dur": 4.000, + "args": { + "External id": 127703,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371199.554, "dur": 0.880, + "args": { + "External id": 127704,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685371203.634, "dur": 2.680, + "args": { + "External id": 127705,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685371204.464, "dur": 1.370, + "args": { + "External id": 127706,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371205.454, "dur": 0.220, + "args": { + "External id": 127707,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685371207.074, "dur": 24.070, + "args": { + "External id": 127708,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371249.884, "dur": 9.260, + "args": { + "External id": 127709,"Record function id": 0, "Sequence number": 2575980, "Fwd thread id": 1, "Ev Idx": 732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371251.824, "dur": 5.390, + "args": { + "External id": 127710,"Sequence number": 2575980, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 733 + } + }, + { + "ph": "f", "id": 86, "pid": 5717, "tid": 6759, "ts": 6302685371251.824, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685371253.714, "dur": 3.280, + "args": { + "External id": 127711,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685371254.664, "dur": 2.130, + "args": { + "External id": 127712,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371263.794, "dur": 9.150, + "args": { + "External id": 127713,"Record function id": 0, "Sequence number": 2575979, "Fwd thread id": 1, "Ev Idx": 736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371265.014, "dur": 5.100, + "args": { + "External id": 127714,"Sequence number": 2575979, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 737 + } + }, + { + "ph": "f", "id": 87, "pid": 5717, "tid": 6759, "ts": 6302685371265.014, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685371265.764, "dur": 4.130, + "args": { + "External id": 127715,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685371266.754, "dur": 2.630, + "args": { + "External id": 127716,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371268.374, "dur": 0.760, + "args": { + "External id": 127717,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685371277.754, "dur": 7.610, + "args": { + "External id": 127718,"Record function id": 0, "Ev Idx": 741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685371280.024, "dur": 4.180, + "args": { + "External id": 127719,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685371281.514, "dur": 2.280, + "args": { + "External id": 127720,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685371282.184, "dur": 1.410, + "args": { + "External id": 127721,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371289.014, "dur": 15.260, + "args": { + "External id": 127722,"Record function id": 0, "Sequence number": 2575978, "Fwd thread id": 1, "Ev Idx": 745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371290.454, "dur": 4.090, + "args": { + "External id": 127723,"Sequence number": 2575978, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 746 + } + }, + { + "ph": "f", "id": 88, "pid": 5717, "tid": 6759, "ts": 6302685371290.454, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685371292.694, "dur": 1.710, + "args": { + "External id": 127724,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685371293.384, "dur": 0.860, + "args": { + "External id": 127725,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371308.104, "dur": 92.400, + "args": { + "External id": 127726,"Record function id": 0, "Sequence number": 2575977, "Fwd thread id": 1, "Ev Idx": 749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371309.324, "dur": 81.370, + "args": { + "External id": 127727,"Sequence number": 2575977, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 750 + } + }, + { + "ph": "f", "id": 89, "pid": 5717, "tid": 6759, "ts": 6302685371309.324, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685371311.494, "dur": 4.660, + "args": { + "External id": 127728,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685371312.274, "dur": 3.400, + "args": { + "External id": 127729,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371314.624, "dur": 0.820, + "args": { + "External id": 127730,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685371317.104, "dur": 36.010, + "args": { + "External id": 127731,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685371354.894, "dur": 4.090, + "args": { + "External id": 127732,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685371355.724, "dur": 2.560, + "args": { + "External id": 127733,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371357.384, "dur": 0.700, + "args": { + "External id": 127734,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685371360.744, "dur": 4.870, + "args": { + "External id": 127735,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685371361.574, "dur": 3.580, + "args": { + "External id": 127736,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371363.734, "dur": 1.270, + "args": { + "External id": 127737,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685371366.394, "dur": 23.420, + "args": { + "External id": 127738,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371409.004, "dur": 33.040, + "args": { + "External id": 127739,"Record function id": 0, "Sequence number": 2575976, "Fwd thread id": 1, "Ev Idx": 762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371410.774, "dur": 5.830, + "args": { + "External id": 127740,"Sequence number": 2575976, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 763 + } + }, + { + "ph": "f", "id": 90, "pid": 5717, "tid": 6759, "ts": 6302685371410.774, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685371413.064, "dur": 3.410, + "args": { + "External id": 127741,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685371414.024, "dur": 2.260, + "args": { + "External id": 127742,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685371419.434, "dur": 18.560, + "args": { + "External id": 127743,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371448.404, "dur": 11.970, + "args": { + "External id": 127744,"Record function id": 0, "Sequence number": 2575975, "Fwd thread id": 1, "Ev Idx": 767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371450.334, "dur": 7.450, + "args": { + "External id": 127745,"Sequence number": 2575975, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 768 + } + }, + { + "ph": "f", "id": 91, "pid": 5717, "tid": 6759, "ts": 6302685371450.334, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685371452.424, "dur": 5.110, + "args": { + "External id": 127746,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685371453.524, "dur": 3.050, + "args": { + "External id": 127747,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371455.284, "dur": 0.970, + "args": { + "External id": 127748,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685371465.573, "dur": 7.300, + "args": { + "External id": 127749,"Record function id": 0, "Ev Idx": 772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685371467.304, "dur": 4.380, + "args": { + "External id": 127750,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685371468.924, "dur": 2.289, + "args": { + "External id": 127751,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685371469.593, "dur": 1.451, + "args": { + "External id": 127752,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371477.564, "dur": 182.539, + "args": { + "External id": 127753,"Record function id": 0, "Sequence number": 2575974, "Fwd thread id": 1, "Ev Idx": 776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371478.764, "dur": 137.349, + "args": { + "External id": 127754,"Sequence number": 2575974, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 777 + } + }, + { + "ph": "f", "id": 92, "pid": 5717, "tid": 6759, "ts": 6302685371478.764, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685371480.573, "dur": 121.980, + "args": { + "External id": 127755,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685371604.083, "dur": 11.570, + "args": { + "External id": 127756,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685371619.453, "dur": 26.980, + "args": { + "External id": 127757,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685371651.163, "dur": 2.920, + "args": { + "External id": 127758,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685371669.013, "dur": 7.250, + "args": { + "External id": 127759,"Record function id": 0, "Ev Idx": 782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685371671.033, "dur": 4.000, + "args": { + "External id": 127760,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685371672.603, "dur": 1.950, + "args": { + "External id": 127761,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685371673.173, "dur": 1.210, + "args": { + "External id": 127762,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371679.823, "dur": 33.890, + "args": { + "External id": 127763,"Record function id": 0, "Sequence number": 2575973, "Fwd thread id": 1, "Ev Idx": 786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371681.063, "dur": 28.360, + "args": { + "External id": 127764,"Sequence number": 2575973, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 787 + } + }, + { + "ph": "f", "id": 93, "pid": 5717, "tid": 6759, "ts": 6302685371681.063, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685371682.473, "dur": 26.580, + "args": { + "External id": 127765,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685371684.413, "dur": 24.370, + "args": { + "External id": 127766,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371687.193, "dur": 5.210, + "args": { + "External id": 127767,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685371693.423, "dur": 14.700, + "args": { + "External id": 127768,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371719.813, "dur": 52.280, + "args": { + "External id": 127769,"Record function id": 0, "Sequence number": 2575972, "Fwd thread id": 1, "Ev Idx": 792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371721.243, "dur": 29.370, + "args": { + "External id": 127770,"Sequence number": 2575972, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 793 + } + }, + { + "ph": "f", "id": 94, "pid": 5717, "tid": 6759, "ts": 6302685371721.243, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685371723.343, "dur": 14.300, + "args": { + "External id": 127771,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685371738.973, "dur": 11.210, + "args": { + "External id": 127772,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685371753.713, "dur": 13.780, + "args": { + "External id": 127773,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371778.913, "dur": 90.460, + "args": { + "External id": 127774,"Record function id": 0, "Sequence number": 2575971, "Fwd thread id": 1, "Ev Idx": 797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371780.703, "dur": 81.070, + "args": { + "External id": 127775,"Sequence number": 2575971, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 798 + } + }, + { + "ph": "f", "id": 95, "pid": 5717, "tid": 6759, "ts": 6302685371780.703, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685371783.993, "dur": 26.370, + "args": { + "External id": 127776,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685371785.523, "dur": 0.420, + "args": { + "External id": 127777,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685371786.823, "dur": 0.270, + "args": { + "External id": 127778,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685371813.323, "dur": 28.330, + "args": { + "External id": 127779,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685371817.693, "dur": 21.960, + "args": { + "External id": 127780,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 803 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685371842.673, "dur": 15.990, + "args": { + "External id": 127781,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685371881.413, "dur": 4.530, + "args": { + "External id": 127782,"Record function id": 0, "Sequence number": 2575970, "Fwd thread id": 1, "Ev Idx": 805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685371882.913, "dur": 0.450, + "args": { + "External id": 127783,"Sequence number": 2575970, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 806 + } + }, + { + "ph": "f", "id": 96, "pid": 5717, "tid": 6759, "ts": 6302685371882.913, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685371889.403, "dur": 36.209, + "args": { + "External id": 127784,"Record function id": 0, "Sequence number": 2575969, "Fwd thread id": 1, "Ev Idx": 807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685371890.533, "dur": 31.150, + "args": { + "External id": 127785,"Sequence number": 2575969, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 808 + } + }, + { + "ph": "f", "id": 97, "pid": 5717, "tid": 6759, "ts": 6302685371890.533, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 6759, + "ts": 6302685371893.983, "dur": 5.250, + "args": { + "External id": 127786,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685371896.313, "dur": 1.480, + "args": { + "External id": 127787,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685371900.193, "dur": 20.879, + "args": { + "External id": 127788,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685371902.843, "dur": 17.329, + "args": { + "External id": 127789,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371932.203, "dur": 86.709, + "args": { + "External id": 127790,"Record function id": 0, "Sequence number": 2575968, "Fwd thread id": 1, "Ev Idx": 813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685371933.772, "dur": 66.720, + "args": { + "External id": 127791,"Sequence number": 2575968, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 814 + } + }, + { + "ph": "f", "id": 98, "pid": 5717, "tid": 6759, "ts": 6302685371933.772, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685371935.443, "dur": 29.189, + "args": { + "External id": 127792,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685371936.723, "dur": 0.309, + "args": { + "External id": 127793,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685371937.952, "dur": 0.200, + "args": { + "External id": 127794,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685371943.323, "dur": 20.200, + "args": { + "External id": 127795,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685371965.743, "dur": 22.060, + "args": { + "External id": 127796,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685371969.472, "dur": 17.440, + "args": { + "External id": 127797,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685371988.652, "dur": 10.060, + "args": { + "External id": 127798,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685372005.942, "dur": 9.610, + "args": { + "External id": 127799,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372027.232, "dur": 50.370, + "args": { + "External id": 127800,"Record function id": 0, "Sequence number": 2575967, "Fwd thread id": 1, "Ev Idx": 823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372029.062, "dur": 24.820, + "args": { + "External id": 127801,"Sequence number": 2575967, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 824 + } + }, + { + "ph": "f", "id": 99, "pid": 5717, "tid": 6759, "ts": 6302685372029.062, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685372030.492, "dur": 23.020, + "args": { + "External id": 127802,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685372031.522, "dur": 21.720, + "args": { + "External id": 127803,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372035.102, "dur": 5.250, + "args": { + "External id": 127804,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685372041.362, "dur": 11.350, + "args": { + "External id": 127805,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685372058.692, "dur": 13.520, + "args": { + "External id": 127806,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372086.052, "dur": 19.020, + "args": { + "External id": 127807,"Record function id": 0, "Sequence number": 2575966, "Fwd thread id": 1, "Ev Idx": 830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372087.842, "dur": 1.200, + "args": { + "External id": 127808,"Sequence number": 2575966, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 831 + } + }, + { + "ph": "f", "id": 100, "pid": 5717, "tid": 6759, "ts": 6302685372087.842, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685372091.872, "dur": 10.420, + "args": { + "External id": 127809,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372110.492, "dur": 10.770, + "args": { + "External id": 127810,"Record function id": 0, "Sequence number": 2575965, "Fwd thread id": 1, "Ev Idx": 833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372112.012, "dur": 6.820, + "args": { + "External id": 127811,"Sequence number": 2575965, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 834 + } + }, + { + "ph": "f", "id": 101, "pid": 5717, "tid": 6759, "ts": 6302685372112.012, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685372113.832, "dur": 4.770, + "args": { + "External id": 127812,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685372115.762, "dur": 2.620, + "args": { + "External id": 127813,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372124.742, "dur": 97.760, + "args": { + "External id": 127814,"Record function id": 0, "Sequence number": 2575964, "Fwd thread id": 1, "Ev Idx": 837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372125.932, "dur": 89.840, + "args": { + "External id": 127815,"Sequence number": 2575964, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 838 + } + }, + { + "ph": "f", "id": 102, "pid": 5717, "tid": 6759, "ts": 6302685372125.932, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685372128.912, "dur": 5.270, + "args": { + "External id": 127816,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685372130.212, "dur": 3.180, + "args": { + "External id": 127817,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372131.892, "dur": 1.140, + "args": { + "External id": 127818,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685372135.262, "dur": 40.580, + "args": { + "External id": 127819,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685372178.692, "dur": 4.930, + "args": { + "External id": 127820,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685372179.622, "dur": 3.290, + "args": { + "External id": 127821,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372181.282, "dur": 1.410, + "args": { + "External id": 127822,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685372185.522, "dur": 3.020, + "args": { + "External id": 127823,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685372186.352, "dur": 1.740, + "args": { + "External id": 127824,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372187.662, "dur": 0.290, + "args": { + "External id": 127825,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685372190.222, "dur": 24.110, + "args": { + "External id": 127826,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372231.162, "dur": 11.720, + "args": { + "External id": 127827,"Record function id": 0, "Sequence number": 2575963, "Fwd thread id": 1, "Ev Idx": 850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372233.052, "dur": 5.820, + "args": { + "External id": 127828,"Sequence number": 2575963, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 851 + } + }, + { + "ph": "f", "id": 103, "pid": 5717, "tid": 6759, "ts": 6302685372233.052, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685372235.072, "dur": 3.640, + "args": { + "External id": 127829,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685372236.242, "dur": 2.290, + "args": { + "External id": 127830,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372247.712, "dur": 12.300, + "args": { + "External id": 127831,"Record function id": 0, "Sequence number": 2575962, "Fwd thread id": 1, "Ev Idx": 854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372248.942, "dur": 7.520, + "args": { + "External id": 127832,"Sequence number": 2575962, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 855 + } + }, + { + "ph": "f", "id": 104, "pid": 5717, "tid": 6759, "ts": 6302685372248.942, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685372249.702, "dur": 6.550, + "args": { + "External id": 127833,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685372251.002, "dur": 4.630, + "args": { + "External id": 127834,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372254.572, "dur": 0.770, + "args": { + "External id": 127835,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685372266.432, "dur": 10.170, + "args": { + "External id": 127836,"Record function id": 0, "Ev Idx": 859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685372269.432, "dur": 5.940, + "args": { + "External id": 127837,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685372271.172, "dur": 3.740, + "args": { + "External id": 127838,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685372271.822, "dur": 2.890, + "args": { + "External id": 127839,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372281.392, "dur": 9.790, + "args": { + "External id": 127840,"Record function id": 0, "Sequence number": 2575961, "Fwd thread id": 1, "Ev Idx": 863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372282.562, "dur": 4.940, + "args": { + "External id": 127841,"Sequence number": 2575961, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 864 + } + }, + { + "ph": "f", "id": 105, "pid": 5717, "tid": 6759, "ts": 6302685372282.562, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685372283.822, "dur": 3.490, + "args": { + "External id": 127842,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685372285.752, "dur": 1.360, + "args": { + "External id": 127843,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5717, "tid": 6759, + "ts": 6302685372303.842, "dur": 245.689, + "args": { + "External id": 127844,"Record function id": 0, "Sequence number": 2575960, "Fwd thread id": 1, "Ev Idx": 867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5717, "tid": 6759, + "ts": 6302685372306.712, "dur": 225.299, + "args": { + "External id": 127845,"Sequence number": 2575960, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 868 + } + }, + { + "ph": "f", "id": 106, "pid": 5717, "tid": 6759, "ts": 6302685372306.712, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685372321.272, "dur": 11.200, + "args": { + "External id": 127846,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372324.402, "dur": 7.380, + "args": { + "External id": 127847,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685372336.122, "dur": 7.760, + "args": { + "External id": 127848,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372337.592, "dur": 6.000, + "args": { + "External id": 127849,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685372347.891, "dur": 6.671, + "args": { + "External id": 127850,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372348.771, "dur": 5.491, + "args": { + "External id": 127851,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685372373.862, "dur": 132.899, + "args": { + "External id": 127852,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685372434.891, "dur": 5.600, + "args": { + "External id": 127853,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685372442.071, "dur": 3.690, + "args": { + "External id": 127854,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685372518.261, "dur": 3.210, + "args": { + "External id": 127855,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685372525.511, "dur": 0.630, + "args": { + "External id": 127856,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685372528.801, "dur": 0.520, + "args": { + "External id": 127857,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685372559.951, "dur": 172.630, + "args": { + "External id": 127858,"Record function id": 0, "Sequence number": 2575959, "Fwd thread id": 1, "Ev Idx": 881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685372562.201, "dur": 162.100, + "args": { + "External id": 127859,"Sequence number": 2575959, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 882 + } + }, + { + "ph": "f", "id": 107, "pid": 5717, "tid": 6759, "ts": 6302685372562.201, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685372577.161, "dur": 35.090, + "args": { + "External id": 127860,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372580.381, "dur": 6.370, + "args": { + "External id": 127861,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685372587.921, "dur": 23.720, + "args": { + "External id": 127862,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685372620.291, "dur": 6.630, + "args": { + "External id": 127863,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372622.131, "dur": 4.410, + "args": { + "External id": 127864,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685372743.681, "dur": 156.449, + "args": { + "External id": 127865,"Record function id": 0, "Sequence number": 2575958, "Fwd thread id": 1, "Ev Idx": 888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685372746.311, "dur": 145.539, + "args": { + "External id": 127866,"Sequence number": 2575958, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 889 + } + }, + { + "ph": "f", "id": 108, "pid": 5717, "tid": 6759, "ts": 6302685372746.311, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685372759.751, "dur": 31.499, + "args": { + "External id": 127867,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372763.151, "dur": 6.020, + "args": { + "External id": 127868,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685372770.271, "dur": 20.430, + "args": { + "External id": 127869,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685372799.161, "dur": 6.640, + "args": { + "External id": 127870,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372800.981, "dur": 4.369, + "args": { + "External id": 127871,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372910.940, "dur": 12.940, + "args": { + "External id": 127872,"Record function id": 0, "Sequence number": 2575957, "Fwd thread id": 1, "Ev Idx": 895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372913.120, "dur": 7.880, + "args": { + "External id": 127873,"Sequence number": 2575957, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 896 + } + }, + { + "ph": "f", "id": 109, "pid": 5717, "tid": 6759, "ts": 6302685372913.120, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685372915.810, "dur": 4.920, + "args": { + "External id": 127874,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685372916.950, "dur": 3.510, + "args": { + "External id": 127875,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372927.730, "dur": 7.890, + "args": { + "External id": 127876,"Record function id": 0, "Sequence number": 2575956, "Fwd thread id": 1, "Ev Idx": 899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372928.910, "dur": 4.190, + "args": { + "External id": 127877,"Sequence number": 2575956, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 900 + } + }, + { + "ph": "f", "id": 110, "pid": 5717, "tid": 6759, "ts": 6302685372928.910, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685372930.150, "dur": 2.750, + "args": { + "External id": 127878,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685372931.830, "dur": 0.900, + "args": { + "External id": 127879,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372938.960, "dur": 6.610, + "args": { + "External id": 127880,"Record function id": 0, "Sequence number": 2575955, "Fwd thread id": 1, "Ev Idx": 903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372941.040, "dur": 2.590, + "args": { + "External id": 127881,"Sequence number": 2575955, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 904 + } + }, + { + "ph": "f", "id": 111, "pid": 5717, "tid": 6759, "ts": 6302685372941.040, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685372941.990, "dur": 1.530, + "args": { + "External id": 127882,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685372942.590, "dur": 0.770, + "args": { + "External id": 127883,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372949.250, "dur": 5.960, + "args": { + "External id": 127884,"Record function id": 0, "Sequence number": 2575954, "Fwd thread id": 1, "Ev Idx": 907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372950.530, "dur": 2.670, + "args": { + "External id": 127885,"Sequence number": 2575954, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 908 + } + }, + { + "ph": "f", "id": 112, "pid": 5717, "tid": 6759, "ts": 6302685372950.530, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685372951.440, "dur": 1.650, + "args": { + "External id": 127886,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685372952.170, "dur": 0.760, + "args": { + "External id": 127887,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372958.660, "dur": 116.540, + "args": { + "External id": 127888,"Record function id": 0, "Sequence number": 2575953, "Fwd thread id": 1, "Ev Idx": 911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685372959.830, "dur": 102.160, + "args": { + "External id": 127889,"Sequence number": 2575953, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 912 + } + }, + { + "ph": "f", "id": 113, "pid": 5717, "tid": 6759, "ts": 6302685372959.830, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685372963.820, "dur": 5.970, + "args": { + "External id": 127890,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685372965.110, "dur": 4.100, + "args": { + "External id": 127891,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685372966.740, "dur": 2.140, + "args": { + "External id": 127892,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685372970.980, "dur": 45.800, + "args": { + "External id": 127893,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373019.220, "dur": 6.350, + "args": { + "External id": 127894,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373020.290, "dur": 4.370, + "args": { + "External id": 127895,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373023.660, "dur": 0.790, + "args": { + "External id": 127896,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373027.570, "dur": 3.210, + "args": { + "External id": 127897,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373028.500, "dur": 1.730, + "args": { + "External id": 127898,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373029.750, "dur": 0.340, + "args": { + "External id": 127899,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685373031.670, "dur": 29.270, + "args": { + "External id": 127900,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373090.010, "dur": 15.940, + "args": { + "External id": 127901,"Record function id": 0, "Sequence number": 2575952, "Fwd thread id": 1, "Ev Idx": 924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373093.250, "dur": 9.430, + "args": { + "External id": 127902,"Sequence number": 2575952, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 925 + } + }, + { + "ph": "f", "id": 114, "pid": 5717, "tid": 6759, "ts": 6302685373093.250, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685373096.700, "dur": 5.790, + "args": { + "External id": 127903,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685373097.740, "dur": 3.460, + "args": { + "External id": 127904,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373113.390, "dur": 13.400, + "args": { + "External id": 127905,"Record function id": 0, "Sequence number": 2575951, "Fwd thread id": 1, "Ev Idx": 928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373115.740, "dur": 7.540, + "args": { + "External id": 127906,"Sequence number": 2575951, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 929 + } + }, + { + "ph": "f", "id": 115, "pid": 5717, "tid": 6759, "ts": 6302685373115.740, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373116.560, "dur": 6.540, + "args": { + "External id": 127907,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373118.720, "dur": 3.740, + "args": { + "External id": 127908,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373120.290, "dur": 1.890, + "args": { + "External id": 127909,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685373135.630, "dur": 14.980, + "args": { + "External id": 127910,"Record function id": 0, "Ev Idx": 933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685373138.910, "dur": 9.350, + "args": { + "External id": 127911,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685373141.860, "dur": 5.890, + "args": { + "External id": 127912,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685373142.660, "dur": 4.910, + "args": { + "External id": 127913,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373155.610, "dur": 12.580, + "args": { + "External id": 127914,"Record function id": 0, "Sequence number": 2575950, "Fwd thread id": 1, "Ev Idx": 937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373158.120, "dur": 6.650, + "args": { + "External id": 127915,"Sequence number": 2575950, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 938 + } + }, + { + "ph": "f", "id": 116, "pid": 5717, "tid": 6759, "ts": 6302685373158.120, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685373161.400, "dur": 3.160, + "args": { + "External id": 127916,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685373163.170, "dur": 1.180, + "args": { + "External id": 127917,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373174.070, "dur": 147.390, + "args": { + "External id": 127918,"Record function id": 0, "Sequence number": 2575949, "Fwd thread id": 1, "Ev Idx": 941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373176.430, "dur": 130.430, + "args": { + "External id": 127919,"Sequence number": 2575949, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 942 + } + }, + { + "ph": "f", "id": 117, "pid": 5717, "tid": 6759, "ts": 6302685373176.430, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373180.420, "dur": 6.440, + "args": { + "External id": 127920,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373182.160, "dur": 4.250, + "args": { + "External id": 127921,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373184.260, "dur": 1.900, + "args": { + "External id": 127922,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685373188.890, "dur": 55.350, + "args": { + "External id": 127923,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373247.340, "dur": 6.800, + "args": { + "External id": 127924,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373248.329, "dur": 4.960, + "args": { + "External id": 127925,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373251.140, "dur": 1.909, + "args": { + "External id": 127926,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373257.069, "dur": 5.840, + "args": { + "External id": 127927,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373257.920, "dur": 4.549, + "args": { + "External id": 127928,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373260.869, "dur": 1.400, + "args": { + "External id": 127929,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685373263.649, "dur": 42.040, + "args": { + "External id": 127930,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373333.619, "dur": 34.500, + "args": { + "External id": 127931,"Record function id": 0, "Sequence number": 2575948, "Fwd thread id": 1, "Ev Idx": 954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373335.859, "dur": 6.210, + "args": { + "External id": 127932,"Sequence number": 2575948, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 955 + } + }, + { + "ph": "f", "id": 118, "pid": 5717, "tid": 6759, "ts": 6302685373335.859, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685373338.119, "dur": 3.730, + "args": { + "External id": 127933,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685373339.189, "dur": 2.460, + "args": { + "External id": 127934,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685373344.979, "dur": 19.680, + "args": { + "External id": 127935,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373375.619, "dur": 14.040, + "args": { + "External id": 127936,"Record function id": 0, "Sequence number": 2575947, "Fwd thread id": 1, "Ev Idx": 959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373377.349, "dur": 9.590, + "args": { + "External id": 127937,"Sequence number": 2575947, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 960 + } + }, + { + "ph": "f", "id": 119, "pid": 5717, "tid": 6759, "ts": 6302685373377.349, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373378.359, "dur": 8.330, + "args": { + "External id": 127938,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373379.469, "dur": 6.320, + "args": { + "External id": 127939,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373383.689, "dur": 1.790, + "args": { + "External id": 127940,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685373394.899, "dur": 8.610, + "args": { + "External id": 127941,"Record function id": 0, "Ev Idx": 964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685373396.829, "dur": 5.530, + "args": { + "External id": 127942,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685373398.549, "dur": 3.350, + "args": { + "External id": 127943,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685373400.209, "dur": 1.490, + "args": { + "External id": 127944,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373407.269, "dur": 7.520, + "args": { + "External id": 127945,"Record function id": 0, "Sequence number": 2575946, "Fwd thread id": 1, "Ev Idx": 968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373408.559, "dur": 3.830, + "args": { + "External id": 127946,"Sequence number": 2575946, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 969 + } + }, + { + "ph": "f", "id": 120, "pid": 5717, "tid": 6759, "ts": 6302685373408.559, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685373409.799, "dur": 2.440, + "args": { + "External id": 127947,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685373410.579, "dur": 1.480, + "args": { + "External id": 127948,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373418.229, "dur": 88.560, + "args": { + "External id": 127949,"Record function id": 0, "Sequence number": 2575945, "Fwd thread id": 1, "Ev Idx": 972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373420.039, "dur": 77.480, + "args": { + "External id": 127950,"Sequence number": 2575945, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 973 + } + }, + { + "ph": "f", "id": 121, "pid": 5717, "tid": 6759, "ts": 6302685373420.039, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373423.159, "dur": 3.930, + "args": { + "External id": 127951,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373423.789, "dur": 2.860, + "args": { + "External id": 127952,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373425.739, "dur": 0.710, + "args": { + "External id": 127953,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685373427.999, "dur": 35.900, + "args": { + "External id": 127954,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373465.599, "dur": 4.280, + "args": { + "External id": 127955,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373466.499, "dur": 2.640, + "args": { + "External id": 127956,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373468.069, "dur": 0.880, + "args": { + "External id": 127957,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373471.669, "dur": 4.070, + "args": { + "External id": 127958,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373473.599, "dur": 1.770, + "args": { + "External id": 127959,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373474.949, "dur": 0.280, + "args": { + "External id": 127960,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685373476.389, "dur": 20.320, + "args": { + "External id": 127961,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373515.539, "dur": 28.190, + "args": { + "External id": 127962,"Record function id": 0, "Sequence number": 2575944, "Fwd thread id": 1, "Ev Idx": 985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373517.429, "dur": 5.710, + "args": { + "External id": 127963,"Sequence number": 2575944, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 986 + } + }, + { + "ph": "f", "id": 122, "pid": 5717, "tid": 6759, "ts": 6302685373517.429, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685373519.599, "dur": 3.410, + "args": { + "External id": 127964,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685373520.659, "dur": 2.170, + "args": { + "External id": 127965,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685373525.769, "dur": 14.540, + "args": { + "External id": 127966,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373549.489, "dur": 11.190, + "args": { + "External id": 127967,"Record function id": 0, "Sequence number": 2575943, "Fwd thread id": 1, "Ev Idx": 990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373550.979, "dur": 7.280, + "args": { + "External id": 127968,"Sequence number": 2575943, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 991 + } + }, + { + "ph": "f", "id": 123, "pid": 5717, "tid": 6759, "ts": 6302685373550.979, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685373553.249, "dur": 4.790, + "args": { + "External id": 127969,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685373554.309, "dur": 2.910, + "args": { + "External id": 127970,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373556.039, "dur": 0.890, + "args": { + "External id": 127971,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685373565.769, "dur": 7.750, + "args": { + "External id": 127972,"Record function id": 0, "Ev Idx": 995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685373567.469, "dur": 4.960, + "args": { + "External id": 127973,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685373568.829, "dur": 3.120, + "args": { + "External id": 127974,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685373569.439, "dur": 2.340, + "args": { + "External id": 127975,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373577.039, "dur": 73.140, + "args": { + "External id": 127976,"Record function id": 0, "Sequence number": 2575942, "Fwd thread id": 1, "Ev Idx": 999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373578.249, "dur": 32.620, + "args": { + "External id": 127977,"Sequence number": 2575942, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1000 + } + }, + { + "ph": "f", "id": 124, "pid": 5717, "tid": 6759, "ts": 6302685373578.249, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685373580.169, "dur": 17.370, + "args": { + "External id": 127978,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685373599.249, "dur": 11.190, + "args": { + "External id": 127979,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685373613.819, "dur": 25.820, + "args": { + "External id": 127980,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685373643.419, "dur": 2.010, + "args": { + "External id": 127981,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685373658.909, "dur": 7.630, + "args": { + "External id": 127982,"Record function id": 0, "Ev Idx": 1005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685373661.139, "dur": 4.190, + "args": { + "External id": 127983,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685373662.759, "dur": 2.060, + "args": { + "External id": 127984,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685373663.409, "dur": 1.230, + "args": { + "External id": 127985,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373670.208, "dur": 33.851, + "args": { + "External id": 127986,"Record function id": 0, "Sequence number": 2575941, "Fwd thread id": 1, "Ev Idx": 1009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373671.439, "dur": 28.720, + "args": { + "External id": 127987,"Sequence number": 2575941, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1010 + } + }, + { + "ph": "f", "id": 125, "pid": 5717, "tid": 6759, "ts": 6302685373671.439, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685373673.008, "dur": 26.751, + "args": { + "External id": 127988,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685373674.948, "dur": 24.531, + "args": { + "External id": 127989,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373677.948, "dur": 5.311, + "args": { + "External id": 127990,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685373684.319, "dur": 14.600, + "args": { + "External id": 127991,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373710.288, "dur": 54.900, + "args": { + "External id": 127992,"Record function id": 0, "Sequence number": 2575940, "Fwd thread id": 1, "Ev Idx": 1015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373711.648, "dur": 29.760, + "args": { + "External id": 127993,"Sequence number": 2575940, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1016 + } + }, + { + "ph": "f", "id": 126, "pid": 5717, "tid": 6759, "ts": 6302685373711.648, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685373713.799, "dur": 14.769, + "args": { + "External id": 127994,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685373730.168, "dur": 10.820, + "args": { + "External id": 127995,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685373744.368, "dur": 15.671, + "args": { + "External id": 127996,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373772.178, "dur": 66.660, + "args": { + "External id": 127997,"Record function id": 0, "Sequence number": 2575939, "Fwd thread id": 1, "Ev Idx": 1020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373773.718, "dur": 60.460, + "args": { + "External id": 127998,"Sequence number": 2575939, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1021 + } + }, + { + "ph": "f", "id": 127, "pid": 5717, "tid": 6759, "ts": 6302685373773.718, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685373776.138, "dur": 28.500, + "args": { + "External id": 127999,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685373778.068, "dur": 0.410, + "args": { + "External id": 128000,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685373788.308, "dur": 0.280, + "args": { + "External id": 128001,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685373806.058, "dur": 16.020, + "args": { + "External id": 128002,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685373809.158, "dur": 12.000, + "args": { + "External id": 128003,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685373822.918, "dur": 9.540, + "args": { + "External id": 128004,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685373846.208, "dur": 3.990, + "args": { + "External id": 128005,"Record function id": 0, "Sequence number": 2575938, "Fwd thread id": 1, "Ev Idx": 1028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685373847.768, "dur": 0.410, + "args": { + "External id": 128006,"Sequence number": 2575938, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1029 + } + }, + { + "ph": "f", "id": 128, "pid": 5717, "tid": 6759, "ts": 6302685373847.768, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685373853.628, "dur": 50.100, + "args": { + "External id": 128007,"Record function id": 0, "Sequence number": 2575937, "Fwd thread id": 1, "Ev Idx": 1030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685373854.878, "dur": 41.830, + "args": { + "External id": 128008,"Sequence number": 2575937, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1031 + } + }, + { + "ph": "f", "id": 129, "pid": 5717, "tid": 6759, "ts": 6302685373854.878, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 6759, + "ts": 6302685373858.378, "dur": 8.510, + "args": { + "External id": 128009,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685373863.848, "dur": 1.510, + "args": { + "External id": 128010,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685373868.048, "dur": 27.940, + "args": { + "External id": 128011,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685373870.878, "dur": 22.910, + "args": { + "External id": 128012,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373915.638, "dur": 103.660, + "args": { + "External id": 128013,"Record function id": 0, "Sequence number": 2575936, "Fwd thread id": 1, "Ev Idx": 1036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685373918.408, "dur": 82.440, + "args": { + "External id": 128014,"Sequence number": 2575936, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1037 + } + }, + { + "ph": "f", "id": 130, "pid": 5717, "tid": 6759, "ts": 6302685373918.408, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685373921.598, "dur": 41.600, + "args": { + "External id": 128015,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685373923.038, "dur": 0.310, + "args": { + "External id": 128016,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685373925.458, "dur": 0.190, + "args": { + "External id": 128017,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685373934.828, "dur": 25.770, + "args": { + "External id": 128018,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685373964.568, "dur": 22.670, + "args": { + "External id": 128019,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685373969.678, "dur": 16.540, + "args": { + "External id": 128020,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685373989.098, "dur": 9.730, + "args": { + "External id": 128021,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685374006.498, "dur": 9.470, + "args": { + "External id": 128022,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374028.078, "dur": 41.920, + "args": { + "External id": 128023,"Record function id": 0, "Sequence number": 2575935, "Fwd thread id": 1, "Ev Idx": 1046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374029.778, "dur": 24.900, + "args": { + "External id": 128024,"Sequence number": 2575935, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1047 + } + }, + { + "ph": "f", "id": 131, "pid": 5717, "tid": 6759, "ts": 6302685374029.778, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685374031.248, "dur": 23.070, + "args": { + "External id": 128025,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685374032.348, "dur": 21.710, + "args": { + "External id": 128026,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374035.798, "dur": 5.130, + "args": { + "External id": 128027,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685374041.888, "dur": 11.570, + "args": { + "External id": 128028,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685374058.888, "dur": 8.960, + "args": { + "External id": 128029,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374076.738, "dur": 5.920, + "args": { + "External id": 128030,"Record function id": 0, "Sequence number": 2575934, "Fwd thread id": 1, "Ev Idx": 1053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374078.528, "dur": 0.920, + "args": { + "External id": 128031,"Sequence number": 2575934, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1054 + } + }, + { + "ph": "f", "id": 132, "pid": 5717, "tid": 6759, "ts": 6302685374078.528, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685374086.998, "dur": 346.589, + "args": { + "External id": 128032,"Record function id": 0, "Sequence number": 2575933, "Fwd thread id": 1, "Ev Idx": 1055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685374088.518, "dur": 328.719, + "args": { + "External id": 128033,"Sequence number": 2575933, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1056 + } + }, + { + "ph": "f", "id": 133, "pid": 5717, "tid": 6759, "ts": 6302685374088.518, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685374115.147, "dur": 7.120, + "args": { + "External id": 128034,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5717, "tid": 6759, + "ts": 6302685374118.658, "dur": 3.200, + "args": { + "External id": 128035,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 1058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685374124.518, "dur": 5.929, + "args": { + "External id": 128036,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685374125.798, "dur": 3.940, + "args": { + "External id": 128037,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374127.738, "dur": 1.709, + "args": { + "External id": 128038,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 6759, + "ts": 6302685374132.707, "dur": 40.180, + "args": { + "External id": 128039,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 1062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685374134.258, "dur": 2.569, + "args": { + "External id": 128040,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 1063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685374134.787, "dur": 1.691, + "args": { + "External id": 128041,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374135.978, "dur": 0.329, + "args": { + "External id": 128042,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 6759, + "ts": 6302685374137.667, "dur": 34.400, + "args": { + "External id": 128043,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685374138.538, "dur": 32.740, + "args": { + "External id": 128044,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 6759, + "ts": 6302685374177.058, "dur": 3.709, + "args": { + "External id": 128045,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 1068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685374178.527, "dur": 2.040, + "args": { + "External id": 128046,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685374202.458, "dur": 5.749, + "args": { + "External id": 128047,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685374209.577, "dur": 3.750, + "args": { + "External id": 128048,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685374214.287, "dur": 2.670, + "args": { + "External id": 128049,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685374243.707, "dur": 3.930, + "args": { + "External id": 128050,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685374245.237, "dur": 2.090, + "args": { + "External id": 128051,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5717, "tid": 6759, + "ts": 6302685374260.077, "dur": 130.990, + "args": { + "External id": 128052,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 1075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 6759, + "ts": 6302685374263.247, "dur": 5.580, + "args": { + "External id": 128053,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374266.097, "dur": 1.860, + "args": { + "External id": 128054,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685374270.417, "dur": 3.110, + "args": { + "External id": 128055,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 1078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374272.617, "dur": 0.370, + "args": { + "External id": 128056,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 1079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 6759, + "ts": 6302685374274.807, "dur": 5.100, + "args": { + "External id": 128057,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374278.827, "dur": 0.430, + "args": { + "External id": 128058,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685374280.937, "dur": 2.120, + "args": { + "External id": 128059,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374282.377, "dur": 0.360, + "args": { + "External id": 128060,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 1083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685374285.877, "dur": 2.320, + "args": { + "External id": 128061,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 1084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374287.507, "dur": 0.350, + "args": { + "External id": 128062,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 1085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685374289.317, "dur": 4.800, + "args": { + "External id": 128063,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 1086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5717, "tid": 6759, + "ts": 6302685374292.397, "dur": 1.480, + "args": { + "External id": 128064,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 1087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685374304.787, "dur": 3.720, + "args": { + "External id": 128065,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 1088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374306.407, "dur": 1.620, + "args": { + "External id": 128066,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 1089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685374311.517, "dur": 4.020, + "args": { + "External id": 128067,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685374312.327, "dur": 3.040, + "args": { + "External id": 128068,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685374316.597, "dur": 52.310, + "args": { + "External id": 128069,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 1092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685374373.417, "dur": 3.060, + "args": { + "External id": 128070,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 1093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685374378.527, "dur": 6.130, + "args": { + "External id": 128071,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 1094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374382.247, "dur": 0.660, + "args": { + "External id": 128072,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 1095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685374388.347, "dur": 0.730, + "args": { + "External id": 128073,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 1096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685374450.377, "dur": 10.270, + "args": { + "External id": 128074,"Record function id": 0, "Ev Idx": 1097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685374453.057, "dur": 6.250, + "args": { + "External id": 128075,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685374455.347, "dur": 3.070, + "args": { + "External id": 128076,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685374456.187, "dur": 2.040, + "args": { + "External id": 128077,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374465.207, "dur": 10.560, + "args": { + "External id": 128078,"Record function id": 0, "Sequence number": 2575932, "Fwd thread id": 1, "Ev Idx": 1101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374466.647, "dur": 4.400, + "args": { + "External id": 128079,"Sequence number": 2575932, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1102 + } + }, + { + "ph": "f", "id": 134, "pid": 5717, "tid": 6759, "ts": 6302685374466.647, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685374468.457, "dur": 2.370, + "args": { + "External id": 128080,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685374469.237, "dur": 1.420, + "args": { + "External id": 128081,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374480.227, "dur": 137.330, + "args": { + "External id": 128082,"Record function id": 0, "Sequence number": 2575931, "Fwd thread id": 1, "Ev Idx": 1105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374481.547, "dur": 128.199, + "args": { + "External id": 128083,"Sequence number": 2575931, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1106 + } + }, + { + "ph": "f", "id": 135, "pid": 5717, "tid": 6759, "ts": 6302685374481.547, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685374484.537, "dur": 6.520, + "args": { + "External id": 128084,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685374486.897, "dur": 3.410, + "args": { + "External id": 128085,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374488.897, "dur": 1.100, + "args": { + "External id": 128086,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685374492.107, "dur": 57.320, + "args": { + "External id": 128087,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685374552.817, "dur": 7.240, + "args": { + "External id": 128088,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685374553.866, "dur": 5.291, + "args": { + "External id": 128089,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374556.706, "dur": 2.171, + "args": { + "External id": 128090,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685374565.077, "dur": 4.000, + "args": { + "External id": 128091,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685374565.886, "dur": 2.700, + "args": { + "External id": 128092,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374568.017, "dur": 0.369, + "args": { + "External id": 128093,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685374569.797, "dur": 37.700, + "args": { + "External id": 128094,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374626.666, "dur": 10.631, + "args": { + "External id": 128095,"Record function id": 0, "Sequence number": 2575930, "Fwd thread id": 1, "Ev Idx": 1118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374628.537, "dur": 6.709, + "args": { + "External id": 128096,"Sequence number": 2575930, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1119 + } + }, + { + "ph": "f", "id": 136, "pid": 5717, "tid": 6759, "ts": 6302685374628.537, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685374630.726, "dur": 4.371, + "args": { + "External id": 128097,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685374631.766, "dur": 3.140, + "args": { + "External id": 128098,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374640.877, "dur": 9.769, + "args": { + "External id": 128099,"Record function id": 0, "Sequence number": 2575929, "Fwd thread id": 1, "Ev Idx": 1122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374642.926, "dur": 5.220, + "args": { + "External id": 128100,"Sequence number": 2575929, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1123 + } + }, + { + "ph": "f", "id": 137, "pid": 5717, "tid": 6759, "ts": 6302685374642.926, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685374643.646, "dur": 4.320, + "args": { + "External id": 128101,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685374644.666, "dur": 2.740, + "args": { + "External id": 128102,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374646.226, "dur": 0.900, + "args": { + "External id": 128103,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685374655.686, "dur": 6.760, + "args": { + "External id": 128104,"Record function id": 0, "Ev Idx": 1127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685374657.396, "dur": 3.990, + "args": { + "External id": 128105,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685374658.816, "dur": 2.170, + "args": { + "External id": 128106,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685374659.386, "dur": 1.400, + "args": { + "External id": 128107,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374666.076, "dur": 10.160, + "args": { + "External id": 128108,"Record function id": 0, "Sequence number": 2575928, "Fwd thread id": 1, "Ev Idx": 1131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374667.336, "dur": 4.120, + "args": { + "External id": 128109,"Sequence number": 2575928, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1132 + } + }, + { + "ph": "f", "id": 138, "pid": 5717, "tid": 6759, "ts": 6302685374667.336, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685374668.606, "dur": 2.700, + "args": { + "External id": 128110,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685374670.206, "dur": 0.950, + "args": { + "External id": 128111,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374680.366, "dur": 118.980, + "args": { + "External id": 128112,"Record function id": 0, "Sequence number": 2575927, "Fwd thread id": 1, "Ev Idx": 1135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374681.686, "dur": 107.700, + "args": { + "External id": 128113,"Sequence number": 2575927, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1136 + } + }, + { + "ph": "f", "id": 139, "pid": 5717, "tid": 6759, "ts": 6302685374681.686, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685374683.856, "dur": 3.390, + "args": { + "External id": 128114,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685374684.656, "dur": 2.140, + "args": { + "External id": 128115,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374685.846, "dur": 0.740, + "args": { + "External id": 128116,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685374688.936, "dur": 54.680, + "args": { + "External id": 128117,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685374746.736, "dur": 6.880, + "args": { + "External id": 128118,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685374747.776, "dur": 5.010, + "args": { + "External id": 128119,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374750.566, "dur": 1.960, + "args": { + "External id": 128120,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685374756.516, "dur": 5.950, + "args": { + "External id": 128121,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685374757.356, "dur": 3.520, + "args": { + "External id": 128122,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374760.356, "dur": 0.310, + "args": { + "External id": 128123,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685374763.246, "dur": 25.000, + "args": { + "External id": 128124,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374811.456, "dur": 39.690, + "args": { + "External id": 128125,"Record function id": 0, "Sequence number": 2575926, "Fwd thread id": 1, "Ev Idx": 1148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374813.366, "dur": 5.970, + "args": { + "External id": 128126,"Sequence number": 2575926, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1149 + } + }, + { + "ph": "f", "id": 140, "pid": 5717, "tid": 6759, "ts": 6302685374813.366, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685374815.496, "dur": 3.640, + "args": { + "External id": 128127,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685374816.506, "dur": 2.420, + "args": { + "External id": 128128,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685374822.236, "dur": 22.320, + "args": { + "External id": 128129,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374861.056, "dur": 19.400, + "args": { + "External id": 128130,"Record function id": 0, "Sequence number": 2575925, "Fwd thread id": 1, "Ev Idx": 1153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374864.156, "dur": 12.500, + "args": { + "External id": 128131,"Sequence number": 2575925, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1154 + } + }, + { + "ph": "f", "id": 141, "pid": 5717, "tid": 6759, "ts": 6302685374864.156, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685374867.216, "dur": 9.160, + "args": { + "External id": 128132,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685374869.706, "dur": 4.550, + "args": { + "External id": 128133,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685374871.776, "dur": 2.120, + "args": { + "External id": 128134,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685374889.256, "dur": 10.830, + "args": { + "External id": 128135,"Record function id": 0, "Ev Idx": 1158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685374892.206, "dur": 6.710, + "args": { + "External id": 128136,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685374893.626, "dur": 3.660, + "args": { + "External id": 128137,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685374895.456, "dur": 1.650, + "args": { + "External id": 128138,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374906.186, "dur": 106.059, + "args": { + "External id": 128139,"Record function id": 0, "Sequence number": 2575924, "Fwd thread id": 1, "Ev Idx": 1162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685374908.576, "dur": 51.220, + "args": { + "External id": 128140,"Sequence number": 2575924, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1163 + } + }, + { + "ph": "f", "id": 142, "pid": 5717, "tid": 6759, "ts": 6302685374908.576, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685374911.466, "dur": 26.230, + "args": { + "External id": 128141,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685374940.456, "dur": 18.840, + "args": { + "External id": 128142,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685374964.066, "dur": 36.599, + "args": { + "External id": 128143,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685375004.545, "dur": 2.900, + "args": { + "External id": 128144,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685375021.085, "dur": 7.531, + "args": { + "External id": 128145,"Record function id": 0, "Ev Idx": 1168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685375023.336, "dur": 4.089, + "args": { + "External id": 128146,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685375024.905, "dur": 2.080, + "args": { + "External id": 128147,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685375025.536, "dur": 1.280, + "args": { + "External id": 128148,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375032.305, "dur": 33.520, + "args": { + "External id": 128149,"Record function id": 0, "Sequence number": 2575923, "Fwd thread id": 1, "Ev Idx": 1172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375033.496, "dur": 28.409, + "args": { + "External id": 128150,"Sequence number": 2575923, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1173 + } + }, + { + "ph": "f", "id": 143, "pid": 5717, "tid": 6759, "ts": 6302685375033.496, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685375034.865, "dur": 26.651, + "args": { + "External id": 128151,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685375036.745, "dur": 24.420, + "args": { + "External id": 128152,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375039.825, "dur": 5.440, + "args": { + "External id": 128153,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685375046.236, "dur": 14.200, + "args": { + "External id": 128154,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375071.825, "dur": 78.080, + "args": { + "External id": 128155,"Record function id": 0, "Sequence number": 2575922, "Fwd thread id": 1, "Ev Idx": 1178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375073.165, "dur": 40.340, + "args": { + "External id": 128156,"Sequence number": 2575922, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1179 + } + }, + { + "ph": "f", "id": 144, "pid": 5717, "tid": 6759, "ts": 6302685375073.165, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685375075.225, "dur": 17.990, + "args": { + "External id": 128157,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685375094.995, "dur": 16.930, + "args": { + "External id": 128158,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685375118.295, "dur": 22.950, + "args": { + "External id": 128159,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375161.795, "dur": 63.220, + "args": { + "External id": 128160,"Record function id": 0, "Sequence number": 2575921, "Fwd thread id": 1, "Ev Idx": 1183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375163.615, "dur": 56.510, + "args": { + "External id": 128161,"Sequence number": 2575921, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1184 + } + }, + { + "ph": "f", "id": 145, "pid": 5717, "tid": 6759, "ts": 6302685375163.615, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685375169.285, "dur": 19.250, + "args": { + "External id": 128162,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685375171.145, "dur": 0.420, + "args": { + "External id": 128163,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685375172.595, "dur": 0.260, + "args": { + "External id": 128164,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685375189.855, "dur": 18.050, + "args": { + "External id": 128165,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685375192.815, "dur": 14.180, + "args": { + "External id": 128166,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685375208.775, "dur": 9.660, + "args": { + "External id": 128167,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685375232.305, "dur": 4.830, + "args": { + "External id": 128168,"Record function id": 0, "Sequence number": 2575920, "Fwd thread id": 1, "Ev Idx": 1191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685375234.845, "dur": 0.440, + "args": { + "External id": 128169,"Sequence number": 2575920, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1192 + } + }, + { + "ph": "f", "id": 146, "pid": 5717, "tid": 6759, "ts": 6302685375234.845, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685375240.465, "dur": 34.130, + "args": { + "External id": 128170,"Record function id": 0, "Sequence number": 2575919, "Fwd thread id": 1, "Ev Idx": 1193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685375241.565, "dur": 29.100, + "args": { + "External id": 128171,"Sequence number": 2575919, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1194 + } + }, + { + "ph": "f", "id": 147, "pid": 5717, "tid": 6759, "ts": 6302685375241.565, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 6759, + "ts": 6302685375243.775, "dur": 5.130, + "args": { + "External id": 128172,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375246.045, "dur": 1.470, + "args": { + "External id": 128173,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685375249.875, "dur": 20.200, + "args": { + "External id": 128174,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685375252.355, "dur": 16.760, + "args": { + "External id": 128175,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375281.285, "dur": 93.710, + "args": { + "External id": 128176,"Record function id": 0, "Sequence number": 2575918, "Fwd thread id": 1, "Ev Idx": 1199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375282.795, "dur": 73.900, + "args": { + "External id": 128177,"Sequence number": 2575918, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1200 + } + }, + { + "ph": "f", "id": 148, "pid": 5717, "tid": 6759, "ts": 6302685375282.795, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685375284.435, "dur": 39.620, + "args": { + "External id": 128178,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685375285.565, "dur": 0.320, + "args": { + "External id": 128179,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685375286.775, "dur": 0.190, + "args": { + "External id": 128180,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685375292.355, "dur": 30.330, + "args": { + "External id": 128181,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685375326.215, "dur": 17.900, + "args": { + "External id": 128182,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685375328.795, "dur": 14.370, + "args": { + "External id": 128183,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685375344.985, "dur": 9.570, + "args": { + "External id": 128184,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685375362.145, "dur": 9.410, + "args": { + "External id": 128185,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375383.735, "dur": 61.080, + "args": { + "External id": 128186,"Record function id": 0, "Sequence number": 2575917, "Fwd thread id": 1, "Ev Idx": 1209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375385.625, "dur": 29.120, + "args": { + "External id": 128187,"Sequence number": 2575917, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1210 + } + }, + { + "ph": "f", "id": 149, "pid": 5717, "tid": 6759, "ts": 6302685375385.625, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685375387.165, "dur": 27.100, + "args": { + "External id": 128188,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685375388.115, "dur": 25.790, + "args": { + "External id": 128189,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375391.265, "dur": 5.200, + "args": { + "External id": 128190,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685375400.075, "dur": 13.090, + "args": { + "External id": 128191,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685375419.895, "dur": 16.909, + "args": { + "External id": 128192,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375452.935, "dur": 21.169, + "args": { + "External id": 128193,"Record function id": 0, "Sequence number": 2575916, "Fwd thread id": 1, "Ev Idx": 1216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375454.655, "dur": 1.060, + "args": { + "External id": 128194,"Sequence number": 2575916, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1217 + } + }, + { + "ph": "f", "id": 150, "pid": 5717, "tid": 6759, "ts": 6302685375454.655, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685375458.515, "dur": 13.209, + "args": { + "External id": 128195,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375479.495, "dur": 10.729, + "args": { + "External id": 128196,"Record function id": 0, "Sequence number": 2575915, "Fwd thread id": 1, "Ev Idx": 1219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375480.824, "dur": 6.960, + "args": { + "External id": 128197,"Sequence number": 2575915, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1220 + } + }, + { + "ph": "f", "id": 151, "pid": 5717, "tid": 6759, "ts": 6302685375480.824, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685375482.824, "dur": 4.751, + "args": { + "External id": 128198,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685375484.615, "dur": 2.729, + "args": { + "External id": 128199,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375493.775, "dur": 94.799, + "args": { + "External id": 128200,"Record function id": 0, "Sequence number": 2575914, "Fwd thread id": 1, "Ev Idx": 1223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375494.924, "dur": 85.520, + "args": { + "External id": 128201,"Sequence number": 2575914, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1224 + } + }, + { + "ph": "f", "id": 152, "pid": 5717, "tid": 6759, "ts": 6302685375494.924, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685375497.844, "dur": 5.060, + "args": { + "External id": 128202,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685375499.155, "dur": 2.940, + "args": { + "External id": 128203,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375500.624, "dur": 1.120, + "args": { + "External id": 128204,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685375504.964, "dur": 41.410, + "args": { + "External id": 128205,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685375548.324, "dur": 4.280, + "args": { + "External id": 128206,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685375549.254, "dur": 2.620, + "args": { + "External id": 128207,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375550.844, "dur": 0.820, + "args": { + "External id": 128208,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685375554.384, "dur": 3.890, + "args": { + "External id": 128209,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685375555.194, "dur": 2.470, + "args": { + "External id": 128210,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375557.234, "dur": 0.290, + "args": { + "External id": 128211,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685375558.964, "dur": 20.590, + "args": { + "External id": 128212,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375598.044, "dur": 10.780, + "args": { + "External id": 128213,"Record function id": 0, "Sequence number": 2575913, "Fwd thread id": 1, "Ev Idx": 1236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375599.944, "dur": 6.650, + "args": { + "External id": 128214,"Sequence number": 2575913, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1237 + } + }, + { + "ph": "f", "id": 153, "pid": 5717, "tid": 6759, "ts": 6302685375599.944, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685375601.904, "dur": 4.520, + "args": { + "External id": 128215,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685375603.944, "dur": 2.290, + "args": { + "External id": 128216,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375612.414, "dur": 14.520, + "args": { + "External id": 128217,"Record function id": 0, "Sequence number": 2575912, "Fwd thread id": 1, "Ev Idx": 1240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375613.564, "dur": 9.930, + "args": { + "External id": 128218,"Sequence number": 2575912, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1241 + } + }, + { + "ph": "f", "id": 154, "pid": 5717, "tid": 6759, "ts": 6302685375613.564, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685375615.824, "dur": 7.480, + "args": { + "External id": 128219,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685375618.954, "dur": 3.520, + "args": { + "External id": 128220,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375620.344, "dur": 1.810, + "args": { + "External id": 128221,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685375635.584, "dur": 11.310, + "args": { + "External id": 128222,"Record function id": 0, "Ev Idx": 1245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685375638.534, "dur": 7.170, + "args": { + "External id": 128223,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685375641.384, "dur": 3.870, + "args": { + "External id": 128224,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685375642.104, "dur": 2.970, + "args": { + "External id": 128225,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375650.394, "dur": 7.020, + "args": { + "External id": 128226,"Record function id": 0, "Sequence number": 2575911, "Fwd thread id": 1, "Ev Idx": 1249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685375651.664, "dur": 3.560, + "args": { + "External id": 128227,"Sequence number": 2575911, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1250 + } + }, + { + "ph": "f", "id": 155, "pid": 5717, "tid": 6759, "ts": 6302685375651.664, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685375652.784, "dur": 2.300, + "args": { + "External id": 128228,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685375653.554, "dur": 1.330, + "args": { + "External id": 128229,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5717, "tid": 6759, + "ts": 6302685375664.674, "dur": 237.669, + "args": { + "External id": 128230,"Record function id": 0, "Sequence number": 2575910, "Fwd thread id": 1, "Ev Idx": 1253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5717, "tid": 6759, + "ts": 6302685375666.394, "dur": 219.489, + "args": { + "External id": 128231,"Sequence number": 2575910, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1254 + } + }, + { + "ph": "f", "id": 156, "pid": 5717, "tid": 6759, "ts": 6302685375666.394, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685375680.604, "dur": 14.390, + "args": { + "External id": 128232,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375683.864, "dur": 10.490, + "args": { + "External id": 128233,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685375698.394, "dur": 8.720, + "args": { + "External id": 128234,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375700.854, "dur": 5.980, + "args": { + "External id": 128235,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685375709.744, "dur": 7.160, + "args": { + "External id": 128236,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375710.944, "dur": 5.660, + "args": { + "External id": 128237,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685375737.584, "dur": 123.340, + "args": { + "External id": 128238,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 1261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685375789.794, "dur": 6.060, + "args": { + "External id": 128239,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685375797.674, "dur": 3.560, + "args": { + "External id": 128240,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685375872.474, "dur": 3.240, + "args": { + "External id": 128241,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685375879.694, "dur": 0.609, + "args": { + "External id": 128242,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685375882.823, "dur": 0.491, + "args": { + "External id": 128243,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685375912.363, "dur": 216.010, + "args": { + "External id": 128244,"Record function id": 0, "Sequence number": 2575909, "Fwd thread id": 1, "Ev Idx": 1267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685375914.283, "dur": 200.540, + "args": { + "External id": 128245,"Sequence number": 2575909, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1268 + } + }, + { + "ph": "f", "id": 157, "pid": 5717, "tid": 6759, "ts": 6302685375914.283, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685375929.114, "dur": 33.500, + "args": { + "External id": 128246,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375932.754, "dur": 6.429, + "args": { + "External id": 128247,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685375940.354, "dur": 21.720, + "args": { + "External id": 128248,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685375974.563, "dur": 8.070, + "args": { + "External id": 128249,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685375976.713, "dur": 5.420, + "args": { + "External id": 128250,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685376145.413, "dur": 195.209, + "args": { + "External id": 128251,"Record function id": 0, "Sequence number": 2575908, "Fwd thread id": 1, "Ev Idx": 1274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685376150.293, "dur": 181.209, + "args": { + "External id": 128252,"Sequence number": 2575908, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1275 + } + }, + { + "ph": "f", "id": 158, "pid": 5717, "tid": 6759, "ts": 6302685376150.293, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685376174.013, "dur": 42.060, + "args": { + "External id": 128253,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376179.773, "dur": 9.560, + "args": { + "External id": 128254,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685376190.503, "dur": 23.310, + "args": { + "External id": 128255,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685376228.253, "dur": 6.990, + "args": { + "External id": 128256,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376230.033, "dur": 4.760, + "args": { + "External id": 128257,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376352.173, "dur": 13.809, + "args": { + "External id": 128258,"Record function id": 0, "Sequence number": 2575907, "Fwd thread id": 1, "Ev Idx": 1281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376354.422, "dur": 8.531, + "args": { + "External id": 128259,"Sequence number": 2575907, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1282 + } + }, + { + "ph": "f", "id": 159, "pid": 5717, "tid": 6759, "ts": 6302685376354.422, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685376357.182, "dur": 5.491, + "args": { + "External id": 128260,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685376358.333, "dur": 4.109, + "args": { + "External id": 128261,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376372.573, "dur": 8.400, + "args": { + "External id": 128262,"Record function id": 0, "Sequence number": 2575906, "Fwd thread id": 1, "Ev Idx": 1285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376374.393, "dur": 4.220, + "args": { + "External id": 128263,"Sequence number": 2575906, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1286 + } + }, + { + "ph": "f", "id": 160, "pid": 5717, "tid": 6759, "ts": 6302685376374.393, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685376375.642, "dur": 2.851, + "args": { + "External id": 128264,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685376377.322, "dur": 0.991, + "args": { + "External id": 128265,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376384.673, "dur": 5.820, + "args": { + "External id": 128266,"Record function id": 0, "Sequence number": 2575905, "Fwd thread id": 1, "Ev Idx": 1289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376385.873, "dur": 2.720, + "args": { + "External id": 128267,"Sequence number": 2575905, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1290 + } + }, + { + "ph": "f", "id": 161, "pid": 5717, "tid": 6759, "ts": 6302685376385.873, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685376386.842, "dur": 1.591, + "args": { + "External id": 128268,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685376387.422, "dur": 0.860, + "args": { + "External id": 128269,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376395.322, "dur": 9.980, + "args": { + "External id": 128270,"Record function id": 0, "Sequence number": 2575904, "Fwd thread id": 1, "Ev Idx": 1293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376396.573, "dur": 5.020, + "args": { + "External id": 128271,"Sequence number": 2575904, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1294 + } + }, + { + "ph": "f", "id": 162, "pid": 5717, "tid": 6759, "ts": 6302685376396.573, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685376398.582, "dur": 1.671, + "args": { + "External id": 128272,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685376399.182, "dur": 0.911, + "args": { + "External id": 128273,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376411.233, "dur": 141.159, + "args": { + "External id": 128274,"Record function id": 0, "Sequence number": 2575903, "Fwd thread id": 1, "Ev Idx": 1297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376414.482, "dur": 129.320, + "args": { + "External id": 128275,"Sequence number": 2575903, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1298 + } + }, + { + "ph": "f", "id": 163, "pid": 5717, "tid": 6759, "ts": 6302685376414.482, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376418.562, "dur": 9.020, + "args": { + "External id": 128276,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376421.012, "dur": 4.970, + "args": { + "External id": 128277,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376422.642, "dur": 2.980, + "args": { + "External id": 128278,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685376428.672, "dur": 62.870, + "args": { + "External id": 128279,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376493.652, "dur": 7.680, + "args": { + "External id": 128280,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376496.492, "dur": 4.010, + "args": { + "External id": 128281,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376499.382, "dur": 0.850, + "args": { + "External id": 128282,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376503.172, "dur": 2.660, + "args": { + "External id": 128283,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376504.152, "dur": 1.310, + "args": { + "External id": 128284,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376505.032, "dur": 0.290, + "args": { + "External id": 128285,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685376506.572, "dur": 36.150, + "args": { + "External id": 128286,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376564.082, "dur": 11.120, + "args": { + "External id": 128287,"Record function id": 0, "Sequence number": 2575902, "Fwd thread id": 1, "Ev Idx": 1310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376566.552, "dur": 6.740, + "args": { + "External id": 128288,"Sequence number": 2575902, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1311 + } + }, + { + "ph": "f", "id": 164, "pid": 5717, "tid": 6759, "ts": 6302685376566.552, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685376568.662, "dur": 4.500, + "args": { + "External id": 128289,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685376570.572, "dur": 2.380, + "args": { + "External id": 128290,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376579.682, "dur": 12.150, + "args": { + "External id": 128291,"Record function id": 0, "Sequence number": 2575901, "Fwd thread id": 1, "Ev Idx": 1314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376580.832, "dur": 7.510, + "args": { + "External id": 128292,"Sequence number": 2575901, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1315 + } + }, + { + "ph": "f", "id": 165, "pid": 5717, "tid": 6759, "ts": 6302685376580.832, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376581.552, "dur": 6.580, + "args": { + "External id": 128293,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376582.592, "dur": 3.800, + "args": { + "External id": 128294,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376585.222, "dur": 0.860, + "args": { + "External id": 128295,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685376599.372, "dur": 15.940, + "args": { + "External id": 128296,"Record function id": 0, "Ev Idx": 1319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685376602.502, "dur": 10.520, + "args": { + "External id": 128297,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685376605.402, "dur": 7.130, + "args": { + "External id": 128298,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685376608.322, "dur": 4.000, + "args": { + "External id": 128299,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376621.502, "dur": 10.820, + "args": { + "External id": 128300,"Record function id": 0, "Sequence number": 2575900, "Fwd thread id": 1, "Ev Idx": 1323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376622.822, "dur": 5.760, + "args": { + "External id": 128301,"Sequence number": 2575900, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1324 + } + }, + { + "ph": "f", "id": 166, "pid": 5717, "tid": 6759, "ts": 6302685376622.822, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685376625.142, "dur": 3.260, + "args": { + "External id": 128302,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685376625.692, "dur": 2.510, + "args": { + "External id": 128303,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376638.172, "dur": 102.100, + "args": { + "External id": 128304,"Record function id": 0, "Sequence number": 2575899, "Fwd thread id": 1, "Ev Idx": 1327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376640.412, "dur": 88.880, + "args": { + "External id": 128305,"Sequence number": 2575899, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1328 + } + }, + { + "ph": "f", "id": 167, "pid": 5717, "tid": 6759, "ts": 6302685376640.412, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376643.772, "dur": 6.020, + "args": { + "External id": 128306,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376644.422, "dur": 3.830, + "args": { + "External id": 128307,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376647.302, "dur": 0.670, + "args": { + "External id": 128308,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685376650.832, "dur": 44.720, + "args": { + "External id": 128309,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376697.412, "dur": 4.250, + "args": { + "External id": 128310,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376698.442, "dur": 2.500, + "args": { + "External id": 128311,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376699.882, "dur": 0.880, + "args": { + "External id": 128312,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376703.462, "dur": 3.600, + "args": { + "External id": 128313,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376704.272, "dur": 2.420, + "args": { + "External id": 128314,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376706.092, "dur": 0.460, + "args": { + "External id": 128315,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685376707.782, "dur": 20.590, + "args": { + "External id": 128316,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376749.542, "dur": 37.490, + "args": { + "External id": 128317,"Record function id": 0, "Sequence number": 2575898, "Fwd thread id": 1, "Ev Idx": 1340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376751.362, "dur": 5.980, + "args": { + "External id": 128318,"Sequence number": 2575898, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1341 + } + }, + { + "ph": "f", "id": 168, "pid": 5717, "tid": 6759, "ts": 6302685376751.362, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685376753.682, "dur": 3.520, + "args": { + "External id": 128319,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685376754.602, "dur": 2.360, + "args": { + "External id": 128320,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685376759.992, "dur": 21.160, + "args": { + "External id": 128321,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376797.632, "dur": 19.620, + "args": { + "External id": 128322,"Record function id": 0, "Sequence number": 2575897, "Fwd thread id": 1, "Ev Idx": 1345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376802.701, "dur": 10.960, + "args": { + "External id": 128323,"Sequence number": 2575897, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1346 + } + }, + { + "ph": "f", "id": 169, "pid": 5717, "tid": 6759, "ts": 6302685376802.701, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376803.921, "dur": 9.500, + "args": { + "External id": 128324,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376806.321, "dur": 6.160, + "args": { + "External id": 128325,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376809.232, "dur": 1.820, + "args": { + "External id": 128326,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685376825.841, "dur": 11.971, + "args": { + "External id": 128327,"Record function id": 0, "Ev Idx": 1350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685376828.752, "dur": 6.780, + "args": { + "External id": 128328,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685376831.332, "dur": 3.669, + "args": { + "External id": 128329,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685376832.092, "dur": 2.720, + "args": { + "External id": 128330,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376842.661, "dur": 14.420, + "args": { + "External id": 128331,"Record function id": 0, "Sequence number": 2575896, "Fwd thread id": 1, "Ev Idx": 1354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376845.172, "dur": 8.449, + "args": { + "External id": 128332,"Sequence number": 2575896, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1355 + } + }, + { + "ph": "f", "id": 170, "pid": 5717, "tid": 6759, "ts": 6302685376845.172, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685376847.821, "dur": 4.511, + "args": { + "External id": 128333,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685376850.461, "dur": 1.660, + "args": { + "External id": 128334,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376861.851, "dur": 112.920, + "args": { + "External id": 128335,"Record function id": 0, "Sequence number": 2575895, "Fwd thread id": 1, "Ev Idx": 1358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376864.151, "dur": 101.490, + "args": { + "External id": 128336,"Sequence number": 2575895, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1359 + } + }, + { + "ph": "f", "id": 171, "pid": 5717, "tid": 6759, "ts": 6302685376864.151, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376867.501, "dur": 5.230, + "args": { + "External id": 128337,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376868.161, "dur": 3.020, + "args": { + "External id": 128338,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376870.281, "dur": 0.640, + "args": { + "External id": 128339,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685376874.631, "dur": 58.320, + "args": { + "External id": 128340,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376934.711, "dur": 4.450, + "args": { + "External id": 128341,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376935.631, "dur": 2.690, + "args": { + "External id": 128342,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376937.161, "dur": 0.950, + "args": { + "External id": 128343,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685376940.811, "dur": 2.650, + "args": { + "External id": 128344,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685376941.701, "dur": 1.320, + "args": { + "External id": 128345,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685376942.551, "dur": 0.320, + "args": { + "External id": 128346,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685376944.201, "dur": 20.480, + "args": { + "External id": 128347,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376983.721, "dur": 30.840, + "args": { + "External id": 128348,"Record function id": 0, "Sequence number": 2575894, "Fwd thread id": 1, "Ev Idx": 1371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685376985.611, "dur": 6.080, + "args": { + "External id": 128349,"Sequence number": 2575894, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1372 + } + }, + { + "ph": "f", "id": 172, "pid": 5717, "tid": 6759, "ts": 6302685376985.611, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685376988.431, "dur": 3.120, + "args": { + "External id": 128350,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685376989.261, "dur": 2.080, + "args": { + "External id": 128351,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685376994.231, "dur": 14.520, + "args": { + "External id": 128352,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377021.331, "dur": 10.600, + "args": { + "External id": 128353,"Record function id": 0, "Sequence number": 2575893, "Fwd thread id": 1, "Ev Idx": 1376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377022.881, "dur": 6.430, + "args": { + "External id": 128354,"Sequence number": 2575893, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1377 + } + }, + { + "ph": "f", "id": 173, "pid": 5717, "tid": 6759, "ts": 6302685377022.881, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685377023.911, "dur": 5.150, + "args": { + "External id": 128355,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685377025.091, "dur": 3.070, + "args": { + "External id": 128356,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377026.851, "dur": 1.010, + "args": { + "External id": 128357,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685377039.711, "dur": 15.350, + "args": { + "External id": 128358,"Record function id": 0, "Ev Idx": 1381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685377042.891, "dur": 9.810, + "args": { + "External id": 128359,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685377045.551, "dur": 6.640, + "args": { + "External id": 128360,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685377048.441, "dur": 3.560, + "args": { + "External id": 128361,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377061.181, "dur": 102.490, + "args": { + "External id": 128362,"Record function id": 0, "Sequence number": 2575892, "Fwd thread id": 1, "Ev Idx": 1385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377062.461, "dur": 54.820, + "args": { + "External id": 128363,"Sequence number": 2575892, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1386 + } + }, + { + "ph": "f", "id": 174, "pid": 5717, "tid": 6759, "ts": 6302685377062.461, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685377065.361, "dur": 29.570, + "args": { + "External id": 128364,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685377097.791, "dur": 18.970, + "args": { + "External id": 128365,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685377122.981, "dur": 30.260, + "args": { + "External id": 128366,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685377156.281, "dur": 2.150, + "args": { + "External id": 128367,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685377172.701, "dur": 11.950, + "args": { + "External id": 128368,"Record function id": 0, "Ev Idx": 1391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685377174.841, "dur": 7.950, + "args": { + "External id": 128369,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685377178.681, "dur": 3.570, + "args": { + "External id": 128370,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685377179.981, "dur": 2.070, + "args": { + "External id": 128371,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377188.851, "dur": 34.100, + "args": { + "External id": 128372,"Record function id": 0, "Sequence number": 2575891, "Fwd thread id": 1, "Ev Idx": 1395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377190.111, "dur": 29.189, + "args": { + "External id": 128373,"Sequence number": 2575891, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1396 + } + }, + { + "ph": "f", "id": 175, "pid": 5717, "tid": 6759, "ts": 6302685377190.111, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685377191.551, "dur": 27.349, + "args": { + "External id": 128374,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685377192.621, "dur": 25.999, + "args": { + "External id": 128375,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377196.171, "dur": 5.470, + "args": { + "External id": 128376,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685377202.831, "dur": 15.220, + "args": { + "External id": 128377,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377228.960, "dur": 92.680, + "args": { + "External id": 128378,"Record function id": 0, "Sequence number": 2575890, "Fwd thread id": 1, "Ev Idx": 1401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377230.400, "dur": 45.711, + "args": { + "External id": 128379,"Sequence number": 2575890, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1402 + } + }, + { + "ph": "f", "id": 176, "pid": 5717, "tid": 6759, "ts": 6302685377230.400, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685377232.431, "dur": 18.289, + "args": { + "External id": 128380,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685377254.451, "dur": 21.180, + "args": { + "External id": 128381,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685377281.720, "dur": 30.780, + "args": { + "External id": 128382,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377334.180, "dur": 86.490, + "args": { + "External id": 128383,"Record function id": 0, "Sequence number": 2575889, "Fwd thread id": 1, "Ev Idx": 1406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377337.090, "dur": 78.290, + "args": { + "External id": 128384,"Sequence number": 2575889, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1407 + } + }, + { + "ph": "f", "id": 177, "pid": 5717, "tid": 6759, "ts": 6302685377337.090, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685377340.880, "dur": 31.900, + "args": { + "External id": 128385,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685377343.810, "dur": 0.510, + "args": { + "External id": 128386,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685377345.260, "dur": 1.390, + "args": { + "External id": 128387,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685377374.160, "dur": 25.780, + "args": { + "External id": 128388,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685377379.500, "dur": 19.540, + "args": { + "External id": 128389,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685377401.990, "dur": 11.580, + "args": { + "External id": 128390,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685377428.270, "dur": 4.090, + "args": { + "External id": 128391,"Record function id": 0, "Sequence number": 2575888, "Fwd thread id": 1, "Ev Idx": 1414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685377429.980, "dur": 0.480, + "args": { + "External id": 128392,"Sequence number": 2575888, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1415 + } + }, + { + "ph": "f", "id": 178, "pid": 5717, "tid": 6759, "ts": 6302685377429.980, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685377435.680, "dur": 35.760, + "args": { + "External id": 128393,"Record function id": 0, "Sequence number": 2575887, "Fwd thread id": 1, "Ev Idx": 1416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685377436.900, "dur": 30.580, + "args": { + "External id": 128394,"Sequence number": 2575887, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1417 + } + }, + { + "ph": "f", "id": 179, "pid": 5717, "tid": 6759, "ts": 6302685377436.900, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 6759, + "ts": 6302685377439.390, "dur": 5.370, + "args": { + "External id": 128395,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377441.860, "dur": 1.420, + "args": { + "External id": 128396,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685377445.680, "dur": 21.190, + "args": { + "External id": 128397,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685377449.080, "dur": 16.890, + "args": { + "External id": 128398,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377478.280, "dur": 80.770, + "args": { + "External id": 128399,"Record function id": 0, "Sequence number": 2575886, "Fwd thread id": 1, "Ev Idx": 1422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377479.770, "dur": 60.240, + "args": { + "External id": 128400,"Sequence number": 2575886, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1423 + } + }, + { + "ph": "f", "id": 180, "pid": 5717, "tid": 6759, "ts": 6302685377479.770, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685377481.580, "dur": 27.240, + "args": { + "External id": 128401,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685377482.670, "dur": 0.320, + "args": { + "External id": 128402,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685377483.740, "dur": 0.210, + "args": { + "External id": 128403,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685377489.270, "dur": 18.400, + "args": { + "External id": 128404,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685377509.810, "dur": 16.810, + "args": { + "External id": 128405,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685377512.310, "dur": 13.440, + "args": { + "External id": 128406,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685377527.510, "dur": 10.620, + "args": { + "External id": 128407,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685377545.830, "dur": 10.090, + "args": { + "External id": 128408,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377567.290, "dur": 42.340, + "args": { + "External id": 128409,"Record function id": 0, "Sequence number": 2575885, "Fwd thread id": 1, "Ev Idx": 1432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377569.100, "dur": 24.840, + "args": { + "External id": 128410,"Sequence number": 2575885, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1433 + } + }, + { + "ph": "f", "id": 181, "pid": 5717, "tid": 6759, "ts": 6302685377569.100, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685377571.340, "dur": 22.260, + "args": { + "External id": 128411,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685377572.260, "dur": 21.110, + "args": { + "External id": 128412,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377575.530, "dur": 5.050, + "args": { + "External id": 128413,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685377581.560, "dur": 11.240, + "args": { + "External id": 128414,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685377598.360, "dur": 8.910, + "args": { + "External id": 128415,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377616.640, "dur": 5.440, + "args": { + "External id": 128416,"Record function id": 0, "Sequence number": 2575884, "Fwd thread id": 1, "Ev Idx": 1439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685377618.280, "dur": 0.960, + "args": { + "External id": 128417,"Sequence number": 2575884, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1440 + } + }, + { + "ph": "f", "id": 182, "pid": 5717, "tid": 6759, "ts": 6302685377618.280, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685377626.490, "dur": 368.179, + "args": { + "External id": 128418,"Record function id": 0, "Sequence number": 2575883, "Fwd thread id": 1, "Ev Idx": 1441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685377628.050, "dur": 356.499, + "args": { + "External id": 128419,"Sequence number": 2575883, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1442 + } + }, + { + "ph": "f", "id": 183, "pid": 5717, "tid": 6759, "ts": 6302685377628.050, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685377653.559, "dur": 7.031, + "args": { + "External id": 128420,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5717, "tid": 6759, + "ts": 6302685377657.050, "dur": 3.120, + "args": { + "External id": 128421,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 1444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685377664.110, "dur": 4.609, + "args": { + "External id": 128422,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685377665.470, "dur": 2.520, + "args": { + "External id": 128423,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377666.910, "dur": 0.780, + "args": { + "External id": 128424,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 6759, + "ts": 6302685377670.919, "dur": 39.660, + "args": { + "External id": 128425,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 1448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685377671.459, "dur": 3.171, + "args": { + "External id": 128426,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 1449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685377671.950, "dur": 2.320, + "args": { + "External id": 128427,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377673.850, "dur": 0.229, + "args": { + "External id": 128428,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 6759, + "ts": 6302685377675.419, "dur": 34.400, + "args": { + "External id": 128429,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685377676.330, "dur": 32.729, + "args": { + "External id": 128430,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 6759, + "ts": 6302685377714.739, "dur": 3.560, + "args": { + "External id": 128431,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 1454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685377716.059, "dur": 2.051, + "args": { + "External id": 128432,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685377743.769, "dur": 10.410, + "args": { + "External id": 128433,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685377755.509, "dur": 6.530, + "args": { + "External id": 128434,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685377764.089, "dur": 4.230, + "args": { + "External id": 128435,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685377814.079, "dur": 5.690, + "args": { + "External id": 128436,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685377816.089, "dur": 3.340, + "args": { + "External id": 128437,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5717, "tid": 6759, + "ts": 6302685377838.109, "dur": 124.550, + "args": { + "External id": 128438,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 1461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 6759, + "ts": 6302685377843.189, "dur": 4.650, + "args": { + "External id": 128439,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377845.779, "dur": 1.140, + "args": { + "External id": 128440,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685377849.049, "dur": 3.560, + "args": { + "External id": 128441,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 1464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377851.589, "dur": 0.450, + "args": { + "External id": 128442,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 1465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 6759, + "ts": 6302685377854.199, "dur": 1.730, + "args": { + "External id": 128443,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377855.299, "dur": 0.290, + "args": { + "External id": 128444,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685377856.689, "dur": 2.590, + "args": { + "External id": 128445,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377858.629, "dur": 0.340, + "args": { + "External id": 128446,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 1469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685377862.579, "dur": 2.170, + "args": { + "External id": 128447,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 1470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377864.059, "dur": 0.360, + "args": { + "External id": 128448,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 1471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685377865.859, "dur": 7.830, + "args": { + "External id": 128449,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 1472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5717, "tid": 6759, + "ts": 6302685377871.079, "dur": 2.310, + "args": { + "External id": 128450,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 1473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685377874.699, "dur": 2.020, + "args": { + "External id": 128451,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 1474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377876.109, "dur": 0.270, + "args": { + "External id": 128452,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 1475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685377877.539, "dur": 2.640, + "args": { + "External id": 128453,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685377878.329, "dur": 1.680, + "args": { + "External id": 128454,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685377881.469, "dur": 61.100, + "args": { + "External id": 128455,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 1478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685377946.919, "dur": 2.750, + "args": { + "External id": 128456,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 1479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5717, "tid": 6759, + "ts": 6302685377951.809, "dur": 4.340, + "args": { + "External id": 128457,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 1480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685377954.829, "dur": 0.650, + "args": { + "External id": 128458,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 1481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685377959.879, "dur": 1.820, + "args": { + "External id": 128459,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 1482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685378009.099, "dur": 11.980, + "args": { + "External id": 128460,"Record function id": 0, "Ev Idx": 1483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685378012.599, "dur": 7.080, + "args": { + "External id": 128461,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685378014.929, "dur": 3.660, + "args": { + "External id": 128462,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685378016.019, "dur": 2.360, + "args": { + "External id": 128463,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378026.929, "dur": 17.570, + "args": { + "External id": 128464,"Record function id": 0, "Sequence number": 2575882, "Fwd thread id": 1, "Ev Idx": 1487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378028.499, "dur": 12.040, + "args": { + "External id": 128465,"Sequence number": 2575882, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1488 + } + }, + { + "ph": "f", "id": 184, "pid": 5717, "tid": 6759, "ts": 6302685378028.499, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685378035.489, "dur": 4.770, + "args": { + "External id": 128466,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685378037.339, "dur": 2.650, + "args": { + "External id": 128467,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378050.689, "dur": 132.489, + "args": { + "External id": 128468,"Record function id": 0, "Sequence number": 2575881, "Fwd thread id": 1, "Ev Idx": 1491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378053.079, "dur": 121.410, + "args": { + "External id": 128469,"Sequence number": 2575881, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1492 + } + }, + { + "ph": "f", "id": 185, "pid": 5717, "tid": 6759, "ts": 6302685378053.079, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685378057.959, "dur": 8.110, + "args": { + "External id": 128470,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685378060.419, "dur": 3.880, + "args": { + "External id": 128471,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378063.069, "dur": 0.920, + "args": { + "External id": 128472,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685378067.099, "dur": 55.890, + "args": { + "External id": 128473,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685378126.318, "dur": 9.800, + "args": { + "External id": 128474,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685378127.689, "dur": 6.349, + "args": { + "External id": 128475,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378132.669, "dur": 1.080, + "args": { + "External id": 128476,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685378139.138, "dur": 6.340, + "args": { + "External id": 128477,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685378140.258, "dur": 3.571, + "args": { + "External id": 128478,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378142.369, "dur": 1.249, + "args": { + "External id": 128479,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685378146.218, "dur": 27.220, + "args": { + "External id": 128480,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378192.508, "dur": 10.040, + "args": { + "External id": 128481,"Record function id": 0, "Sequence number": 2575880, "Fwd thread id": 1, "Ev Idx": 1504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378194.398, "dur": 6.030, + "args": { + "External id": 128482,"Sequence number": 2575880, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1505 + } + }, + { + "ph": "f", "id": 186, "pid": 5717, "tid": 6759, "ts": 6302685378194.398, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685378196.658, "dur": 3.590, + "args": { + "External id": 128483,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685378197.588, "dur": 2.470, + "args": { + "External id": 128484,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378206.088, "dur": 8.040, + "args": { + "External id": 128485,"Record function id": 0, "Sequence number": 2575879, "Fwd thread id": 1, "Ev Idx": 1508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378207.198, "dur": 4.900, + "args": { + "External id": 128486,"Sequence number": 2575879, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1509 + } + }, + { + "ph": "f", "id": 187, "pid": 5717, "tid": 6759, "ts": 6302685378207.198, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685378208.078, "dur": 3.850, + "args": { + "External id": 128487,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685378209.008, "dur": 2.320, + "args": { + "External id": 128488,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378210.408, "dur": 0.630, + "args": { + "External id": 128489,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685378218.948, "dur": 7.160, + "args": { + "External id": 128490,"Record function id": 0, "Ev Idx": 1513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685378220.838, "dur": 4.150, + "args": { + "External id": 128491,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685378222.328, "dur": 2.240, + "args": { + "External id": 128492,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685378222.928, "dur": 1.470, + "args": { + "External id": 128493,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378229.688, "dur": 7.220, + "args": { + "External id": 128494,"Record function id": 0, "Sequence number": 2575878, "Fwd thread id": 1, "Ev Idx": 1517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378230.898, "dur": 4.150, + "args": { + "External id": 128495,"Sequence number": 2575878, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1518 + } + }, + { + "ph": "f", "id": 188, "pid": 5717, "tid": 6759, "ts": 6302685378230.898, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685378233.188, "dur": 1.710, + "args": { + "External id": 128496,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685378233.808, "dur": 0.940, + "args": { + "External id": 128497,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378242.838, "dur": 136.080, + "args": { + "External id": 128498,"Record function id": 0, "Sequence number": 2575877, "Fwd thread id": 1, "Ev Idx": 1521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378244.678, "dur": 124.940, + "args": { + "External id": 128499,"Sequence number": 2575877, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1522 + } + }, + { + "ph": "f", "id": 189, "pid": 5717, "tid": 6759, "ts": 6302685378244.678, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685378246.988, "dur": 4.080, + "args": { + "External id": 128500,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685378247.808, "dur": 2.800, + "args": { + "External id": 128501,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378249.688, "dur": 0.680, + "args": { + "External id": 128502,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685378251.988, "dur": 60.560, + "args": { + "External id": 128503,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685378315.858, "dur": 8.410, + "args": { + "External id": 128504,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685378317.978, "dur": 4.220, + "args": { + "External id": 128505,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378320.978, "dur": 0.960, + "args": { + "External id": 128506,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685378327.128, "dur": 4.740, + "args": { + "External id": 128507,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685378327.878, "dur": 3.560, + "args": { + "External id": 128508,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378329.998, "dur": 1.240, + "args": { + "External id": 128509,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685378333.628, "dur": 34.850, + "args": { + "External id": 128510,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378388.388, "dur": 42.490, + "args": { + "External id": 128511,"Record function id": 0, "Sequence number": 2575876, "Fwd thread id": 1, "Ev Idx": 1534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378390.208, "dur": 14.490, + "args": { + "External id": 128512,"Sequence number": 2575876, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1535 + } + }, + { + "ph": "f", "id": 190, "pid": 5717, "tid": 6759, "ts": 6302685378390.208, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685378400.968, "dur": 3.530, + "args": { + "External id": 128513,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685378401.908, "dur": 2.390, + "args": { + "External id": 128514,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685378407.588, "dur": 19.050, + "args": { + "External id": 128515,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378437.438, "dur": 14.480, + "args": { + "External id": 128516,"Record function id": 0, "Sequence number": 2575875, "Fwd thread id": 1, "Ev Idx": 1539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378439.268, "dur": 9.550, + "args": { + "External id": 128517,"Sequence number": 2575875, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1540 + } + }, + { + "ph": "f", "id": 191, "pid": 5717, "tid": 6759, "ts": 6302685378439.268, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685378440.378, "dur": 8.070, + "args": { + "External id": 128518,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685378443.688, "dur": 3.740, + "args": { + "External id": 128519,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378445.968, "dur": 1.150, + "args": { + "External id": 128520,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685378457.498, "dur": 8.340, + "args": { + "External id": 128521,"Record function id": 0, "Ev Idx": 1544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685378459.178, "dur": 5.470, + "args": { + "External id": 128522,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685378460.708, "dur": 3.450, + "args": { + "External id": 128523,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685378462.328, "dur": 1.650, + "args": { + "External id": 128524,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378469.448, "dur": 74.260, + "args": { + "External id": 128525,"Record function id": 0, "Sequence number": 2575874, "Fwd thread id": 1, "Ev Idx": 1548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378470.648, "dur": 33.590, + "args": { + "External id": 128526,"Sequence number": 2575874, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1549 + } + }, + { + "ph": "f", "id": 192, "pid": 5717, "tid": 6759, "ts": 6302685378470.648, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685378472.458, "dur": 18.170, + "args": { + "External id": 128527,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685378492.088, "dur": 11.750, + "args": { + "External id": 128528,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685378507.598, "dur": 24.899, + "args": { + "External id": 128529,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685378535.548, "dur": 3.289, + "args": { + "External id": 128530,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685378552.437, "dur": 7.600, + "args": { + "External id": 128531,"Record function id": 0, "Ev Idx": 1554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685378554.497, "dur": 4.340, + "args": { + "External id": 128532,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685378555.997, "dur": 2.371, + "args": { + "External id": 128533,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685378556.837, "dur": 1.340, + "args": { + "External id": 128534,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378563.808, "dur": 41.280, + "args": { + "External id": 128535,"Record function id": 0, "Sequence number": 2575873, "Fwd thread id": 1, "Ev Idx": 1558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378565.077, "dur": 32.100, + "args": { + "External id": 128536,"Sequence number": 2575873, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1559 + } + }, + { + "ph": "f", "id": 193, "pid": 5717, "tid": 6759, "ts": 6302685378565.077, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685378566.528, "dur": 30.300, + "args": { + "External id": 128537,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685378567.537, "dur": 29.000, + "args": { + "External id": 128538,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378570.637, "dur": 5.480, + "args": { + "External id": 128539,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685378577.988, "dur": 17.800, + "args": { + "External id": 128540,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378615.097, "dur": 86.260, + "args": { + "External id": 128541,"Record function id": 0, "Sequence number": 2575872, "Fwd thread id": 1, "Ev Idx": 1564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378617.848, "dur": 48.639, + "args": { + "External id": 128542,"Sequence number": 2575872, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1565 + } + }, + { + "ph": "f", "id": 194, "pid": 5717, "tid": 6759, "ts": 6302685378617.848, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685378621.448, "dur": 23.189, + "args": { + "External id": 128543,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685378647.237, "dur": 18.830, + "args": { + "External id": 128544,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685378671.707, "dur": 22.940, + "args": { + "External id": 128545,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378713.567, "dur": 83.160, + "args": { + "External id": 128546,"Record function id": 0, "Sequence number": 2575871, "Fwd thread id": 1, "Ev Idx": 1569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378715.197, "dur": 76.570, + "args": { + "External id": 128547,"Sequence number": 2575871, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1570 + } + }, + { + "ph": "f", "id": 195, "pid": 5717, "tid": 6759, "ts": 6302685378715.197, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685378718.757, "dur": 29.240, + "args": { + "External id": 128548,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685378721.437, "dur": 0.440, + "args": { + "External id": 128549,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685378723.977, "dur": 0.220, + "args": { + "External id": 128550,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685378750.797, "dur": 26.800, + "args": { + "External id": 128551,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685378755.037, "dur": 21.630, + "args": { + "External id": 128552,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685378779.657, "dur": 10.080, + "args": { + "External id": 128553,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685378804.127, "dur": 3.580, + "args": { + "External id": 128554,"Record function id": 0, "Sequence number": 2575870, "Fwd thread id": 1, "Ev Idx": 1577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685378805.547, "dur": 0.460, + "args": { + "External id": 128555,"Sequence number": 2575870, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1578 + } + }, + { + "ph": "f", "id": 196, "pid": 5717, "tid": 6759, "ts": 6302685378805.547, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685378811.217, "dur": 34.960, + "args": { + "External id": 128556,"Record function id": 0, "Sequence number": 2575869, "Fwd thread id": 1, "Ev Idx": 1579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685378812.337, "dur": 30.060, + "args": { + "External id": 128557,"Sequence number": 2575869, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1580 + } + }, + { + "ph": "f", "id": 197, "pid": 5717, "tid": 6759, "ts": 6302685378812.337, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 6759, + "ts": 6302685378814.617, "dur": 5.230, + "args": { + "External id": 128558,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378817.077, "dur": 1.390, + "args": { + "External id": 128559,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685378820.797, "dur": 20.990, + "args": { + "External id": 128560,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685378823.387, "dur": 17.380, + "args": { + "External id": 128561,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378852.767, "dur": 82.350, + "args": { + "External id": 128562,"Record function id": 0, "Sequence number": 2575868, "Fwd thread id": 1, "Ev Idx": 1585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378855.007, "dur": 61.750, + "args": { + "External id": 128563,"Sequence number": 2575868, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1586 + } + }, + { + "ph": "f", "id": 198, "pid": 5717, "tid": 6759, "ts": 6302685378855.007, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685378856.547, "dur": 28.930, + "args": { + "External id": 128564,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685378857.727, "dur": 0.320, + "args": { + "External id": 128565,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685378858.837, "dur": 0.180, + "args": { + "External id": 128566,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685378864.427, "dur": 19.820, + "args": { + "External id": 128567,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685378886.517, "dur": 16.860, + "args": { + "External id": 128568,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685378888.947, "dur": 13.580, + "args": { + "External id": 128569,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685378904.247, "dur": 10.710, + "args": { + "External id": 128570,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685378922.087, "dur": 9.580, + "args": { + "External id": 128571,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378943.247, "dur": 50.309, + "args": { + "External id": 128572,"Record function id": 0, "Sequence number": 2575867, "Fwd thread id": 1, "Ev Idx": 1595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685378945.037, "dur": 24.370, + "args": { + "External id": 128573,"Sequence number": 2575867, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1596 + } + }, + { + "ph": "f", "id": 199, "pid": 5717, "tid": 6759, "ts": 6302685378945.037, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685378947.387, "dur": 21.669, + "args": { + "External id": 128574,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685378948.287, "dur": 20.529, + "args": { + "External id": 128575,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685378951.467, "dur": 4.900, + "args": { + "External id": 128576,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685378957.327, "dur": 10.920, + "args": { + "External id": 128577,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685378973.896, "dur": 13.420, + "args": { + "External id": 128578,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379000.996, "dur": 18.320, + "args": { + "External id": 128579,"Record function id": 0, "Sequence number": 2575866, "Fwd thread id": 1, "Ev Idx": 1602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379002.747, "dur": 0.969, + "args": { + "External id": 128580,"Sequence number": 2575866, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1603 + } + }, + { + "ph": "f", "id": 200, "pid": 5717, "tid": 6759, "ts": 6302685379002.747, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685379006.067, "dur": 10.720, + "args": { + "External id": 128581,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379024.136, "dur": 9.551, + "args": { + "External id": 128582,"Record function id": 0, "Sequence number": 2575865, "Fwd thread id": 1, "Ev Idx": 1605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379025.547, "dur": 6.020, + "args": { + "External id": 128583,"Sequence number": 2575865, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1606 + } + }, + { + "ph": "f", "id": 201, "pid": 5717, "tid": 6759, "ts": 6302685379025.547, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685379027.367, "dur": 3.989, + "args": { + "External id": 128584,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685379028.516, "dur": 2.620, + "args": { + "External id": 128585,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379037.436, "dur": 90.380, + "args": { + "External id": 128586,"Record function id": 0, "Sequence number": 2575864, "Fwd thread id": 1, "Ev Idx": 1609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379038.527, "dur": 83.089, + "args": { + "External id": 128587,"Sequence number": 2575864, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1610 + } + }, + { + "ph": "f", "id": 202, "pid": 5717, "tid": 6759, "ts": 6302685379038.527, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685379042.076, "dur": 5.031, + "args": { + "External id": 128588,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685379043.407, "dur": 2.900, + "args": { + "External id": 128589,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379044.856, "dur": 1.111, + "args": { + "External id": 128590,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685379048.067, "dur": 39.639, + "args": { + "External id": 128591,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685379089.576, "dur": 5.200, + "args": { + "External id": 128592,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685379090.396, "dur": 3.600, + "args": { + "External id": 128593,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379092.906, "dur": 0.880, + "args": { + "External id": 128594,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685379096.536, "dur": 2.620, + "args": { + "External id": 128595,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685379097.356, "dur": 1.210, + "args": { + "External id": 128596,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379098.136, "dur": 0.270, + "args": { + "External id": 128597,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685379099.846, "dur": 20.880, + "args": { + "External id": 128598,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379136.406, "dur": 9.390, + "args": { + "External id": 128599,"Record function id": 0, "Sequence number": 2575863, "Fwd thread id": 1, "Ev Idx": 1622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379138.226, "dur": 5.310, + "args": { + "External id": 128600,"Sequence number": 2575863, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1623 + } + }, + { + "ph": "f", "id": 203, "pid": 5717, "tid": 6759, "ts": 6302685379138.226, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685379140.146, "dur": 3.170, + "args": { + "External id": 128601,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685379141.076, "dur": 2.040, + "args": { + "External id": 128602,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379149.186, "dur": 8.940, + "args": { + "External id": 128603,"Record function id": 0, "Sequence number": 2575862, "Fwd thread id": 1, "Ev Idx": 1626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379150.326, "dur": 5.850, + "args": { + "External id": 128604,"Sequence number": 2575862, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1627 + } + }, + { + "ph": "f", "id": 204, "pid": 5717, "tid": 6759, "ts": 6302685379150.326, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685379151.176, "dur": 4.810, + "args": { + "External id": 128605,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685379153.286, "dur": 2.110, + "args": { + "External id": 128606,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379154.566, "dur": 0.540, + "args": { + "External id": 128607,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685379163.246, "dur": 7.540, + "args": { + "External id": 128608,"Record function id": 0, "Ev Idx": 1631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685379164.906, "dur": 4.740, + "args": { + "External id": 128609,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685379166.506, "dur": 2.680, + "args": { + "External id": 128610,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685379167.146, "dur": 1.880, + "args": { + "External id": 128611,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379174.206, "dur": 7.410, + "args": { + "External id": 128612,"Record function id": 0, "Sequence number": 2575861, "Fwd thread id": 1, "Ev Idx": 1635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379175.486, "dur": 4.110, + "args": { + "External id": 128613,"Sequence number": 2575861, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1636 + } + }, + { + "ph": "f", "id": 205, "pid": 5717, "tid": 6759, "ts": 6302685379175.486, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685379176.496, "dur": 2.960, + "args": { + "External id": 128614,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685379177.246, "dur": 2.020, + "args": { + "External id": 128615,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5717, "tid": 6759, + "ts": 6302685379186.006, "dur": 225.420, + "args": { + "External id": 128616,"Record function id": 0, "Sequence number": 2575860, "Fwd thread id": 1, "Ev Idx": 1639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5717, "tid": 6759, + "ts": 6302685379187.476, "dur": 205.170, + "args": { + "External id": 128617,"Sequence number": 2575860, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1640 + } + }, + { + "ph": "f", "id": 206, "pid": 5717, "tid": 6759, "ts": 6302685379187.476, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685379199.236, "dur": 8.820, + "args": { + "External id": 128618,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379201.116, "dur": 6.390, + "args": { + "External id": 128619,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685379210.166, "dur": 4.840, + "args": { + "External id": 128620,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379211.416, "dur": 3.360, + "args": { + "External id": 128621,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685379216.416, "dur": 4.500, + "args": { + "External id": 128622,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379217.516, "dur": 3.180, + "args": { + "External id": 128623,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685379236.546, "dur": 130.640, + "args": { + "External id": 128624,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 1647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685379287.536, "dur": 5.620, + "args": { + "External id": 128625,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685379294.986, "dur": 11.280, + "args": { + "External id": 128626,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685379378.766, "dur": 3.120, + "args": { + "External id": 128627,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685379386.116, "dur": 0.660, + "args": { + "External id": 128628,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 6759, + "ts": 6302685379389.606, "dur": 0.510, + "args": { + "External id": 128629,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685379421.686, "dur": 172.859, + "args": { + "External id": 128630,"Record function id": 0, "Sequence number": 2575859, "Fwd thread id": 1, "Ev Idx": 1653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685379423.855, "dur": 162.770, + "args": { + "External id": 128631,"Sequence number": 2575859, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1654 + } + }, + { + "ph": "f", "id": 207, "pid": 5717, "tid": 6759, "ts": 6302685379423.855, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685379438.766, "dur": 37.069, + "args": { + "External id": 128632,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379442.275, "dur": 6.480, + "args": { + "External id": 128633,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685379449.995, "dur": 25.280, + "args": { + "External id": 128634,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685379483.646, "dur": 6.880, + "args": { + "External id": 128635,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379485.506, "dur": 4.620, + "args": { + "External id": 128636,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685379605.555, "dur": 153.650, + "args": { + "External id": 128637,"Record function id": 0, "Sequence number": 2575858, "Fwd thread id": 1, "Ev Idx": 1660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685379608.015, "dur": 143.280, + "args": { + "External id": 128638,"Sequence number": 2575858, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1661 + } + }, + { + "ph": "f", "id": 208, "pid": 5717, "tid": 6759, "ts": 6302685379608.015, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685379621.015, "dur": 29.760, + "args": { + "External id": 128639,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379624.315, "dur": 6.180, + "args": { + "External id": 128640,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685379631.475, "dur": 18.770, + "args": { + "External id": 128641,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685379658.685, "dur": 7.110, + "args": { + "External id": 128642,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379660.755, "dur": 4.580, + "args": { + "External id": 128643,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379770.215, "dur": 13.010, + "args": { + "External id": 128644,"Record function id": 0, "Sequence number": 2575857, "Fwd thread id": 1, "Ev Idx": 1667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379772.645, "dur": 7.780, + "args": { + "External id": 128645,"Sequence number": 2575857, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1668 + } + }, + { + "ph": "f", "id": 209, "pid": 5717, "tid": 6759, "ts": 6302685379772.645, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685379775.225, "dur": 4.920, + "args": { + "External id": 128646,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685379776.285, "dur": 3.600, + "args": { + "External id": 128647,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379787.285, "dur": 6.960, + "args": { + "External id": 128648,"Record function id": 0, "Sequence number": 2575856, "Fwd thread id": 1, "Ev Idx": 1671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379788.595, "dur": 3.210, + "args": { + "External id": 128649,"Sequence number": 2575856, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1672 + } + }, + { + "ph": "f", "id": 210, "pid": 5717, "tid": 6759, "ts": 6302685379788.595, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685379789.635, "dur": 2.000, + "args": { + "External id": 128650,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685379790.595, "dur": 0.890, + "args": { + "External id": 128651,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379797.565, "dur": 5.830, + "args": { + "External id": 128652,"Record function id": 0, "Sequence number": 2575855, "Fwd thread id": 1, "Ev Idx": 1675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379798.695, "dur": 2.580, + "args": { + "External id": 128653,"Sequence number": 2575855, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1676 + } + }, + { + "ph": "f", "id": 211, "pid": 5717, "tid": 6759, "ts": 6302685379798.695, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685379799.615, "dur": 1.540, + "args": { + "External id": 128654,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685379800.235, "dur": 0.770, + "args": { + "External id": 128655,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379806.895, "dur": 6.110, + "args": { + "External id": 128656,"Record function id": 0, "Sequence number": 2575854, "Fwd thread id": 1, "Ev Idx": 1679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379808.025, "dur": 2.830, + "args": { + "External id": 128657,"Sequence number": 2575854, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1680 + } + }, + { + "ph": "f", "id": 212, "pid": 5717, "tid": 6759, "ts": 6302685379808.025, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685379809.045, "dur": 1.660, + "args": { + "External id": 128658,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685379809.745, "dur": 0.810, + "args": { + "External id": 128659,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379816.425, "dur": 96.980, + "args": { + "External id": 128660,"Record function id": 0, "Sequence number": 2575853, "Fwd thread id": 1, "Ev Idx": 1683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379817.655, "dur": 87.270, + "args": { + "External id": 128661,"Sequence number": 2575853, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1684 + } + }, + { + "ph": "f", "id": 213, "pid": 5717, "tid": 6759, "ts": 6302685379817.655, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685379820.375, "dur": 5.840, + "args": { + "External id": 128662,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685379821.625, "dur": 3.980, + "args": { + "External id": 128663,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379823.135, "dur": 2.110, + "args": { + "External id": 128664,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685379827.785, "dur": 41.849, + "args": { + "External id": 128665,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685379871.685, "dur": 4.169, + "args": { + "External id": 128666,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685379872.565, "dur": 2.569, + "args": { + "External id": 128667,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379874.134, "dur": 0.791, + "args": { + "External id": 128668,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685379879.305, "dur": 3.480, + "args": { + "External id": 128669,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685379880.114, "dur": 2.220, + "args": { + "External id": 128670,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379881.894, "dur": 0.291, + "args": { + "External id": 128671,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685379883.465, "dur": 20.489, + "args": { + "External id": 128672,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379922.274, "dur": 9.451, + "args": { + "External id": 128673,"Record function id": 0, "Sequence number": 2575852, "Fwd thread id": 1, "Ev Idx": 1696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379924.185, "dur": 5.620, + "args": { + "External id": 128674,"Sequence number": 2575852, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1697 + } + }, + { + "ph": "f", "id": 214, "pid": 5717, "tid": 6759, "ts": 6302685379924.185, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685379926.425, "dur": 3.229, + "args": { + "External id": 128675,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685379927.265, "dur": 2.149, + "args": { + "External id": 128676,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379935.425, "dur": 8.980, + "args": { + "External id": 128677,"Record function id": 0, "Sequence number": 2575851, "Fwd thread id": 1, "Ev Idx": 1700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379936.585, "dur": 5.600, + "args": { + "External id": 128678,"Sequence number": 2575851, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1701 + } + }, + { + "ph": "f", "id": 215, "pid": 5717, "tid": 6759, "ts": 6302685379936.585, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685379937.385, "dur": 4.600, + "args": { + "External id": 128679,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685379938.214, "dur": 3.200, + "args": { + "External id": 128680,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379940.425, "dur": 0.689, + "args": { + "External id": 128681,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685379949.474, "dur": 8.670, + "args": { + "External id": 128682,"Record function id": 0, "Ev Idx": 1705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685379951.464, "dur": 5.490, + "args": { + "External id": 128683,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685379953.164, "dur": 3.350, + "args": { + "External id": 128684,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685379953.814, "dur": 2.510, + "args": { + "External id": 128685,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379961.774, "dur": 6.600, + "args": { + "External id": 128686,"Record function id": 0, "Sequence number": 2575850, "Fwd thread id": 1, "Ev Idx": 1709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379962.964, "dur": 2.960, + "args": { + "External id": 128687,"Sequence number": 2575850, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1710 + } + }, + { + "ph": "f", "id": 216, "pid": 5717, "tid": 6759, "ts": 6302685379962.964, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685379964.024, "dur": 1.770, + "args": { + "External id": 128688,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685379964.604, "dur": 1.020, + "args": { + "External id": 128689,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379972.194, "dur": 81.230, + "args": { + "External id": 128690,"Record function id": 0, "Sequence number": 2575849, "Fwd thread id": 1, "Ev Idx": 1713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685379974.134, "dur": 72.380, + "args": { + "External id": 128691,"Sequence number": 2575849, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1714 + } + }, + { + "ph": "f", "id": 217, "pid": 5717, "tid": 6759, "ts": 6302685379974.134, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685379976.194, "dur": 2.860, + "args": { + "External id": 128692,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685379976.824, "dur": 1.790, + "args": { + "External id": 128693,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685379977.804, "dur": 0.620, + "args": { + "External id": 128694,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685379979.834, "dur": 33.970, + "args": { + "External id": 128695,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685380015.534, "dur": 5.020, + "args": { + "External id": 128696,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685380016.374, "dur": 3.470, + "args": { + "External id": 128697,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685380018.874, "dur": 0.780, + "args": { + "External id": 128698,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685380022.094, "dur": 2.800, + "args": { + "External id": 128699,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685380022.904, "dur": 1.430, + "args": { + "External id": 128700,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685380023.904, "dur": 0.290, + "args": { + "External id": 128701,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685380025.574, "dur": 20.150, + "args": { + "External id": 128702,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380061.934, "dur": 40.510, + "args": { + "External id": 128703,"Record function id": 0, "Sequence number": 2575848, "Fwd thread id": 1, "Ev Idx": 1726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380063.814, "dur": 5.180, + "args": { + "External id": 128704,"Sequence number": 2575848, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1727 + } + }, + { + "ph": "f", "id": 218, "pid": 5717, "tid": 6759, "ts": 6302685380063.814, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685380065.614, "dur": 3.220, + "args": { + "External id": 128705,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685380066.444, "dur": 2.190, + "args": { + "External id": 128706,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 6759, + "ts": 6302685380071.524, "dur": 26.630, + "args": { + "External id": 128707,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380109.654, "dur": 13.800, + "args": { + "External id": 128708,"Record function id": 0, "Sequence number": 2575847, "Fwd thread id": 1, "Ev Idx": 1731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380112.984, "dur": 7.730, + "args": { + "External id": 128709,"Sequence number": 2575847, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1732 + } + }, + { + "ph": "f", "id": 219, "pid": 5717, "tid": 6759, "ts": 6302685380112.984, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685380113.964, "dur": 6.510, + "args": { + "External id": 128710,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685380116.534, "dur": 3.020, + "args": { + "External id": 128711,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685380118.424, "dur": 0.850, + "args": { + "External id": 128712,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685380128.584, "dur": 8.890, + "args": { + "External id": 128713,"Record function id": 0, "Ev Idx": 1736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685380131.604, "dur": 4.670, + "args": { + "External id": 128714,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685380133.164, "dur": 2.620, + "args": { + "External id": 128715,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685380133.854, "dur": 1.730, + "args": { + "External id": 128716,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380141.154, "dur": 8.300, + "args": { + "External id": 128717,"Record function id": 0, "Sequence number": 2575846, "Fwd thread id": 1, "Ev Idx": 1740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380142.584, "dur": 4.810, + "args": { + "External id": 128718,"Sequence number": 2575846, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1741 + } + }, + { + "ph": "f", "id": 220, "pid": 5717, "tid": 6759, "ts": 6302685380142.584, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685380143.834, "dur": 3.390, + "args": { + "External id": 128719,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685380145.654, "dur": 1.370, + "args": { + "External id": 128720,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380153.084, "dur": 87.620, + "args": { + "External id": 128721,"Record function id": 0, "Sequence number": 2575845, "Fwd thread id": 1, "Ev Idx": 1744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380154.224, "dur": 76.860, + "args": { + "External id": 128722,"Sequence number": 2575845, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1745 + } + }, + { + "ph": "f", "id": 221, "pid": 5717, "tid": 6759, "ts": 6302685380154.224, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685380156.194, "dur": 2.830, + "args": { + "External id": 128723,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685380156.814, "dur": 1.810, + "args": { + "External id": 128724,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685380157.804, "dur": 0.620, + "args": { + "External id": 128725,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685380159.914, "dur": 36.970, + "args": { + "External id": 128726,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685380199.644, "dur": 4.230, + "args": { + "External id": 128727,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685380200.604, "dur": 2.560, + "args": { + "External id": 128728,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685380202.154, "dur": 0.810, + "args": { + "External id": 128729,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685380205.644, "dur": 2.970, + "args": { + "External id": 128730,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685380206.704, "dur": 1.440, + "args": { + "External id": 128731,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685380207.554, "dur": 0.440, + "args": { + "External id": 128732,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685380209.314, "dur": 20.870, + "args": { + "External id": 128733,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380249.254, "dur": 28.310, + "args": { + "External id": 128734,"Record function id": 0, "Sequence number": 2575844, "Fwd thread id": 1, "Ev Idx": 1757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380251.014, "dur": 6.050, + "args": { + "External id": 128735,"Sequence number": 2575844, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1758 + } + }, + { + "ph": "f", "id": 222, "pid": 5717, "tid": 6759, "ts": 6302685380251.014, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 6759, + "ts": 6302685380252.974, "dur": 3.930, + "args": { + "External id": 128736,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685380254.614, "dur": 2.080, + "args": { + "External id": 128737,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685380259.704, "dur": 14.320, + "args": { + "External id": 128738,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380283.224, "dur": 10.109, + "args": { + "External id": 128739,"Record function id": 0, "Sequence number": 2575843, "Fwd thread id": 1, "Ev Idx": 1762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380284.874, "dur": 6.000, + "args": { + "External id": 128740,"Sequence number": 2575843, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1763 + } + }, + { + "ph": "f", "id": 223, "pid": 5717, "tid": 6759, "ts": 6302685380284.874, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 6759, + "ts": 6302685380285.714, "dur": 4.930, + "args": { + "External id": 128741,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 6759, + "ts": 6302685380286.774, "dur": 2.980, + "args": { + "External id": 128742,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685380288.454, "dur": 0.980, + "args": { + "External id": 128743,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685380318.293, "dur": 8.051, + "args": { + "External id": 128744,"Record function id": 0, "Ev Idx": 1767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685380320.333, "dur": 4.851, + "args": { + "External id": 128745,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685380321.944, "dur": 2.769, + "args": { + "External id": 128746,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685380322.744, "dur": 1.729, + "args": { + "External id": 128747,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380330.333, "dur": 76.520, + "args": { + "External id": 128748,"Record function id": 0, "Sequence number": 2575842, "Fwd thread id": 1, "Ev Idx": 1771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380332.464, "dur": 35.100, + "args": { + "External id": 128749,"Sequence number": 2575842, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1772 + } + }, + { + "ph": "f", "id": 224, "pid": 5717, "tid": 6759, "ts": 6302685380332.464, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685380334.393, "dur": 19.040, + "args": { + "External id": 128750,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685380355.113, "dur": 11.951, + "args": { + "External id": 128751,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685380370.904, "dur": 25.189, + "args": { + "External id": 128752,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685380398.993, "dur": 2.110, + "args": { + "External id": 128753,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685380415.643, "dur": 7.730, + "args": { + "External id": 128754,"Record function id": 0, "Ev Idx": 1777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685380417.763, "dur": 4.470, + "args": { + "External id": 128755,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685380419.263, "dur": 2.490, + "args": { + "External id": 128756,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685380420.063, "dur": 1.480, + "args": { + "External id": 128757,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380427.043, "dur": 32.650, + "args": { + "External id": 128758,"Record function id": 0, "Sequence number": 2575841, "Fwd thread id": 1, "Ev Idx": 1781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380428.283, "dur": 27.980, + "args": { + "External id": 128759,"Sequence number": 2575841, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1782 + } + }, + { + "ph": "f", "id": 225, "pid": 5717, "tid": 6759, "ts": 6302685380428.283, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685380429.803, "dur": 26.040, + "args": { + "External id": 128760,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685380430.813, "dur": 24.780, + "args": { + "External id": 128761,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685380433.853, "dur": 5.280, + "args": { + "External id": 128762,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685380440.353, "dur": 14.580, + "args": { + "External id": 128763,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380465.533, "dur": 63.360, + "args": { + "External id": 128764,"Record function id": 0, "Sequence number": 2575840, "Fwd thread id": 1, "Ev Idx": 1787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380467.113, "dur": 40.610, + "args": { + "External id": 128765,"Sequence number": 2575840, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1788 + } + }, + { + "ph": "f", "id": 226, "pid": 5717, "tid": 6759, "ts": 6302685380467.113, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685380469.353, "dur": 14.320, + "args": { + "External id": 128766,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685380485.223, "dur": 22.080, + "args": { + "External id": 128767,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 6759, + "ts": 6302685380510.743, "dur": 13.890, + "args": { + "External id": 128768,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380535.683, "dur": 55.830, + "args": { + "External id": 128769,"Record function id": 0, "Sequence number": 2575839, "Fwd thread id": 1, "Ev Idx": 1792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380537.253, "dur": 49.690, + "args": { + "External id": 128770,"Sequence number": 2575839, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1793 + } + }, + { + "ph": "f", "id": 227, "pid": 5717, "tid": 6759, "ts": 6302685380537.253, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685380539.613, "dur": 17.850, + "args": { + "External id": 128771,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685380541.323, "dur": 0.430, + "args": { + "External id": 128772,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685380542.763, "dur": 0.250, + "args": { + "External id": 128773,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685380558.773, "dur": 16.420, + "args": { + "External id": 128774,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685380561.953, "dur": 12.400, + "args": { + "External id": 128775,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685380576.053, "dur": 9.170, + "args": { + "External id": 128776,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685380598.343, "dur": 4.100, + "args": { + "External id": 128777,"Record function id": 0, "Sequence number": 2575838, "Fwd thread id": 1, "Ev Idx": 1800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685380599.923, "dur": 0.460, + "args": { + "External id": 128778,"Sequence number": 2575838, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1801 + } + }, + { + "ph": "f", "id": 228, "pid": 5717, "tid": 6759, "ts": 6302685380599.923, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685380605.833, "dur": 34.690, + "args": { + "External id": 128779,"Record function id": 0, "Sequence number": 2575837, "Fwd thread id": 1, "Ev Idx": 1802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5717, "tid": 6759, + "ts": 6302685380607.993, "dur": 28.970, + "args": { + "External id": 128780,"Sequence number": 2575837, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1803 + } + }, + { + "ph": "f", "id": 229, "pid": 5717, "tid": 6759, "ts": 6302685380607.993, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 6759, + "ts": 6302685380610.153, "dur": 5.220, + "args": { + "External id": 128781,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685380612.603, "dur": 1.350, + "args": { + "External id": 128782,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685380616.273, "dur": 20.060, + "args": { + "External id": 128783,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 6759, + "ts": 6302685380618.843, "dur": 16.460, + "args": { + "External id": 128784,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380647.133, "dur": 85.300, + "args": { + "External id": 128785,"Record function id": 0, "Sequence number": 2575836, "Fwd thread id": 1, "Ev Idx": 1808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380648.613, "dur": 61.300, + "args": { + "External id": 128786,"Sequence number": 2575836, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1809 + } + }, + { + "ph": "f", "id": 230, "pid": 5717, "tid": 6759, "ts": 6302685380648.613, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 6759, + "ts": 6302685380650.233, "dur": 27.970, + "args": { + "External id": 128787,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 6759, + "ts": 6302685380651.303, "dur": 0.280, + "args": { + "External id": 128788,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685380652.423, "dur": 0.150, + "args": { + "External id": 128789,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685380657.743, "dur": 19.240, + "args": { + "External id": 128790,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685380679.193, "dur": 18.500, + "args": { + "External id": 128791,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685380682.853, "dur": 13.990, + "args": { + "External id": 128792,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 6759, + "ts": 6302685380698.563, "dur": 9.810, + "args": { + "External id": 128793,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685380715.253, "dur": 13.130, + "args": { + "External id": 128794,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380740.812, "dur": 72.520, + "args": { + "External id": 128795,"Record function id": 0, "Sequence number": 2575835, "Fwd thread id": 1, "Ev Idx": 1818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5717, "tid": 6759, + "ts": 6302685380744.063, "dur": 39.789, + "args": { + "External id": 128796,"Sequence number": 2575835, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1819 + } + }, + { + "ph": "f", "id": 231, "pid": 5717, "tid": 6759, "ts": 6302685380744.063, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685380746.743, "dur": 36.769, + "args": { + "External id": 128797,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685380747.783, "dur": 35.480, + "args": { + "External id": 128798,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685380753.363, "dur": 9.429, + "args": { + "External id": 128799,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685380763.943, "dur": 18.709, + "args": { + "External id": 128800,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685380793.352, "dur": 14.431, + "args": { + "External id": 128801,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685380824.992, "dur": 1404.767, + "args": { + "External id": 128802,"Record function id": 0, "Ev Idx": 1825 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.9)", "pid": 5717, "tid": 6759, + "ts": 6302685380862.962, "dur": 811.228, + "args": { + "External id": 128803,"Record function id": 0, "Ev Idx": 1826 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.8", "pid": 5717, "tid": 6759, + "ts": 6302685380897.152, "dur": 765.229, + "args": { + "External id": 128804,"Record function id": 0, "Ev Idx": 1827 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 5717, "tid": 6759, + "ts": 6302685380920.942, "dur": 721.408, + "args": { + "External id": 128805,"Record function id": 0, "Ev Idx": 1828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685381045.602, "dur": 7.850, + "args": { + "External id": 128806,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685381069.742, "dur": 23.730, + "args": { + "External id": 128807,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 1830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381074.632, "dur": 1.120, + "args": { + "External id": 128808,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381077.962, "dur": 0.260, + "args": { + "External id": 128809,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381079.702, "dur": 0.260, + "args": { + "External id": 128810,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381081.122, "dur": 0.250, + "args": { + "External id": 128811,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381082.432, "dur": 0.360, + "args": { + "External id": 128812,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381084.102, "dur": 0.250, + "args": { + "External id": 128813,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381085.512, "dur": 0.900, + "args": { + "External id": 128814,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381087.882, "dur": 0.290, + "args": { + "External id": 128815,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381089.302, "dur": 0.250, + "args": { + "External id": 128816,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685381104.822, "dur": 30.020, + "args": { + "External id": 128817,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 1840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 6759, + "ts": 6302685381184.562, "dur": 103.009, + "args": { + "External id": 128818,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 1841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685381196.151, "dur": 6.520, + "args": { + "External id": 128819,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 6759, + "ts": 6302685381208.702, "dur": 11.080, + "args": { + "External id": 128820,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 1843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685381212.602, "dur": 6.749, + "args": { + "External id": 128821,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 1844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381216.802, "dur": 0.689, + "args": { + "External id": 128822,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 1845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685381227.591, "dur": 18.031, + "args": { + "External id": 128823,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 1846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381229.451, "dur": 0.311, + "args": { + "External id": 128824,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381231.151, "dur": 0.280, + "args": { + "External id": 128825,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381232.711, "dur": 0.160, + "args": { + "External id": 128826,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381234.071, "dur": 1.191, + "args": { + "External id": 128827,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381236.471, "dur": 0.251, + "args": { + "External id": 128828,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381237.862, "dur": 0.260, + "args": { + "External id": 128829,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381239.331, "dur": 0.260, + "args": { + "External id": 128830,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381240.602, "dur": 0.249, + "args": { + "External id": 128831,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685381242.002, "dur": 0.249, + "args": { + "External id": 128832,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685381259.922, "dur": 19.379, + "args": { + "External id": 128833,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 1856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 6759, + "ts": 6302685381381.471, "dur": 169.210, + "args": { + "External id": 128834,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 1857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685381419.491, "dur": 125.240, + "args": { + "External id": 128835,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 1858, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 6759, + "ts": 6302685381439.261, "dur": 96.630, + "args": { + "External id": 128836,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 1859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685381569.911, "dur": 4.110, + "args": { + "External id": 128837,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 1860, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685381679.841, "dur": 533.768, + "args": { + "External id": 128838,"Sequence number": 2575834, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1861 + } + }, + { + "ph": "f", "id": 232, "pid": 5717, "tid": 6759, "ts": 6302685381679.841, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685381779.480, "dur": 34.980, + "args": { + "External id": 128839,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 1862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5717, "tid": 6759, + "ts": 6302685381848.880, "dur": 28.320, + "args": { + "External id": 128840,"kernel_hash": "czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/zc/czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 1863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685381895.670, "dur": 35.780, + "args": { + "External id": 128841,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 1864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685381944.510, "dur": 25.380, + "args": { + "External id": 128842,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 1865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685381980.100, "dur": 19.670, + "args": { + "External id": 128843,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 1866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685382011.080, "dur": 23.270, + "args": { + "External id": 128844,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 1867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685382045.160, "dur": 18.629, + "args": { + "External id": 128845,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 1868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685382087.640, "dur": 25.840, + "args": { + "External id": 128846,"kernel_hash": "cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/f4/cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 1869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685382131.569, "dur": 16.951, + "args": { + "External id": 128847,"kernel_hash": "c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/7m/c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 1870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5717, "tid": 6759, + "ts": 6302685382166.379, "dur": 22.360, + "args": { + "External id": 128848,"kernel_hash": "c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/5t/c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 1871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685382246.159, "dur": 21.710, + "args": { + "External id": 128849,"Record function id": 0, "Ev Idx": 1872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685382251.809, "dur": 13.040, + "args": { + "External id": 128850,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685382255.619, "dur": 8.260, + "args": { + "External id": 128851,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685382257.859, "dur": 5.790, + "args": { + "External id": 128852,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685382274.189, "dur": 10.670, + "args": { + "External id": 128853,"Record function id": 0, "Ev Idx": 1876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685382278.139, "dur": 5.600, + "args": { + "External id": 128854,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685382279.979, "dur": 3.180, + "args": { + "External id": 128855,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685382280.659, "dur": 1.190, + "args": { + "External id": 128856,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685382291.289, "dur": 17.480, + "args": { + "External id": 128857,"Record function id": 0, "Ev Idx": 1880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685382294.049, "dur": 13.420, + "args": { + "External id": 128858,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685382295.889, "dur": 10.920, + "args": { + "External id": 128859,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685382305.569, "dur": 0.990, + "args": { + "External id": 128860,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685382313.139, "dur": 5.260, + "args": { + "External id": 128861,"Record function id": 0, "Ev Idx": 1884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685382314.799, "dur": 2.620, + "args": { + "External id": 128862,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685382315.509, "dur": 1.420, + "args": { + "External id": 128863,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685382315.909, "dur": 0.860, + "args": { + "External id": 128864,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685382322.349, "dur": 369.899, + "args": { + "External id": 128865,"Record function id": 0, "Sequence number": 2575833, "Fwd thread id": 1, "Ev Idx": 1888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685382323.839, "dur": 360.219, + "args": { + "External id": 128866,"Sequence number": 2575833, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1889 + } + }, + { + "ph": "f", "id": 233, "pid": 5717, "tid": 6759, "ts": 6302685382323.839, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685382429.649, "dur": 40.080, + "args": { + "External id": 128867,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685382483.089, "dur": 18.790, + "args": { + "External id": 128868,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685382531.119, "dur": 126.859, + "args": { + "External id": 128869,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 1892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685382585.739, "dur": 6.209, + "args": { + "External id": 128870,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685382593.799, "dur": 3.820, + "args": { + "External id": 128871,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685382705.118, "dur": 11.220, + "args": { + "External id": 128872,"Record function id": 0, "Ev Idx": 1895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685382707.988, "dur": 7.070, + "args": { + "External id": 128873,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1896 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685382710.528, "dur": 3.670, + "args": { + "External id": 128874,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685382711.658, "dur": 2.350, + "args": { + "External id": 128875,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685382720.558, "dur": 168.930, + "args": { + "External id": 128876,"Record function id": 0, "Sequence number": 2575832, "Fwd thread id": 1, "Ev Idx": 1899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685382722.188, "dur": 160.980, + "args": { + "External id": 128877,"Sequence number": 2575832, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1900 + } + }, + { + "ph": "f", "id": 234, "pid": 5717, "tid": 6759, "ts": 6302685382722.188, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685382736.638, "dur": 34.930, + "args": { + "External id": 128878,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685382739.758, "dur": 6.260, + "args": { + "External id": 128879,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685382747.168, "dur": 23.820, + "args": { + "External id": 128880,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685382780.108, "dur": 7.080, + "args": { + "External id": 128881,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685382782.078, "dur": 4.670, + "args": { + "External id": 128882,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685382900.588, "dur": 153.489, + "args": { + "External id": 128883,"Record function id": 0, "Sequence number": 2575831, "Fwd thread id": 1, "Ev Idx": 1906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685382903.008, "dur": 143.929, + "args": { + "External id": 128884,"Sequence number": 2575831, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1907 + } + }, + { + "ph": "f", "id": 235, "pid": 5717, "tid": 6759, "ts": 6302685382903.008, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685382915.808, "dur": 30.319, + "args": { + "External id": 128885,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685382918.928, "dur": 6.220, + "args": { + "External id": 128886,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685382926.268, "dur": 19.310, + "args": { + "External id": 128887,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685382953.998, "dur": 6.640, + "args": { + "External id": 128888,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685382955.718, "dur": 4.480, + "args": { + "External id": 128889,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685383065.317, "dur": 411.120, + "args": { + "External id": 128890,"Record function id": 0, "Sequence number": 2575830, "Fwd thread id": 1, "Ev Idx": 1913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685383070.797, "dur": 392.209, + "args": { + "External id": 128891,"Sequence number": 2575830, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 1914 + } + }, + { + "ph": "f", "id": 236, "pid": 5717, "tid": 6759, "ts": 6302685383070.797, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685383152.957, "dur": 39.050, + "args": { + "External id": 128892,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685383208.797, "dur": 36.800, + "args": { + "External id": 128893,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685383259.007, "dur": 31.060, + "args": { + "External id": 128894,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685383315.157, "dur": 28.280, + "args": { + "External id": 128895,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685383357.577, "dur": 16.020, + "args": { + "External id": 128896,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685383382.147, "dur": 14.659, + "args": { + "External id": 128897,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5717, "tid": 6759, + "ts": 6302685383418.277, "dur": 24.720, + "args": { + "External id": 128898,"kernel_hash": "cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/ug/cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 1921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685383495.976, "dur": 18.450, + "args": { + "External id": 128899,"Record function id": 0, "Ev Idx": 1922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685383500.066, "dur": 11.800, + "args": { + "External id": 128900,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685383504.946, "dur": 6.200, + "args": { + "External id": 128901,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685383506.206, "dur": 4.740, + "args": { + "External id": 128902,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685383521.366, "dur": 9.040, + "args": { + "External id": 128903,"Record function id": 0, "Ev Idx": 1926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685383524.286, "dur": 3.910, + "args": { + "External id": 128904,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685383524.976, "dur": 2.660, + "args": { + "External id": 128905,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685383525.466, "dur": 1.970, + "args": { + "External id": 128906,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685383536.486, "dur": 7.530, + "args": { + "External id": 128907,"Record function id": 0, "Ev Idx": 1930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685383538.186, "dur": 4.720, + "args": { + "External id": 128908,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685383539.956, "dur": 1.350, + "args": { + "External id": 128909,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685383540.296, "dur": 0.840, + "args": { + "External id": 128910,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685383550.576, "dur": 280.800, + "args": { + "External id": 128911,"Record function id": 0, "Sequence number": 2575829, "Fwd thread id": 1, "Ev Idx": 1934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685383553.306, "dur": 248.330, + "args": { + "External id": 128912,"Sequence number": 2575829, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1935 + } + }, + { + "ph": "f", "id": 237, "pid": 5717, "tid": 6759, "ts": 6302685383553.306, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685383645.516, "dur": 41.390, + "args": { + "External id": 128913,"kernel_hash": "c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/6f/c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 1936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685383717.006, "dur": 26.040, + "args": { + "External id": 128914,"kernel_hash": "cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/ie/cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 1937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685383762.526, "dur": 19.830, + "args": { + "External id": 128915,"kernel_hash": "cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/ys/cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 1938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685383809.516, "dur": 16.960, + "args": { + "External id": 128916,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685383843.116, "dur": 11.109, + "args": { + "External id": 128917,"Record function id": 0, "Ev Idx": 1940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685383845.896, "dur": 7.000, + "args": { + "External id": 128918,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685383847.985, "dur": 4.291, + "args": { + "External id": 128919,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685383849.056, "dur": 3.009, + "args": { + "External id": 128920,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685383858.705, "dur": 929.149, + "args": { + "External id": 128921,"Record function id": 0, "Sequence number": 2575828, "Fwd thread id": 1, "Ev Idx": 1944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685383860.365, "dur": 921.458, + "args": { + "External id": 128922,"Sequence number": 2575828, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1945 + } + }, + { + "ph": "f", "id": 238, "pid": 5717, "tid": 6759, "ts": 6302685383860.365, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.9)", "pid": 5717, "tid": 6759, + "ts": 6302685383883.536, "dur": 35.000, + "args": { + "External id": 128923,"Record function id": 0, "Ev Idx": 1946 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.9)", "pid": 5717, "tid": 6759, + "ts": 6302685383927.375, "dur": 79.980, + "args": { + "External id": 128924,"Record function id": 0, "Ev Idx": 1947 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.9)", "pid": 5717, "tid": 6759, + "ts": 6302685384014.535, "dur": 761.468, + "args": { + "External id": 128925,"Record function id": 0, "Ev Idx": 1948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685384062.235, "dur": 8.620, + "args": { + "External id": 128926,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384080.785, "dur": 3.200, + "args": { + "External id": 128927,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 1950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685384099.755, "dur": 104.610, + "args": { + "External id": 128928,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 1951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685384111.775, "dur": 87.130, + "args": { + "External id": 128929,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 1952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685384133.985, "dur": 2.470, + "args": { + "External id": 128930,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685384140.385, "dur": 31.140, + "args": { + "External id": 128931,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 1954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685384141.935, "dur": 29.260, + "args": { + "External id": 128932,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 1955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685384144.215, "dur": 6.020, + "args": { + "External id": 128933,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685384151.455, "dur": 19.250, + "args": { + "External id": 128934,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 1957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685384311.655, "dur": 13.009, + "args": { + "External id": 128935,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 1958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685384314.355, "dur": 9.389, + "args": { + "External id": 128936,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685384360.524, "dur": 130.800, + "args": { + "External id": 128937,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 1960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685384390.514, "dur": 97.380, + "args": { + "External id": 128938,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 1961, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685384412.304, "dur": 70.660, + "args": { + "External id": 128939,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 1962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685384506.294, "dur": 3.830, + "args": { + "External id": 128940,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 1963, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685384564.594, "dur": 4.680, + "args": { + "External id": 128941,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685384611.314, "dur": 3.600, + "args": { + "External id": 128942,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685384638.754, "dur": 1.040, + "args": { + "External id": 128943,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685384654.204, "dur": 0.880, + "args": { + "External id": 128944,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685384668.324, "dur": 0.850, + "args": { + "External id": 128945,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685384681.624, "dur": 1.060, + "args": { + "External id": 128946,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685384695.704, "dur": 0.830, + "args": { + "External id": 128947,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685384709.463, "dur": 1.711, + "args": { + "External id": 128948,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685384723.394, "dur": 0.789, + "args": { + "External id": 128949,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685384799.583, "dur": 1508.877, + "args": { + "External id": 128950,"Record function id": 0, "Ev Idx": 1973 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.8)", "pid": 5717, "tid": 6759, + "ts": 6302685384815.053, "dur": 985.108, + "args": { + "External id": 128951,"Record function id": 0, "Ev Idx": 1974 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 5717, "tid": 6759, + "ts": 6302685384832.083, "dur": 323.779, + "args": { + "External id": 128952,"Record function id": 0, "Ev Idx": 1975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384923.193, "dur": 3.590, + "args": { + "External id": 128953,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 1976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384930.763, "dur": 1.770, + "args": { + "External id": 128954,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 1977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384934.313, "dur": 3.170, + "args": { + "External id": 128955,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384939.923, "dur": 0.750, + "args": { + "External id": 128956,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384942.103, "dur": 0.770, + "args": { + "External id": 128957,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384944.383, "dur": 0.780, + "args": { + "External id": 128958,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384946.603, "dur": 1.400, + "args": { + "External id": 128959,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 1982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384949.313, "dur": 0.730, + "args": { + "External id": 128960,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 1983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384951.493, "dur": 0.820, + "args": { + "External id": 128961,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 1984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685384953.493, "dur": 1.470, + "args": { + "External id": 128962,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 1985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685384977.773, "dur": 135.150, + "args": { + "External id": 128963,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 1986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685384998.273, "dur": 108.720, + "args": { + "External id": 128964,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 1987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685385019.953, "dur": 11.090, + "args": { + "External id": 128965,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685385035.913, "dur": 37.860, + "args": { + "External id": 128966,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 1989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685385037.453, "dur": 35.870, + "args": { + "External id": 128967,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 1990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385042.023, "dur": 7.330, + "args": { + "External id": 128968,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685385050.723, "dur": 21.910, + "args": { + "External id": 128969,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 1992 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.7", "pid": 5717, "tid": 6759, + "ts": 6302685385269.602, "dur": 522.279, + "args": { + "External id": 128970,"Record function id": 0, "Ev Idx": 1993 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 5717, "tid": 6759, + "ts": 6302685385287.812, "dur": 489.579, + "args": { + "External id": 128971,"Record function id": 0, "Ev Idx": 1994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685385362.082, "dur": 7.850, + "args": { + "External id": 128972,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685385381.362, "dur": 21.450, + "args": { + "External id": 128973,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 1996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385384.662, "dur": 1.340, + "args": { + "External id": 128974,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385388.022, "dur": 0.320, + "args": { + "External id": 128975,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385389.682, "dur": 0.280, + "args": { + "External id": 128976,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385391.152, "dur": 0.310, + "args": { + "External id": 128977,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385392.622, "dur": 1.420, + "args": { + "External id": 128978,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385395.172, "dur": 0.260, + "args": { + "External id": 128979,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385396.622, "dur": 0.280, + "args": { + "External id": 128980,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385398.042, "dur": 0.230, + "args": { + "External id": 128981,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385399.452, "dur": 0.250, + "args": { + "External id": 128982,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685385411.332, "dur": 24.530, + "args": { + "External id": 128983,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 6759, + "ts": 6302685385470.582, "dur": 96.930, + "args": { + "External id": 128984,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685385480.562, "dur": 6.560, + "args": { + "External id": 128985,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 6759, + "ts": 6302685385492.032, "dur": 12.240, + "args": { + "External id": 128986,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685385495.162, "dur": 8.720, + "args": { + "External id": 128987,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385501.462, "dur": 0.690, + "args": { + "External id": 128988,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685385511.492, "dur": 17.670, + "args": { + "External id": 128989,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385513.412, "dur": 0.390, + "args": { + "External id": 128990,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385515.232, "dur": 1.150, + "args": { + "External id": 128991,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385517.592, "dur": 0.300, + "args": { + "External id": 128992,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385519.392, "dur": 0.270, + "args": { + "External id": 128993,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385520.792, "dur": 0.280, + "args": { + "External id": 128994,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385522.472, "dur": 0.260, + "args": { + "External id": 128995,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385523.912, "dur": 0.240, + "args": { + "External id": 128996,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385525.332, "dur": 0.250, + "args": { + "External id": 128997,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685385526.812, "dur": 0.260, + "args": { + "External id": 128998,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685385540.062, "dur": 18.380, + "args": { + "External id": 128999,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 6759, + "ts": 6302685385622.412, "dur": 88.749, + "args": { + "External id": 129000,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685385638.432, "dur": 69.749, + "args": { + "External id": 129001,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2024, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 6759, + "ts": 6302685385650.481, "dur": 52.610, + "args": { + "External id": 129002,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685385724.831, "dur": 3.150, + "args": { + "External id": 129003,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2026, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685385805.971, "dur": 482.969, + "args": { + "External id": 129004,"Sequence number": 2575827, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2027 + } + }, + { + "ph": "f", "id": 239, "pid": 5717, "tid": 6759, "ts": 6302685385805.971, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685385874.551, "dur": 31.920, + "args": { + "External id": 129005,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5717, "tid": 6759, + "ts": 6302685385936.491, "dur": 27.180, + "args": { + "External id": 129006,"kernel_hash": "czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/zc/czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685385988.961, "dur": 34.500, + "args": { + "External id": 129007,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685386037.260, "dur": 25.860, + "args": { + "External id": 129008,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685386073.871, "dur": 19.520, + "args": { + "External id": 129009,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685386103.360, "dur": 22.960, + "args": { + "External id": 129010,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685386136.530, "dur": 18.340, + "args": { + "External id": 129011,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685386177.920, "dur": 22.200, + "args": { + "External id": 129012,"kernel_hash": "cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/f4/cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685386216.830, "dur": 15.250, + "args": { + "External id": 129013,"kernel_hash": "c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/7m/c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5717, "tid": 6759, + "ts": 6302685386248.150, "dur": 18.570, + "args": { + "External id": 129014,"kernel_hash": "c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/5t/c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685386322.530, "dur": 11.470, + "args": { + "External id": 129015,"Record function id": 0, "Ev Idx": 2038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685386325.740, "dur": 6.920, + "args": { + "External id": 129016,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685386328.220, "dur": 3.570, + "args": { + "External id": 129017,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685386329.160, "dur": 2.410, + "args": { + "External id": 129018,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685386338.580, "dur": 5.870, + "args": { + "External id": 129019,"Record function id": 0, "Ev Idx": 2042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685386340.340, "dur": 2.890, + "args": { + "External id": 129020,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685386341.070, "dur": 1.690, + "args": { + "External id": 129021,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685386341.790, "dur": 0.800, + "args": { + "External id": 129022,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685386351.220, "dur": 15.370, + "args": { + "External id": 129023,"Record function id": 0, "Ev Idx": 2046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685386353.950, "dur": 11.610, + "args": { + "External id": 129024,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685386363.480, "dur": 1.560, + "args": { + "External id": 129025,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685386364.050, "dur": 0.810, + "args": { + "External id": 129026,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685386370.400, "dur": 4.840, + "args": { + "External id": 129027,"Record function id": 0, "Ev Idx": 2050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685386372.010, "dur": 2.270, + "args": { + "External id": 129028,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685386372.570, "dur": 1.320, + "args": { + "External id": 129029,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685386372.920, "dur": 0.810, + "args": { + "External id": 129030,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685386379.170, "dur": 364.909, + "args": { + "External id": 129031,"Record function id": 0, "Sequence number": 2575826, "Fwd thread id": 1, "Ev Idx": 2054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685386380.640, "dur": 355.189, + "args": { + "External id": 129032,"Sequence number": 2575826, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2055 + } + }, + { + "ph": "f", "id": 240, "pid": 5717, "tid": 6759, "ts": 6302685386380.640, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685386444.610, "dur": 38.729, + "args": { + "External id": 129033,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685386496.239, "dur": 18.531, + "args": { + "External id": 129034,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685386543.479, "dur": 162.960, + "args": { + "External id": 129035,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685386618.379, "dur": 10.040, + "args": { + "External id": 129036,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685386631.389, "dur": 6.260, + "args": { + "External id": 129037,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685386757.429, "dur": 11.430, + "args": { + "External id": 129038,"Record function id": 0, "Ev Idx": 2061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685386760.779, "dur": 6.790, + "args": { + "External id": 129039,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685386763.189, "dur": 3.580, + "args": { + "External id": 129040,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685386764.229, "dur": 2.330, + "args": { + "External id": 129041,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685386773.149, "dur": 250.439, + "args": { + "External id": 129042,"Record function id": 0, "Sequence number": 2575825, "Fwd thread id": 1, "Ev Idx": 2065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685386775.199, "dur": 240.739, + "args": { + "External id": 129043,"Sequence number": 2575825, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2066 + } + }, + { + "ph": "f", "id": 241, "pid": 5717, "tid": 6759, "ts": 6302685386775.199, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685386789.969, "dur": 56.320, + "args": { + "External id": 129044,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685386793.229, "dur": 11.860, + "args": { + "External id": 129045,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685386806.349, "dur": 38.230, + "args": { + "External id": 129046,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685386859.759, "dur": 12.740, + "args": { + "External id": 129047,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685386863.159, "dur": 8.800, + "args": { + "External id": 129048,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685387035.118, "dur": 164.940, + "args": { + "External id": 129049,"Record function id": 0, "Sequence number": 2575824, "Fwd thread id": 1, "Ev Idx": 2072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685387037.668, "dur": 153.340, + "args": { + "External id": 129050,"Sequence number": 2575824, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2073 + } + }, + { + "ph": "f", "id": 242, "pid": 5717, "tid": 6759, "ts": 6302685387037.668, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685387051.368, "dur": 31.350, + "args": { + "External id": 129051,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685387054.548, "dur": 6.410, + "args": { + "External id": 129052,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685387061.988, "dur": 20.180, + "args": { + "External id": 129053,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685387090.588, "dur": 7.160, + "args": { + "External id": 129054,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685387092.548, "dur": 4.770, + "args": { + "External id": 129055,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685387214.138, "dur": 382.959, + "args": { + "External id": 129056,"Record function id": 0, "Sequence number": 2575823, "Fwd thread id": 1, "Ev Idx": 2079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685387217.708, "dur": 366.349, + "args": { + "External id": 129057,"Sequence number": 2575823, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2080 + } + }, + { + "ph": "f", "id": 243, "pid": 5717, "tid": 6759, "ts": 6302685387217.708, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685387312.858, "dur": 45.839, + "args": { + "External id": 129058,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685387373.348, "dur": 25.709, + "args": { + "External id": 129059,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685387412.657, "dur": 24.420, + "args": { + "External id": 129060,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685387449.148, "dur": 19.199, + "args": { + "External id": 129061,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685387477.797, "dur": 16.080, + "args": { + "External id": 129062,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685387504.787, "dur": 14.980, + "args": { + "External id": 129063,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5717, "tid": 6759, + "ts": 6302685387542.617, "dur": 23.490, + "args": { + "External id": 129064,"kernel_hash": "cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/ug/cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685387610.427, "dur": 11.790, + "args": { + "External id": 129065,"Record function id": 0, "Ev Idx": 2088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685387613.167, "dur": 7.690, + "args": { + "External id": 129066,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685387615.797, "dur": 4.350, + "args": { + "External id": 129067,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685387617.257, "dur": 2.670, + "args": { + "External id": 129068,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685387626.687, "dur": 5.460, + "args": { + "External id": 129069,"Record function id": 0, "Ev Idx": 2092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685387628.327, "dur": 2.770, + "args": { + "External id": 129070,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685387629.087, "dur": 1.450, + "args": { + "External id": 129071,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685387629.537, "dur": 0.840, + "args": { + "External id": 129072,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685387636.067, "dur": 5.520, + "args": { + "External id": 129073,"Record function id": 0, "Ev Idx": 2096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685387637.757, "dur": 2.770, + "args": { + "External id": 129074,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685387638.587, "dur": 1.500, + "args": { + "External id": 129075,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685387639.087, "dur": 0.830, + "args": { + "External id": 129076,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685387645.647, "dur": 247.200, + "args": { + "External id": 129077,"Record function id": 0, "Sequence number": 2575822, "Fwd thread id": 1, "Ev Idx": 2100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685387647.197, "dur": 202.950, + "args": { + "External id": 129078,"Sequence number": 2575822, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2101 + } + }, + { + "ph": "f", "id": 244, "pid": 5717, "tid": 6759, "ts": 6302685387647.197, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685387721.507, "dur": 25.930, + "args": { + "External id": 129079,"kernel_hash": "c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/6f/c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685387767.797, "dur": 15.950, + "args": { + "External id": 129080,"kernel_hash": "cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/ie/cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685387804.476, "dur": 23.671, + "args": { + "External id": 129081,"kernel_hash": "cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/ys/cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685387858.996, "dur": 25.640, + "args": { + "External id": 129082,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685387905.626, "dur": 12.080, + "args": { + "External id": 129083,"Record function id": 0, "Ev Idx": 2106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685387908.646, "dur": 7.630, + "args": { + "External id": 129084,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685387910.876, "dur": 4.750, + "args": { + "External id": 129085,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685387912.016, "dur": 3.420, + "args": { + "External id": 129086,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685387922.326, "dur": 910.828, + "args": { + "External id": 129087,"Record function id": 0, "Sequence number": 2575821, "Fwd thread id": 1, "Ev Idx": 2110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685387924.056, "dur": 902.188, + "args": { + "External id": 129088,"Sequence number": 2575821, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2111 + } + }, + { + "ph": "f", "id": 245, "pid": 5717, "tid": 6759, "ts": 6302685387924.056, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.8)", "pid": 5717, "tid": 6759, + "ts": 6302685387944.436, "dur": 29.660, + "args": { + "External id": 129089,"Record function id": 0, "Ev Idx": 2112 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.8)", "pid": 5717, "tid": 6759, + "ts": 6302685387983.756, "dur": 79.730, + "args": { + "External id": 129090,"Record function id": 0, "Ev Idx": 2113 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.8)", "pid": 5717, "tid": 6759, + "ts": 6302685388070.516, "dur": 749.538, + "args": { + "External id": 129091,"Record function id": 0, "Ev Idx": 2114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685388157.306, "dur": 11.180, + "args": { + "External id": 129092,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685388178.856, "dur": 4.150, + "args": { + "External id": 129093,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685388200.116, "dur": 131.279, + "args": { + "External id": 129094,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685388214.616, "dur": 112.590, + "args": { + "External id": 129095,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685388238.366, "dur": 9.000, + "args": { + "External id": 129096,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685388252.215, "dur": 41.760, + "args": { + "External id": 129097,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685388254.386, "dur": 39.169, + "args": { + "External id": 129098,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685388257.915, "dur": 8.360, + "args": { + "External id": 129099,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685388268.006, "dur": 24.869, + "args": { + "External id": 129100,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685388403.575, "dur": 9.880, + "args": { + "External id": 129101,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685388406.135, "dur": 6.800, + "args": { + "External id": 129102,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685388437.815, "dur": 98.500, + "args": { + "External id": 129103,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685388455.425, "dur": 77.350, + "args": { + "External id": 129104,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2127, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685388469.345, "dur": 58.490, + "args": { + "External id": 129105,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685388551.745, "dur": 3.580, + "args": { + "External id": 129106,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2129, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685388611.605, "dur": 4.140, + "args": { + "External id": 129107,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685388649.365, "dur": 1.340, + "args": { + "External id": 129108,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685388672.295, "dur": 1.000, + "args": { + "External id": 129109,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685388689.394, "dur": 0.920, + "args": { + "External id": 129110,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685388704.685, "dur": 0.840, + "args": { + "External id": 129111,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685388719.105, "dur": 0.920, + "args": { + "External id": 129112,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685388734.685, "dur": 0.849, + "args": { + "External id": 129113,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685388749.894, "dur": 1.171, + "args": { + "External id": 129114,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685388764.314, "dur": 0.800, + "args": { + "External id": 129115,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685388846.204, "dur": 1674.357, + "args": { + "External id": 129116,"Record function id": 0, "Ev Idx": 2139 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.7)", "pid": 5717, "tid": 6759, + "ts": 6302685388863.064, "dur": 1028.608, + "args": { + "External id": 129117,"Record function id": 0, "Ev Idx": 2140 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 5717, "tid": 6759, + "ts": 6302685388877.184, "dur": 276.709, + "args": { + "External id": 129118,"Record function id": 0, "Ev Idx": 2141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685388973.994, "dur": 4.040, + "args": { + "External id": 129119,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685388982.454, "dur": 1.100, + "args": { + "External id": 129120,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685388985.374, "dur": 1.650, + "args": { + "External id": 129121,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685388988.604, "dur": 0.630, + "args": { + "External id": 129122,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685388990.554, "dur": 0.750, + "args": { + "External id": 129123,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685388993.064, "dur": 0.690, + "args": { + "External id": 129124,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685388995.364, "dur": 1.050, + "args": { + "External id": 129125,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685388997.854, "dur": 0.820, + "args": { + "External id": 129126,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685389000.214, "dur": 0.800, + "args": { + "External id": 129127,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685389002.514, "dur": 0.690, + "args": { + "External id": 129128,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685389019.424, "dur": 101.930, + "args": { + "External id": 129129,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685389032.654, "dur": 84.650, + "args": { + "External id": 129130,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685389045.414, "dur": 8.360, + "args": { + "External id": 129131,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685389056.924, "dur": 35.290, + "args": { + "External id": 129132,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685389058.634, "dur": 33.190, + "args": { + "External id": 129133,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389061.184, "dur": 7.830, + "args": { + "External id": 129134,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685389070.324, "dur": 20.910, + "args": { + "External id": 129135,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2158 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.6", "pid": 5717, "tid": 6759, + "ts": 6302685389271.323, "dur": 610.039, + "args": { + "External id": 129136,"Record function id": 0, "Ev Idx": 2159 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 5717, "tid": 6759, + "ts": 6302685389294.693, "dur": 566.689, + "args": { + "External id": 129137,"Record function id": 0, "Ev Idx": 2160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685389388.583, "dur": 9.380, + "args": { + "External id": 129138,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685389411.643, "dur": 23.150, + "args": { + "External id": 129139,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389415.293, "dur": 1.400, + "args": { + "External id": 129140,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389418.713, "dur": 0.350, + "args": { + "External id": 129141,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389420.433, "dur": 0.300, + "args": { + "External id": 129142,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389421.963, "dur": 0.420, + "args": { + "External id": 129143,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389423.663, "dur": 0.290, + "args": { + "External id": 129144,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389425.303, "dur": 1.410, + "args": { + "External id": 129145,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389428.053, "dur": 0.300, + "args": { + "External id": 129146,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389429.473, "dur": 0.290, + "args": { + "External id": 129147,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389431.053, "dur": 0.270, + "args": { + "External id": 129148,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685389444.593, "dur": 27.580, + "args": { + "External id": 129149,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 6759, + "ts": 6302685389509.063, "dur": 106.820, + "args": { + "External id": 129150,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685389520.753, "dur": 7.880, + "args": { + "External id": 129151,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 6759, + "ts": 6302685389534.333, "dur": 10.350, + "args": { + "External id": 129152,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685389537.423, "dur": 6.810, + "args": { + "External id": 129153,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389541.293, "dur": 0.960, + "args": { + "External id": 129154,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685389553.033, "dur": 19.550, + "args": { + "External id": 129155,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389555.383, "dur": 0.360, + "args": { + "External id": 129156,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389557.283, "dur": 0.270, + "args": { + "External id": 129157,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389558.973, "dur": 1.330, + "args": { + "External id": 129158,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389561.813, "dur": 0.280, + "args": { + "External id": 129159,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389563.283, "dur": 0.320, + "args": { + "External id": 129160,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389564.863, "dur": 0.290, + "args": { + "External id": 129161,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389566.433, "dur": 0.519, + "args": { + "External id": 129162,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389568.212, "dur": 0.300, + "args": { + "External id": 129163,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685389569.672, "dur": 0.331, + "args": { + "External id": 129164,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685389584.792, "dur": 21.271, + "args": { + "External id": 129165,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 6759, + "ts": 6302685389680.452, "dur": 99.230, + "args": { + "External id": 129166,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685389698.402, "dur": 77.650, + "args": { + "External id": 129167,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2190, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 6759, + "ts": 6302685389711.942, "dur": 58.680, + "args": { + "External id": 129168,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685389794.722, "dur": 3.730, + "args": { + "External id": 129169,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2192, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685389898.682, "dur": 608.599, + "args": { + "External id": 129170,"Sequence number": 2575820, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2193 + } + }, + { + "ph": "f", "id": 246, "pid": 5717, "tid": 6759, "ts": 6302685389898.682, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685390001.502, "dur": 39.969, + "args": { + "External id": 129171,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5717, "tid": 6759, + "ts": 6302685390076.262, "dur": 30.849, + "args": { + "External id": 129172,"kernel_hash": "czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/zc/czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685390127.751, "dur": 42.520, + "args": { + "External id": 129173,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685390187.261, "dur": 31.080, + "args": { + "External id": 129174,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685390232.951, "dur": 23.880, + "args": { + "External id": 129175,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685390269.671, "dur": 38.630, + "args": { + "External id": 129176,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685390324.061, "dur": 26.270, + "args": { + "External id": 129177,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685390378.281, "dur": 26.120, + "args": { + "External id": 129178,"kernel_hash": "cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/f4/cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685390423.901, "dur": 18.550, + "args": { + "External id": 129179,"kernel_hash": "c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/7m/c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5717, "tid": 6759, + "ts": 6302685390459.281, "dur": 21.360, + "args": { + "External id": 129180,"kernel_hash": "c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/5t/c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685390537.241, "dur": 14.099, + "args": { + "External id": 129181,"Record function id": 0, "Ev Idx": 2204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685390541.141, "dur": 8.529, + "args": { + "External id": 129182,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685390544.201, "dur": 4.539, + "args": { + "External id": 129183,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685390545.521, "dur": 2.969, + "args": { + "External id": 129184,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685390556.720, "dur": 6.710, + "args": { + "External id": 129185,"Record function id": 0, "Ev Idx": 2208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685390558.720, "dur": 3.510, + "args": { + "External id": 129186,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685390559.680, "dur": 2.020, + "args": { + "External id": 129187,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685390560.550, "dur": 0.940, + "args": { + "External id": 129188,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685390568.000, "dur": 6.090, + "args": { + "External id": 129189,"Record function id": 0, "Ev Idx": 2212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685390569.900, "dur": 3.070, + "args": { + "External id": 129190,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685390570.780, "dur": 1.710, + "args": { + "External id": 129191,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685390571.360, "dur": 0.920, + "args": { + "External id": 129192,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685390578.490, "dur": 5.870, + "args": { + "External id": 129193,"Record function id": 0, "Ev Idx": 2216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685390580.470, "dur": 2.720, + "args": { + "External id": 129194,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685390581.250, "dur": 1.420, + "args": { + "External id": 129195,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685390581.670, "dur": 0.810, + "args": { + "External id": 129196,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685390588.980, "dur": 383.380, + "args": { + "External id": 129197,"Record function id": 0, "Sequence number": 2575819, "Fwd thread id": 1, "Ev Idx": 2220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685390590.760, "dur": 372.100, + "args": { + "External id": 129198,"Sequence number": 2575819, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2221 + } + }, + { + "ph": "f", "id": 247, "pid": 5717, "tid": 6759, "ts": 6302685390590.760, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685390665.010, "dur": 47.020, + "args": { + "External id": 129199,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685390730.760, "dur": 23.010, + "args": { + "External id": 129200,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685390784.220, "dur": 149.980, + "args": { + "External id": 129201,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685390847.020, "dur": 7.420, + "args": { + "External id": 129202,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685390856.540, "dur": 4.080, + "args": { + "External id": 129203,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685390988.429, "dur": 13.290, + "args": { + "External id": 129204,"Record function id": 0, "Ev Idx": 2227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685390992.009, "dur": 8.130, + "args": { + "External id": 129205,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685390994.779, "dur": 4.230, + "args": { + "External id": 129206,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685390996.039, "dur": 2.700, + "args": { + "External id": 129207,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685391006.719, "dur": 209.480, + "args": { + "External id": 129208,"Record function id": 0, "Sequence number": 2575818, "Fwd thread id": 1, "Ev Idx": 2231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685391008.629, "dur": 199.780, + "args": { + "External id": 129209,"Sequence number": 2575818, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2232 + } + }, + { + "ph": "f", "id": 248, "pid": 5717, "tid": 6759, "ts": 6302685391008.629, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685391026.499, "dur": 43.230, + "args": { + "External id": 129210,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685391030.189, "dur": 8.450, + "args": { + "External id": 129211,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685391040.059, "dur": 28.980, + "args": { + "External id": 129212,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685391079.909, "dur": 8.660, + "args": { + "External id": 129213,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685391082.169, "dur": 5.880, + "args": { + "External id": 129214,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685391230.089, "dur": 194.799, + "args": { + "External id": 129215,"Record function id": 0, "Sequence number": 2575817, "Fwd thread id": 1, "Ev Idx": 2238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685391233.529, "dur": 182.719, + "args": { + "External id": 129216,"Sequence number": 2575817, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2239 + } + }, + { + "ph": "f", "id": 249, "pid": 5717, "tid": 6759, "ts": 6302685391233.529, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685391250.169, "dur": 37.220, + "args": { + "External id": 129217,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685391253.979, "dur": 7.440, + "args": { + "External id": 129218,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685391262.739, "dur": 23.880, + "args": { + "External id": 129219,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685391305.539, "dur": 9.310, + "args": { + "External id": 129220,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685391307.659, "dur": 6.600, + "args": { + "External id": 129221,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685391438.078, "dur": 375.609, + "args": { + "External id": 129222,"Record function id": 0, "Sequence number": 2575816, "Fwd thread id": 1, "Ev Idx": 2245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685391441.158, "dur": 359.389, + "args": { + "External id": 129223,"Sequence number": 2575816, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2246 + } + }, + { + "ph": "f", "id": 250, "pid": 5717, "tid": 6759, "ts": 6302685391441.158, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685391522.428, "dur": 45.640, + "args": { + "External id": 129224,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685391583.008, "dur": 27.690, + "args": { + "External id": 129225,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685391622.128, "dur": 26.010, + "args": { + "External id": 129226,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685391662.118, "dur": 20.860, + "args": { + "External id": 129227,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685391692.678, "dur": 17.160, + "args": { + "External id": 129228,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685391718.658, "dur": 16.590, + "args": { + "External id": 129229,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5717, "tid": 6759, + "ts": 6302685391757.618, "dur": 23.700, + "args": { + "External id": 129230,"kernel_hash": "cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/ug/cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685391828.118, "dur": 12.780, + "args": { + "External id": 129231,"Record function id": 0, "Ev Idx": 2254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685391831.367, "dur": 8.051, + "args": { + "External id": 129232,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685391834.207, "dur": 4.451, + "args": { + "External id": 129233,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685391835.607, "dur": 2.800, + "args": { + "External id": 129234,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685391845.827, "dur": 6.980, + "args": { + "External id": 129235,"Record function id": 0, "Ev Idx": 2258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685391847.758, "dur": 3.940, + "args": { + "External id": 129236,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685391848.578, "dur": 2.660, + "args": { + "External id": 129237,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685391849.138, "dur": 1.889, + "args": { + "External id": 129238,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685391857.058, "dur": 5.880, + "args": { + "External id": 129239,"Record function id": 0, "Ev Idx": 2262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685391858.958, "dur": 2.900, + "args": { + "External id": 129240,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685391859.647, "dur": 1.680, + "args": { + "External id": 129241,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685391860.378, "dur": 0.760, + "args": { + "External id": 129242,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685391867.327, "dur": 236.790, + "args": { + "External id": 129243,"Record function id": 0, "Sequence number": 2575815, "Fwd thread id": 1, "Ev Idx": 2266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685391869.187, "dur": 200.020, + "args": { + "External id": 129244,"Sequence number": 2575815, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2267 + } + }, + { + "ph": "f", "id": 251, "pid": 5717, "tid": 6759, "ts": 6302685391869.187, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685391944.427, "dur": 27.360, + "args": { + "External id": 129245,"kernel_hash": "c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/6f/c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685391991.837, "dur": 16.300, + "args": { + "External id": 129246,"kernel_hash": "cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/ie/cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685392028.107, "dur": 20.010, + "args": { + "External id": 129247,"kernel_hash": "cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/ys/cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685392078.247, "dur": 19.220, + "args": { + "External id": 129248,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685392117.207, "dur": 11.450, + "args": { + "External id": 129249,"Record function id": 0, "Ev Idx": 2272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685392120.307, "dur": 6.840, + "args": { + "External id": 129250,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685392122.667, "dur": 3.590, + "args": { + "External id": 129251,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685392123.667, "dur": 2.370, + "args": { + "External id": 129252,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685392133.267, "dur": 945.188, + "args": { + "External id": 129253,"Record function id": 0, "Sequence number": 2575814, "Fwd thread id": 1, "Ev Idx": 2276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685392135.047, "dur": 936.598, + "args": { + "External id": 129254,"Sequence number": 2575814, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2277 + } + }, + { + "ph": "f", "id": 252, "pid": 5717, "tid": 6759, "ts": 6302685392135.047, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.7)", "pid": 5717, "tid": 6759, + "ts": 6302685392155.647, "dur": 26.350, + "args": { + "External id": 129255,"Record function id": 0, "Ev Idx": 2278 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.7)", "pid": 5717, "tid": 6759, + "ts": 6302685392191.217, "dur": 68.409, + "args": { + "External id": 129256,"Record function id": 0, "Ev Idx": 2279 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.7)", "pid": 5717, "tid": 6759, + "ts": 6302685392266.457, "dur": 798.778, + "args": { + "External id": 129257,"Record function id": 0, "Ev Idx": 2280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685392355.966, "dur": 9.620, + "args": { + "External id": 129258,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685392378.846, "dur": 6.460, + "args": { + "External id": 129259,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685392406.326, "dur": 118.840, + "args": { + "External id": 129260,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685392421.956, "dur": 98.650, + "args": { + "External id": 129261,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685392446.826, "dur": 8.870, + "args": { + "External id": 129262,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685392459.696, "dur": 34.420, + "args": { + "External id": 129263,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685392461.536, "dur": 32.210, + "args": { + "External id": 129264,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685392464.106, "dur": 6.950, + "args": { + "External id": 129265,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685392472.576, "dur": 20.540, + "args": { + "External id": 129266,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685392622.186, "dur": 14.490, + "args": { + "External id": 129267,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685392625.926, "dur": 10.190, + "args": { + "External id": 129268,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685392658.476, "dur": 97.559, + "args": { + "External id": 129269,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685392677.776, "dur": 74.749, + "args": { + "External id": 129270,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2293, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685392692.165, "dur": 55.711, + "args": { + "External id": 129271,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685392777.985, "dur": 6.160, + "args": { + "External id": 129272,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2295, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685392856.975, "dur": 4.290, + "args": { + "External id": 129273,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685392893.905, "dur": 1.330, + "args": { + "External id": 129274,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685392914.675, "dur": 2.100, + "args": { + "External id": 129275,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685392932.255, "dur": 0.900, + "args": { + "External id": 129276,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685392947.615, "dur": 0.840, + "args": { + "External id": 129277,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685392962.595, "dur": 0.840, + "args": { + "External id": 129278,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685392977.015, "dur": 1.730, + "args": { + "External id": 129279,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685392993.125, "dur": 1.100, + "args": { + "External id": 129280,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393008.675, "dur": 0.800, + "args": { + "External id": 129281,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685393091.775, "dur": 1519.546, + "args": { + "External id": 129282,"Record function id": 0, "Ev Idx": 2305 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.6)", "pid": 5717, "tid": 6759, + "ts": 6302685393107.275, "dur": 955.487, + "args": { + "External id": 129283,"Record function id": 0, "Ev Idx": 2306 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 5717, "tid": 6759, + "ts": 6302685393119.924, "dur": 333.680, + "args": { + "External id": 129284,"Record function id": 0, "Ev Idx": 2307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685393225.664, "dur": 4.590, + "args": { + "External id": 129285,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685393235.424, "dur": 1.080, + "args": { + "External id": 129286,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685393238.614, "dur": 1.900, + "args": { + "External id": 129287,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685393242.384, "dur": 0.800, + "args": { + "External id": 129288,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685393244.774, "dur": 0.760, + "args": { + "External id": 129289,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685393247.504, "dur": 0.810, + "args": { + "External id": 129290,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685393250.054, "dur": 1.110, + "args": { + "External id": 129291,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685393252.874, "dur": 0.740, + "args": { + "External id": 129292,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685393255.384, "dur": 0.810, + "args": { + "External id": 129293,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685393257.944, "dur": 0.790, + "args": { + "External id": 129294,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685393277.144, "dur": 141.190, + "args": { + "External id": 129295,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685393293.824, "dur": 118.980, + "args": { + "External id": 129296,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685393315.904, "dur": 8.910, + "args": { + "External id": 129297,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685393327.504, "dur": 49.270, + "args": { + "External id": 129298,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685393332.014, "dur": 44.280, + "args": { + "External id": 129299,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393335.024, "dur": 9.910, + "args": { + "External id": 129300,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685393346.334, "dur": 29.360, + "args": { + "External id": 129301,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2324 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.5", "pid": 5717, "tid": 6759, + "ts": 6302685393558.443, "dur": 496.819, + "args": { + "External id": 129302,"Record function id": 0, "Ev Idx": 2325 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 5717, "tid": 6759, + "ts": 6302685393580.863, "dur": 459.910, + "args": { + "External id": 129303,"Record function id": 0, "Ev Idx": 2326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685393649.163, "dur": 8.170, + "args": { + "External id": 129304,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685393668.733, "dur": 19.170, + "args": { + "External id": 129305,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393671.853, "dur": 1.410, + "args": { + "External id": 129306,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393674.893, "dur": 0.370, + "args": { + "External id": 129307,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393676.413, "dur": 0.280, + "args": { + "External id": 129308,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393677.553, "dur": 0.260, + "args": { + "External id": 129309,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393678.833, "dur": 0.410, + "args": { + "External id": 129310,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393680.253, "dur": 1.000, + "args": { + "External id": 129311,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393682.193, "dur": 0.280, + "args": { + "External id": 129312,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393683.363, "dur": 0.290, + "args": { + "External id": 129313,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393684.653, "dur": 0.270, + "args": { + "External id": 129314,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685393696.073, "dur": 23.240, + "args": { + "External id": 129315,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 6759, + "ts": 6302685393749.653, "dur": 89.300, + "args": { + "External id": 129316,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685393759.333, "dur": 7.230, + "args": { + "External id": 129317,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 6759, + "ts": 6302685393771.073, "dur": 8.750, + "args": { + "External id": 129318,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685393773.803, "dur": 5.640, + "args": { + "External id": 129319,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393776.783, "dur": 0.830, + "args": { + "External id": 129320,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685393786.843, "dur": 15.500, + "args": { + "External id": 129321,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393788.663, "dur": 0.280, + "args": { + "External id": 129322,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393790.123, "dur": 0.280, + "args": { + "External id": 129323,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393791.403, "dur": 1.150, + "args": { + "External id": 129324,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393793.603, "dur": 0.180, + "args": { + "External id": 129325,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393794.923, "dur": 0.330, + "args": { + "External id": 129326,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393796.413, "dur": 0.270, + "args": { + "External id": 129327,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393797.613, "dur": 0.180, + "args": { + "External id": 129328,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393798.853, "dur": 0.300, + "args": { + "External id": 129329,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685393800.083, "dur": 0.170, + "args": { + "External id": 129330,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685393813.123, "dur": 17.840, + "args": { + "External id": 129331,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 6759, + "ts": 6302685393891.213, "dur": 83.200, + "args": { + "External id": 129332,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685393906.243, "dur": 65.340, + "args": { + "External id": 129333,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2356, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 6759, + "ts": 6302685393917.283, "dur": 49.930, + "args": { + "External id": 129334,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685393986.553, "dur": 3.600, + "args": { + "External id": 129335,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2358, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685394068.282, "dur": 531.579, + "args": { + "External id": 129336,"Sequence number": 2575813, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2359 + } + }, + { + "ph": "f", "id": 253, "pid": 5717, "tid": 6759, "ts": 6302685394068.282, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685394135.442, "dur": 32.690, + "args": { + "External id": 129337,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5717, "tid": 6759, + "ts": 6302685394199.272, "dur": 30.560, + "args": { + "External id": 129338,"kernel_hash": "czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/zc/czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685394257.552, "dur": 56.280, + "args": { + "External id": 129339,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685394329.702, "dur": 36.520, + "args": { + "External id": 129340,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685394376.452, "dur": 19.840, + "args": { + "External id": 129341,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685394406.292, "dur": 22.709, + "args": { + "External id": 129342,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685394439.112, "dur": 18.509, + "args": { + "External id": 129343,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685394481.252, "dur": 22.229, + "args": { + "External id": 129344,"kernel_hash": "cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/f4/cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685394521.312, "dur": 18.689, + "args": { + "External id": 129345,"kernel_hash": "c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/7m/c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5717, "tid": 6759, + "ts": 6302685394560.411, "dur": 17.870, + "args": { + "External id": 129346,"kernel_hash": "c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/5t/c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685394625.261, "dur": 11.290, + "args": { + "External id": 129347,"Record function id": 0, "Ev Idx": 2370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685394628.241, "dur": 6.990, + "args": { + "External id": 129348,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685394630.711, "dur": 3.800, + "args": { + "External id": 129349,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685394631.711, "dur": 2.570, + "args": { + "External id": 129350,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685394641.121, "dur": 6.340, + "args": { + "External id": 129351,"Record function id": 0, "Ev Idx": 2374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685394643.011, "dur": 3.440, + "args": { + "External id": 129352,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685394643.971, "dur": 1.990, + "args": { + "External id": 129353,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685394644.751, "dur": 1.040, + "args": { + "External id": 129354,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685394651.121, "dur": 5.000, + "args": { + "External id": 129355,"Record function id": 0, "Ev Idx": 2378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685394652.881, "dur": 2.330, + "args": { + "External id": 129356,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685394653.541, "dur": 1.210, + "args": { + "External id": 129357,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685394653.971, "dur": 0.620, + "args": { + "External id": 129358,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685394662.231, "dur": 6.960, + "args": { + "External id": 129359,"Record function id": 0, "Ev Idx": 2382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685394664.461, "dur": 3.710, + "args": { + "External id": 129360,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685394665.381, "dur": 2.170, + "args": { + "External id": 129361,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685394665.831, "dur": 1.540, + "args": { + "External id": 129362,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685394673.411, "dur": 368.169, + "args": { + "External id": 129363,"Record function id": 0, "Sequence number": 2575812, "Fwd thread id": 1, "Ev Idx": 2386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685394675.041, "dur": 357.869, + "args": { + "External id": 129364,"Sequence number": 2575812, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2387 + } + }, + { + "ph": "f", "id": 254, "pid": 5717, "tid": 6759, "ts": 6302685394675.041, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685394759.931, "dur": 46.580, + "args": { + "External id": 129365,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685394820.561, "dur": 28.720, + "args": { + "External id": 129366,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685394876.720, "dur": 132.520, + "args": { + "External id": 129367,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685394928.791, "dur": 6.360, + "args": { + "External id": 129368,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685394936.971, "dur": 3.620, + "args": { + "External id": 129369,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685395054.660, "dur": 11.350, + "args": { + "External id": 129370,"Record function id": 0, "Ev Idx": 2393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685395057.900, "dur": 6.770, + "args": { + "External id": 129371,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685395060.300, "dur": 3.490, + "args": { + "External id": 129372,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685395061.240, "dur": 2.320, + "args": { + "External id": 129373,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685395070.280, "dur": 171.800, + "args": { + "External id": 129374,"Record function id": 0, "Sequence number": 2575811, "Fwd thread id": 1, "Ev Idx": 2397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685395071.700, "dur": 164.130, + "args": { + "External id": 129375,"Sequence number": 2575811, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2398 + } + }, + { + "ph": "f", "id": 255, "pid": 5717, "tid": 6759, "ts": 6302685395071.700, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685395086.150, "dur": 36.250, + "args": { + "External id": 129376,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685395089.240, "dur": 6.720, + "args": { + "External id": 129377,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685395097.460, "dur": 24.390, + "args": { + "External id": 129378,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685395131.000, "dur": 7.270, + "args": { + "External id": 129379,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685395133.040, "dur": 4.770, + "args": { + "External id": 129380,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685395253.320, "dur": 180.439, + "args": { + "External id": 129381,"Record function id": 0, "Sequence number": 2575810, "Fwd thread id": 1, "Ev Idx": 2404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685395255.980, "dur": 165.809, + "args": { + "External id": 129382,"Sequence number": 2575810, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2405 + } + }, + { + "ph": "f", "id": 256, "pid": 5717, "tid": 6759, "ts": 6302685395255.980, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685395269.910, "dur": 38.469, + "args": { + "External id": 129383,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685395272.990, "dur": 5.890, + "args": { + "External id": 129384,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685395280.000, "dur": 27.599, + "args": { + "External id": 129385,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685395316.770, "dur": 6.949, + "args": { + "External id": 129386,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685395318.799, "dur": 4.451, + "args": { + "External id": 129387,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685395451.249, "dur": 387.400, + "args": { + "External id": 129388,"Record function id": 0, "Sequence number": 2575809, "Fwd thread id": 1, "Ev Idx": 2411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685395456.939, "dur": 370.799, + "args": { + "External id": 129389,"Sequence number": 2575809, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2412 + } + }, + { + "ph": "f", "id": 257, "pid": 5717, "tid": 6759, "ts": 6302685395456.939, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685395548.029, "dur": 49.380, + "args": { + "External id": 129390,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685395619.219, "dur": 43.790, + "args": { + "External id": 129391,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685395673.039, "dur": 22.680, + "args": { + "External id": 129392,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685395707.059, "dur": 18.300, + "args": { + "External id": 129393,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685395734.099, "dur": 15.050, + "args": { + "External id": 129394,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685395757.189, "dur": 14.320, + "args": { + "External id": 129395,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5717, "tid": 6759, + "ts": 6302685395790.489, "dur": 20.540, + "args": { + "External id": 129396,"kernel_hash": "cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/ug/cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685395851.248, "dur": 11.050, + "args": { + "External id": 129397,"Record function id": 0, "Ev Idx": 2420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685395854.118, "dur": 6.850, + "args": { + "External id": 129398,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685395856.628, "dur": 3.670, + "args": { + "External id": 129399,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685395857.638, "dur": 2.450, + "args": { + "External id": 129400,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685395866.468, "dur": 5.120, + "args": { + "External id": 129401,"Record function id": 0, "Ev Idx": 2424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685395868.248, "dur": 2.300, + "args": { + "External id": 129402,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685395868.928, "dur": 1.230, + "args": { + "External id": 129403,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685395869.378, "dur": 0.620, + "args": { + "External id": 129404,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685395875.218, "dur": 4.900, + "args": { + "External id": 129405,"Record function id": 0, "Ev Idx": 2428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685395876.788, "dur": 2.400, + "args": { + "External id": 129406,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685395877.338, "dur": 1.430, + "args": { + "External id": 129407,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685395878.038, "dur": 0.560, + "args": { + "External id": 129408,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685395884.008, "dur": 202.990, + "args": { + "External id": 129409,"Record function id": 0, "Sequence number": 2575808, "Fwd thread id": 1, "Ev Idx": 2432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685395885.378, "dur": 171.430, + "args": { + "External id": 129410,"Sequence number": 2575808, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2433 + } + }, + { + "ph": "f", "id": 258, "pid": 5717, "tid": 6759, "ts": 6302685395885.378, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685395949.598, "dur": 23.540, + "args": { + "External id": 129411,"kernel_hash": "c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/6f/c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685395991.258, "dur": 13.760, + "args": { + "External id": 129412,"kernel_hash": "cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/ie/cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685396021.798, "dur": 16.810, + "args": { + "External id": 129413,"kernel_hash": "cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/ys/cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685396064.768, "dur": 16.790, + "args": { + "External id": 129414,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685396098.348, "dur": 9.990, + "args": { + "External id": 129415,"Record function id": 0, "Ev Idx": 2438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685396101.058, "dur": 5.910, + "args": { + "External id": 129416,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685396103.188, "dur": 3.170, + "args": { + "External id": 129417,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685396104.108, "dur": 2.060, + "args": { + "External id": 129418,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685396112.678, "dur": 821.488, + "args": { + "External id": 129419,"Record function id": 0, "Sequence number": 2575807, "Fwd thread id": 1, "Ev Idx": 2442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685396114.268, "dur": 813.398, + "args": { + "External id": 129420,"Sequence number": 2575807, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2443 + } + }, + { + "ph": "f", "id": 259, "pid": 5717, "tid": 6759, "ts": 6302685396114.268, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.6)", "pid": 5717, "tid": 6759, + "ts": 6302685396132.038, "dur": 23.870, + "args": { + "External id": 129421,"Record function id": 0, "Ev Idx": 2444 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.6)", "pid": 5717, "tid": 6759, + "ts": 6302685396163.758, "dur": 59.799, + "args": { + "External id": 129422,"Record function id": 0, "Ev Idx": 2445 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.6)", "pid": 5717, "tid": 6759, + "ts": 6302685396229.808, "dur": 692.858, + "args": { + "External id": 129423,"Record function id": 0, "Ev Idx": 2446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685396295.777, "dur": 14.800, + "args": { + "External id": 129424,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685396319.327, "dur": 3.380, + "args": { + "External id": 129425,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685396335.267, "dur": 88.950, + "args": { + "External id": 129426,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685396345.917, "dur": 74.970, + "args": { + "External id": 129427,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685396360.737, "dur": 5.990, + "args": { + "External id": 129428,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685396370.417, "dur": 29.400, + "args": { + "External id": 129429,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685396371.667, "dur": 27.840, + "args": { + "External id": 129430,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685396373.687, "dur": 5.810, + "args": { + "External id": 129431,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685396380.667, "dur": 18.330, + "args": { + "External id": 129432,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685396486.497, "dur": 9.130, + "args": { + "External id": 129433,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685396488.647, "dur": 6.500, + "args": { + "External id": 129434,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685396513.677, "dur": 95.590, + "args": { + "External id": 129435,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685396527.977, "dur": 78.250, + "args": { + "External id": 129436,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2459, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685396541.507, "dur": 59.500, + "args": { + "External id": 129437,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685396622.197, "dur": 3.230, + "args": { + "External id": 129438,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2461, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685396680.567, "dur": 3.869, + "args": { + "External id": 129439,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685396712.407, "dur": 1.129, + "args": { + "External id": 129440,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685396730.156, "dur": 0.970, + "args": { + "External id": 129441,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685396754.096, "dur": 0.870, + "args": { + "External id": 129442,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685396773.686, "dur": 0.750, + "args": { + "External id": 129443,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685396795.236, "dur": 0.800, + "args": { + "External id": 129444,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685396809.586, "dur": 0.750, + "args": { + "External id": 129445,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685396828.796, "dur": 1.290, + "args": { + "External id": 129446,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685396851.446, "dur": 0.780, + "args": { + "External id": 129447,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685396946.056, "dur": 1453.476, + "args": { + "External id": 129448,"Record function id": 0, "Ev Idx": 2471 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.5)", "pid": 5717, "tid": 6759, + "ts": 6302685396960.106, "dur": 913.138, + "args": { + "External id": 129449,"Record function id": 0, "Ev Idx": 2472 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 5717, "tid": 6759, + "ts": 6302685396972.156, "dur": 261.999, + "args": { + "External id": 129450,"Record function id": 0, "Ev Idx": 2473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685397057.076, "dur": 3.390, + "args": { + "External id": 129451,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685397064.226, "dur": 0.870, + "args": { + "External id": 129452,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685397066.756, "dur": 0.630, + "args": { + "External id": 129453,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685397068.916, "dur": 0.520, + "args": { + "External id": 129454,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685397071.356, "dur": 0.530, + "args": { + "External id": 129455,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685397073.256, "dur": 1.550, + "args": { + "External id": 129456,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685397076.335, "dur": 0.791, + "args": { + "External id": 129457,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685397078.475, "dur": 0.800, + "args": { + "External id": 129458,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685397080.986, "dur": 0.529, + "args": { + "External id": 129459,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685397082.895, "dur": 0.540, + "args": { + "External id": 129460,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685397096.835, "dur": 94.770, + "args": { + "External id": 129461,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685397108.695, "dur": 76.690, + "args": { + "External id": 129462,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685397118.275, "dur": 5.811, + "args": { + "External id": 129463,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685397126.106, "dur": 31.280, + "args": { + "External id": 129464,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685397127.335, "dur": 29.740, + "args": { + "External id": 129465,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397129.435, "dur": 7.140, + "args": { + "External id": 129466,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685397137.935, "dur": 18.691, + "args": { + "External id": 129467,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2490 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.4", "pid": 5717, "tid": 6759, + "ts": 6302685397357.755, "dur": 507.949, + "args": { + "External id": 129468,"Record function id": 0, "Ev Idx": 2491 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 5717, "tid": 6759, + "ts": 6302685397373.205, "dur": 479.089, + "args": { + "External id": 129469,"Record function id": 0, "Ev Idx": 2492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685397434.275, "dur": 7.840, + "args": { + "External id": 129470,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685397453.125, "dur": 19.300, + "args": { + "External id": 129471,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397455.975, "dur": 1.850, + "args": { + "External id": 129472,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397459.585, "dur": 0.220, + "args": { + "External id": 129473,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397461.155, "dur": 0.470, + "args": { + "External id": 129474,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397462.495, "dur": 0.250, + "args": { + "External id": 129475,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397463.825, "dur": 0.180, + "args": { + "External id": 129476,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397464.855, "dur": 0.230, + "args": { + "External id": 129477,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397466.015, "dur": 0.280, + "args": { + "External id": 129478,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397467.165, "dur": 0.240, + "args": { + "External id": 129479,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397468.245, "dur": 1.230, + "args": { + "External id": 129480,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685397481.345, "dur": 22.600, + "args": { + "External id": 129481,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 6759, + "ts": 6302685397533.234, "dur": 85.020, + "args": { + "External id": 129482,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685397542.554, "dur": 7.051, + "args": { + "External id": 129483,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 6759, + "ts": 6302685397553.674, "dur": 8.380, + "args": { + "External id": 129484,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685397556.314, "dur": 5.411, + "args": { + "External id": 129485,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397559.214, "dur": 0.831, + "args": { + "External id": 129486,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685397568.825, "dur": 15.489, + "args": { + "External id": 129487,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397570.645, "dur": 0.309, + "args": { + "External id": 129488,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397571.905, "dur": 0.269, + "args": { + "External id": 129489,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397573.025, "dur": 0.160, + "args": { + "External id": 129490,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397575.125, "dur": 0.180, + "args": { + "External id": 129491,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397576.225, "dur": 0.249, + "args": { + "External id": 129492,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397577.334, "dur": 1.291, + "args": { + "External id": 129493,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397579.785, "dur": 0.329, + "args": { + "External id": 129494,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397580.974, "dur": 0.231, + "args": { + "External id": 129495,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685397582.114, "dur": 0.160, + "args": { + "External id": 129496,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685397593.605, "dur": 17.100, + "args": { + "External id": 129497,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 6759, + "ts": 6302685397667.364, "dur": 100.790, + "args": { + "External id": 129498,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685397681.484, "dur": 82.380, + "args": { + "External id": 129499,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2522, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 6759, + "ts": 6302685397692.354, "dur": 64.570, + "args": { + "External id": 129500,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685397786.974, "dur": 5.690, + "args": { + "External id": 129501,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2524, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685397878.444, "dur": 508.139, + "args": { + "External id": 129502,"Sequence number": 2575806, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2525 + } + }, + { + "ph": "f", "id": 260, "pid": 5717, "tid": 6759, "ts": 6302685397878.444, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685397942.844, "dur": 32.060, + "args": { + "External id": 129503,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5717, "tid": 6759, + "ts": 6302685398003.973, "dur": 24.491, + "args": { + "External id": 129504,"kernel_hash": "czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/zc/czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685398044.653, "dur": 34.090, + "args": { + "External id": 129505,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685398091.073, "dur": 25.180, + "args": { + "External id": 129506,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685398126.033, "dur": 20.100, + "args": { + "External id": 129507,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685398160.913, "dur": 33.070, + "args": { + "External id": 129508,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685398206.713, "dur": 19.120, + "args": { + "External id": 129509,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685398258.063, "dur": 24.560, + "args": { + "External id": 129510,"kernel_hash": "cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/f4/cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685398308.243, "dur": 16.030, + "args": { + "External id": 129511,"kernel_hash": "c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/7m/c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5717, "tid": 6759, + "ts": 6302685398339.353, "dur": 17.440, + "args": { + "External id": 129512,"kernel_hash": "c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/5t/c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685398412.783, "dur": 11.829, + "args": { + "External id": 129513,"Record function id": 0, "Ev Idx": 2536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685398415.952, "dur": 7.271, + "args": { + "External id": 129514,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685398418.503, "dur": 3.849, + "args": { + "External id": 129515,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685398419.532, "dur": 2.600, + "args": { + "External id": 129516,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685398429.123, "dur": 5.240, + "args": { + "External id": 129517,"Record function id": 0, "Ev Idx": 2540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685398430.763, "dur": 2.560, + "args": { + "External id": 129518,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685398431.512, "dur": 1.340, + "args": { + "External id": 129519,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685398431.983, "dur": 0.709, + "args": { + "External id": 129520,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685398438.112, "dur": 4.760, + "args": { + "External id": 129521,"Record function id": 0, "Ev Idx": 2544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685398439.712, "dur": 2.231, + "args": { + "External id": 129522,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685398440.303, "dur": 1.249, + "args": { + "External id": 129523,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685398440.823, "dur": 0.549, + "args": { + "External id": 129524,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685398446.503, "dur": 4.689, + "args": { + "External id": 129525,"Record function id": 0, "Ev Idx": 2548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685398447.983, "dur": 2.289, + "args": { + "External id": 129526,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685398448.612, "dur": 1.151, + "args": { + "External id": 129527,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685398448.992, "dur": 0.611, + "args": { + "External id": 129528,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685398454.943, "dur": 320.669, + "args": { + "External id": 129529,"Record function id": 0, "Sequence number": 2575805, "Fwd thread id": 1, "Ev Idx": 2552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685398456.312, "dur": 311.010, + "args": { + "External id": 129530,"Sequence number": 2575805, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2553 + } + }, + { + "ph": "f", "id": 261, "pid": 5717, "tid": 6759, "ts": 6302685398456.312, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685398522.272, "dur": 43.400, + "args": { + "External id": 129531,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685398579.472, "dur": 18.500, + "args": { + "External id": 129532,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685398623.152, "dur": 121.120, + "args": { + "External id": 129533,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685398672.922, "dur": 6.110, + "args": { + "External id": 129534,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685398680.632, "dur": 3.370, + "args": { + "External id": 129535,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685398788.652, "dur": 19.030, + "args": { + "External id": 129536,"Record function id": 0, "Ev Idx": 2559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685398799.562, "dur": 6.690, + "args": { + "External id": 129537,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685398802.142, "dur": 3.160, + "args": { + "External id": 129538,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685398802.962, "dur": 2.130, + "args": { + "External id": 129539,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685398812.102, "dur": 170.349, + "args": { + "External id": 129540,"Record function id": 0, "Sequence number": 2575804, "Fwd thread id": 1, "Ev Idx": 2563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685398813.662, "dur": 162.519, + "args": { + "External id": 129541,"Sequence number": 2575804, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2564 + } + }, + { + "ph": "f", "id": 262, "pid": 5717, "tid": 6759, "ts": 6302685398813.662, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685398828.442, "dur": 34.589, + "args": { + "External id": 129542,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685398831.412, "dur": 6.280, + "args": { + "External id": 129543,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685398838.832, "dur": 23.650, + "args": { + "External id": 129544,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685398871.422, "dur": 7.569, + "args": { + "External id": 129545,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685398873.162, "dur": 5.380, + "args": { + "External id": 129546,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685398993.591, "dur": 198.440, + "args": { + "External id": 129547,"Record function id": 0, "Sequence number": 2575803, "Fwd thread id": 1, "Ev Idx": 2570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685398996.231, "dur": 187.700, + "args": { + "External id": 129548,"Sequence number": 2575803, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2571 + } + }, + { + "ph": "f", "id": 263, "pid": 5717, "tid": 6759, "ts": 6302685398996.231, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685399009.401, "dur": 34.100, + "args": { + "External id": 129549,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685399012.481, "dur": 6.220, + "args": { + "External id": 129550,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685399019.741, "dur": 22.780, + "args": { + "External id": 129551,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685399052.581, "dur": 10.030, + "args": { + "External id": 129552,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685399054.521, "dur": 7.640, + "args": { + "External id": 129553,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685399203.481, "dur": 387.999, + "args": { + "External id": 129554,"Record function id": 0, "Sequence number": 2575802, "Fwd thread id": 1, "Ev Idx": 2577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685399206.331, "dur": 374.469, + "args": { + "External id": 129555,"Sequence number": 2575802, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2578 + } + }, + { + "ph": "f", "id": 264, "pid": 5717, "tid": 6759, "ts": 6302685399206.331, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685399274.661, "dur": 67.020, + "args": { + "External id": 129556,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685399363.741, "dur": 35.099, + "args": { + "External id": 129557,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685399410.590, "dur": 23.270, + "args": { + "External id": 129558,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685399445.260, "dur": 18.330, + "args": { + "External id": 129559,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685399476.410, "dur": 25.000, + "args": { + "External id": 129560,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685399510.350, "dur": 14.990, + "args": { + "External id": 129561,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5717, "tid": 6759, + "ts": 6302685399543.970, "dur": 20.190, + "args": { + "External id": 129562,"kernel_hash": "cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/ug/cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685399603.950, "dur": 11.640, + "args": { + "External id": 129563,"Record function id": 0, "Ev Idx": 2586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685399606.790, "dur": 7.480, + "args": { + "External id": 129564,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685399609.320, "dur": 4.270, + "args": { + "External id": 129565,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685399610.260, "dur": 3.120, + "args": { + "External id": 129566,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685399619.970, "dur": 5.030, + "args": { + "External id": 129567,"Record function id": 0, "Ev Idx": 2590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685399621.720, "dur": 2.270, + "args": { + "External id": 129568,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685399622.390, "dur": 1.110, + "args": { + "External id": 129569,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685399622.750, "dur": 0.600, + "args": { + "External id": 129570,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685399628.570, "dur": 4.720, + "args": { + "External id": 129571,"Record function id": 0, "Ev Idx": 2594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685399630.100, "dur": 2.230, + "args": { + "External id": 129572,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685399630.650, "dur": 1.280, + "args": { + "External id": 129573,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685399631.190, "dur": 0.590, + "args": { + "External id": 129574,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685399636.920, "dur": 215.469, + "args": { + "External id": 129575,"Record function id": 0, "Sequence number": 2575801, "Fwd thread id": 1, "Ev Idx": 2598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685399638.420, "dur": 169.760, + "args": { + "External id": 129576,"Sequence number": 2575801, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2599 + } + }, + { + "ph": "f", "id": 265, "pid": 5717, "tid": 6759, "ts": 6302685399638.420, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685399702.690, "dur": 22.659, + "args": { + "External id": 129577,"kernel_hash": "c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/6f/c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685399742.349, "dur": 13.491, + "args": { + "External id": 129578,"kernel_hash": "cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/ie/cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685399772.780, "dur": 16.849, + "args": { + "External id": 129579,"kernel_hash": "cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/ys/cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685399819.380, "dur": 25.129, + "args": { + "External id": 129580,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685399870.669, "dur": 12.490, + "args": { + "External id": 129581,"Record function id": 0, "Ev Idx": 2604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685399874.619, "dur": 7.040, + "args": { + "External id": 129582,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685399876.689, "dur": 4.260, + "args": { + "External id": 129583,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685399877.559, "dur": 3.190, + "args": { + "External id": 129584,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685399887.499, "dur": 758.179, + "args": { + "External id": 129585,"Record function id": 0, "Sequence number": 2575800, "Fwd thread id": 1, "Ev Idx": 2608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685399888.999, "dur": 746.399, + "args": { + "External id": 129586,"Sequence number": 2575800, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2609 + } + }, + { + "ph": "f", "id": 266, "pid": 5717, "tid": 6759, "ts": 6302685399888.999, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.5)", "pid": 5717, "tid": 6759, + "ts": 6302685399907.159, "dur": 23.290, + "args": { + "External id": 129587,"Record function id": 0, "Ev Idx": 2610 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.5)", "pid": 5717, "tid": 6759, + "ts": 6302685399938.389, "dur": 58.030, + "args": { + "External id": 129588,"Record function id": 0, "Ev Idx": 2611 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.5)", "pid": 5717, "tid": 6759, + "ts": 6302685400002.689, "dur": 623.618, + "args": { + "External id": 129589,"Record function id": 0, "Ev Idx": 2612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685400068.379, "dur": 7.640, + "args": { + "External id": 129590,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400084.439, "dur": 3.380, + "args": { + "External id": 129591,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685400100.599, "dur": 88.109, + "args": { + "External id": 129592,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685400110.589, "dur": 74.770, + "args": { + "External id": 129593,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685400125.409, "dur": 5.540, + "args": { + "External id": 129594,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685400134.349, "dur": 29.880, + "args": { + "External id": 129595,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685400135.679, "dur": 28.240, + "args": { + "External id": 129596,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400137.899, "dur": 6.010, + "args": { + "External id": 129597,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685400145.109, "dur": 18.280, + "args": { + "External id": 129598,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685400250.839, "dur": 7.920, + "args": { + "External id": 129599,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685400252.488, "dur": 5.891, + "args": { + "External id": 129600,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685400277.198, "dur": 90.150, + "args": { + "External id": 129601,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685400293.218, "dur": 70.990, + "args": { + "External id": 129602,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2625, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685400311.738, "dur": 48.100, + "args": { + "External id": 129603,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685400380.468, "dur": 3.070, + "args": { + "External id": 129604,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2627, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400433.318, "dur": 3.640, + "args": { + "External id": 129605,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400464.488, "dur": 2.040, + "args": { + "External id": 129606,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400485.498, "dur": 1.050, + "args": { + "External id": 129607,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400500.628, "dur": 0.790, + "args": { + "External id": 129608,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400514.208, "dur": 0.770, + "args": { + "External id": 129609,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400527.558, "dur": 1.460, + "args": { + "External id": 129610,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400541.088, "dur": 0.780, + "args": { + "External id": 129611,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400554.518, "dur": 1.000, + "args": { + "External id": 129612,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400567.938, "dur": 0.820, + "args": { + "External id": 129613,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685400663.947, "dur": 1351.597, + "args": { + "External id": 129614,"Record function id": 0, "Ev Idx": 2637 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.4)", "pid": 5717, "tid": 6759, + "ts": 6302685400681.618, "dur": 838.578, + "args": { + "External id": 129615,"Record function id": 0, "Ev Idx": 2638 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 5717, "tid": 6759, + "ts": 6302685400693.467, "dur": 258.550, + "args": { + "External id": 129616,"Record function id": 0, "Ev Idx": 2639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400788.507, "dur": 7.010, + "args": { + "External id": 129617,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400800.697, "dur": 0.880, + "args": { + "External id": 129618,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400803.317, "dur": 0.700, + "args": { + "External id": 129619,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400805.577, "dur": 0.610, + "args": { + "External id": 129620,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400807.897, "dur": 0.580, + "args": { + "External id": 129621,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400809.737, "dur": 0.620, + "args": { + "External id": 129622,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400813.177, "dur": 0.790, + "args": { + "External id": 129623,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400815.697, "dur": 0.580, + "args": { + "External id": 129624,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400817.677, "dur": 1.320, + "args": { + "External id": 129625,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685400820.277, "dur": 0.510, + "args": { + "External id": 129626,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685400834.337, "dur": 90.830, + "args": { + "External id": 129627,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685400850.417, "dur": 71.310, + "args": { + "External id": 129628,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685400862.537, "dur": 5.920, + "args": { + "External id": 129629,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685400871.237, "dur": 30.280, + "args": { + "External id": 129630,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685400872.707, "dur": 28.510, + "args": { + "External id": 129631,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685400874.817, "dur": 6.340, + "args": { + "External id": 129632,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685400882.297, "dur": 18.450, + "args": { + "External id": 129633,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2656 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.3", "pid": 5717, "tid": 6759, + "ts": 6302685401036.727, "dur": 470.049, + "args": { + "External id": 129634,"Record function id": 0, "Ev Idx": 2657 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 5717, "tid": 6759, + "ts": 6302685401051.357, "dur": 442.059, + "args": { + "External id": 129635,"Record function id": 0, "Ev Idx": 2658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685401110.986, "dur": 7.411, + "args": { + "External id": 129636,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685401129.277, "dur": 18.279, + "args": { + "External id": 129637,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401132.046, "dur": 1.231, + "args": { + "External id": 129638,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401134.926, "dur": 0.340, + "args": { + "External id": 129639,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401136.266, "dur": 0.251, + "args": { + "External id": 129640,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401137.417, "dur": 1.100, + "args": { + "External id": 129641,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401139.526, "dur": 0.311, + "args": { + "External id": 129642,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401140.677, "dur": 0.229, + "args": { + "External id": 129643,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401141.786, "dur": 0.271, + "args": { + "External id": 129644,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401143.266, "dur": 0.280, + "args": { + "External id": 129645,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401144.437, "dur": 0.329, + "args": { + "External id": 129646,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685401155.146, "dur": 21.820, + "args": { + "External id": 129647,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 6759, + "ts": 6302685401205.916, "dur": 84.610, + "args": { + "External id": 129648,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685401215.286, "dur": 6.130, + "args": { + "External id": 129649,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 6759, + "ts": 6302685401225.396, "dur": 8.110, + "args": { + "External id": 129650,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685401227.916, "dur": 5.230, + "args": { + "External id": 129651,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401230.836, "dur": 0.750, + "args": { + "External id": 129652,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685401240.346, "dur": 15.910, + "args": { + "External id": 129653,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401241.986, "dur": 1.120, + "args": { + "External id": 129654,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401244.386, "dur": 0.210, + "args": { + "External id": 129655,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401245.786, "dur": 0.310, + "args": { + "External id": 129656,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401247.156, "dur": 0.150, + "args": { + "External id": 129657,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401248.166, "dur": 0.350, + "args": { + "External id": 129658,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401249.536, "dur": 0.160, + "args": { + "External id": 129659,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401250.656, "dur": 0.170, + "args": { + "External id": 129660,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401252.076, "dur": 0.230, + "args": { + "External id": 129661,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685401253.276, "dur": 1.140, + "args": { + "External id": 129662,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685401266.096, "dur": 16.580, + "args": { + "External id": 129663,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 6759, + "ts": 6302685401349.686, "dur": 80.340, + "args": { + "External id": 129664,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685401366.416, "dur": 60.650, + "args": { + "External id": 129665,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2688, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 6759, + "ts": 6302685401377.116, "dur": 45.960, + "args": { + "External id": 129666,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685401442.486, "dur": 3.190, + "args": { + "External id": 129667,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2690, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685401525.696, "dur": 474.939, + "args": { + "External id": 129668,"Sequence number": 2575799, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2691 + } + }, + { + "ph": "f", "id": 267, "pid": 5717, "tid": 6759, "ts": 6302685401525.696, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685401592.955, "dur": 32.270, + "args": { + "External id": 129669,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5717, "tid": 6759, + "ts": 6302685401655.515, "dur": 23.990, + "args": { + "External id": 129670,"kernel_hash": "czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/zc/czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685401696.235, "dur": 35.830, + "args": { + "External id": 129671,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685401752.545, "dur": 36.450, + "args": { + "External id": 129672,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685401799.175, "dur": 20.260, + "args": { + "External id": 129673,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685401828.805, "dur": 22.580, + "args": { + "External id": 129674,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685401860.945, "dur": 18.140, + "args": { + "External id": 129675,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685401901.665, "dur": 21.190, + "args": { + "External id": 129676,"kernel_hash": "cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/f4/cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685401938.115, "dur": 12.509, + "args": { + "External id": 129677,"kernel_hash": "c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/7m/c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5717, "tid": 6759, + "ts": 6302685401964.564, "dur": 16.011, + "args": { + "External id": 129678,"kernel_hash": "c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/5t/c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685402030.614, "dur": 18.340, + "args": { + "External id": 129679,"Record function id": 0, "Ev Idx": 2702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685402036.094, "dur": 10.390, + "args": { + "External id": 129680,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685402039.764, "dur": 5.960, + "args": { + "External id": 129681,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685402041.964, "dur": 3.520, + "args": { + "External id": 129682,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685402055.884, "dur": 8.560, + "args": { + "External id": 129683,"Record function id": 0, "Ev Idx": 2706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685402058.714, "dur": 3.650, + "args": { + "External id": 129684,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685402059.434, "dur": 2.380, + "args": { + "External id": 129685,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685402060.974, "dur": 0.630, + "args": { + "External id": 129686,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685402069.704, "dur": 4.870, + "args": { + "External id": 129687,"Record function id": 0, "Ev Idx": 2710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685402071.324, "dur": 2.170, + "args": { + "External id": 129688,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685402071.904, "dur": 1.180, + "args": { + "External id": 129689,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685402072.334, "dur": 0.590, + "args": { + "External id": 129690,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685402078.104, "dur": 4.770, + "args": { + "External id": 129691,"Record function id": 0, "Ev Idx": 2714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685402079.704, "dur": 2.210, + "args": { + "External id": 129692,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685402080.254, "dur": 1.230, + "args": { + "External id": 129693,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685402080.764, "dur": 0.550, + "args": { + "External id": 129694,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685402086.924, "dur": 387.369, + "args": { + "External id": 129695,"Record function id": 0, "Sequence number": 2575798, "Fwd thread id": 1, "Ev Idx": 2718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685402088.334, "dur": 377.189, + "args": { + "External id": 129696,"Sequence number": 2575798, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2719 + } + }, + { + "ph": "f", "id": 268, "pid": 5717, "tid": 6759, "ts": 6302685402088.334, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685402151.114, "dur": 42.610, + "args": { + "External id": 129697,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685402207.524, "dur": 22.490, + "args": { + "External id": 129698,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685402274.384, "dur": 167.259, + "args": { + "External id": 129699,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685402363.024, "dur": 10.699, + "args": { + "External id": 129700,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685402376.754, "dur": 3.880, + "args": { + "External id": 129701,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685402491.603, "dur": 12.510, + "args": { + "External id": 129702,"Record function id": 0, "Ev Idx": 2725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685402494.743, "dur": 7.930, + "args": { + "External id": 129703,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685402497.233, "dur": 4.080, + "args": { + "External id": 129704,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685402498.083, "dur": 2.950, + "args": { + "External id": 129705,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685402508.493, "dur": 171.900, + "args": { + "External id": 129706,"Record function id": 0, "Sequence number": 2575797, "Fwd thread id": 1, "Ev Idx": 2729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685402510.163, "dur": 163.710, + "args": { + "External id": 129707,"Sequence number": 2575797, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2730 + } + }, + { + "ph": "f", "id": 269, "pid": 5717, "tid": 6759, "ts": 6302685402510.163, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685402525.093, "dur": 36.030, + "args": { + "External id": 129708,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685402528.193, "dur": 6.410, + "args": { + "External id": 129709,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685402535.773, "dur": 24.700, + "args": { + "External id": 129710,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685402569.713, "dur": 7.310, + "args": { + "External id": 129711,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685402571.733, "dur": 4.840, + "args": { + "External id": 129712,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685402692.363, "dur": 153.290, + "args": { + "External id": 129713,"Record function id": 0, "Sequence number": 2575796, "Fwd thread id": 1, "Ev Idx": 2736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685402694.873, "dur": 143.520, + "args": { + "External id": 129714,"Sequence number": 2575796, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2737 + } + }, + { + "ph": "f", "id": 270, "pid": 5717, "tid": 6759, "ts": 6302685402694.873, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685402708.163, "dur": 29.740, + "args": { + "External id": 129715,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685402711.113, "dur": 5.950, + "args": { + "External id": 129716,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685402718.153, "dur": 19.170, + "args": { + "External id": 129717,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685402745.933, "dur": 6.470, + "args": { + "External id": 129718,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685402747.613, "dur": 4.320, + "args": { + "External id": 129719,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685402856.613, "dur": 311.419, + "args": { + "External id": 129720,"Record function id": 0, "Sequence number": 2575795, "Fwd thread id": 1, "Ev Idx": 2743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685402859.502, "dur": 297.910, + "args": { + "External id": 129721,"Sequence number": 2575795, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2744 + } + }, + { + "ph": "f", "id": 271, "pid": 5717, "tid": 6759, "ts": 6302685402859.502, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685402920.782, "dur": 39.760, + "args": { + "External id": 129722,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685402972.982, "dur": 24.330, + "args": { + "External id": 129723,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685403006.942, "dur": 22.740, + "args": { + "External id": 129724,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685403039.742, "dur": 18.070, + "args": { + "External id": 129725,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685403065.872, "dur": 14.790, + "args": { + "External id": 129726,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685403088.812, "dur": 14.200, + "args": { + "External id": 129727,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5717, "tid": 6759, + "ts": 6302685403121.802, "dur": 19.640, + "args": { + "External id": 129728,"kernel_hash": "cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/ug/cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685403180.292, "dur": 10.650, + "args": { + "External id": 129729,"Record function id": 0, "Ev Idx": 2752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685403183.002, "dur": 6.650, + "args": { + "External id": 129730,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685403185.432, "dur": 3.530, + "args": { + "External id": 129731,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685403186.332, "dur": 2.440, + "args": { + "External id": 129732,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685403195.192, "dur": 5.030, + "args": { + "External id": 129733,"Record function id": 0, "Ev Idx": 2756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685403196.812, "dur": 2.420, + "args": { + "External id": 129734,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685403197.552, "dur": 1.250, + "args": { + "External id": 129735,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685403198.042, "dur": 0.600, + "args": { + "External id": 129736,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685403203.802, "dur": 4.540, + "args": { + "External id": 129737,"Record function id": 0, "Ev Idx": 2760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685403205.262, "dur": 2.080, + "args": { + "External id": 129738,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685403205.802, "dur": 1.100, + "args": { + "External id": 129739,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685403206.232, "dur": 0.510, + "args": { + "External id": 129740,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685403212.002, "dur": 218.959, + "args": { + "External id": 129741,"Record function id": 0, "Sequence number": 2575794, "Fwd thread id": 1, "Ev Idx": 2764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685403213.462, "dur": 184.639, + "args": { + "External id": 129742,"Sequence number": 2575794, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2765 + } + }, + { + "ph": "f", "id": 272, "pid": 5717, "tid": 6759, "ts": 6302685403213.462, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685403275.452, "dur": 28.509, + "args": { + "External id": 129743,"kernel_hash": "c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/6f/c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685403324.361, "dur": 16.851, + "args": { + "External id": 129744,"kernel_hash": "cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/ie/cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685403361.341, "dur": 16.180, + "args": { + "External id": 129745,"kernel_hash": "cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/ys/cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685403408.661, "dur": 17.140, + "args": { + "External id": 129746,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685403442.791, "dur": 11.020, + "args": { + "External id": 129747,"Record function id": 0, "Ev Idx": 2770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685403445.841, "dur": 6.620, + "args": { + "External id": 129748,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685403447.961, "dur": 3.850, + "args": { + "External id": 129749,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685403448.741, "dur": 2.870, + "args": { + "External id": 129750,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685403457.901, "dur": 798.618, + "args": { + "External id": 129751,"Record function id": 0, "Sequence number": 2575793, "Fwd thread id": 1, "Ev Idx": 2774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685403459.791, "dur": 790.618, + "args": { + "External id": 129752,"Sequence number": 2575793, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2775 + } + }, + { + "ph": "f", "id": 273, "pid": 5717, "tid": 6759, "ts": 6302685403459.791, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.4)", "pid": 5717, "tid": 6759, + "ts": 6302685403477.021, "dur": 23.850, + "args": { + "External id": 129753,"Record function id": 0, "Ev Idx": 2776 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.4)", "pid": 5717, "tid": 6759, + "ts": 6302685403508.711, "dur": 63.170, + "args": { + "External id": 129754,"Record function id": 0, "Ev Idx": 2777 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.4)", "pid": 5717, "tid": 6759, + "ts": 6302685403583.921, "dur": 661.348, + "args": { + "External id": 129755,"Record function id": 0, "Ev Idx": 2778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685403677.601, "dur": 10.770, + "args": { + "External id": 129756,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685403701.171, "dur": 5.520, + "args": { + "External id": 129757,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685403720.351, "dur": 90.429, + "args": { + "External id": 129758,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685403730.051, "dur": 77.339, + "args": { + "External id": 129759,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685403744.811, "dur": 9.089, + "args": { + "External id": 129760,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685403757.640, "dur": 29.691, + "args": { + "External id": 129761,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685403758.971, "dur": 28.080, + "args": { + "External id": 129762,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685403761.211, "dur": 5.960, + "args": { + "External id": 129763,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685403768.471, "dur": 18.100, + "args": { + "External id": 129764,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685403897.360, "dur": 8.710, + "args": { + "External id": 129765,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685403899.070, "dur": 6.530, + "args": { + "External id": 129766,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685403922.800, "dur": 81.100, + "args": { + "External id": 129767,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685403938.620, "dur": 62.330, + "args": { + "External id": 129768,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2791, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685403950.000, "dur": 46.910, + "args": { + "External id": 129769,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685404016.320, "dur": 3.140, + "args": { + "External id": 129770,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2793, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404068.060, "dur": 3.530, + "args": { + "External id": 129771,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404098.570, "dur": 1.080, + "args": { + "External id": 129772,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404117.340, "dur": 0.890, + "args": { + "External id": 129773,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404131.030, "dur": 0.650, + "args": { + "External id": 129774,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404144.570, "dur": 0.660, + "args": { + "External id": 129775,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404157.250, "dur": 0.669, + "args": { + "External id": 129776,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404171.279, "dur": 0.800, + "args": { + "External id": 129777,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404184.250, "dur": 0.969, + "args": { + "External id": 129778,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404196.830, "dur": 0.769, + "args": { + "External id": 129779,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685404267.549, "dur": 1284.998, + "args": { + "External id": 129780,"Record function id": 0, "Ev Idx": 2803 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.3)", "pid": 5717, "tid": 6759, + "ts": 6302685404281.329, "dur": 793.168, + "args": { + "External id": 129781,"Record function id": 0, "Ev Idx": 2804 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 5717, "tid": 6759, + "ts": 6302685404292.489, "dur": 239.010, + "args": { + "External id": 129782,"Record function id": 0, "Ev Idx": 2805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685404382.449, "dur": 3.550, + "args": { + "External id": 129783,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685404390.189, "dur": 0.820, + "args": { + "External id": 129784,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685404392.859, "dur": 1.550, + "args": { + "External id": 129785,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685404396.309, "dur": 0.570, + "args": { + "External id": 129786,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685404398.239, "dur": 0.590, + "args": { + "External id": 129787,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685404400.239, "dur": 0.520, + "args": { + "External id": 129788,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685404402.339, "dur": 0.850, + "args": { + "External id": 129789,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685404404.609, "dur": 0.630, + "args": { + "External id": 129790,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685404406.649, "dur": 0.530, + "args": { + "External id": 129791,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685404408.829, "dur": 0.560, + "args": { + "External id": 129792,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685404422.329, "dur": 83.450, + "args": { + "External id": 129793,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685404433.469, "dur": 68.920, + "args": { + "External id": 129794,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685404443.029, "dur": 6.550, + "args": { + "External id": 129795,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685404451.949, "dur": 30.350, + "args": { + "External id": 129796,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685404453.529, "dur": 28.450, + "args": { + "External id": 129797,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404455.659, "dur": 6.280, + "args": { + "External id": 129798,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685404463.099, "dur": 18.430, + "args": { + "External id": 129799,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2822 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.2", "pid": 5717, "tid": 6759, + "ts": 6302685404613.569, "dur": 452.619, + "args": { + "External id": 129800,"Record function id": 0, "Ev Idx": 2823 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 5717, "tid": 6759, + "ts": 6302685404628.958, "dur": 424.370, + "args": { + "External id": 129801,"Record function id": 0, "Ev Idx": 2824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685404687.488, "dur": 7.390, + "args": { + "External id": 129802,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685404705.968, "dur": 17.650, + "args": { + "External id": 129803,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404708.878, "dur": 1.200, + "args": { + "External id": 129804,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404711.808, "dur": 0.250, + "args": { + "External id": 129805,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404713.148, "dur": 0.300, + "args": { + "External id": 129806,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404714.498, "dur": 0.240, + "args": { + "External id": 129807,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404715.598, "dur": 0.250, + "args": { + "External id": 129808,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404716.648, "dur": 0.920, + "args": { + "External id": 129809,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404718.468, "dur": 0.360, + "args": { + "External id": 129810,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404719.728, "dur": 0.290, + "args": { + "External id": 129811,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404720.848, "dur": 0.190, + "args": { + "External id": 129812,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685404731.678, "dur": 20.840, + "args": { + "External id": 129813,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 6759, + "ts": 6302685404781.118, "dur": 82.540, + "args": { + "External id": 129814,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685404790.318, "dur": 6.200, + "args": { + "External id": 129815,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 6759, + "ts": 6302685404800.518, "dur": 7.750, + "args": { + "External id": 129816,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685404802.848, "dur": 5.030, + "args": { + "External id": 129817,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404805.488, "dur": 0.750, + "args": { + "External id": 129818,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685404814.908, "dur": 15.140, + "args": { + "External id": 129819,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404816.498, "dur": 0.580, + "args": { + "External id": 129820,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404818.138, "dur": 0.320, + "args": { + "External id": 129821,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404819.498, "dur": 1.130, + "args": { + "External id": 129822,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404821.528, "dur": 0.270, + "args": { + "External id": 129823,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404822.668, "dur": 0.330, + "args": { + "External id": 129824,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404823.938, "dur": 0.240, + "args": { + "External id": 129825,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404825.298, "dur": 0.250, + "args": { + "External id": 129826,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404826.468, "dur": 0.270, + "args": { + "External id": 129827,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685404827.648, "dur": 0.220, + "args": { + "External id": 129828,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685404839.728, "dur": 16.140, + "args": { + "External id": 129829,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 6759, + "ts": 6302685404912.748, "dur": 78.130, + "args": { + "External id": 129830,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685404928.218, "dur": 59.910, + "args": { + "External id": 129831,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2854, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 6759, + "ts": 6302685404939.038, "dur": 45.030, + "args": { + "External id": 129832,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685405003.098, "dur": 3.170, + "args": { + "External id": 129833,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2856, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685405079.397, "dur": 462.059, + "args": { + "External id": 129834,"Sequence number": 2575792, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2857 + } + }, + { + "ph": "f", "id": 274, "pid": 5717, "tid": 6759, "ts": 6302685405079.397, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685405141.807, "dur": 31.320, + "args": { + "External id": 129835,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5717, "tid": 6759, + "ts": 6302685405203.067, "dur": 23.120, + "args": { + "External id": 129836,"kernel_hash": "czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/zc/czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685405242.127, "dur": 33.440, + "args": { + "External id": 129837,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685405287.997, "dur": 35.480, + "args": { + "External id": 129838,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685405335.887, "dur": 20.420, + "args": { + "External id": 129839,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685405366.187, "dur": 22.630, + "args": { + "External id": 129840,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685405398.967, "dur": 18.320, + "args": { + "External id": 129841,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685405440.017, "dur": 20.430, + "args": { + "External id": 129842,"kernel_hash": "cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/f4/cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685405476.547, "dur": 13.240, + "args": { + "External id": 129843,"kernel_hash": "c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/7m/c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5717, "tid": 6759, + "ts": 6302685405503.916, "dur": 16.240, + "args": { + "External id": 129844,"kernel_hash": "c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/5t/c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685405565.606, "dur": 11.220, + "args": { + "External id": 129845,"Record function id": 0, "Ev Idx": 2868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685405568.766, "dur": 6.750, + "args": { + "External id": 129846,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685405571.186, "dur": 3.490, + "args": { + "External id": 129847,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685405572.026, "dur": 2.420, + "args": { + "External id": 129848,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685405581.266, "dur": 5.020, + "args": { + "External id": 129849,"Record function id": 0, "Ev Idx": 2872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685405582.986, "dur": 2.260, + "args": { + "External id": 129850,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685405583.686, "dur": 1.110, + "args": { + "External id": 129851,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685405584.056, "dur": 0.580, + "args": { + "External id": 129852,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685405590.236, "dur": 4.740, + "args": { + "External id": 129853,"Record function id": 0, "Ev Idx": 2876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685405591.776, "dur": 2.240, + "args": { + "External id": 129854,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685405592.366, "dur": 1.270, + "args": { + "External id": 129855,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685405592.926, "dur": 0.550, + "args": { + "External id": 129856,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685405598.456, "dur": 4.430, + "args": { + "External id": 129857,"Record function id": 0, "Ev Idx": 2880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685405599.996, "dur": 1.970, + "args": { + "External id": 129858,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685405600.556, "dur": 1.020, + "args": { + "External id": 129859,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685405600.896, "dur": 0.520, + "args": { + "External id": 129860,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685405606.596, "dur": 305.010, + "args": { + "External id": 129861,"Record function id": 0, "Sequence number": 2575791, "Fwd thread id": 1, "Ev Idx": 2884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685405608.066, "dur": 295.060, + "args": { + "External id": 129862,"Sequence number": 2575791, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2885 + } + }, + { + "ph": "f", "id": 275, "pid": 5717, "tid": 6759, "ts": 6302685405608.066, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685405667.296, "dur": 37.600, + "args": { + "External id": 129863,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685405717.186, "dur": 18.460, + "args": { + "External id": 129864,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685405759.526, "dur": 119.920, + "args": { + "External id": 129865,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685405808.966, "dur": 6.320, + "args": { + "External id": 129866,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685405816.876, "dur": 3.840, + "args": { + "External id": 129867,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685405924.506, "dur": 10.869, + "args": { + "External id": 129868,"Record function id": 0, "Ev Idx": 2891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685405927.755, "dur": 6.411, + "args": { + "External id": 129869,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685405930.135, "dur": 3.171, + "args": { + "External id": 129870,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685405931.095, "dur": 2.020, + "args": { + "External id": 129871,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685405939.495, "dur": 169.210, + "args": { + "External id": 129872,"Record function id": 0, "Sequence number": 2575790, "Fwd thread id": 1, "Ev Idx": 2895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685405941.135, "dur": 161.150, + "args": { + "External id": 129873,"Sequence number": 2575790, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2896 + } + }, + { + "ph": "f", "id": 276, "pid": 5717, "tid": 6759, "ts": 6302685405941.135, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685405955.046, "dur": 35.009, + "args": { + "External id": 129874,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685405958.086, "dur": 6.249, + "args": { + "External id": 129875,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685405965.355, "dur": 24.091, + "args": { + "External id": 129876,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685405998.655, "dur": 6.910, + "args": { + "External id": 129877,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685406000.406, "dur": 4.749, + "args": { + "External id": 129878,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685406119.715, "dur": 157.160, + "args": { + "External id": 129879,"Record function id": 0, "Sequence number": 2575789, "Fwd thread id": 1, "Ev Idx": 2902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685406122.355, "dur": 145.940, + "args": { + "External id": 129880,"Sequence number": 2575789, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2903 + } + }, + { + "ph": "f", "id": 277, "pid": 5717, "tid": 6759, "ts": 6302685406122.355, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685406135.815, "dur": 29.930, + "args": { + "External id": 129881,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685406138.905, "dur": 6.300, + "args": { + "External id": 129882,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685406146.225, "dur": 18.940, + "args": { + "External id": 129883,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685406173.735, "dur": 6.720, + "args": { + "External id": 129884,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685406175.655, "dur": 4.390, + "args": { + "External id": 129885,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685406288.355, "dur": 378.589, + "args": { + "External id": 129886,"Record function id": 0, "Sequence number": 2575788, "Fwd thread id": 1, "Ev Idx": 2909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685406292.655, "dur": 363.449, + "args": { + "External id": 129887,"Sequence number": 2575788, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2910 + } + }, + { + "ph": "f", "id": 278, "pid": 5717, "tid": 6759, "ts": 6302685406292.655, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685406394.314, "dur": 61.010, + "args": { + "External id": 129888,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685406467.894, "dur": 25.060, + "args": { + "External id": 129889,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685406503.174, "dur": 22.850, + "args": { + "External id": 129890,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685406536.884, "dur": 18.020, + "args": { + "External id": 129891,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685406563.154, "dur": 15.200, + "args": { + "External id": 129892,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685406585.724, "dur": 14.960, + "args": { + "External id": 129893,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5717, "tid": 6759, + "ts": 6302685406620.464, "dur": 19.220, + "args": { + "External id": 129894,"kernel_hash": "cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/ug/cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685406679.524, "dur": 10.920, + "args": { + "External id": 129895,"Record function id": 0, "Ev Idx": 2918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685406682.284, "dur": 6.830, + "args": { + "External id": 129896,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685406684.744, "dur": 3.700, + "args": { + "External id": 129897,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685406685.704, "dur": 2.550, + "args": { + "External id": 129898,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685406694.824, "dur": 5.310, + "args": { + "External id": 129899,"Record function id": 0, "Ev Idx": 2922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685406696.514, "dur": 2.600, + "args": { + "External id": 129900,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685406697.494, "dur": 1.250, + "args": { + "External id": 129901,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685406697.874, "dur": 0.700, + "args": { + "External id": 129902,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685406703.734, "dur": 4.770, + "args": { + "External id": 129903,"Record function id": 0, "Ev Idx": 2926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685406705.324, "dur": 2.280, + "args": { + "External id": 129904,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685406705.924, "dur": 1.270, + "args": { + "External id": 129905,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685406706.394, "dur": 0.640, + "args": { + "External id": 129906,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685406712.134, "dur": 198.159, + "args": { + "External id": 129907,"Record function id": 0, "Sequence number": 2575787, "Fwd thread id": 1, "Ev Idx": 2930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685406714.024, "dur": 166.589, + "args": { + "External id": 129908,"Sequence number": 2575787, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2931 + } + }, + { + "ph": "f", "id": 279, "pid": 5717, "tid": 6759, "ts": 6302685406714.024, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685406777.384, "dur": 21.900, + "args": { + "External id": 129909,"kernel_hash": "c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/6f/c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685406816.213, "dur": 12.460, + "args": { + "External id": 129910,"kernel_hash": "cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/ie/cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685406844.913, "dur": 15.820, + "args": { + "External id": 129911,"kernel_hash": "cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/ys/cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685406888.803, "dur": 16.580, + "args": { + "External id": 129912,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685406921.673, "dur": 10.000, + "args": { + "External id": 129913,"Record function id": 0, "Ev Idx": 2936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685406924.543, "dur": 5.860, + "args": { + "External id": 129914,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685406926.673, "dur": 3.100, + "args": { + "External id": 129915,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685406927.483, "dur": 2.100, + "args": { + "External id": 129916,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685406935.813, "dur": 717.479, + "args": { + "External id": 129917,"Record function id": 0, "Sequence number": 2575786, "Fwd thread id": 1, "Ev Idx": 2940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685406937.403, "dur": 710.129, + "args": { + "External id": 129918,"Sequence number": 2575786, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2941 + } + }, + { + "ph": "f", "id": 280, "pid": 5717, "tid": 6759, "ts": 6302685406937.403, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.3)", "pid": 5717, "tid": 6759, + "ts": 6302685406954.523, "dur": 23.880, + "args": { + "External id": 129919,"Record function id": 0, "Ev Idx": 2942 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.3)", "pid": 5717, "tid": 6759, + "ts": 6302685406986.463, "dur": 53.960, + "args": { + "External id": 129920,"Record function id": 0, "Ev Idx": 2943 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.3)", "pid": 5717, "tid": 6759, + "ts": 6302685407046.663, "dur": 595.769, + "args": { + "External id": 129921,"Record function id": 0, "Ev Idx": 2944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685407112.163, "dur": 7.570, + "args": { + "External id": 129922,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407127.573, "dur": 3.250, + "args": { + "External id": 129923,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685407142.433, "dur": 84.870, + "args": { + "External id": 129924,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685407151.783, "dur": 72.050, + "args": { + "External id": 129925,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685407165.613, "dur": 5.910, + "args": { + "External id": 129926,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685407175.573, "dur": 29.150, + "args": { + "External id": 129927,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685407176.803, "dur": 27.600, + "args": { + "External id": 129928,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407178.913, "dur": 6.120, + "args": { + "External id": 129929,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685407186.153, "dur": 17.770, + "args": { + "External id": 129930,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685407287.632, "dur": 8.131, + "args": { + "External id": 129931,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685407289.203, "dur": 6.069, + "args": { + "External id": 129932,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685407320.043, "dur": 79.839, + "args": { + "External id": 129933,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685407334.712, "dur": 62.280, + "args": { + "External id": 129934,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2957, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685407346.122, "dur": 46.890, + "args": { + "External id": 129935,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685407411.992, "dur": 3.310, + "args": { + "External id": 129936,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2959, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407465.072, "dur": 3.620, + "args": { + "External id": 129937,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407496.092, "dur": 1.230, + "args": { + "External id": 129938,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407514.232, "dur": 1.050, + "args": { + "External id": 129939,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407529.722, "dur": 0.830, + "args": { + "External id": 129940,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407543.742, "dur": 0.780, + "args": { + "External id": 129941,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407556.642, "dur": 0.840, + "args": { + "External id": 129942,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407569.342, "dur": 0.750, + "args": { + "External id": 129943,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407582.362, "dur": 1.030, + "args": { + "External id": 129944,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407595.112, "dur": 0.830, + "args": { + "External id": 129945,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685407664.152, "dur": 1356.736, + "args": { + "External id": 129946,"Record function id": 0, "Ev Idx": 2969 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.2)", "pid": 5717, "tid": 6759, + "ts": 6302685407677.771, "dur": 848.359, + "args": { + "External id": 129947,"Record function id": 0, "Ev Idx": 2970 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 5717, "tid": 6759, + "ts": 6302685407688.611, "dur": 228.260, + "args": { + "External id": 129948,"Record function id": 0, "Ev Idx": 2971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407770.081, "dur": 3.480, + "args": { + "External id": 129949,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407777.341, "dur": 0.890, + "args": { + "External id": 129950,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407779.981, "dur": 0.840, + "args": { + "External id": 129951,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407782.261, "dur": 0.730, + "args": { + "External id": 129952,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407784.311, "dur": 0.660, + "args": { + "External id": 129953,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407786.261, "dur": 0.720, + "args": { + "External id": 129954,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407788.491, "dur": 0.980, + "args": { + "External id": 129955,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407790.651, "dur": 0.680, + "args": { + "External id": 129956,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407792.701, "dur": 0.700, + "args": { + "External id": 129957,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685407794.911, "dur": 0.750, + "args": { + "External id": 129958,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685407808.041, "dur": 83.430, + "args": { + "External id": 129959,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685407818.681, "dur": 69.520, + "args": { + "External id": 129960,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685407828.321, "dur": 6.160, + "args": { + "External id": 129961,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685407836.951, "dur": 30.310, + "args": { + "External id": 129962,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685407838.331, "dur": 28.660, + "args": { + "External id": 129963,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685407840.401, "dur": 6.720, + "args": { + "External id": 129964,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685407848.291, "dur": 18.200, + "args": { + "External id": 129965,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2988 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.1", "pid": 5717, "tid": 6759, + "ts": 6302685407998.891, "dur": 519.209, + "args": { + "External id": 129966,"Record function id": 0, "Ev Idx": 2989 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 5717, "tid": 6759, + "ts": 6302685408013.531, "dur": 491.479, + "args": { + "External id": 129967,"Record function id": 0, "Ev Idx": 2990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685408087.741, "dur": 8.550, + "args": { + "External id": 129968,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685408107.761, "dur": 18.860, + "args": { + "External id": 129969,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408110.531, "dur": 1.400, + "args": { + "External id": 129970,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408113.661, "dur": 0.269, + "args": { + "External id": 129971,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408114.890, "dur": 0.331, + "args": { + "External id": 129972,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408116.150, "dur": 0.791, + "args": { + "External id": 129973,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408117.810, "dur": 0.260, + "args": { + "External id": 129974,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408118.981, "dur": 0.260, + "args": { + "External id": 129975,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408120.181, "dur": 0.260, + "args": { + "External id": 129976,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408121.221, "dur": 0.269, + "args": { + "External id": 129977,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408122.350, "dur": 0.240, + "args": { + "External id": 129978,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685408139.350, "dur": 32.960, + "args": { + "External id": 129979,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 6759, + "ts": 6302685408212.310, "dur": 83.660, + "args": { + "External id": 129980,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685408221.870, "dur": 6.590, + "args": { + "External id": 129981,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 6759, + "ts": 6302685408232.360, "dur": 7.910, + "args": { + "External id": 129982,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685408234.830, "dur": 5.090, + "args": { + "External id": 129983,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408237.410, "dur": 0.880, + "args": { + "External id": 129984,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685408247.450, "dur": 14.480, + "args": { + "External id": 129985,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408249.150, "dur": 0.330, + "args": { + "External id": 129986,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408250.570, "dur": 0.250, + "args": { + "External id": 129987,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408251.710, "dur": 0.270, + "args": { + "External id": 129988,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408253.060, "dur": 0.260, + "args": { + "External id": 129989,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408254.230, "dur": 0.270, + "args": { + "External id": 129990,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408255.360, "dur": 0.290, + "args": { + "External id": 129991,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408256.560, "dur": 0.540, + "args": { + "External id": 129992,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408257.990, "dur": 0.190, + "args": { + "External id": 129993,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685408259.020, "dur": 0.290, + "args": { + "External id": 129994,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685408271.830, "dur": 16.390, + "args": { + "External id": 129995,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 6759, + "ts": 6302685408359.550, "dur": 80.960, + "args": { + "External id": 129996,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685408376.200, "dur": 61.350, + "args": { + "External id": 129997,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3020, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 6759, + "ts": 6302685408387.290, "dur": 46.140, + "args": { + "External id": 129998,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685408452.810, "dur": 3.130, + "args": { + "External id": 129999,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3022, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685408531.240, "dur": 474.528, + "args": { + "External id": 130000,"Sequence number": 2575785, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3023 + } + }, + { + "ph": "f", "id": 281, "pid": 5717, "tid": 6759, "ts": 6302685408531.240, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685408593.349, "dur": 31.980, + "args": { + "External id": 130001,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5717, "tid": 6759, + "ts": 6302685408653.949, "dur": 22.280, + "args": { + "External id": 130002,"kernel_hash": "czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/zc/czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 3025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685408691.759, "dur": 33.290, + "args": { + "External id": 130003,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 3026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685408737.189, "dur": 25.200, + "args": { + "External id": 130004,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685408772.079, "dur": 19.800, + "args": { + "External id": 130005,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685408801.429, "dur": 22.540, + "args": { + "External id": 130006,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685408834.319, "dur": 18.090, + "args": { + "External id": 130007,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685408876.529, "dur": 22.450, + "args": { + "External id": 130008,"kernel_hash": "cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/f4/cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685408922.019, "dur": 16.750, + "args": { + "External id": 130009,"kernel_hash": "c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/7m/c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5717, "tid": 6759, + "ts": 6302685408956.699, "dur": 21.390, + "args": { + "External id": 130010,"kernel_hash": "c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/5t/c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 3033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685409037.799, "dur": 15.080, + "args": { + "External id": 130011,"Record function id": 0, "Ev Idx": 3034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685409042.028, "dur": 8.200, + "args": { + "External id": 130012,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685409044.559, "dur": 4.840, + "args": { + "External id": 130013,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685409045.439, "dur": 3.720, + "args": { + "External id": 130014,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685409058.628, "dur": 6.911, + "args": { + "External id": 130015,"Record function id": 0, "Ev Idx": 3038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685409060.699, "dur": 3.829, + "args": { + "External id": 130016,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685409061.468, "dur": 2.591, + "args": { + "External id": 130017,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685409063.028, "dur": 0.840, + "args": { + "External id": 130018,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685409069.499, "dur": 6.869, + "args": { + "External id": 130019,"Record function id": 0, "Ev Idx": 3042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685409071.059, "dur": 2.369, + "args": { + "External id": 130020,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685409071.719, "dur": 1.249, + "args": { + "External id": 130021,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685409072.159, "dur": 0.629, + "args": { + "External id": 130022,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685409080.728, "dur": 6.720, + "args": { + "External id": 130023,"Record function id": 0, "Ev Idx": 3046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685409082.459, "dur": 3.980, + "args": { + "External id": 130024,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685409084.468, "dur": 1.380, + "args": { + "External id": 130025,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685409085.048, "dur": 0.651, + "args": { + "External id": 130026,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685409092.639, "dur": 329.639, + "args": { + "External id": 130027,"Record function id": 0, "Sequence number": 2575784, "Fwd thread id": 1, "Ev Idx": 3050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685409094.099, "dur": 319.959, + "args": { + "External id": 130028,"Sequence number": 2575784, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3051 + } + }, + { + "ph": "f", "id": 282, "pid": 5717, "tid": 6759, "ts": 6302685409094.099, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685409155.168, "dur": 39.030, + "args": { + "External id": 130029,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685409207.188, "dur": 29.330, + "args": { + "External id": 130030,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685409260.768, "dur": 129.010, + "args": { + "External id": 130031,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 3054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685409316.938, "dur": 6.660, + "args": { + "External id": 130032,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685409325.358, "dur": 3.560, + "args": { + "External id": 130033,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685409435.318, "dur": 10.849, + "args": { + "External id": 130034,"Record function id": 0, "Ev Idx": 3057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685409438.567, "dur": 6.340, + "args": { + "External id": 130035,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685409440.938, "dur": 3.149, + "args": { + "External id": 130036,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685409441.727, "dur": 2.171, + "args": { + "External id": 130037,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685409450.267, "dur": 168.300, + "args": { + "External id": 130038,"Record function id": 0, "Sequence number": 2575783, "Fwd thread id": 1, "Ev Idx": 3061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685409451.847, "dur": 160.380, + "args": { + "External id": 130039,"Sequence number": 2575783, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3062 + } + }, + { + "ph": "f", "id": 283, "pid": 5717, "tid": 6759, "ts": 6302685409451.847, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685409466.678, "dur": 35.069, + "args": { + "External id": 130040,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685409469.858, "dur": 6.380, + "args": { + "External id": 130041,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685409477.447, "dur": 23.680, + "args": { + "External id": 130042,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685409510.298, "dur": 6.689, + "args": { + "External id": 130043,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685409512.167, "dur": 4.400, + "args": { + "External id": 130044,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685409629.867, "dur": 160.060, + "args": { + "External id": 130045,"Record function id": 0, "Sequence number": 2575782, "Fwd thread id": 1, "Ev Idx": 3068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685409632.437, "dur": 150.470, + "args": { + "External id": 130046,"Sequence number": 2575782, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3069 + } + }, + { + "ph": "f", "id": 284, "pid": 5717, "tid": 6759, "ts": 6302685409632.437, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685409645.677, "dur": 38.140, + "args": { + "External id": 130047,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685409648.877, "dur": 6.160, + "args": { + "External id": 130048,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685409663.447, "dur": 19.790, + "args": { + "External id": 130049,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685409691.777, "dur": 6.520, + "args": { + "External id": 130050,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685409693.607, "dur": 4.280, + "args": { + "External id": 130051,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685409800.837, "dur": 307.579, + "args": { + "External id": 130052,"Record function id": 0, "Sequence number": 2575781, "Fwd thread id": 1, "Ev Idx": 3075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685409803.647, "dur": 293.209, + "args": { + "External id": 130053,"Sequence number": 2575781, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 3076 + } + }, + { + "ph": "f", "id": 285, "pid": 5717, "tid": 6759, "ts": 6302685409803.647, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685409864.647, "dur": 38.970, + "args": { + "External id": 130054,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685409915.686, "dur": 23.631, + "args": { + "External id": 130055,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685409948.786, "dur": 22.120, + "args": { + "External id": 130056,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685409981.616, "dur": 18.040, + "args": { + "External id": 130057,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685410007.336, "dur": 14.600, + "args": { + "External id": 130058,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685410029.536, "dur": 14.630, + "args": { + "External id": 130059,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5717, "tid": 6759, + "ts": 6302685410062.996, "dur": 18.850, + "args": { + "External id": 130060,"kernel_hash": "cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/ug/cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 3083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685410120.796, "dur": 11.740, + "args": { + "External id": 130061,"Record function id": 0, "Ev Idx": 3084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685410123.576, "dur": 7.260, + "args": { + "External id": 130062,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685410125.936, "dur": 3.880, + "args": { + "External id": 130063,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685410126.886, "dur": 2.640, + "args": { + "External id": 130064,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685410136.776, "dur": 5.200, + "args": { + "External id": 130065,"Record function id": 0, "Ev Idx": 3088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685410138.436, "dur": 2.590, + "args": { + "External id": 130066,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685410139.146, "dur": 1.390, + "args": { + "External id": 130067,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685410139.596, "dur": 0.790, + "args": { + "External id": 130068,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685410145.576, "dur": 4.680, + "args": { + "External id": 130069,"Record function id": 0, "Ev Idx": 3092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685410147.096, "dur": 2.220, + "args": { + "External id": 130070,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685410147.646, "dur": 1.230, + "args": { + "External id": 130071,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685410148.046, "dur": 0.680, + "args": { + "External id": 130072,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685410153.906, "dur": 200.679, + "args": { + "External id": 130073,"Record function id": 0, "Sequence number": 2575780, "Fwd thread id": 1, "Ev Idx": 3096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685410155.386, "dur": 169.130, + "args": { + "External id": 130074,"Sequence number": 2575780, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3097 + } + }, + { + "ph": "f", "id": 286, "pid": 5717, "tid": 6759, "ts": 6302685410155.386, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685410217.526, "dur": 20.490, + "args": { + "External id": 130075,"kernel_hash": "c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/6f/c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685410254.576, "dur": 12.410, + "args": { + "External id": 130076,"kernel_hash": "cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/ie/cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685410283.056, "dur": 22.660, + "args": { + "External id": 130077,"kernel_hash": "cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/ys/cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 3100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685410332.745, "dur": 17.291, + "args": { + "External id": 130078,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685410366.256, "dur": 9.860, + "args": { + "External id": 130079,"Record function id": 0, "Ev Idx": 3102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685410368.936, "dur": 5.769, + "args": { + "External id": 130080,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685410370.985, "dur": 3.120, + "args": { + "External id": 130081,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685410371.696, "dur": 2.220, + "args": { + "External id": 130082,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685410380.325, "dur": 769.899, + "args": { + "External id": 130083,"Record function id": 0, "Sequence number": 2575779, "Fwd thread id": 1, "Ev Idx": 3106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685410381.876, "dur": 762.548, + "args": { + "External id": 130084,"Sequence number": 2575779, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3107 + } + }, + { + "ph": "f", "id": 287, "pid": 5717, "tid": 6759, "ts": 6302685410381.876, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.2)", "pid": 5717, "tid": 6759, + "ts": 6302685410399.676, "dur": 23.209, + "args": { + "External id": 130085,"Record function id": 0, "Ev Idx": 3108 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.2)", "pid": 5717, "tid": 6759, + "ts": 6302685410430.725, "dur": 54.390, + "args": { + "External id": 130086,"Record function id": 0, "Ev Idx": 3109 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.2)", "pid": 5717, "tid": 6759, + "ts": 6302685410491.705, "dur": 647.629, + "args": { + "External id": 130087,"Record function id": 0, "Ev Idx": 3110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685410556.215, "dur": 7.390, + "args": { + "External id": 130088,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685410571.395, "dur": 3.390, + "args": { + "External id": 130089,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685410586.525, "dur": 85.860, + "args": { + "External id": 130090,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685410595.935, "dur": 73.110, + "args": { + "External id": 130091,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685410611.235, "dur": 5.540, + "args": { + "External id": 130092,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685410620.075, "dur": 29.480, + "args": { + "External id": 130093,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 3116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685410621.325, "dur": 27.910, + "args": { + "External id": 130094,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 3117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685410623.465, "dur": 6.190, + "args": { + "External id": 130095,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685410630.735, "dur": 18.010, + "args": { + "External id": 130096,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 3119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685410733.255, "dur": 7.880, + "args": { + "External id": 130097,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 3120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685410734.805, "dur": 5.870, + "args": { + "External id": 130098,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685410757.035, "dur": 78.920, + "args": { + "External id": 130099,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 3122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685410771.995, "dur": 61.069, + "args": { + "External id": 130100,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3123, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685410782.964, "dur": 45.940, + "args": { + "External id": 130101,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 3124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685410848.155, "dur": 3.180, + "args": { + "External id": 130102,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3125, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685410902.864, "dur": 4.020, + "args": { + "External id": 130103,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685410947.604, "dur": 2.920, + "args": { + "External id": 130104,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685410977.104, "dur": 2.360, + "args": { + "External id": 130105,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685410995.884, "dur": 0.900, + "args": { + "External id": 130106,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411009.624, "dur": 0.780, + "args": { + "External id": 130107,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411024.844, "dur": 1.020, + "args": { + "External id": 130108,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411040.724, "dur": 0.870, + "args": { + "External id": 130109,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411056.504, "dur": 0.940, + "args": { + "External id": 130110,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411076.444, "dur": 2.120, + "args": { + "External id": 130111,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685411161.414, "dur": 1440.006, + "args": { + "External id": 130112,"Record function id": 0, "Ev Idx": 3135 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.1)", "pid": 5717, "tid": 6759, + "ts": 6302685411175.604, "dur": 837.928, + "args": { + "External id": 130113,"Record function id": 0, "Ev Idx": 3136 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 5717, "tid": 6759, + "ts": 6302685411186.554, "dur": 241.549, + "args": { + "External id": 130114,"Record function id": 0, "Ev Idx": 3137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685411271.143, "dur": 3.580, + "args": { + "External id": 130115,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685411278.483, "dur": 1.100, + "args": { + "External id": 130116,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685411281.414, "dur": 1.040, + "args": { + "External id": 130117,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685411284.174, "dur": 0.749, + "args": { + "External id": 130118,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685411286.343, "dur": 0.691, + "args": { + "External id": 130119,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685411288.274, "dur": 0.780, + "args": { + "External id": 130120,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685411290.663, "dur": 1.031, + "args": { + "External id": 130121,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685411292.943, "dur": 0.740, + "args": { + "External id": 130122,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685411294.983, "dur": 0.800, + "args": { + "External id": 130123,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685411304.453, "dur": 0.860, + "args": { + "External id": 130124,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685411318.273, "dur": 84.010, + "args": { + "External id": 130125,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685411329.253, "dur": 69.800, + "args": { + "External id": 130126,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685411339.533, "dur": 6.290, + "args": { + "External id": 130127,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685411348.053, "dur": 30.890, + "args": { + "External id": 130128,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685411349.463, "dur": 29.160, + "args": { + "External id": 130129,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411351.703, "dur": 6.490, + "args": { + "External id": 130130,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685411359.343, "dur": 18.800, + "args": { + "External id": 130131,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3154 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.0", "pid": 5717, "tid": 6759, + "ts": 6302685411512.743, "dur": 492.749, + "args": { + "External id": 130132,"Record function id": 0, "Ev Idx": 3155 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 5717, "tid": 6759, + "ts": 6302685411526.823, "dur": 463.019, + "args": { + "External id": 130133,"Record function id": 0, "Ev Idx": 3156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685411594.483, "dur": 12.940, + "args": { + "External id": 130134,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685411621.753, "dur": 19.340, + "args": { + "External id": 130135,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411624.693, "dur": 1.360, + "args": { + "External id": 130136,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411627.903, "dur": 0.390, + "args": { + "External id": 130137,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411629.353, "dur": 0.330, + "args": { + "External id": 130138,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411630.763, "dur": 0.310, + "args": { + "External id": 130139,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411632.013, "dur": 0.360, + "args": { + "External id": 130140,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411633.383, "dur": 0.330, + "args": { + "External id": 130141,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411634.663, "dur": 0.420, + "args": { + "External id": 130142,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411636.043, "dur": 0.310, + "args": { + "External id": 130143,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411637.413, "dur": 0.310, + "args": { + "External id": 130144,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685411648.793, "dur": 21.820, + "args": { + "External id": 130145,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 6759, + "ts": 6302685411699.502, "dur": 84.390, + "args": { + "External id": 130146,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685411709.482, "dur": 6.660, + "args": { + "External id": 130147,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 6759, + "ts": 6302685411720.342, "dur": 7.420, + "args": { + "External id": 130148,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 6759, + "ts": 6302685411722.653, "dur": 4.740, + "args": { + "External id": 130149,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411725.102, "dur": 0.751, + "args": { + "External id": 130150,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 6759, + "ts": 6302685411734.733, "dur": 15.039, + "args": { + "External id": 130151,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411736.502, "dur": 0.451, + "args": { + "External id": 130152,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411738.093, "dur": 0.369, + "args": { + "External id": 130153,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411739.422, "dur": 0.191, + "args": { + "External id": 130154,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411740.522, "dur": 0.351, + "args": { + "External id": 130155,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411741.802, "dur": 0.340, + "args": { + "External id": 130156,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411743.193, "dur": 0.300, + "args": { + "External id": 130157,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411744.452, "dur": 0.280, + "args": { + "External id": 130158,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411745.672, "dur": 0.270, + "args": { + "External id": 130159,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685411746.802, "dur": 0.270, + "args": { + "External id": 130160,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 6759, + "ts": 6302685411759.342, "dur": 16.710, + "args": { + "External id": 130161,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 6759, + "ts": 6302685411839.032, "dur": 85.730, + "args": { + "External id": 130162,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685411854.072, "dur": 67.820, + "args": { + "External id": 130163,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3186, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 6759, + "ts": 6302685411864.592, "dur": 52.970, + "args": { + "External id": 130164,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685411936.722, "dur": 3.310, + "args": { + "External id": 130165,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3188, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685412018.872, "dur": 571.028, + "args": { + "External id": 130166,"Sequence number": 2575778, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3189 + } + }, + { + "ph": "f", "id": 288, "pid": 5717, "tid": 6759, "ts": 6302685412018.872, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685412091.672, "dur": 32.169, + "args": { + "External id": 130167,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5717, "tid": 6759, + "ts": 6302685412158.061, "dur": 26.591, + "args": { + "External id": 130168,"kernel_hash": "czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/zc/czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 3191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685412205.111, "dur": 43.890, + "args": { + "External id": 130169,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 3192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685412265.211, "dur": 42.410, + "args": { + "External id": 130170,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685412321.221, "dur": 30.000, + "args": { + "External id": 130171,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685412368.561, "dur": 40.070, + "args": { + "External id": 130172,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685412429.001, "dur": 26.370, + "args": { + "External id": 130173,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685412480.971, "dur": 21.700, + "args": { + "External id": 130174,"kernel_hash": "cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/f4/cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685412520.671, "dur": 13.609, + "args": { + "External id": 130175,"kernel_hash": "c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/7m/c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5717, "tid": 6759, + "ts": 6302685412549.851, "dur": 17.469, + "args": { + "External id": 130176,"kernel_hash": "c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/5t/c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 3199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685412615.611, "dur": 12.099, + "args": { + "External id": 130177,"Record function id": 0, "Ev Idx": 3200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685412618.740, "dur": 7.511, + "args": { + "External id": 130178,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685412621.451, "dur": 4.000, + "args": { + "External id": 130179,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685412622.511, "dur": 2.700, + "args": { + "External id": 130180,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685412632.410, "dur": 5.390, + "args": { + "External id": 130181,"Record function id": 0, "Ev Idx": 3204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685412634.130, "dur": 2.640, + "args": { + "External id": 130182,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685412634.950, "dur": 1.340, + "args": { + "External id": 130183,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685412635.360, "dur": 0.750, + "args": { + "External id": 130184,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685412641.820, "dur": 5.100, + "args": { + "External id": 130185,"Record function id": 0, "Ev Idx": 3208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685412643.370, "dur": 2.550, + "args": { + "External id": 130186,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685412643.970, "dur": 1.500, + "args": { + "External id": 130187,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685412644.610, "dur": 0.680, + "args": { + "External id": 130188,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685412650.780, "dur": 4.980, + "args": { + "External id": 130189,"Record function id": 0, "Ev Idx": 3212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685412652.380, "dur": 2.390, + "args": { + "External id": 130190,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685412653.000, "dur": 1.350, + "args": { + "External id": 130191,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685412653.510, "dur": 0.680, + "args": { + "External id": 130192,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685412659.670, "dur": 346.329, + "args": { + "External id": 130193,"Record function id": 0, "Sequence number": 2575777, "Fwd thread id": 1, "Ev Idx": 3216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685412661.110, "dur": 335.840, + "args": { + "External id": 130194,"Sequence number": 2575777, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3217 + } + }, + { + "ph": "f", "id": 289, "pid": 5717, "tid": 6759, "ts": 6302685412661.110, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685412733.130, "dur": 45.780, + "args": { + "External id": 130195,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685412793.110, "dur": 21.050, + "args": { + "External id": 130196,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685412840.690, "dur": 129.710, + "args": { + "External id": 130197,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 3220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685412894.040, "dur": 6.520, + "args": { + "External id": 130198,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685412902.300, "dur": 4.280, + "args": { + "External id": 130199,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685413020.030, "dur": 11.329, + "args": { + "External id": 130200,"Record function id": 0, "Ev Idx": 3223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685413023.079, "dur": 6.931, + "args": { + "External id": 130201,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685413025.690, "dur": 3.349, + "args": { + "External id": 130202,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685413026.570, "dur": 2.280, + "args": { + "External id": 130203,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685413035.919, "dur": 187.270, + "args": { + "External id": 130204,"Record function id": 0, "Sequence number": 2575776, "Fwd thread id": 1, "Ev Idx": 3227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685413037.739, "dur": 178.260, + "args": { + "External id": 130205,"Sequence number": 2575776, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3228 + } + }, + { + "ph": "f", "id": 290, "pid": 5717, "tid": 6759, "ts": 6302685413037.739, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685413053.770, "dur": 37.139, + "args": { + "External id": 130206,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685413057.050, "dur": 6.800, + "args": { + "External id": 130207,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685413065.090, "dur": 25.239, + "args": { + "External id": 130208,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685413100.329, "dur": 7.400, + "args": { + "External id": 130209,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685413102.249, "dur": 4.990, + "args": { + "External id": 130210,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685413235.919, "dur": 208.230, + "args": { + "External id": 130211,"Record function id": 0, "Sequence number": 2575775, "Fwd thread id": 1, "Ev Idx": 3234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685413238.879, "dur": 196.579, + "args": { + "External id": 130212,"Sequence number": 2575775, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3235 + } + }, + { + "ph": "f", "id": 291, "pid": 5717, "tid": 6759, "ts": 6302685413238.879, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685413254.259, "dur": 34.500, + "args": { + "External id": 130213,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685413257.819, "dur": 6.820, + "args": { + "External id": 130214,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685413265.899, "dur": 22.220, + "args": { + "External id": 130215,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685413307.079, "dur": 7.840, + "args": { + "External id": 130216,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685413309.149, "dur": 5.260, + "args": { + "External id": 130217,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685413457.589, "dur": 398.799, + "args": { + "External id": 130218,"Record function id": 0, "Sequence number": 2575774, "Fwd thread id": 1, "Ev Idx": 3241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685413460.798, "dur": 383.450, + "args": { + "External id": 130219,"Sequence number": 2575774, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 3242 + } + }, + { + "ph": "f", "id": 292, "pid": 5717, "tid": 6759, "ts": 6302685413460.798, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685413544.358, "dur": 48.670, + "args": { + "External id": 130220,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685413607.298, "dur": 27.100, + "args": { + "External id": 130221,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685413645.158, "dur": 25.590, + "args": { + "External id": 130222,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685413682.718, "dur": 20.470, + "args": { + "External id": 130223,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685413712.018, "dur": 16.810, + "args": { + "External id": 130224,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685413741.648, "dur": 28.570, + "args": { + "External id": 130225,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5717, "tid": 6759, + "ts": 6302685413803.508, "dur": 21.510, + "args": { + "External id": 130226,"kernel_hash": "cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/ug/cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 3249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685413873.617, "dur": 14.051, + "args": { + "External id": 130227,"Record function id": 0, "Ev Idx": 3250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685413877.788, "dur": 8.329, + "args": { + "External id": 130228,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685413880.597, "dur": 4.351, + "args": { + "External id": 130229,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685413881.597, "dur": 3.091, + "args": { + "External id": 130230,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685413895.797, "dur": 10.240, + "args": { + "External id": 130231,"Record function id": 0, "Ev Idx": 3254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685413899.257, "dur": 4.340, + "args": { + "External id": 130232,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685413900.208, "dur": 2.820, + "args": { + "External id": 130233,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685413901.897, "dur": 0.911, + "args": { + "External id": 130234,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685413912.917, "dur": 8.100, + "args": { + "External id": 130235,"Record function id": 0, "Ev Idx": 3258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685413916.077, "dur": 3.791, + "args": { + "External id": 130236,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685413916.848, "dur": 1.369, + "args": { + "External id": 130237,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685413917.217, "dur": 0.820, + "args": { + "External id": 130238,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685413927.877, "dur": 268.200, + "args": { + "External id": 130239,"Record function id": 0, "Sequence number": 2575773, "Fwd thread id": 1, "Ev Idx": 3262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685413930.937, "dur": 223.610, + "args": { + "External id": 130240,"Sequence number": 2575773, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3263 + } + }, + { + "ph": "f", "id": 293, "pid": 5717, "tid": 6759, "ts": 6302685413930.937, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685414028.047, "dur": 25.060, + "args": { + "External id": 130241,"kernel_hash": "c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/6f/c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685414071.457, "dur": 14.450, + "args": { + "External id": 130242,"kernel_hash": "cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/ie/cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685414108.297, "dur": 20.560, + "args": { + "External id": 130243,"kernel_hash": "cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/ys/cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 3266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685414165.027, "dur": 22.510, + "args": { + "External id": 130244,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685414212.597, "dur": 14.440, + "args": { + "External id": 130245,"Record function id": 0, "Ev Idx": 3268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685414216.387, "dur": 8.640, + "args": { + "External id": 130246,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685414219.507, "dur": 4.560, + "args": { + "External id": 130247,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685414220.597, "dur": 3.190, + "args": { + "External id": 130248,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685414234.507, "dur": 1154.087, + "args": { + "External id": 130249,"Record function id": 0, "Sequence number": 2575772, "Fwd thread id": 1, "Ev Idx": 3272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685414236.777, "dur": 1143.447, + "args": { + "External id": 130250,"Sequence number": 2575772, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3273 + } + }, + { + "ph": "f", "id": 294, "pid": 5717, "tid": 6759, "ts": 6302685414236.777, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.1)", "pid": 5717, "tid": 6759, + "ts": 6302685414270.977, "dur": 64.019, + "args": { + "External id": 130251,"Record function id": 0, "Ev Idx": 3274 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.1)", "pid": 5717, "tid": 6759, + "ts": 6302685414348.176, "dur": 80.950, + "args": { + "External id": 130252,"Record function id": 0, "Ev Idx": 3275 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.1)", "pid": 5717, "tid": 6759, + "ts": 6302685414437.366, "dur": 935.448, + "args": { + "External id": 130253,"Record function id": 0, "Ev Idx": 3276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685414517.826, "dur": 9.100, + "args": { + "External id": 130254,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685414538.816, "dur": 7.210, + "args": { + "External id": 130255,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685414568.696, "dur": 108.330, + "args": { + "External id": 130256,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685414581.976, "dur": 90.680, + "args": { + "External id": 130257,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685414599.376, "dur": 7.490, + "args": { + "External id": 130258,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685414611.236, "dur": 37.110, + "args": { + "External id": 130259,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 3282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685414612.886, "dur": 35.080, + "args": { + "External id": 130260,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 3283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685414615.586, "dur": 7.540, + "args": { + "External id": 130261,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685414624.496, "dur": 22.820, + "args": { + "External id": 130262,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 3285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685414753.655, "dur": 10.140, + "args": { + "External id": 130263,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 3286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685414755.755, "dur": 7.440, + "args": { + "External id": 130264,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685414785.726, "dur": 127.129, + "args": { + "External id": 130265,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 3288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685414804.366, "dur": 104.759, + "args": { + "External id": 130266,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3289, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685414823.315, "dur": 80.630, + "args": { + "External id": 130267,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 3290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685414928.535, "dur": 4.130, + "args": { + "External id": 130268,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3291, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685415000.165, "dur": 6.770, + "args": { + "External id": 130269,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685415061.905, "dur": 1.900, + "args": { + "External id": 130270,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685415097.165, "dur": 3.450, + "args": { + "External id": 130271,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685415128.315, "dur": 2.730, + "args": { + "External id": 130272,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685415159.965, "dur": 3.200, + "args": { + "External id": 130273,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685415192.085, "dur": 1.620, + "args": { + "External id": 130274,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685415227.705, "dur": 1.449, + "args": { + "External id": 130275,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685415259.665, "dur": 3.309, + "args": { + "External id": 130276,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685415288.274, "dur": 1.370, + "args": { + "External id": 130277,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685415404.744, "dur": 1284.377, + "args": { + "External id": 130278,"Record function id": 0, "Ev Idx": 3301 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.0)", "pid": 5717, "tid": 6759, + "ts": 6302685415424.584, "dur": 512.069, + "args": { + "External id": 130279,"Record function id": 0, "Ev Idx": 3302 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 5717, "tid": 6759, + "ts": 6302685415440.764, "dur": 377.659, + "args": { + "External id": 130280,"Record function id": 0, "Ev Idx": 3303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685415566.254, "dur": 8.430, + "args": { + "External id": 130281,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685415584.044, "dur": 1.350, + "args": { + "External id": 130282,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685415589.194, "dur": 2.580, + "args": { + "External id": 130283,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685415594.184, "dur": 2.670, + "args": { + "External id": 130284,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685415600.484, "dur": 1.100, + "args": { + "External id": 130285,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685415605.114, "dur": 1.330, + "args": { + "External id": 130286,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685415609.884, "dur": 1.170, + "args": { + "External id": 130287,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685415614.354, "dur": 1.080, + "args": { + "External id": 130288,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685415618.814, "dur": 2.410, + "args": { + "External id": 130289,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685415623.224, "dur": 2.409, + "args": { + "External id": 130290,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685415652.984, "dur": 127.869, + "args": { + "External id": 130291,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 6759, + "ts": 6302685415676.824, "dur": 99.049, + "args": { + "External id": 130292,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685415692.804, "dur": 8.969, + "args": { + "External id": 130293,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685415705.044, "dur": 42.449, + "args": { + "External id": 130294,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685415707.044, "dur": 39.969, + "args": { + "External id": 130295,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685415710.013, "dur": 8.710, + "args": { + "External id": 130296,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685415720.353, "dur": 25.910, + "args": { + "External id": 130297,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685415946.293, "dur": 726.868, + "args": { + "External id": 130298,"Sequence number": 2575771, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3321 + } + }, + { + "ph": "f", "id": 295, "pid": 5717, "tid": 6759, "ts": 6302685415946.293, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685416033.983, "dur": 42.529, + "args": { + "External id": 130299,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5717, "tid": 6759, + "ts": 6302685416115.243, "dur": 35.449, + "args": { + "External id": 130300,"kernel_hash": "czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/zc/czcdeklwyoiozqthu67wfcy2jui7cbqei3qpzku23hbntqfpkqqf.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 3323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5717, "tid": 6759, + "ts": 6302685416175.742, "dur": 59.520, + "args": { + "External id": 130301,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 3324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685416256.912, "dur": 58.960, + "args": { + "External id": 130302,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685416336.472, "dur": 33.790, + "args": { + "External id": 130303,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685416385.492, "dur": 35.460, + "args": { + "External id": 130304,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685416435.752, "dur": 28.550, + "args": { + "External id": 130305,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685416497.422, "dur": 30.540, + "args": { + "External id": 130306,"kernel_hash": "cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/f4/cf4etmfloeg3v7n5hiaeyudawbwjqutwoda7szw44yaw7rnmc7qg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685416551.542, "dur": 18.940, + "args": { + "External id": 130307,"kernel_hash": "c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/7m/c7mmy3cqfj4dpsbyqeg2rtkviy5d5ksh2mh6ficros7lty24zfzr.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5717, "tid": 6759, + "ts": 6302685416597.422, "dur": 33.269, + "args": { + "External id": 130308,"kernel_hash": "c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/5t/c5tro6aozk4roxmgsivsy4gmxvvqurypkdtvhffept3v4hxhngob.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 3331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685416709.781, "dur": 17.440, + "args": { + "External id": 130309,"Record function id": 0, "Ev Idx": 3332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685416714.491, "dur": 10.640, + "args": { + "External id": 130310,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685416718.291, "dur": 5.650, + "args": { + "External id": 130311,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685416719.841, "dur": 3.790, + "args": { + "External id": 130312,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685416734.131, "dur": 7.940, + "args": { + "External id": 130313,"Record function id": 0, "Ev Idx": 3336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685416736.731, "dur": 3.800, + "args": { + "External id": 130314,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685416737.981, "dur": 1.810, + "args": { + "External id": 130315,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685416738.531, "dur": 0.980, + "args": { + "External id": 130316,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685416747.871, "dur": 7.490, + "args": { + "External id": 130317,"Record function id": 0, "Ev Idx": 3340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685416750.491, "dur": 3.350, + "args": { + "External id": 130318,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685416751.411, "dur": 1.840, + "args": { + "External id": 130319,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685416752.151, "dur": 0.820, + "args": { + "External id": 130320,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685416760.801, "dur": 7.090, + "args": { + "External id": 130321,"Record function id": 0, "Ev Idx": 3344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685416763.061, "dur": 3.390, + "args": { + "External id": 130322,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685416764.031, "dur": 1.770, + "args": { + "External id": 130323,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685416764.711, "dur": 0.840, + "args": { + "External id": 130324,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685416773.371, "dur": 622.649, + "args": { + "External id": 130325,"Record function id": 0, "Sequence number": 2575770, "Fwd thread id": 1, "Ev Idx": 3348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685416775.791, "dur": 605.449, + "args": { + "External id": 130326,"Sequence number": 2575770, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3349 + } + }, + { + "ph": "f", "id": 296, "pid": 5717, "tid": 6759, "ts": 6302685416775.791, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685416874.981, "dur": 77.140, + "args": { + "External id": 130327,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685416981.181, "dur": 44.849, + "args": { + "External id": 130328,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5717, "tid": 6759, + "ts": 6302685417083.630, "dur": 254.730, + "args": { + "External id": 130329,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 3352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685417165.050, "dur": 12.150, + "args": { + "External id": 130330,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685417180.410, "dur": 11.020, + "args": { + "External id": 130331,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685417420.780, "dur": 20.100, + "args": { + "External id": 130332,"Record function id": 0, "Ev Idx": 3355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685417426.180, "dur": 12.369, + "args": { + "External id": 130333,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685417430.580, "dur": 6.369, + "args": { + "External id": 130334,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685417432.149, "dur": 4.400, + "args": { + "External id": 130335,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685417448.389, "dur": 305.790, + "args": { + "External id": 130336,"Record function id": 0, "Sequence number": 2575769, "Fwd thread id": 1, "Ev Idx": 3359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685417451.080, "dur": 291.589, + "args": { + "External id": 130337,"Sequence number": 2575769, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3360 + } + }, + { + "ph": "f", "id": 297, "pid": 5717, "tid": 6759, "ts": 6302685417451.080, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685417477.549, "dur": 61.850, + "args": { + "External id": 130338,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685417483.569, "dur": 11.410, + "args": { + "External id": 130339,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685417496.959, "dur": 41.340, + "args": { + "External id": 130340,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685417554.749, "dur": 12.250, + "args": { + "External id": 130341,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685417557.669, "dur": 8.450, + "args": { + "External id": 130342,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685417774.719, "dur": 280.489, + "args": { + "External id": 130343,"Record function id": 0, "Sequence number": 2575768, "Fwd thread id": 1, "Ev Idx": 3366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685417780.609, "dur": 261.529, + "args": { + "External id": 130344,"Sequence number": 2575768, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3367 + } + }, + { + "ph": "f", "id": 298, "pid": 5717, "tid": 6759, "ts": 6302685417780.609, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 6759, + "ts": 6302685417804.469, "dur": 53.539, + "args": { + "External id": 130345,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685417809.679, "dur": 11.240, + "args": { + "External id": 130346,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685417822.869, "dur": 34.150, + "args": { + "External id": 130347,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 6759, + "ts": 6302685417872.399, "dur": 12.040, + "args": { + "External id": 130348,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685417875.288, "dur": 8.371, + "args": { + "External id": 130349,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685418075.788, "dur": 734.068, + "args": { + "External id": 130350,"Record function id": 0, "Sequence number": 2575767, "Fwd thread id": 1, "Ev Idx": 3373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685418080.498, "dur": 705.848, + "args": { + "External id": 130351,"Sequence number": 2575767, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 3374 + } + }, + { + "ph": "f", "id": 299, "pid": 5717, "tid": 6759, "ts": 6302685418080.498, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685418200.358, "dur": 126.989, + "args": { + "External id": 130352,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685418358.467, "dur": 57.370, + "args": { + "External id": 130353,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685418438.677, "dur": 51.790, + "args": { + "External id": 130354,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685418516.257, "dur": 41.990, + "args": { + "External id": 130355,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685418576.607, "dur": 33.930, + "args": { + "External id": 130356,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 6759, + "ts": 6302685418627.927, "dur": 32.340, + "args": { + "External id": 130357,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5717, "tid": 6759, + "ts": 6302685418704.157, "dur": 44.240, + "args": { + "External id": 130358,"kernel_hash": "cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/ug/cugegblztwuwfpswv3pcbpmf7lyfqx36q4efiotwm2gbhm2tndso.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 3381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685418839.726, "dur": 25.810, + "args": { + "External id": 130359,"Record function id": 0, "Ev Idx": 3382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685418846.166, "dur": 16.180, + "args": { + "External id": 130360,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685418852.046, "dur": 8.600, + "args": { + "External id": 130361,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685418854.236, "dur": 5.890, + "args": { + "External id": 130362,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685418875.836, "dur": 11.790, + "args": { + "External id": 130363,"Record function id": 0, "Ev Idx": 3386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685418879.596, "dur": 5.720, + "args": { + "External id": 130364,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685418881.416, "dur": 2.900, + "args": { + "External id": 130365,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685418882.306, "dur": 1.600, + "args": { + "External id": 130366,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685418896.376, "dur": 11.130, + "args": { + "External id": 130367,"Record function id": 0, "Ev Idx": 3390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685418899.906, "dur": 5.380, + "args": { + "External id": 130368,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685418901.316, "dur": 2.960, + "args": { + "External id": 130369,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685418902.436, "dur": 1.440, + "args": { + "External id": 130370,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685418916.166, "dur": 490.949, + "args": { + "External id": 130371,"Record function id": 0, "Sequence number": 2575766, "Fwd thread id": 1, "Ev Idx": 3394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685418919.396, "dur": 401.329, + "args": { + "External id": 130372,"Sequence number": 2575766, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3395 + } + }, + { + "ph": "f", "id": 300, "pid": 5717, "tid": 6759, "ts": 6302685418919.396, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5717, "tid": 6759, + "ts": 6302685419064.576, "dur": 50.130, + "args": { + "External id": 130373,"kernel_hash": "c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/6f/c6fw7a5wn64dm2e25t7py3pd4hex6j4gieqdplgacyxvq5wslkrm.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5717, "tid": 6759, + "ts": 6302685419153.186, "dur": 29.539, + "args": { + "External id": 130374,"kernel_hash": "cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/ie/cieux2wspn6sxk6ymyoi7prluajfh2qv44ykspfo55ajbpntta77.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5717, "tid": 6759, + "ts": 6302685419220.885, "dur": 37.590, + "args": { + "External id": 130375,"kernel_hash": "cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/ys/cysu6zlvi425sym65tahmimrarid6t4tjpub7zbyprzxusj632yw.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 3398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685419340.565, "dur": 54.350, + "args": { + "External id": 130376,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685419436.215, "dur": 24.260, + "args": { + "External id": 130377,"Record function id": 0, "Ev Idx": 3400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685419442.805, "dur": 14.470, + "args": { + "External id": 130378,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685419447.875, "dur": 7.860, + "args": { + "External id": 130379,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685419449.725, "dur": 5.520, + "args": { + "External id": 130380,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685419470.565, "dur": 1945.626, + "args": { + "External id": 130381,"Record function id": 0, "Sequence number": 2575765, "Fwd thread id": 1, "Ev Idx": 3404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685419474.165, "dur": 1920.575, + "args": { + "External id": 130382,"Sequence number": 2575765, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3405 + } + }, + { + "ph": "f", "id": 301, "pid": 5717, "tid": 6759, "ts": 6302685419474.165, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.0)", "pid": 5717, "tid": 6759, + "ts": 6302685419518.985, "dur": 57.590, + "args": { + "External id": 130383,"Record function id": 0, "Ev Idx": 3406 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.0)", "pid": 5717, "tid": 6759, + "ts": 6302685419596.804, "dur": 131.070, + "args": { + "External id": 130384,"Record function id": 0, "Ev Idx": 3407 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.0)", "pid": 5717, "tid": 6759, + "ts": 6302685419744.274, "dur": 1631.777, + "args": { + "External id": 130385,"Record function id": 0, "Ev Idx": 3408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685419904.574, "dur": 18.070, + "args": { + "External id": 130386,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685419942.824, "dur": 8.310, + "args": { + "External id": 130387,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685419979.204, "dur": 207.129, + "args": { + "External id": 130388,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685420002.094, "dur": 175.669, + "args": { + "External id": 130389,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685420033.334, "dur": 17.620, + "args": { + "External id": 130390,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685420058.674, "dur": 73.049, + "args": { + "External id": 130391,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 3414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685420062.003, "dur": 69.000, + "args": { + "External id": 130392,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 3415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685420067.594, "dur": 14.400, + "args": { + "External id": 130393,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685420084.694, "dur": 44.989, + "args": { + "External id": 130394,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 3417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685420356.493, "dur": 20.620, + "args": { + "External id": 130395,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 3418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685420360.323, "dur": 15.460, + "args": { + "External id": 130396,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685420416.303, "dur": 193.629, + "args": { + "External id": 130397,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 3420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685420450.503, "dur": 152.029, + "args": { + "External id": 130398,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3421, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685420479.273, "dur": 112.839, + "args": { + "External id": 130399,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 3422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685420638.962, "dur": 7.440, + "args": { + "External id": 130400,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3423, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685420768.922, "dur": 9.300, + "args": { + "External id": 130401,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685420849.262, "dur": 4.170, + "args": { + "External id": 130402,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685420912.142, "dur": 3.700, + "args": { + "External id": 130403,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685420964.781, "dur": 2.880, + "args": { + "External id": 130404,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685421010.721, "dur": 2.920, + "args": { + "External id": 130405,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685421054.541, "dur": 2.800, + "args": { + "External id": 130406,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685421097.551, "dur": 2.790, + "args": { + "External id": 130407,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685421140.731, "dur": 2.790, + "args": { + "External id": 130408,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685421183.171, "dur": 2.830, + "args": { + "External id": 130409,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685421456.151, "dur": 814.738, + "args": { + "External id": 130410,"Record function id": 0, "Sequence number": 2575764, "Fwd thread id": 1, "Ev Idx": 3433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5717, "tid": 6759, + "ts": 6302685421465.280, "dur": 675.639, + "args": { + "External id": 130411,"Sequence number": 2575764, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3434 + } + }, + { + "ph": "f", "id": 302, "pid": 5717, "tid": 6759, "ts": 6302685421465.280, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_dense_backward_0", "pid": 5717, "tid": 6759, + "ts": 6302685421728.680, "dur": 98.839, + "args": { + "External id": 130412,"kernel_hash": "cmn6dur6j7tozbnh4iryxy5t6hlwn4iydb4cd3fcublho4hrtelw", "grid": "grid(24576000,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "24576000"], "kernel_file": "/tmp/torchinductor_root/mn/cmn6dur6j7tozbnh4iryxy5t6hlwn4iydb4cd3fcublho4hrtelw.py", "kernel_backend": "triton", "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 3435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_dense_backward_1", "pid": 5717, "tid": 6759, + "ts": 6302685421877.610, "dur": 66.009, + "args": { + "External id": 130413,"kernel_hash": "c7hnqeykk5jb3g3ry67anwi5ylv3c4scdnhk7tclmyzgevv6iph4", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/7h/c7hnqeykk5jb3g3ry67anwi5ylv3c4scdnhk7tclmyzgevv6iph4.py", "kernel_backend": "triton", "Input type": ["long int", "c10::BFloat16", "float", "Scalar"], "Input Strides": [[2048, 1], [1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048], [8, 2048, 768], [32000, 768], []], "Ev Idx": 3436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_dense_backward_2", "pid": 5717, "tid": 6759, + "ts": 6302685422016.719, "dur": 58.720, + "args": { + "External id": 130414,"kernel_hash": "cebx22tiyqg33dxeitesxdrka7b74inhm3i35igy3bow4qudchxk", "grid": "grid(24576000,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "24576000"], "kernel_file": "/tmp/torchinductor_root/eb/cebx22tiyqg33dxeitesxdrka7b74inhm3i35igy3bow4qudchxk.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 3437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 6759, + "ts": 6302685422177.689, "dur": 71.989, + "args": { + "External id": 130415,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 3438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685422355.458, "dur": 52.460, + "args": { + "External id": 130416,"Record function id": 0, "Ev Idx": 3439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5717, "tid": 6759, + "ts": 6302685422369.848, "dur": 31.350, + "args": { + "External id": 130417,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 3440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 6759, + "ts": 6302685422380.878, "dur": 16.030, + "args": { + "External id": 130418,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 3441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 6759, + "ts": 6302685422384.668, "dur": 11.190, + "args": { + "External id": 130419,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 3442 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::root_post_backward_callback", "pid": 5717, "tid": 6759, + "ts": 6302685422482.658, "dur": 10994.945, + "args": { + "External id": 130420,"Record function id": 0, "Ev Idx": 3443 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate", "pid": 5717, "tid": 6759, + "ts": 6302685422548.178, "dur": 119.040, + "args": { + "External id": 130421,"Record function id": 0, "Ev Idx": 3444 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard", "pid": 5717, "tid": 6759, + "ts": 6302685422710.237, "dur": 900.018, + "args": { + "External id": 130422,"Record function id": 0, "Ev Idx": 3445 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce", "pid": 5717, "tid": 6759, + "ts": 6302685423644.546, "dur": 8389.170, + "args": { + "External id": 130423,"Record function id": 0, "Ev Idx": 3446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685424101.005, "dur": 38.189, + "args": { + "External id": 130424,"Record function id": 0, "Concrete Inputs": ["[52894464]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 6759, + "ts": 6302685424177.404, "dur": 16.370, + "args": { + "External id": 130425,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[52894464], []], "Ev Idx": 3448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685424267.154, "dur": 633.029, + "args": { + "External id": 130426,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[], [], [], [13223616, 1]], "Input Dims": [[], [], [], [4, 13223616]], "Ev Idx": 3449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5717, "tid": 6759, + "ts": 6302685424362.154, "dur": 518.009, + "args": { + "External id": 130427,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[], [], [], [13223616, 1]], "Input Dims": [[], [], [], [4, 13223616]], "Ev Idx": 3450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685424543.884, "dur": 37.589, + "args": { + "External id": 130428,"Record function id": 0, "Concrete Inputs": ["[26063]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 6759, + "ts": 6302685424626.393, "dur": 153.740, + "args": { + "External id": 130429,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[26063], [], [], [], [], [], [], []], "Ev Idx": 3452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 6759, + "ts": 6302685424633.213, "dur": 145.580, + "args": { + "External id": 130430,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[26063], [], [], [], [], [], []], "Ev Idx": 3453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 6759, + "ts": 6302685424644.203, "dur": 33.050, + "args": { + "External id": 130431,"Record function id": 0, "Concrete Inputs": ["[26063]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 6759, + "ts": 6302685424683.123, "dur": 93.200, + "args": { + "External id": 130432,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[26063], [26063], []], "Ev Idx": 3455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 6759, + "ts": 6302685425362.671, "dur": 60.131, + "args": { + "External id": 130433,"Record function id": 0, "Concrete Inputs": ["", "[13223616]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[52894464], [], [], [], [], []], "Ev Idx": 3456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 6759, + "ts": 6302685425375.142, "dur": 43.740, + "args": { + "External id": 130434,"Record function id": 0, "Concrete Inputs": ["[13223616]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5717, "tid": 6759, + "ts": 6302685425532.171, "dur": 554.539, + "args": { + "External id": 130435,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[13223616], [52894464], [], [], [], []], "Ev Idx": 3458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685425627.541, "dur": 437.579, + "args": { + "External id": 130436,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 13223616, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[52894464], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3459, "In msg nelems": 52894464 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5717, "tid": 6759, + "ts": 6302685425709.741, "dur": 326.069, + "args": { + "External id": 130437,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[52894464]], "Ev Idx": 3460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 6759, + "ts": 6302685426182.240, "dur": 29.200, + "args": { + "External id": 130438,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3461, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685426897.068, "dur": 47.450, + "args": { + "External id": 130439,"Record function id": 0, "Concrete Inputs": ["", "[8000, 768]", "[768, 1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685427280.307, "dur": 13.740, + "args": { + "External id": 130440,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6144000"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685427488.517, "dur": 11.160, + "args": { + "External id": 130441,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6144192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685427623.646, "dur": 7.940, + "args": { + "External id": 130442,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6291648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685427745.916, "dur": 6.500, + "args": { + "External id": 130443,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6439104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685427857.126, "dur": 6.540, + "args": { + "External id": 130444,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6586560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685427993.465, "dur": 8.231, + "args": { + "External id": 130445,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6734016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685428105.415, "dur": 6.620, + "args": { + "External id": 130446,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "6734208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685428208.985, "dur": 6.110, + "args": { + "External id": 130447,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "7127424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685428340.345, "dur": 8.090, + "args": { + "External id": 130448,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "7520640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685428453.435, "dur": 8.089, + "args": { + "External id": 130449,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "7913856"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685428558.334, "dur": 6.230, + "args": { + "External id": 130450,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "7914048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685428664.974, "dur": 6.500, + "args": { + "External id": 130451,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "8061504"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685428768.254, "dur": 7.220, + "args": { + "External id": 130452,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "8208960"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685428873.714, "dur": 6.960, + "args": { + "External id": 130453,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "8356416"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685428975.523, "dur": 6.420, + "args": { + "External id": 130454,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "8503872"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685429074.983, "dur": 6.860, + "args": { + "External id": 130455,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "8504064"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685429176.003, "dur": 6.340, + "args": { + "External id": 130456,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "8897280"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685429276.733, "dur": 6.080, + "args": { + "External id": 130457,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "9290496"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685429416.542, "dur": 7.810, + "args": { + "External id": 130458,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "9683712"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685429524.222, "dur": 6.060, + "args": { + "External id": 130459,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "9683904"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685429624.692, "dur": 6.350, + "args": { + "External id": 130460,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "9831360"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685429768.752, "dur": 10.469, + "args": { + "External id": 130461,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "9978816"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685429881.471, "dur": 6.540, + "args": { + "External id": 130462,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "10126272"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685429983.641, "dur": 6.590, + "args": { + "External id": 130463,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "10273728"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685430123.371, "dur": 7.590, + "args": { + "External id": 130464,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "10273920"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685430254.020, "dur": 14.151, + "args": { + "External id": 130465,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "10667136"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685430445.080, "dur": 14.670, + "args": { + "External id": 130466,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "11060352"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685430595.370, "dur": 6.720, + "args": { + "External id": 130467,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "11453568"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685430701.270, "dur": 6.360, + "args": { + "External id": 130468,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11453760"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685430802.969, "dur": 7.520, + "args": { + "External id": 130469,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11601216"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685430904.829, "dur": 6.520, + "args": { + "External id": 130470,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11748672"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685431011.049, "dur": 7.190, + "args": { + "External id": 130471,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11896128"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685431115.738, "dur": 6.151, + "args": { + "External id": 130472,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "12043584"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685431218.178, "dur": 6.860, + "args": { + "External id": 130473,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "12043776"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685431364.528, "dur": 8.000, + "args": { + "External id": 130474,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "12436992"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685431476.848, "dur": 6.160, + "args": { + "External id": 130475,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "12830208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 6759, + "ts": 6302685431580.768, "dur": 6.709, + "args": { + "External id": 130476,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "13223424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3499 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "ProfilerStep#8191", "pid": 5717, "tid": 5717, + "ts": 6302684941994.776, "dur": 677595.446, + "args": { + "External id": 122881,"Record function id": 0, "Ev Idx": 3500 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "Optimizer.zero_grad#AdamW.zero_grad", "pid": 5717, "tid": 5717, + "ts": 6302684942042.656, "dur": 388.789, + "args": { + "External id": 122882,"Record function id": 0, "Ev Idx": 3501 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "enumerate(DataLoader)#_StatefulMultiProcessingDataLoaderIter.__next__", "pid": 5717, "tid": 5717, + "ts": 6302684942473.455, "dur": 238836.289, + "args": { + "External id": 122883,"Record function id": 0, "Ev Idx": 3502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685180599.626, "dur": 6.230, + "args": { + "External id": 122884,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::set_", "pid": 5717, "tid": 5717, + "ts": 6302685180625.626, "dur": 4.970, + "args": { + "External id": 122885,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "0", "[8, 4096]", "[4096, 1]"], "Input type": ["long int", "", "Scalar", "ScalarList", "ScalarList"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[0], [], [], [], []], "Ev Idx": 3504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685180895.055, "dur": 2.470, + "args": { + "External id": 122886,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::set_", "pid": 5717, "tid": 5717, + "ts": 6302685180905.085, "dur": 2.400, + "args": { + "External id": 122887,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "0", "[8, 4096]", "[4096, 1]"], "Input type": ["long int", "", "Scalar", "ScalarList", "ScalarList"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[0], [], [], [], []], "Ev Idx": 3506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685181157.404, "dur": 2.800, + "args": { + "External id": 122888,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::set_", "pid": 5717, "tid": 5717, + "ts": 6302685181168.795, "dur": 2.780, + "args": { + "External id": 122889,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "0", "[8, 4096]", "[4096, 1]"], "Input type": ["long int", "", "Scalar", "ScalarList", "ScalarList"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[0], [], [], [], []], "Ev Idx": 3508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685181602.923, "dur": 15.251, + "args": { + "External id": 122890,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], []], "Ev Idx": 3509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685181611.663, "dur": 2.031, + "args": { + "External id": 122891,"Record function id": 0, "Concrete Inputs": ["", "[8, 4096]", "[4096, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 3510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685181620.083, "dur": 6.080, + "args": { + "External id": 122892,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], []], "Ev Idx": 3511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685181622.903, "dur": 1.851, + "args": { + "External id": 122893,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 3512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685181654.443, "dur": 182.060, + "args": { + "External id": 122894,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], [], [], []], "Ev Idx": 3513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685181673.683, "dur": 162.440, + "args": { + "External id": 122895,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], [], []], "Ev Idx": 3514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685181680.553, "dur": 15.860, + "args": { + "External id": 122896,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "[2048, 1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685181698.223, "dur": 137.370, + "args": { + "External id": 122897,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685181706.273, "dur": 0.300, + "args": { + "External id": 122898,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 3517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand_as", "pid": 5717, "tid": 5717, + "ts": 6302685181709.043, "dur": 8.600, + "args": { + "External id": 122899,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["long int", "long int"], "Input Strides": [[4096, 1], [2048, 1]], "Input Dims": [[8, 2048], [8, 2048]], "Ev Idx": 3518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 5717, + "ts": 6302685181713.533, "dur": 3.930, + "args": { + "External id": 122900,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "False"], "Input type": ["long int", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], []], "Input Dims": [[8, 2048], [], []], "Ev Idx": 3519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685181716.523, "dur": 0.640, + "args": { + "External id": 122901,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 3520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5717, "tid": 5717, + "ts": 6302685181720.023, "dur": 43.050, + "args": { + "External id": 122902,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 3521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 5717, + "ts": 6302685181722.263, "dur": 40.540, + "args": { + "External id": 122903,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 3522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685181724.083, "dur": 7.930, + "args": { + "External id": 122904,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 3523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685181725.873, "dur": 5.740, + "args": { + "External id": 122905,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685181732.733, "dur": 29.460, + "args": { + "External id": 122906,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685181764.823, "dur": 69.630, + "args": { + "External id": 122907,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685181845.153, "dur": 72.860, + "args": { + "External id": 122908,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], [], [], [], []], "Ev Idx": 3527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685181846.813, "dur": 70.900, + "args": { + "External id": 122909,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], [], [], []], "Ev Idx": 3528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685181850.623, "dur": 5.370, + "args": { + "External id": 122910,"Record function id": 0, "Concrete Inputs": ["[8, 4096]", "[4096, 1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685181856.913, "dur": 60.150, + "args": { + "External id": 122911,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[4096, 1], [4096, 1], []], "Input Dims": [[8, 4096], [8, 4096], []], "Ev Idx": 3530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::arange", "pid": 5717, "tid": 5717, + "ts": 6302685181933.873, "dur": 62.310, + "args": { + "External id": 122912,"Record function id": 0, "Concrete Inputs": ["0", "2048", "", "", "", "False"], "Input type": ["Scalar", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685181937.843, "dur": 6.590, + "args": { + "External id": 122913,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::arange", "pid": 5717, "tid": 5717, + "ts": 6302685181945.973, "dur": 49.540, + "args": { + "External id": 122914,"Record function id": 0, "Concrete Inputs": ["0", "2048", "1", ""], "Input type": ["Scalar", "Scalar", "Scalar", "long int"], "Input Strides": [[], [], [], [1]], "Input Dims": [[], [], [], [0]], "Ev Idx": 3533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685181951.433, "dur": 6.610, + "args": { + "External id": 122915,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["long int", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 3534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::repeat", "pid": 5717, "tid": 5717, + "ts": 6302685182010.853, "dur": 85.560, + "args": { + "External id": 122916,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 3535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 5717, + "ts": 6302685182016.283, "dur": 7.119, + "args": { + "External id": 122917,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048]", "False"], "Input type": ["long int", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[2048], [], []], "Ev Idx": 3536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182020.333, "dur": 2.640, + "args": { + "External id": 122918,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048]", "[2048, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 3537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685182025.533, "dur": 10.249, + "args": { + "External id": 122919,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 5717, + "ts": 6302685182041.482, "dur": 4.551, + "args": { + "External id": 122920,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[2048, 1]], "Input Dims": [[8, 2048]], "Ev Idx": 3539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unfold", "pid": 5717, "tid": 5717, + "ts": 6302685182049.082, "dur": 6.800, + "args": { + "External id": 122921,"Record function id": 0, "Concrete Inputs": ["", "0", "1", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 3540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182055.193, "dur": 0.409, + "args": { + "External id": 122922,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 3541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unfold", "pid": 5717, "tid": 5717, + "ts": 6302685182057.673, "dur": 3.289, + "args": { + "External id": 122923,"Record function id": 0, "Concrete Inputs": ["", "1", "2048", "2048"], "Input type": ["long int", "Scalar", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 2048], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 3542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182058.562, "dur": 0.831, + "args": { + "External id": 122924,"Record function id": 0, "Concrete Inputs": ["", "[8, 1, 1, 2048]", "[2048, 2048, 2048, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 2048], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 3543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand_as", "pid": 5717, "tid": 5717, + "ts": 6302685182062.073, "dur": 6.229, + "args": { + "External id": 122925,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["long int", "long int"], "Input Strides": [[2048, 1], [2048, 2048, 2048, 1]], "Input Dims": [[1, 2048], [8, 1, 1, 2048]], "Ev Idx": 3544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5717, "tid": 5717, + "ts": 6302685182063.842, "dur": 4.280, + "args": { + "External id": 122926,"Record function id": 0, "Concrete Inputs": ["", "[8, 1, 1, 2048]", "False"], "Input type": ["long int", "ScalarList", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[1, 2048], [], []], "Ev Idx": 3545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182067.122, "dur": 0.780, + "args": { + "External id": 122927,"Record function id": 0, "Concrete Inputs": ["", "[8, 1, 1, 2048]", "[0, 2048, 2048, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[1, 2048], [], [], []], "Ev Idx": 3546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685182070.182, "dur": 25.520, + "args": { + "External id": 122928,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 2048, 2048, 1], [0, 2048, 2048, 1], []], "Input Dims": [[8, 1, 1, 2048], [8, 1, 1, 2048], []], "Ev Idx": 3547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685182104.382, "dur": 28.250, + "args": { + "External id": 122929,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "3", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 3548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685182105.493, "dur": 26.899, + "args": { + "External id": 122930,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "3", "", "", "", "False", ""], "Input type": ["long int", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[2048, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], [], []], "Ev Idx": 3549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182109.222, "dur": 5.491, + "args": { + "External id": 122931,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "[2048, 1]", "3", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685182115.973, "dur": 15.829, + "args": { + "External id": 122932,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["int", "long int", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3551 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::root_pre_forward", "pid": 5717, "tid": 5717, + "ts": 6302685182246.042, "dur": 172.360, + "args": { + "External id": 122933,"Record function id": 0, "Ev Idx": 3552 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::inputs_to_device", "pid": 5717, "tid": 5717, + "ts": 6302685182346.432, "dur": 56.310, + "args": { + "External id": 122934,"Record function id": 0, "Ev Idx": 3553 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685182425.632, "dur": 47.349, + "args": { + "External id": 122935,"Record function id": 0, "Ev Idx": 3554 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward", "pid": 5717, "tid": 5717, + "ts": 6302685182482.312, "dur": 1675.176, + "args": { + "External id": 122936,"Record function id": 0, "Ev Idx": 3555 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather", "pid": 5717, "tid": 5717, + "ts": 6302685182490.652, "dur": 912.887, + "args": { + "External id": 122937,"Record function id": 0, "Ev Idx": 3556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685182610.901, "dur": 10.240, + "args": { + "External id": 122938,"Record function id": 0, "Concrete Inputs": ["[13223616]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685182640.021, "dur": 98.860, + "args": { + "External id": 122939,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["c10::BFloat16", "", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[13223616], [], []], "Ev Idx": 3558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182646.561, "dur": 1.530, + "args": { + "External id": 122940,"Record function id": 0, "Concrete Inputs": ["", "[6144000]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182649.631, "dur": 0.220, + "args": { + "External id": 122941,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6144000"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182651.841, "dur": 0.230, + "args": { + "External id": 122942,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6144192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182654.621, "dur": 0.240, + "args": { + "External id": 122943,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6291648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182655.571, "dur": 1.770, + "args": { + "External id": 122944,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6439104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182659.261, "dur": 0.220, + "args": { + "External id": 122945,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6586560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182661.421, "dur": 0.250, + "args": { + "External id": 122946,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6734016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182662.501, "dur": 1.420, + "args": { + "External id": 122947,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6734208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182665.811, "dur": 0.210, + "args": { + "External id": 122948,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "7127424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182666.761, "dur": 0.230, + "args": { + "External id": 122949,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "7520640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182669.551, "dur": 0.280, + "args": { + "External id": 122950,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "7913856"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182671.771, "dur": 0.220, + "args": { + "External id": 122951,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "7914048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182672.771, "dur": 1.420, + "args": { + "External id": 122952,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8061504"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182676.061, "dur": 0.170, + "args": { + "External id": 122953,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8208960"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182678.251, "dur": 0.160, + "args": { + "External id": 122954,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8356416"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182679.081, "dur": 1.430, + "args": { + "External id": 122955,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "8503872"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182682.401, "dur": 0.150, + "args": { + "External id": 122956,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "8504064"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182683.191, "dur": 0.150, + "args": { + "External id": 122957,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "8897280"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182685.441, "dur": 0.220, + "args": { + "External id": 122958,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "9290496"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182687.231, "dur": 0.200, + "args": { + "External id": 122959,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "9683712"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182688.151, "dur": 1.480, + "args": { + "External id": 122960,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9683904"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182691.761, "dur": 0.160, + "args": { + "External id": 122961,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9831360"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182694.231, "dur": 0.150, + "args": { + "External id": 122962,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9978816"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182695.051, "dur": 1.340, + "args": { + "External id": 122963,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "10126272"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182698.551, "dur": 0.520, + "args": { + "External id": 122964,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "10273728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182700.421, "dur": 0.200, + "args": { + "External id": 122965,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "10273920"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182703.051, "dur": 0.230, + "args": { + "External id": 122966,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "10667136"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182704.931, "dur": 0.200, + "args": { + "External id": 122967,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "11060352"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182705.751, "dur": 1.390, + "args": { + "External id": 122968,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "11453568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182709.261, "dur": 0.190, + "args": { + "External id": 122969,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11453760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182711.281, "dur": 0.160, + "args": { + "External id": 122970,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11601216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182712.101, "dur": 1.450, + "args": { + "External id": 122971,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11748672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182715.391, "dur": 0.190, + "args": { + "External id": 122972,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11896128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182716.201, "dur": 0.200, + "args": { + "External id": 122973,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "12043584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182718.411, "dur": 0.230, + "args": { + "External id": 122974,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12043776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182720.601, "dur": 0.200, + "args": { + "External id": 122975,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12436992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182721.421, "dur": 1.220, + "args": { + "External id": 122976,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12830208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182725.011, "dur": 0.150, + "args": { + "External id": 122977,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "13223424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685182762.031, "dur": 44.280, + "args": { + "External id": 122978,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 3597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685182876.061, "dur": 198.439, + "args": { + "External id": 122979,"Record function id": 0, "Concrete Inputs": ["", "", "13223616", "4", "3", "15", ""], "Input type": ["TensorList", "", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 3598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685182895.101, "dur": 9.419, + "args": { + "External id": 122980,"Record function id": 0, "Concrete Inputs": ["[52894464]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685182910.231, "dur": 11.100, + "args": { + "External id": 122981,"Record function id": 0, "Concrete Inputs": ["", "0", "39670848", "13223616"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[52894464], [], [], []], "Ev Idx": 3600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685182915.080, "dur": 5.871, + "args": { + "External id": 122982,"Record function id": 0, "Concrete Inputs": ["", "0", "39670848", "52894464", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[52894464], [], [], [], []], "Ev Idx": 3601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182918.800, "dur": 0.671, + "args": { + "External id": 122983,"Record function id": 0, "Concrete Inputs": ["", "[13223616]", "[1]", "39670848"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[52894464], [], [], []], "Ev Idx": 3602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685182929.911, "dur": 83.449, + "args": { + "External id": 122984,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["c10::BFloat16", "", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[13223616], [], []], "Ev Idx": 3603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182931.491, "dur": 0.420, + "args": { + "External id": 122985,"Record function id": 0, "Concrete Inputs": ["", "[6144000]", "[1]", "39670848"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182934.160, "dur": 0.171, + "args": { + "External id": 122986,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "45814848"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182935.040, "dur": 0.160, + "args": { + "External id": 122987,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "45815040"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182936.080, "dur": 0.231, + "args": { + "External id": 122988,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "45962496"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182938.191, "dur": 1.329, + "args": { + "External id": 122989,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "46109952"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182940.420, "dur": 0.151, + "args": { + "External id": 122990,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "46257408"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182941.220, "dur": 0.280, + "args": { + "External id": 122991,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "46404864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182943.491, "dur": 0.149, + "args": { + "External id": 122992,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "46405056"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182944.500, "dur": 0.231, + "args": { + "External id": 122993,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "46798272"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182946.431, "dur": 0.160, + "args": { + "External id": 122994,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "47191488"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182947.380, "dur": 0.160, + "args": { + "External id": 122995,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "47584704"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182948.220, "dur": 1.391, + "args": { + "External id": 122996,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "47584896"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182951.571, "dur": 1.420, + "args": { + "External id": 122997,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "47732352"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182953.711, "dur": 0.160, + "args": { + "External id": 122998,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "47879808"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182955.580, "dur": 0.231, + "args": { + "External id": 122999,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "48027264"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182957.791, "dur": 0.389, + "args": { + "External id": 123000,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "48174720"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182958.891, "dur": 0.160, + "args": { + "External id": 123001,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "48174912"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182962.480, "dur": 0.140, + "args": { + "External id": 123002,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "48568128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182963.251, "dur": 0.209, + "args": { + "External id": 123003,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "48961344"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182964.120, "dur": 1.871, + "args": { + "External id": 123004,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "49354560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182967.811, "dur": 1.409, + "args": { + "External id": 123005,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "49354752"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182969.871, "dur": 0.169, + "args": { + "External id": 123006,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "49502208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182971.691, "dur": 0.220, + "args": { + "External id": 123007,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "49649664"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182974.000, "dur": 0.211, + "args": { + "External id": 123008,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "49797120"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182974.840, "dur": 0.160, + "args": { + "External id": 123009,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "49944576"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182977.920, "dur": 0.151, + "args": { + "External id": 123010,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "49944768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182978.711, "dur": 0.149, + "args": { + "External id": 123011,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "50337984"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182979.591, "dur": 1.229, + "args": { + "External id": 123012,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "50731200"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182982.831, "dur": 1.169, + "args": { + "External id": 123013,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "51124416"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182984.640, "dur": 0.171, + "args": { + "External id": 123014,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "51124608"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182986.880, "dur": 0.280, + "args": { + "External id": 123015,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "51272064"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182988.611, "dur": 0.189, + "args": { + "External id": 123016,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "51419520"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182989.851, "dur": 0.160, + "args": { + "External id": 123017,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "51566976"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182993.071, "dur": 0.149, + "args": { + "External id": 123018,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "51714432"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182993.931, "dur": 0.160, + "args": { + "External id": 123019,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "51714624"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182994.891, "dur": 1.309, + "args": { + "External id": 123020,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "52107840"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182997.840, "dur": 1.331, + "args": { + "External id": 123021,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "52501056"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685182999.800, "dur": 0.170, + "args": { + "External id": 123022,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "52894272"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685183032.580, "dur": 26.900, + "args": { + "External id": 123023,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 3642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685183141.030, "dur": 175.250, + "args": { + "External id": 123024,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[52894464], [13223616], [], [], []], "Ev Idx": 3643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685183175.960, "dur": 135.510, + "args": { + "External id": 123025,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 52894464, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[13223616], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3644, "In msg nelems": 13223616 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685183198.330, "dur": 105.940, + "args": { + "External id": 123026,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[13223616]], "Ev Idx": 3645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685183337.690, "dur": 4.800, + "args": { + "External id": 123027,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3646, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out", "pid": 5717, "tid": 5717, + "ts": 6302685183415.899, "dur": 570.399, + "args": { + "External id": 123028,"Record function id": 0, "Ev Idx": 3647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183620.719, "dur": 7.470, + "args": { + "External id": 123029,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[52894464], []], "Ev Idx": 3648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183634.559, "dur": 1.840, + "args": { + "External id": 123030,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[24576000], []], "Ev Idx": 3649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183639.209, "dur": 0.570, + "args": { + "External id": 123031,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183642.499, "dur": 0.480, + "args": { + "External id": 123032,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183646.989, "dur": 0.630, + "args": { + "External id": 123033,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183650.339, "dur": 0.630, + "args": { + "External id": 123034,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183653.459, "dur": 2.960, + "args": { + "External id": 123035,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183657.759, "dur": 4.380, + "args": { + "External id": 123036,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183665.419, "dur": 0.710, + "args": { + "External id": 123037,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183669.579, "dur": 0.760, + "args": { + "External id": 123038,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183672.759, "dur": 0.670, + "args": { + "External id": 123039,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183675.649, "dur": 0.860, + "args": { + "External id": 123040,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183680.849, "dur": 0.660, + "args": { + "External id": 123041,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183682.779, "dur": 1.740, + "args": { + "External id": 123042,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183685.789, "dur": 2.960, + "args": { + "External id": 123043,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183691.369, "dur": 2.840, + "args": { + "External id": 123044,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183697.459, "dur": 0.570, + "args": { + "External id": 123045,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183702.769, "dur": 0.630, + "args": { + "External id": 123046,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183705.929, "dur": 0.510, + "args": { + "External id": 123047,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183708.779, "dur": 0.640, + "args": { + "External id": 123048,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183712.689, "dur": 2.410, + "args": { + "External id": 123049,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183716.529, "dur": 1.760, + "args": { + "External id": 123050,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183719.629, "dur": 2.540, + "args": { + "External id": 123051,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183724.469, "dur": 1.530, + "args": { + "External id": 123052,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183730.799, "dur": 0.610, + "args": { + "External id": 123053,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183735.039, "dur": 0.600, + "args": { + "External id": 123054,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183737.989, "dur": 0.590, + "args": { + "External id": 123055,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183740.899, "dur": 0.550, + "args": { + "External id": 123056,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183744.769, "dur": 1.800, + "args": { + "External id": 123057,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183765.639, "dur": 0.650, + "args": { + "External id": 123058,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183768.879, "dur": 2.880, + "args": { + "External id": 123059,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183773.109, "dur": 2.910, + "args": { + "External id": 123060,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183779.429, "dur": 0.510, + "args": { + "External id": 123061,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183782.029, "dur": 0.550, + "args": { + "External id": 123062,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183783.939, "dur": 0.530, + "args": { + "External id": 123063,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183785.638, "dur": 0.520, + "args": { + "External id": 123064,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183788.269, "dur": 0.549, + "args": { + "External id": 123065,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183789.949, "dur": 0.640, + "args": { + "External id": 123066,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685183791.758, "dur": 1.620, + "args": { + "External id": 123067,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685183816.318, "dur": 123.870, + "args": { + "External id": 123068,"Record function id": 0, "Concrete Inputs": ["", "", "1", ""], "Input type": ["c10::BFloat16", "", "Scalar", "TensorList"], "Input Strides": [[13223616, 1], [], [], []], "Input Dims": [[4, 13223616], [], [], []], "Ev Idx": 3687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685183836.338, "dur": 97.020, + "args": { + "External id": 123069,"Record function id": 0, "Concrete Inputs": ["", "", "1", ""], "Input type": ["c10::BFloat16", "", "Scalar", "TensorList"], "Input Strides": [[13223616, 1], [], [], []], "Input Dims": [[4, 13223616], [], [], []], "Ev Idx": 3688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685183855.898, "dur": 5.280, + "args": { + "External id": 123070,"Record function id": 0, "Concrete Inputs": ["[2750]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685183866.609, "dur": 36.469, + "args": { + "External id": 123071,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[2750], [], [], [], [], [], [], []], "Ev Idx": 3690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685183868.298, "dur": 34.450, + "args": { + "External id": 123072,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[2750], [], [], [], [], [], []], "Ev Idx": 3691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685183871.978, "dur": 7.411, + "args": { + "External id": 123073,"Record function id": 0, "Concrete Inputs": ["[2750]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685183880.669, "dur": 21.609, + "args": { + "External id": 123074,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2750], [2750], []], "Ev Idx": 3693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685184234.777, "dur": 30.640, + "args": { + "External id": 123075,"Record function id": 0, "Ev Idx": 3694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 5717, "tid": 5717, + "ts": 6302685184266.728, "dur": 215.759, + "args": { + "External id": 123076,"Record function id": 0, "Ev Idx": 3695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685184320.677, "dur": 149.170, + "args": { + "External id": 123077,"Sequence number": 2575764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "long int"], "Input Strides": [[768, 1], [2048, 1]], "Input Dims": [[32000, 768], [8, 2048]], "Ev Idx": 3696 + } + }, + { + "ph": "s", "id": 302, "pid": 5717, "tid": 5717, "ts": 6302685184320.677, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_0", "pid": 5717, "tid": 5717, + "ts": 6302685184384.247, "dur": 37.400, + "args": { + "External id": 123078,"kernel_hash": "cetti5xicbftggghr4t4digg525svif2uibmlnbq6rzsdmc2ud4d", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/et/cetti5xicbftggghr4t4digg525svif2uibmlnbq6rzsdmc2ud4d.py", "kernel_backend": "triton", "Input type": ["long int", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048], [32000, 768], [8, 2048, 768], []], "Ev Idx": 3697 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685184537.147, "dur": 54.150, + "args": { + "External id": 123079,"Record function id": 0, "Ev Idx": 3698 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.0)", "pid": 5717, "tid": 5717, + "ts": 6302685184606.427, "dur": 1338.247, + "args": { + "External id": 123080,"Record function id": 0, "Ev Idx": 3699 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 5717, "tid": 5717, + "ts": 6302685184616.197, "dur": 621.688, + "args": { + "External id": 123081,"Record function id": 0, "Ev Idx": 3700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685184697.807, "dur": 13.180, + "args": { + "External id": 123082,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685184729.136, "dur": 41.140, + "args": { + "External id": 123083,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184735.127, "dur": 2.300, + "args": { + "External id": 123084,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184738.887, "dur": 1.660, + "args": { + "External id": 123085,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184743.987, "dur": 0.629, + "args": { + "External id": 123086,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184747.776, "dur": 2.351, + "args": { + "External id": 123087,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184751.016, "dur": 2.611, + "args": { + "External id": 123088,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184754.607, "dur": 0.240, + "args": { + "External id": 123089,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184757.816, "dur": 0.151, + "args": { + "External id": 123090,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184761.027, "dur": 1.309, + "args": { + "External id": 123091,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184764.156, "dur": 1.260, + "args": { + "External id": 123092,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685184784.216, "dur": 37.990, + "args": { + "External id": 123093,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685184868.816, "dur": 140.090, + "args": { + "External id": 123094,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685184884.926, "dur": 13.450, + "args": { + "External id": 123095,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685184906.756, "dur": 12.910, + "args": { + "External id": 123096,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685184910.326, "dur": 8.950, + "args": { + "External id": 123097,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184915.706, "dur": 0.870, + "args": { + "External id": 123098,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685184932.146, "dur": 36.470, + "args": { + "External id": 123099,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184934.736, "dur": 1.520, + "args": { + "External id": 123100,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184942.136, "dur": 0.380, + "args": { + "External id": 123101,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184944.356, "dur": 0.230, + "args": { + "External id": 123102,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184947.466, "dur": 0.220, + "args": { + "External id": 123103,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184950.586, "dur": 0.160, + "args": { + "External id": 123104,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184951.586, "dur": 0.280, + "args": { + "External id": 123105,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184954.976, "dur": 2.860, + "args": { + "External id": 123106,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184958.756, "dur": 0.240, + "args": { + "External id": 123107,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685184960.906, "dur": 2.510, + "args": { + "External id": 123108,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685184981.366, "dur": 18.320, + "args": { + "External id": 123109,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685185063.156, "dur": 97.799, + "args": { + "External id": 123110,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685185081.736, "dur": 75.999, + "args": { + "External id": 123111,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3730, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685185093.376, "dur": 59.910, + "args": { + "External id": 123112,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685185177.515, "dur": 4.331, + "args": { + "External id": 123113,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3732, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 5717, "tid": 5717, + "ts": 6302685185260.985, "dur": 479.089, + "args": { + "External id": 123114,"Record function id": 0, "Ev Idx": 3733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185411.435, "dur": 4.980, + "args": { + "External id": 123115,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185422.445, "dur": 1.150, + "args": { + "External id": 123116,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185429.715, "dur": 2.040, + "args": { + "External id": 123117,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185437.425, "dur": 1.760, + "args": { + "External id": 123118,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185442.155, "dur": 0.970, + "args": { + "External id": 123119,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185444.825, "dur": 0.990, + "args": { + "External id": 123120,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185450.085, "dur": 1.000, + "args": { + "External id": 123121,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185452.845, "dur": 4.030, + "args": { + "External id": 123122,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185458.495, "dur": 0.850, + "args": { + "External id": 123123,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185461.325, "dur": 1.370, + "args": { + "External id": 123124,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685185483.875, "dur": 201.279, + "args": { + "External id": 123125,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685185512.265, "dur": 164.089, + "args": { + "External id": 123126,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685185554.705, "dur": 14.620, + "args": { + "External id": 123127,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685185573.545, "dur": 62.769, + "args": { + "External id": 123128,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685185577.205, "dur": 58.689, + "args": { + "External id": 123129,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685185581.974, "dur": 16.540, + "args": { + "External id": 123130,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685185601.745, "dur": 32.480, + "args": { + "External id": 123131,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5717, "tid": 5717, + "ts": 6302685185886.114, "dur": 29.190, + "args": { + "External id": 123132,"Sequence number": 2575765, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3751 + } + }, + { + "ph": "s", "id": 301, "pid": 5717, "tid": 5717, "ts": 6302685185886.114, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685185903.474, "dur": 7.670, + "args": { + "External id": 123133,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 3752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685185906.344, "dur": 4.200, + "args": { + "External id": 123134,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 3753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685185984.924, "dur": 31.420, + "args": { + "External id": 123135,"Record function id": 0, "Ev Idx": 3754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5717, "tid": 5717, + "ts": 6302685186017.753, "dur": 2258.806, + "args": { + "External id": 123136,"Record function id": 0, "Ev Idx": 3755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685186046.684, "dur": 148.289, + "args": { + "External id": 123137,"Sequence number": 2575766, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 3756 + } + }, + { + "ph": "s", "id": 300, "pid": 5717, "tid": 5717, "ts": 6302685186046.684, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685186108.783, "dur": 38.530, + "args": { + "External id": 123138,"kernel_hash": "cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/dl/cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 3757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685186164.273, "dur": 8.110, + "args": { + "External id": 123139,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685186166.493, "dur": 5.670, + "args": { + "External id": 123140,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685186220.993, "dur": 27.270, + "args": { + "External id": 123141,"Record function id": 0, "Ev Idx": 3760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5717, "tid": 5717, + "ts": 6302685186249.313, "dur": 1443.137, + "args": { + "External id": 123142,"Record function id": 0, "Ev Idx": 3761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685186278.173, "dur": 294.049, + "args": { + "External id": 123143,"Sequence number": 2575767, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 3762 + } + }, + { + "ph": "s", "id": 299, "pid": 5717, "tid": 5717, "ts": 6302685186278.173, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685186326.393, "dur": 79.030, + "args": { + "External id": 123144,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685186423.253, "dur": 21.410, + "args": { + "External id": 123145,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685186458.063, "dur": 19.389, + "args": { + "External id": 123146,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685186523.803, "dur": 5.920, + "args": { + "External id": 123147,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685186540.092, "dur": 1.190, + "args": { + "External id": 123148,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685186546.492, "dur": 1.950, + "args": { + "External id": 123149,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685186599.042, "dur": 17.900, + "args": { + "External id": 123150,"Record function id": 0, "Ev Idx": 3769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5717, "tid": 5717, + "ts": 6302685186618.202, "dur": 635.429, + "args": { + "External id": 123151,"Record function id": 0, "Ev Idx": 3770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685186644.862, "dur": 6.960, + "args": { + "External id": 123152,"Record function id": 0, "Ev Idx": 3771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685186652.662, "dur": 316.600, + "args": { + "External id": 123153,"Record function id": 0, "Ev Idx": 3772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685186669.352, "dur": 298.599, + "args": { + "External id": 123154,"Sequence number": 2575768, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3773 + } + }, + { + "ph": "s", "id": 298, "pid": 5717, "tid": 5717, "ts": 6302685186669.352, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685186677.392, "dur": 14.450, + "args": { + "External id": 123155,"Record function id": 0, "Ev Idx": 3774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685186694.182, "dur": 262.989, + "args": { + "External id": 123156,"Record function id": 0, "Ev Idx": 3775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685186729.612, "dur": 7.090, + "args": { + "External id": 123157,"Record function id": 0, "Ev Idx": 3776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685186737.692, "dur": 184.839, + "args": { + "External id": 123158,"Record function id": 0, "Ev Idx": 3777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685186744.162, "dur": 11.370, + "args": { + "External id": 123159,"Record function id": 0, "Ev Idx": 3778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685186756.212, "dur": 162.259, + "args": { + "External id": 123160,"Record function id": 0, "Ev Idx": 3779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685186808.692, "dur": 14.080, + "args": { + "External id": 123161,"Record function id": 0, "Ev Idx": 3780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685186824.412, "dur": 92.810, + "args": { + "External id": 123162,"Record function id": 0, "Ev Idx": 3781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685186868.322, "dur": 34.589, + "args": { + "External id": 123163,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685186931.702, "dur": 5.240, + "args": { + "External id": 123164,"Record function id": 0, "Ev Idx": 3783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685186937.831, "dur": 18.651, + "args": { + "External id": 123165,"Record function id": 0, "Ev Idx": 3784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685186974.681, "dur": 8.560, + "args": { + "External id": 123166,"Record function id": 0, "Ev Idx": 3785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5717, "tid": 5717, + "ts": 6302685186984.121, "dur": 268.970, + "args": { + "External id": 123167,"Record function id": 0, "Ev Idx": 3786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685186988.961, "dur": 1.930, + "args": { + "External id": 123168,"Record function id": 0, "Ev Idx": 3787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685186991.521, "dur": 260.050, + "args": { + "External id": 123169,"Record function id": 0, "Ev Idx": 3788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685187008.131, "dur": 241.700, + "args": { + "External id": 123170,"Sequence number": 2575769, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3789 + } + }, + { + "ph": "s", "id": 297, "pid": 5717, "tid": 5717, "ts": 6302685187008.131, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685187014.361, "dur": 7.270, + "args": { + "External id": 123171,"Record function id": 0, "Ev Idx": 3790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685187023.061, "dur": 218.080, + "args": { + "External id": 123172,"Record function id": 0, "Ev Idx": 3791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685187050.701, "dur": 3.450, + "args": { + "External id": 123173,"Record function id": 0, "Ev Idx": 3792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685187055.101, "dur": 158.210, + "args": { + "External id": 123174,"Record function id": 0, "Ev Idx": 3793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685187061.791, "dur": 3.940, + "args": { + "External id": 123175,"Record function id": 0, "Ev Idx": 3794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685187066.561, "dur": 144.290, + "args": { + "External id": 123176,"Record function id": 0, "Ev Idx": 3795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685187113.481, "dur": 7.470, + "args": { + "External id": 123177,"Record function id": 0, "Ev Idx": 3796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685187122.141, "dur": 87.630, + "args": { + "External id": 123178,"Record function id": 0, "Ev Idx": 3797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685187163.331, "dur": 31.940, + "args": { + "External id": 123179,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685187219.801, "dur": 3.950, + "args": { + "External id": 123180,"Record function id": 0, "Ev Idx": 3799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685187224.731, "dur": 15.630, + "args": { + "External id": 123181,"Record function id": 0, "Ev Idx": 3800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685187266.551, "dur": 26.510, + "args": { + "External id": 123182,"Record function id": 0, "Ev Idx": 3801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5717, "tid": 5717, + "ts": 6302685187294.921, "dur": 396.109, + "args": { + "External id": 123183,"Record function id": 0, "Ev Idx": 3802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685187340.890, "dur": 336.300, + "args": { + "External id": 123184,"Sequence number": 2575770, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 3803 + } + }, + { + "ph": "s", "id": 296, "pid": 5717, "tid": 5717, "ts": 6302685187340.890, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685187389.490, "dur": 183.870, + "args": { + "External id": 123185,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 3804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685187456.080, "dur": 16.840, + "args": { + "External id": 123186,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685187459.670, "dur": 11.850, + "args": { + "External id": 123187,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685187476.140, "dur": 8.350, + "args": { + "External id": 123188,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685187486.490, "dur": 3.740, + "args": { + "External id": 123189,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685187495.270, "dur": 4.380, + "args": { + "External id": 123190,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685187595.720, "dur": 38.740, + "args": { + "External id": 123191,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685187704.830, "dur": 52.460, + "args": { + "External id": 123192,"Record function id": 0, "Ev Idx": 3811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5717, "tid": 5717, + "ts": 6302685187758.490, "dur": 514.858, + "args": { + "External id": 123193,"Record function id": 0, "Ev Idx": 3812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685187794.420, "dur": 462.888, + "args": { + "External id": 123194,"Sequence number": 2575771, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 3813 + } + }, + { + "ph": "s", "id": 295, "pid": 5717, "tid": 5717, "ts": 6302685187794.420, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685187864.329, "dur": 38.800, + "args": { + "External id": 123195,"kernel_hash": "cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/qk/cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 3814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685187924.169, "dur": 40.630, + "args": { + "External id": 123196,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685187980.859, "dur": 22.630, + "args": { + "External id": 123197,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5717, "tid": 5717, + "ts": 6302685188036.649, "dur": 32.610, + "args": { + "External id": 123198,"kernel_hash": "cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/s2/cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 3817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685188087.669, "dur": 45.700, + "args": { + "External id": 123199,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5717, "tid": 5717, + "ts": 6302685188169.979, "dur": 32.709, + "args": { + "External id": 123200,"kernel_hash": "c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/52/c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3819 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.0)", "pid": 5717, "tid": 5717, + "ts": 6302685188345.978, "dur": 81.050, + "args": { + "External id": 123201,"Record function id": 0, "Ev Idx": 3820 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685188516.348, "dur": 61.660, + "args": { + "External id": 123202,"Record function id": 0, "Ev Idx": 3821 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.1)", "pid": 5717, "tid": 5717, + "ts": 6302685188590.098, "dur": 1629.796, + "args": { + "External id": 123203,"Record function id": 0, "Ev Idx": 3822 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 5717, "tid": 5717, + "ts": 6302685188600.968, "dur": 791.948, + "args": { + "External id": 123204,"Record function id": 0, "Ev Idx": 3823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685188691.507, "dur": 11.340, + "args": { + "External id": 123205,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685188717.638, "dur": 35.459, + "args": { + "External id": 123206,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188724.787, "dur": 2.271, + "args": { + "External id": 123207,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188729.338, "dur": 0.289, + "args": { + "External id": 123208,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188730.678, "dur": 0.340, + "args": { + "External id": 123209,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188734.587, "dur": 1.451, + "args": { + "External id": 123210,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188736.967, "dur": 0.251, + "args": { + "External id": 123211,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188738.187, "dur": 1.680, + "args": { + "External id": 123212,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188742.077, "dur": 0.300, + "args": { + "External id": 123213,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188743.297, "dur": 0.260, + "args": { + "External id": 123214,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188747.177, "dur": 0.390, + "args": { + "External id": 123215,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685188763.837, "dur": 31.470, + "args": { + "External id": 123216,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685188838.297, "dur": 146.200, + "args": { + "External id": 123217,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685188852.417, "dur": 9.260, + "args": { + "External id": 123218,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685188868.107, "dur": 11.450, + "args": { + "External id": 123219,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685188871.537, "dur": 7.540, + "args": { + "External id": 123220,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188876.087, "dur": 0.850, + "args": { + "External id": 123221,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685188889.867, "dur": 31.580, + "args": { + "External id": 123222,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188891.947, "dur": 1.770, + "args": { + "External id": 123223,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188896.017, "dur": 0.310, + "args": { + "External id": 123224,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188898.417, "dur": 0.320, + "args": { + "External id": 123225,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188900.767, "dur": 0.300, + "args": { + "External id": 123226,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188902.007, "dur": 0.530, + "args": { + "External id": 123227,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188904.767, "dur": 0.310, + "args": { + "External id": 123228,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188907.217, "dur": 0.230, + "args": { + "External id": 123229,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188908.607, "dur": 4.170, + "args": { + "External id": 123230,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685188914.737, "dur": 1.490, + "args": { + "External id": 123231,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685188939.867, "dur": 28.590, + "args": { + "External id": 123232,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685189077.717, "dur": 185.229, + "args": { + "External id": 123233,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685189110.997, "dur": 146.079, + "args": { + "External id": 123234,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3853, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685189141.777, "dur": 107.759, + "args": { + "External id": 123235,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685189284.836, "dur": 6.740, + "args": { + "External id": 123236,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3855, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 5717, "tid": 5717, + "ts": 6302685189425.866, "dur": 460.589, + "args": { + "External id": 123237,"Record function id": 0, "Ev Idx": 3856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685189569.776, "dur": 6.420, + "args": { + "External id": 123238,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685189582.576, "dur": 1.189, + "args": { + "External id": 123239,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685189586.496, "dur": 4.240, + "args": { + "External id": 123240,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685189595.225, "dur": 1.331, + "args": { + "External id": 123241,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685189598.585, "dur": 0.891, + "args": { + "External id": 123242,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685189602.996, "dur": 0.909, + "args": { + "External id": 123243,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685189607.716, "dur": 0.849, + "args": { + "External id": 123244,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685189610.645, "dur": 3.440, + "args": { + "External id": 123245,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685189616.096, "dur": 0.829, + "args": { + "External id": 123246,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685189620.256, "dur": 0.829, + "args": { + "External id": 123247,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685189653.345, "dur": 180.440, + "args": { + "External id": 123248,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685189700.865, "dur": 126.420, + "args": { + "External id": 123249,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685189721.985, "dur": 10.320, + "args": { + "External id": 123250,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685189735.875, "dur": 56.360, + "args": { + "External id": 123251,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685189740.155, "dur": 51.500, + "args": { + "External id": 123252,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685189744.905, "dur": 11.950, + "args": { + "External id": 123253,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685189758.905, "dur": 31.870, + "args": { + "External id": 123254,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5717, "tid": 5717, + "ts": 6302685190094.214, "dur": 56.340, + "args": { + "External id": 123255,"Sequence number": 2575772, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3874 + } + }, + { + "ph": "s", "id": 294, "pid": 5717, "tid": 5717, "ts": 6302685190094.214, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685190124.424, "dur": 15.800, + "args": { + "External id": 123256,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 3875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685190130.034, "dur": 9.440, + "args": { + "External id": 123257,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 3876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685190291.984, "dur": 33.630, + "args": { + "External id": 123258,"Record function id": 0, "Ev Idx": 3877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5717, "tid": 5717, + "ts": 6302685190327.864, "dur": 3673.642, + "args": { + "External id": 123259,"Record function id": 0, "Ev Idx": 3878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685190367.344, "dur": 245.509, + "args": { + "External id": 123260,"Sequence number": 2575773, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 3879 + } + }, + { + "ph": "s", "id": 293, "pid": 5717, "tid": 5717, "ts": 6302685190367.344, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685190447.814, "dur": 72.429, + "args": { + "External id": 123261,"kernel_hash": "cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/dl/cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 3880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685190555.853, "dur": 11.590, + "args": { + "External id": 123262,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685190559.803, "dur": 7.310, + "args": { + "External id": 123263,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685190663.363, "dur": 20.320, + "args": { + "External id": 123264,"Record function id": 0, "Ev Idx": 3883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5717, "tid": 5717, + "ts": 6302685190685.343, "dur": 2106.985, + "args": { + "External id": 123265,"Record function id": 0, "Ev Idx": 3884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685190723.793, "dur": 340.569, + "args": { + "External id": 123266,"Sequence number": 2575774, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 3885 + } + }, + { + "ph": "s", "id": 292, "pid": 5717, "tid": 5717, "ts": 6302685190723.793, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685190771.673, "dur": 59.230, + "args": { + "External id": 123267,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685190853.513, "dur": 31.880, + "args": { + "External id": 123268,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685190906.002, "dur": 30.280, + "args": { + "External id": 123269,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685190994.282, "dur": 6.290, + "args": { + "External id": 123270,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685191015.492, "dur": 1.810, + "args": { + "External id": 123271,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685191025.032, "dur": 2.850, + "args": { + "External id": 123272,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191111.392, "dur": 23.120, + "args": { + "External id": 123273,"Record function id": 0, "Ev Idx": 3892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5717, "tid": 5717, + "ts": 6302685191137.232, "dur": 932.358, + "args": { + "External id": 123274,"Record function id": 0, "Ev Idx": 3893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191178.612, "dur": 7.980, + "args": { + "External id": 123275,"Record function id": 0, "Ev Idx": 3894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685191188.212, "dur": 461.989, + "args": { + "External id": 123276,"Record function id": 0, "Ev Idx": 3895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685191218.542, "dur": 429.089, + "args": { + "External id": 123277,"Sequence number": 2575775, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3896 + } + }, + { + "ph": "s", "id": 291, "pid": 5717, "tid": 5717, "ts": 6302685191218.542, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191232.842, "dur": 14.030, + "args": { + "External id": 123278,"Record function id": 0, "Ev Idx": 3897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685191250.642, "dur": 379.569, + "args": { + "External id": 123279,"Record function id": 0, "Ev Idx": 3898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191324.281, "dur": 9.131, + "args": { + "External id": 123280,"Record function id": 0, "Ev Idx": 3899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685191335.292, "dur": 244.139, + "args": { + "External id": 123281,"Record function id": 0, "Ev Idx": 3900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191342.612, "dur": 6.849, + "args": { + "External id": 123282,"Record function id": 0, "Ev Idx": 3901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685191350.652, "dur": 223.599, + "args": { + "External id": 123283,"Record function id": 0, "Ev Idx": 3902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191423.421, "dur": 11.380, + "args": { + "External id": 123284,"Record function id": 0, "Ev Idx": 3903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685191436.711, "dur": 135.340, + "args": { + "External id": 123285,"Record function id": 0, "Ev Idx": 3904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685191499.231, "dur": 47.120, + "args": { + "External id": 123286,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191592.591, "dur": 6.740, + "args": { + "External id": 123287,"Record function id": 0, "Ev Idx": 3906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685191600.681, "dur": 28.110, + "args": { + "External id": 123288,"Record function id": 0, "Ev Idx": 3907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191658.541, "dur": 8.660, + "args": { + "External id": 123289,"Record function id": 0, "Ev Idx": 3908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5717, "tid": 5717, + "ts": 6302685191668.541, "dur": 400.239, + "args": { + "External id": 123290,"Record function id": 0, "Ev Idx": 3909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191676.351, "dur": 4.590, + "args": { + "External id": 123291,"Record function id": 0, "Ev Idx": 3910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685191682.131, "dur": 384.769, + "args": { + "External id": 123292,"Record function id": 0, "Ev Idx": 3911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685191708.571, "dur": 356.079, + "args": { + "External id": 123293,"Sequence number": 2575776, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3912 + } + }, + { + "ph": "s", "id": 290, "pid": 5717, "tid": 5717, "ts": 6302685191708.571, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191719.321, "dur": 10.760, + "args": { + "External id": 123294,"Record function id": 0, "Ev Idx": 3913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685191731.591, "dur": 316.879, + "args": { + "External id": 123295,"Record function id": 0, "Ev Idx": 3914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191771.951, "dur": 7.829, + "args": { + "External id": 123296,"Record function id": 0, "Ev Idx": 3915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685191781.380, "dur": 218.030, + "args": { + "External id": 123297,"Record function id": 0, "Ev Idx": 3916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191787.731, "dur": 6.529, + "args": { + "External id": 123298,"Record function id": 0, "Ev Idx": 3917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685191795.511, "dur": 200.059, + "args": { + "External id": 123299,"Record function id": 0, "Ev Idx": 3918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685191858.260, "dur": 11.540, + "args": { + "External id": 123300,"Record function id": 0, "Ev Idx": 3919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685191871.380, "dur": 122.310, + "args": { + "External id": 123301,"Record function id": 0, "Ev Idx": 3920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685191927.720, "dur": 42.790, + "args": { + "External id": 123302,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685192011.910, "dur": 6.720, + "args": { + "External id": 123303,"Record function id": 0, "Ev Idx": 3922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685192020.150, "dur": 27.000, + "args": { + "External id": 123304,"Record function id": 0, "Ev Idx": 3923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685192091.640, "dur": 28.340, + "args": { + "External id": 123305,"Record function id": 0, "Ev Idx": 3924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5717, "tid": 5717, + "ts": 6302685192122.570, "dur": 666.778, + "args": { + "External id": 123306,"Record function id": 0, "Ev Idx": 3925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685192179.739, "dur": 581.839, + "args": { + "External id": 123307,"Sequence number": 2575777, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 3926 + } + }, + { + "ph": "s", "id": 289, "pid": 5717, "tid": 5717, "ts": 6302685192179.739, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685192241.659, "dur": 313.010, + "args": { + "External id": 123308,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 3927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685192348.049, "dur": 31.240, + "args": { + "External id": 123309,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685192355.479, "dur": 20.980, + "args": { + "External id": 123310,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685192383.729, "dur": 13.790, + "args": { + "External id": 123311,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685192400.649, "dur": 6.100, + "args": { + "External id": 123312,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685192414.839, "dur": 10.330, + "args": { + "External id": 123313,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685192601.199, "dur": 78.379, + "args": { + "External id": 123314,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685192815.008, "dur": 50.900, + "args": { + "External id": 123315,"Record function id": 0, "Ev Idx": 3934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5717, "tid": 5717, + "ts": 6302685192868.338, "dur": 1123.068, + "args": { + "External id": 123316,"Record function id": 0, "Ev Idx": 3935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685192934.818, "dur": 1017.328, + "args": { + "External id": 123317,"Sequence number": 2575778, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 3936 + } + }, + { + "ph": "s", "id": 288, "pid": 5717, "tid": 5717, "ts": 6302685192934.818, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685193059.148, "dur": 71.189, + "args": { + "External id": 123318,"kernel_hash": "cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/qk/cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 3937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685193181.887, "dur": 95.370, + "args": { + "External id": 123319,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685193349.357, "dur": 72.570, + "args": { + "External id": 123320,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5717, "tid": 5717, + "ts": 6302685193503.607, "dur": 66.549, + "args": { + "External id": 123321,"kernel_hash": "cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/s2/cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 3940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685193616.656, "dur": 90.060, + "args": { + "External id": 123322,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5717, "tid": 5717, + "ts": 6302685193777.516, "dur": 57.680, + "args": { + "External id": 123323,"kernel_hash": "c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/52/c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3942 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.1)", "pid": 5717, "tid": 5717, + "ts": 6302685194148.585, "dur": 222.580, + "args": { + "External id": 123324,"Record function id": 0, "Ev Idx": 3943 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685194702.984, "dur": 256.139, + "args": { + "External id": 123325,"Record function id": 0, "Ev Idx": 3944 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.2)", "pid": 5717, "tid": 5717, + "ts": 6302685195007.773, "dur": 9554.169, + "args": { + "External id": 123326,"Record function id": 0, "Ev Idx": 3945 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 5717, "tid": 5717, + "ts": 6302685195050.613, "dur": 4742.479, + "args": { + "External id": 123327,"Record function id": 0, "Ev Idx": 3946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685195602.512, "dur": 74.200, + "args": { + "External id": 123328,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685195770.491, "dur": 157.220, + "args": { + "External id": 123329,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685195801.702, "dur": 10.999, + "args": { + "External id": 123330,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685195828.211, "dur": 1.870, + "args": { + "External id": 123331,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685195836.581, "dur": 1.400, + "args": { + "External id": 123332,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685195847.411, "dur": 8.100, + "args": { + "External id": 123333,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685195861.611, "dur": 1.540, + "args": { + "External id": 123334,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685195869.131, "dur": 1.540, + "args": { + "External id": 123335,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685195883.171, "dur": 1.470, + "args": { + "External id": 123336,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685195890.291, "dur": 1.320, + "args": { + "External id": 123337,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685195900.721, "dur": 1.330, + "args": { + "External id": 123338,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685195987.571, "dur": 200.019, + "args": { + "External id": 123339,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685196508.800, "dur": 776.908, + "args": { + "External id": 123340,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685196598.869, "dur": 62.800, + "args": { + "External id": 123341,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685196703.699, "dur": 72.090, + "args": { + "External id": 123342,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685196723.409, "dur": 48.830, + "args": { + "External id": 123343,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685196748.469, "dur": 9.410, + "args": { + "External id": 123344,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685196845.429, "dur": 121.190, + "args": { + "External id": 123345,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685196857.479, "dur": 5.710, + "args": { + "External id": 123346,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685196874.129, "dur": 1.520, + "args": { + "External id": 123347,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685196881.569, "dur": 1.640, + "args": { + "External id": 123348,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685196895.469, "dur": 1.800, + "args": { + "External id": 123349,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685196902.949, "dur": 1.370, + "args": { + "External id": 123350,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685196910.229, "dur": 5.070, + "args": { + "External id": 123351,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685196924.749, "dur": 1.560, + "args": { + "External id": 123352,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685196931.989, "dur": 1.470, + "args": { + "External id": 123353,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685196942.979, "dur": 4.940, + "args": { + "External id": 123354,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685197051.268, "dur": 157.100, + "args": { + "External id": 123355,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685197889.227, "dur": 948.897, + "args": { + "External id": 123356,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685198074.606, "dur": 733.609, + "args": { + "External id": 123357,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3976, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685198227.586, "dur": 522.809, + "args": { + "External id": 123358,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685198958.104, "dur": 36.390, + "args": { + "External id": 123359,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3978, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 5717, "tid": 5717, + "ts": 6302685200043.432, "dur": 2979.533, + "args": { + "External id": 123360,"Record function id": 0, "Ev Idx": 3979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685201142.509, "dur": 55.460, + "args": { + "External id": 123361,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685201252.579, "dur": 13.120, + "args": { + "External id": 123362,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685201288.229, "dur": 5.160, + "args": { + "External id": 123363,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685201356.349, "dur": 12.360, + "args": { + "External id": 123364,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685201385.259, "dur": 4.650, + "args": { + "External id": 123365,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685201400.099, "dur": 4.160, + "args": { + "External id": 123366,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685201419.069, "dur": 4.610, + "args": { + "External id": 123367,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685201434.049, "dur": 10.370, + "args": { + "External id": 123368,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685201458.138, "dur": 3.980, + "args": { + "External id": 123369,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685201472.029, "dur": 3.929, + "args": { + "External id": 123370,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685201596.208, "dur": 1060.248, + "args": { + "External id": 123371,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685201719.108, "dur": 885.208, + "args": { + "External id": 123372,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685201853.298, "dur": 60.839, + "args": { + "External id": 123373,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685201931.297, "dur": 351.450, + "args": { + "External id": 123374,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685201942.668, "dur": 330.659, + "args": { + "External id": 123375,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685201963.457, "dur": 57.730, + "args": { + "External id": 123376,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685202033.987, "dur": 234.990, + "args": { + "External id": 123377,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5717, "tid": 5717, + "ts": 6302685204091.033, "dur": 188.649, + "args": { + "External id": 123378,"Sequence number": 2575779, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3997 + } + }, + { + "ph": "s", "id": 287, "pid": 5717, "tid": 5717, "ts": 6302685204091.033, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685204187.743, "dur": 58.009, + "args": { + "External id": 123379,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 3998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685204207.882, "dur": 31.800, + "args": { + "External id": 123380,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 3999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685204794.981, "dur": 89.440, + "args": { + "External id": 123381,"Record function id": 0, "Ev Idx": 4000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5717, "tid": 5717, + "ts": 6302685204893.321, "dur": 13123.630, + "args": { + "External id": 123382,"Record function id": 0, "Ev Idx": 4001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685205066.881, "dur": 972.657, + "args": { + "External id": 123383,"Sequence number": 2575780, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4002 + } + }, + { + "ph": "s", "id": 286, "pid": 5717, "tid": 5717, "ts": 6302685205066.881, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685205478.940, "dur": 232.499, + "args": { + "External id": 123384,"kernel_hash": "cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/dl/cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685205832.909, "dur": 35.110, + "args": { + "External id": 123385,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685205842.579, "dur": 23.569, + "args": { + "External id": 123386,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685206259.888, "dur": 141.659, + "args": { + "External id": 123387,"Record function id": 0, "Ev Idx": 4006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5717, "tid": 5717, + "ts": 6302685206418.467, "dur": 8230.192, + "args": { + "External id": 123388,"Record function id": 0, "Ev Idx": 4007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685206616.667, "dur": 1627.696, + "args": { + "External id": 123389,"Sequence number": 2575781, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4008 + } + }, + { + "ph": "s", "id": 285, "pid": 5717, "tid": 5717, "ts": 6302685206616.667, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685206843.637, "dur": 263.319, + "args": { + "External id": 123390,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685207204.245, "dur": 183.060, + "args": { + "External id": 123391,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685207478.565, "dur": 138.250, + "args": { + "External id": 123392,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685207951.914, "dur": 27.870, + "args": { + "External id": 123393,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685208047.334, "dur": 8.060, + "args": { + "External id": 123394,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685208092.274, "dur": 8.809, + "args": { + "External id": 123395,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685208446.353, "dur": 72.769, + "args": { + "External id": 123396,"Record function id": 0, "Ev Idx": 4015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5717, "tid": 5717, + "ts": 6302685208527.873, "dur": 3608.701, + "args": { + "External id": 123397,"Record function id": 0, "Ev Idx": 4016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685208668.832, "dur": 25.570, + "args": { + "External id": 123398,"Record function id": 0, "Ev Idx": 4017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685208699.812, "dur": 1809.766, + "args": { + "External id": 123399,"Record function id": 0, "Ev Idx": 4018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685208804.592, "dur": 1694.696, + "args": { + "External id": 123400,"Sequence number": 2575782, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4019 + } + }, + { + "ph": "s", "id": 284, "pid": 5717, "tid": 5717, "ts": 6302685208804.592, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685208843.492, "dur": 42.270, + "args": { + "External id": 123401,"Record function id": 0, "Ev Idx": 4020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685208895.872, "dur": 1534.136, + "args": { + "External id": 123402,"Record function id": 0, "Ev Idx": 4021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685209081.521, "dur": 26.890, + "args": { + "External id": 123403,"Record function id": 0, "Ev Idx": 4022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685209114.461, "dur": 1098.688, + "args": { + "External id": 123404,"Record function id": 0, "Ev Idx": 4023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685209136.671, "dur": 28.500, + "args": { + "External id": 123405,"Record function id": 0, "Ev Idx": 4024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685209169.991, "dur": 1023.848, + "args": { + "External id": 123406,"Record function id": 0, "Ev Idx": 4025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685209582.010, "dur": 72.570, + "args": { + "External id": 123407,"Record function id": 0, "Ev Idx": 4026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685209663.710, "dur": 521.969, + "args": { + "External id": 123408,"Record function id": 0, "Ev Idx": 4027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685209920.090, "dur": 177.229, + "args": { + "External id": 123409,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685210253.559, "dur": 24.090, + "args": { + "External id": 123410,"Record function id": 0, "Ev Idx": 4029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685210282.918, "dur": 140.620, + "args": { + "External id": 123411,"Record function id": 0, "Ev Idx": 4030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685210537.698, "dur": 36.130, + "args": { + "External id": 123412,"Record function id": 0, "Ev Idx": 4031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5717, "tid": 5717, + "ts": 6302685210578.838, "dur": 1554.787, + "args": { + "External id": 123413,"Record function id": 0, "Ev Idx": 4032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685210601.738, "dur": 19.510, + "args": { + "External id": 123414,"Record function id": 0, "Ev Idx": 4033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685210625.618, "dur": 1500.636, + "args": { + "External id": 123415,"Record function id": 0, "Ev Idx": 4034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685210731.417, "dur": 1385.837, + "args": { + "External id": 123416,"Sequence number": 2575783, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4035 + } + }, + { + "ph": "s", "id": 283, "pid": 5717, "tid": 5717, "ts": 6302685210731.417, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685210765.957, "dur": 35.711, + "args": { + "External id": 123417,"Record function id": 0, "Ev Idx": 4036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685210807.317, "dur": 1246.588, + "args": { + "External id": 123418,"Record function id": 0, "Ev Idx": 4037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685210964.167, "dur": 22.460, + "args": { + "External id": 123419,"Record function id": 0, "Ev Idx": 4038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685210992.167, "dur": 883.128, + "args": { + "External id": 123420,"Record function id": 0, "Ev Idx": 4039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685211015.197, "dur": 25.230, + "args": { + "External id": 123421,"Record function id": 0, "Ev Idx": 4040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685211045.177, "dur": 812.318, + "args": { + "External id": 123422,"Record function id": 0, "Ev Idx": 4041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685211286.616, "dur": 79.510, + "args": { + "External id": 123423,"Record function id": 0, "Ev Idx": 4042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685211373.586, "dur": 476.939, + "args": { + "External id": 123424,"Record function id": 0, "Ev Idx": 4043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685211595.636, "dur": 164.149, + "args": { + "External id": 123425,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685211917.415, "dur": 26.150, + "args": { + "External id": 123426,"Record function id": 0, "Ev Idx": 4045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685211949.055, "dur": 99.590, + "args": { + "External id": 123427,"Record function id": 0, "Ev Idx": 4046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685212214.184, "dur": 69.850, + "args": { + "External id": 123428,"Record function id": 0, "Ev Idx": 4047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5717, "tid": 5717, + "ts": 6302685212290.134, "dur": 2348.545, + "args": { + "External id": 123429,"Record function id": 0, "Ev Idx": 4048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685212508.293, "dur": 2042.346, + "args": { + "External id": 123430,"Sequence number": 2575784, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4049 + } + }, + { + "ph": "s", "id": 282, "pid": 5717, "tid": 5717, "ts": 6302685212508.293, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685212733.893, "dur": 1135.288, + "args": { + "External id": 123431,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685213075.952, "dur": 113.350, + "args": { + "External id": 123432,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685213107.212, "dur": 73.280, + "args": { + "External id": 123433,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685213204.022, "dur": 61.030, + "args": { + "External id": 123434,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685213279.732, "dur": 60.780, + "args": { + "External id": 123435,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685213367.302, "dur": 34.729, + "args": { + "External id": 123436,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685214012.370, "dur": 237.100, + "args": { + "External id": 123437,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685214720.728, "dur": 153.800, + "args": { + "External id": 123438,"Record function id": 0, "Ev Idx": 4057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5717, "tid": 5717, + "ts": 6302685214882.848, "dur": 3109.573, + "args": { + "External id": 123439,"Record function id": 0, "Ev Idx": 4058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685215094.588, "dur": 2809.503, + "args": { + "External id": 123440,"Sequence number": 2575785, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4059 + } + }, + { + "ph": "s", "id": 281, "pid": 5717, "tid": 5717, "ts": 6302685215094.588, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685215526.407, "dur": 296.179, + "args": { + "External id": 123441,"kernel_hash": "cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/qk/cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685215948.176, "dur": 420.599, + "args": { + "External id": 123442,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685216477.175, "dur": 150.819, + "args": { + "External id": 123443,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5717, "tid": 5717, + "ts": 6302685216801.644, "dur": 143.779, + "args": { + "External id": 123444,"kernel_hash": "cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/s2/cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685217032.993, "dur": 193.550, + "args": { + "External id": 123445,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5717, "tid": 5717, + "ts": 6302685217428.802, "dur": 137.370, + "args": { + "External id": 123446,"kernel_hash": "c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/52/c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4065 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.2)", "pid": 5717, "tid": 5717, + "ts": 6302685218382.660, "dur": 352.839, + "args": { + "External id": 123447,"Record function id": 0, "Ev Idx": 4066 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685219128.829, "dur": 370.159, + "args": { + "External id": 123448,"Record function id": 0, "Ev Idx": 4067 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.3)", "pid": 5717, "tid": 5717, + "ts": 6302685219579.308, "dur": 7915.412, + "args": { + "External id": 123449,"Record function id": 0, "Ev Idx": 4068 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 5717, "tid": 5717, + "ts": 6302685219649.237, "dur": 4555.020, + "args": { + "External id": 123450,"Record function id": 0, "Ev Idx": 4069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685220147.506, "dur": 68.240, + "args": { + "External id": 123451,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685220341.506, "dur": 157.369, + "args": { + "External id": 123452,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685220368.966, "dur": 10.850, + "args": { + "External id": 123453,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685220396.156, "dur": 1.770, + "args": { + "External id": 123454,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685220404.536, "dur": 1.480, + "args": { + "External id": 123455,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685220415.826, "dur": 9.590, + "args": { + "External id": 123456,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685220431.716, "dur": 1.760, + "args": { + "External id": 123457,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685220443.255, "dur": 1.311, + "args": { + "External id": 123458,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685220454.466, "dur": 1.509, + "args": { + "External id": 123459,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685220461.715, "dur": 1.380, + "args": { + "External id": 123460,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685220471.955, "dur": 1.531, + "args": { + "External id": 123461,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685220596.415, "dur": 183.390, + "args": { + "External id": 123462,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685221089.164, "dur": 1102.778, + "args": { + "External id": 123463,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685221217.654, "dur": 135.350, + "args": { + "External id": 123464,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685221412.773, "dur": 94.640, + "args": { + "External id": 123465,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685221438.643, "dur": 65.610, + "args": { + "External id": 123466,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685221474.743, "dur": 8.530, + "args": { + "External id": 123467,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685221600.303, "dur": 141.910, + "args": { + "External id": 123468,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685221618.383, "dur": 8.000, + "args": { + "External id": 123469,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685221633.893, "dur": 5.670, + "args": { + "External id": 123470,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685221645.453, "dur": 1.550, + "args": { + "External id": 123471,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685221656.723, "dur": 1.600, + "args": { + "External id": 123472,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685221668.073, "dur": 1.350, + "args": { + "External id": 123473,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685221675.183, "dur": 1.290, + "args": { + "External id": 123474,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685221685.583, "dur": 1.320, + "args": { + "External id": 123475,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685221696.263, "dur": 1.370, + "args": { + "External id": 123476,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685221706.793, "dur": 5.090, + "args": { + "External id": 123477,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685221853.743, "dur": 233.879, + "args": { + "External id": 123478,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685222719.290, "dur": 719.459, + "args": { + "External id": 123479,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685222829.710, "dur": 569.799, + "args": { + "External id": 123480,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4099, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685222908.910, "dur": 430.979, + "args": { + "External id": 123481,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685223722.378, "dur": 42.870, + "args": { + "External id": 123482,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4101, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 5717, "tid": 5717, + "ts": 6302685224400.487, "dur": 1785.706, + "args": { + "External id": 123483,"Record function id": 0, "Ev Idx": 4102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685224992.275, "dur": 28.940, + "args": { + "External id": 123484,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685225048.295, "dur": 5.010, + "args": { + "External id": 123485,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685225063.655, "dur": 3.900, + "args": { + "External id": 123486,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685225078.875, "dur": 3.520, + "args": { + "External id": 123487,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685225091.045, "dur": 3.280, + "args": { + "External id": 123488,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685225102.495, "dur": 3.340, + "args": { + "External id": 123489,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685225114.685, "dur": 3.440, + "args": { + "External id": 123490,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685225131.465, "dur": 9.580, + "args": { + "External id": 123491,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685225149.395, "dur": 3.580, + "args": { + "External id": 123492,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685225161.245, "dur": 3.030, + "args": { + "External id": 123493,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685225276.355, "dur": 701.548, + "args": { + "External id": 123494,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685225415.594, "dur": 534.709, + "args": { + "External id": 123495,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685225509.204, "dur": 64.670, + "args": { + "External id": 123496,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685225588.494, "dur": 214.969, + "args": { + "External id": 123497,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685225599.284, "dur": 202.270, + "args": { + "External id": 123498,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685225616.144, "dur": 48.850, + "args": { + "External id": 123499,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685225670.944, "dur": 127.850, + "args": { + "External id": 123500,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5717, "tid": 5717, + "ts": 6302685227125.720, "dur": 143.180, + "args": { + "External id": 123501,"Sequence number": 2575786, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4120 + } + }, + { + "ph": "s", "id": 280, "pid": 5717, "tid": 5717, "ts": 6302685227125.720, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685227197.940, "dur": 44.730, + "args": { + "External id": 123502,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685227214.280, "dur": 24.700, + "args": { + "External id": 123503,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685227667.169, "dur": 66.390, + "args": { + "External id": 123504,"Record function id": 0, "Ev Idx": 4123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5717, "tid": 5717, + "ts": 6302685227739.479, "dur": 6192.066, + "args": { + "External id": 123505,"Record function id": 0, "Ev Idx": 4124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685227873.949, "dur": 732.158, + "args": { + "External id": 123506,"Sequence number": 2575787, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4125 + } + }, + { + "ph": "s", "id": 279, "pid": 5717, "tid": 5717, "ts": 6302685227873.949, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685228196.488, "dur": 195.649, + "args": { + "External id": 123507,"kernel_hash": "cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/dl/cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685228472.908, "dur": 26.629, + "args": { + "External id": 123508,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685228480.288, "dur": 18.029, + "args": { + "External id": 123509,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685228729.997, "dur": 62.660, + "args": { + "External id": 123510,"Record function id": 0, "Ev Idx": 4129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5717, "tid": 5717, + "ts": 6302685228798.517, "dur": 4319.130, + "args": { + "External id": 123511,"Record function id": 0, "Ev Idx": 4130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685228926.476, "dur": 1270.117, + "args": { + "External id": 123512,"Sequence number": 2575788, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4131 + } + }, + { + "ph": "s", "id": 278, "pid": 5717, "tid": 5717, "ts": 6302685228926.476, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685229086.986, "dur": 190.249, + "args": { + "External id": 123513,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685229386.395, "dur": 171.430, + "args": { + "External id": 123514,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685229627.335, "dur": 97.190, + "args": { + "External id": 123515,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685229932.104, "dur": 51.750, + "args": { + "External id": 123516,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685230048.774, "dur": 5.720, + "args": { + "External id": 123517,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685230084.544, "dur": 6.640, + "args": { + "External id": 123518,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685230352.243, "dur": 43.050, + "args": { + "External id": 123519,"Record function id": 0, "Ev Idx": 4138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5717, "tid": 5717, + "ts": 6302685230399.403, "dur": 1891.346, + "args": { + "External id": 123520,"Record function id": 0, "Ev Idx": 4139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685230471.643, "dur": 13.210, + "args": { + "External id": 123521,"Record function id": 0, "Ev Idx": 4140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685230487.463, "dur": 1001.618, + "args": { + "External id": 123522,"Record function id": 0, "Ev Idx": 4141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685230542.343, "dur": 942.927, + "args": { + "External id": 123523,"Sequence number": 2575789, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4142 + } + }, + { + "ph": "s", "id": 277, "pid": 5717, "tid": 5717, "ts": 6302685230542.343, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685230564.893, "dur": 126.319, + "args": { + "External id": 123524,"Record function id": 0, "Ev Idx": 4143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685230694.822, "dur": 762.699, + "args": { + "External id": 123525,"Record function id": 0, "Ev Idx": 4144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685230791.372, "dur": 14.340, + "args": { + "External id": 123526,"Record function id": 0, "Ev Idx": 4145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685230809.002, "dur": 563.169, + "args": { + "External id": 123527,"Record function id": 0, "Ev Idx": 4146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685230821.472, "dur": 15.200, + "args": { + "External id": 123528,"Record function id": 0, "Ev Idx": 4147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685230839.362, "dur": 522.129, + "args": { + "External id": 123529,"Record function id": 0, "Ev Idx": 4148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685231011.432, "dur": 24.339, + "args": { + "External id": 123530,"Record function id": 0, "Ev Idx": 4149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685231040.642, "dur": 315.699, + "args": { + "External id": 123531,"Record function id": 0, "Ev Idx": 4150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685231186.261, "dur": 93.890, + "args": { + "External id": 123532,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685231396.871, "dur": 12.400, + "args": { + "External id": 123533,"Record function id": 0, "Ev Idx": 4152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685231412.181, "dur": 42.940, + "args": { + "External id": 123534,"Record function id": 0, "Ev Idx": 4153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685231500.090, "dur": 14.000, + "args": { + "External id": 123535,"Record function id": 0, "Ev Idx": 4154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5717, "tid": 5717, + "ts": 6302685231516.121, "dur": 773.138, + "args": { + "External id": 123536,"Record function id": 0, "Ev Idx": 4155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685231527.470, "dur": 7.880, + "args": { + "External id": 123537,"Record function id": 0, "Ev Idx": 4156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685231537.061, "dur": 749.428, + "args": { + "External id": 123538,"Record function id": 0, "Ev Idx": 4157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685231680.890, "dur": 601.899, + "args": { + "External id": 123539,"Sequence number": 2575790, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4158 + } + }, + { + "ph": "s", "id": 276, "pid": 5717, "tid": 5717, "ts": 6302685231680.890, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685231695.150, "dur": 15.540, + "args": { + "External id": 123540,"Record function id": 0, "Ev Idx": 4159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685231713.200, "dur": 545.819, + "args": { + "External id": 123541,"Record function id": 0, "Ev Idx": 4160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685231777.270, "dur": 8.970, + "args": { + "External id": 123542,"Record function id": 0, "Ev Idx": 4161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685231788.820, "dur": 401.679, + "args": { + "External id": 123543,"Record function id": 0, "Ev Idx": 4162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685231798.430, "dur": 10.180, + "args": { + "External id": 123544,"Record function id": 0, "Ev Idx": 4163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685231810.590, "dur": 373.639, + "args": { + "External id": 123545,"Record function id": 0, "Ev Idx": 4164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685231943.540, "dur": 16.960, + "args": { + "External id": 123546,"Record function id": 0, "Ev Idx": 4165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685231963.109, "dur": 218.250, + "args": { + "External id": 123547,"Record function id": 0, "Ev Idx": 4166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685232050.529, "dur": 76.610, + "args": { + "External id": 123548,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685232205.989, "dur": 9.610, + "args": { + "External id": 123549,"Record function id": 0, "Ev Idx": 4168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685232217.819, "dur": 39.060, + "args": { + "External id": 123550,"Record function id": 0, "Ev Idx": 4169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685232334.909, "dur": 24.430, + "args": { + "External id": 123551,"Record function id": 0, "Ev Idx": 4170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5717, "tid": 5717, + "ts": 6302685232361.588, "dur": 752.979, + "args": { + "External id": 123552,"Record function id": 0, "Ev Idx": 4171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685232416.659, "dur": 672.368, + "args": { + "External id": 123553,"Sequence number": 2575791, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4172 + } + }, + { + "ph": "s", "id": 275, "pid": 5717, "tid": 5717, "ts": 6302685232416.659, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685232604.528, "dur": 273.139, + "args": { + "External id": 123554,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685232683.778, "dur": 31.450, + "args": { + "External id": 123555,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685232691.408, "dur": 21.380, + "args": { + "External id": 123556,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685232719.648, "dur": 13.870, + "args": { + "External id": 123557,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685232737.918, "dur": 6.010, + "args": { + "External id": 123558,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685232749.958, "dur": 8.950, + "args": { + "External id": 123559,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685232918.237, "dur": 72.630, + "args": { + "External id": 123560,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685233139.077, "dur": 47.800, + "args": { + "External id": 123561,"Record function id": 0, "Ev Idx": 4180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5717, "tid": 5717, + "ts": 6302685233189.267, "dur": 735.928, + "args": { + "External id": 123562,"Record function id": 0, "Ev Idx": 4181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685233255.457, "dur": 647.378, + "args": { + "External id": 123563,"Sequence number": 2575792, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4182 + } + }, + { + "ph": "s", "id": 274, "pid": 5717, "tid": 5717, "ts": 6302685233255.457, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685233371.606, "dur": 55.560, + "args": { + "External id": 123564,"kernel_hash": "cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/qk/cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685233457.226, "dur": 54.320, + "args": { + "External id": 123565,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685233534.466, "dur": 35.790, + "args": { + "External id": 123566,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5717, "tid": 5717, + "ts": 6302685233611.906, "dur": 36.940, + "args": { + "External id": 123567,"kernel_hash": "cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/s2/cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685233671.996, "dur": 63.020, + "args": { + "External id": 123568,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5717, "tid": 5717, + "ts": 6302685233792.615, "dur": 40.300, + "args": { + "External id": 123569,"kernel_hash": "c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/52/c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4188 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.3)", "pid": 5717, "tid": 5717, + "ts": 6302685234014.275, "dur": 106.100, + "args": { + "External id": 123570,"Record function id": 0, "Ev Idx": 4189 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685234261.114, "dur": 157.110, + "args": { + "External id": 123571,"Record function id": 0, "Ev Idx": 4190 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.4)", "pid": 5717, "tid": 5717, + "ts": 6302685234437.234, "dur": 1610.126, + "args": { + "External id": 123572,"Record function id": 0, "Ev Idx": 4191 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 5717, "tid": 5717, + "ts": 6302685234451.324, "dur": 833.018, + "args": { + "External id": 123573,"Record function id": 0, "Ev Idx": 4192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685234563.964, "dur": 15.290, + "args": { + "External id": 123574,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685234596.703, "dur": 38.060, + "args": { + "External id": 123575,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234602.814, "dur": 2.289, + "args": { + "External id": 123576,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234608.903, "dur": 0.440, + "args": { + "External id": 123577,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234610.803, "dur": 0.351, + "args": { + "External id": 123578,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234613.934, "dur": 2.929, + "args": { + "External id": 123579,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234618.134, "dur": 0.329, + "args": { + "External id": 123580,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234621.214, "dur": 0.320, + "args": { + "External id": 123581,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234624.063, "dur": 0.480, + "args": { + "External id": 123582,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234625.734, "dur": 0.269, + "args": { + "External id": 123583,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234628.794, "dur": 0.289, + "args": { + "External id": 123584,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685234648.014, "dur": 40.969, + "args": { + "External id": 123585,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685234742.933, "dur": 158.660, + "args": { + "External id": 123586,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685234761.103, "dur": 11.690, + "args": { + "External id": 123587,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685234780.233, "dur": 14.240, + "args": { + "External id": 123588,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685234784.303, "dur": 9.510, + "args": { + "External id": 123589,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234788.703, "dur": 2.290, + "args": { + "External id": 123590,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685234807.403, "dur": 32.630, + "args": { + "External id": 123591,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234811.253, "dur": 1.780, + "args": { + "External id": 123592,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234814.733, "dur": 0.380, + "args": { + "External id": 123593,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234816.323, "dur": 0.390, + "args": { + "External id": 123594,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234820.613, "dur": 0.350, + "args": { + "External id": 123595,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234822.243, "dur": 0.320, + "args": { + "External id": 123596,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234823.833, "dur": 1.810, + "args": { + "External id": 123597,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234828.353, "dur": 0.320, + "args": { + "External id": 123598,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234829.863, "dur": 0.650, + "args": { + "External id": 123599,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685234834.243, "dur": 1.750, + "args": { + "External id": 123600,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685234856.943, "dur": 29.980, + "args": { + "External id": 123601,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685234992.763, "dur": 150.549, + "args": { + "External id": 123602,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685235022.633, "dur": 114.249, + "args": { + "External id": 123603,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4222, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685235044.273, "dur": 84.959, + "args": { + "External id": 123604,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685235173.122, "dur": 6.940, + "args": { + "External id": 123605,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4224, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 5717, "tid": 5717, + "ts": 6302685235339.502, "dur": 481.529, + "args": { + "External id": 123606,"Record function id": 0, "Ev Idx": 4225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235475.361, "dur": 6.191, + "args": { + "External id": 123607,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235487.381, "dur": 1.231, + "args": { + "External id": 123608,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235491.101, "dur": 0.911, + "args": { + "External id": 123609,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235494.472, "dur": 0.809, + "args": { + "External id": 123610,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235497.092, "dur": 0.809, + "args": { + "External id": 123611,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235501.032, "dur": 111.319, + "args": { + "External id": 123612,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235614.361, "dur": 0.930, + "args": { + "External id": 123613,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235617.481, "dur": 2.840, + "args": { + "External id": 123614,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235622.161, "dur": 0.900, + "args": { + "External id": 123615,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235626.201, "dur": 0.880, + "args": { + "External id": 123616,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685235647.581, "dur": 133.120, + "args": { + "External id": 123617,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685235663.801, "dur": 111.650, + "args": { + "External id": 123618,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685235684.871, "dur": 9.090, + "args": { + "External id": 123619,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685235697.091, "dur": 47.310, + "args": { + "External id": 123620,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685235699.081, "dur": 44.800, + "args": { + "External id": 123621,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685235703.751, "dur": 9.220, + "args": { + "External id": 123622,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685235714.781, "dur": 28.180, + "args": { + "External id": 123623,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5717, "tid": 5717, + "ts": 6302685235974.911, "dur": 34.289, + "args": { + "External id": 123624,"Sequence number": 2575793, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4243 + } + }, + { + "ph": "s", "id": 273, "pid": 5717, "tid": 5717, "ts": 6302685235974.911, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685235991.240, "dur": 12.330, + "args": { + "External id": 123625,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685235997.500, "dur": 5.270, + "args": { + "External id": 123626,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685236089.380, "dur": 16.020, + "args": { + "External id": 123627,"Record function id": 0, "Ev Idx": 4246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5717, "tid": 5717, + "ts": 6302685236107.040, "dur": 2023.586, + "args": { + "External id": 123628,"Record function id": 0, "Ev Idx": 4247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685236140.220, "dur": 198.019, + "args": { + "External id": 123629,"Sequence number": 2575794, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4248 + } + }, + { + "ph": "s", "id": 272, "pid": 5717, "tid": 5717, "ts": 6302685236140.220, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685236217.740, "dur": 46.230, + "args": { + "External id": 123630,"kernel_hash": "cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/dl/cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685236286.100, "dur": 8.410, + "args": { + "External id": 123631,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685236288.270, "dur": 5.770, + "args": { + "External id": 123632,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685236371.430, "dur": 16.080, + "args": { + "External id": 123633,"Record function id": 0, "Ev Idx": 4252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5717, "tid": 5717, + "ts": 6302685236388.919, "dur": 1301.717, + "args": { + "External id": 123634,"Record function id": 0, "Ev Idx": 4253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685236427.870, "dur": 252.859, + "args": { + "External id": 123635,"Sequence number": 2575795, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4254 + } + }, + { + "ph": "s", "id": 271, "pid": 5717, "tid": 5717, "ts": 6302685236427.870, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685236472.409, "dur": 41.920, + "args": { + "External id": 123636,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685236529.709, "dur": 23.970, + "args": { + "External id": 123637,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685236567.639, "dur": 20.700, + "args": { + "External id": 123638,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685236631.169, "dur": 5.140, + "args": { + "External id": 123639,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685236646.529, "dur": 1.220, + "args": { + "External id": 123640,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685236653.759, "dur": 2.150, + "args": { + "External id": 123641,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685236706.379, "dur": 11.250, + "args": { + "External id": 123642,"Record function id": 0, "Ev Idx": 4261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5717, "tid": 5717, + "ts": 6302685236718.799, "dur": 643.308, + "args": { + "External id": 123643,"Record function id": 0, "Ev Idx": 4262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685236741.159, "dur": 4.180, + "args": { + "External id": 123644,"Record function id": 0, "Ev Idx": 4263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685236746.159, "dur": 335.989, + "args": { + "External id": 123645,"Record function id": 0, "Ev Idx": 4264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685236763.369, "dur": 317.249, + "args": { + "External id": 123646,"Sequence number": 2575796, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4265 + } + }, + { + "ph": "s", "id": 270, "pid": 5717, "tid": 5717, "ts": 6302685236763.369, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685236772.169, "dur": 8.220, + "args": { + "External id": 123647,"Record function id": 0, "Ev Idx": 4266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685236781.538, "dur": 287.420, + "args": { + "External id": 123648,"Record function id": 0, "Ev Idx": 4267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685236810.629, "dur": 8.460, + "args": { + "External id": 123649,"Record function id": 0, "Ev Idx": 4268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685236820.918, "dur": 215.450, + "args": { + "External id": 123650,"Record function id": 0, "Ev Idx": 4269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685236825.409, "dur": 4.980, + "args": { + "External id": 123651,"Record function id": 0, "Ev Idx": 4270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685236831.298, "dur": 201.400, + "args": { + "External id": 123652,"Record function id": 0, "Ev Idx": 4271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685236902.968, "dur": 9.920, + "args": { + "External id": 123653,"Record function id": 0, "Ev Idx": 4272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685236915.948, "dur": 113.830, + "args": { + "External id": 123654,"Record function id": 0, "Ev Idx": 4273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685236965.538, "dur": 35.590, + "args": { + "External id": 123655,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685237044.678, "dur": 5.170, + "args": { + "External id": 123656,"Record function id": 0, "Ev Idx": 4275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685237050.708, "dur": 17.360, + "args": { + "External id": 123657,"Record function id": 0, "Ev Idx": 4276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685237086.918, "dur": 5.820, + "args": { + "External id": 123658,"Record function id": 0, "Ev Idx": 4277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5717, "tid": 5717, + "ts": 6302685237093.608, "dur": 267.959, + "args": { + "External id": 123659,"Record function id": 0, "Ev Idx": 4278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685237097.438, "dur": 3.160, + "args": { + "External id": 123660,"Record function id": 0, "Ev Idx": 4279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685237101.328, "dur": 259.139, + "args": { + "External id": 123661,"Record function id": 0, "Ev Idx": 4280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685237119.948, "dur": 239.219, + "args": { + "External id": 123662,"Sequence number": 2575797, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4281 + } + }, + { + "ph": "s", "id": 269, "pid": 5717, "tid": 5717, "ts": 6302685237119.948, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685237127.318, "dur": 6.290, + "args": { + "External id": 123663,"Record function id": 0, "Ev Idx": 4282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685237134.478, "dur": 214.689, + "args": { + "External id": 123664,"Record function id": 0, "Ev Idx": 4283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685237161.438, "dur": 3.540, + "args": { + "External id": 123665,"Record function id": 0, "Ev Idx": 4284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685237165.948, "dur": 154.489, + "args": { + "External id": 123666,"Record function id": 0, "Ev Idx": 4285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685237172.248, "dur": 4.060, + "args": { + "External id": 123667,"Record function id": 0, "Ev Idx": 4286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685237177.098, "dur": 140.119, + "args": { + "External id": 123668,"Record function id": 0, "Ev Idx": 4287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685237219.837, "dur": 6.551, + "args": { + "External id": 123669,"Record function id": 0, "Ev Idx": 4288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685237227.468, "dur": 88.469, + "args": { + "External id": 123670,"Record function id": 0, "Ev Idx": 4289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685237263.777, "dur": 28.491, + "args": { + "External id": 123671,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685237327.887, "dur": 4.390, + "args": { + "External id": 123672,"Record function id": 0, "Ev Idx": 4291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685237333.247, "dur": 15.200, + "args": { + "External id": 123673,"Record function id": 0, "Ev Idx": 4292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685237373.897, "dur": 10.740, + "args": { + "External id": 123674,"Record function id": 0, "Ev Idx": 4293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5717, "tid": 5717, + "ts": 6302685237385.437, "dur": 303.839, + "args": { + "External id": 123675,"Record function id": 0, "Ev Idx": 4294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685237411.887, "dur": 264.820, + "args": { + "External id": 123676,"Sequence number": 2575798, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4295 + } + }, + { + "ph": "s", "id": 268, "pid": 5717, "tid": 5717, "ts": 6302685237411.887, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685237441.547, "dur": 140.710, + "args": { + "External id": 123677,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685237483.897, "dur": 15.320, + "args": { + "External id": 123678,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685237487.527, "dur": 10.500, + "args": { + "External id": 123679,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685237501.357, "dur": 7.810, + "args": { + "External id": 123680,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685237511.947, "dur": 2.920, + "args": { + "External id": 123681,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685237517.857, "dur": 5.610, + "args": { + "External id": 123682,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685237602.747, "dur": 36.180, + "args": { + "External id": 123683,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685237701.116, "dur": 23.951, + "args": { + "External id": 123684,"Record function id": 0, "Ev Idx": 4303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5717, "tid": 5717, + "ts": 6302685237726.156, "dur": 400.790, + "args": { + "External id": 123685,"Record function id": 0, "Ev Idx": 4304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685237758.096, "dur": 356.179, + "args": { + "External id": 123686,"Sequence number": 2575799, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4305 + } + }, + { + "ph": "s", "id": 267, "pid": 5717, "tid": 5717, "ts": 6302685237758.096, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685237815.746, "dur": 33.040, + "args": { + "External id": 123687,"kernel_hash": "cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/qk/cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685237867.516, "dur": 32.590, + "args": { + "External id": 123688,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685237913.356, "dur": 22.030, + "args": { + "External id": 123689,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5717, "tid": 5717, + "ts": 6302685237963.106, "dur": 22.640, + "args": { + "External id": 123690,"kernel_hash": "cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/s2/cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685238000.206, "dur": 29.340, + "args": { + "External id": 123691,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5717, "tid": 5717, + "ts": 6302685238055.566, "dur": 19.130, + "args": { + "External id": 123692,"kernel_hash": "c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/52/c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4311 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.4)", "pid": 5717, "tid": 5717, + "ts": 6302685238181.706, "dur": 68.219, + "args": { + "External id": 123693,"Record function id": 0, "Ev Idx": 4312 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685238338.255, "dur": 58.410, + "args": { + "External id": 123694,"Record function id": 0, "Ev Idx": 4313 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.5)", "pid": 5717, "tid": 5717, + "ts": 6302685238409.345, "dur": 1185.157, + "args": { + "External id": 123695,"Record function id": 0, "Ev Idx": 4314 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 5717, "tid": 5717, + "ts": 6302685238421.035, "dur": 630.198, + "args": { + "External id": 123696,"Record function id": 0, "Ev Idx": 4315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685238506.995, "dur": 11.260, + "args": { + "External id": 123697,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685238530.925, "dur": 29.920, + "args": { + "External id": 123698,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238535.825, "dur": 1.680, + "args": { + "External id": 123699,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238540.695, "dur": 0.250, + "args": { + "External id": 123700,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238542.254, "dur": 0.280, + "args": { + "External id": 123701,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238544.605, "dur": 2.660, + "args": { + "External id": 123702,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238548.234, "dur": 0.220, + "args": { + "External id": 123703,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238550.565, "dur": 0.220, + "args": { + "External id": 123704,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238552.734, "dur": 0.331, + "args": { + "External id": 123705,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238553.954, "dur": 0.220, + "args": { + "External id": 123706,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238556.314, "dur": 0.311, + "args": { + "External id": 123707,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685238571.694, "dur": 30.000, + "args": { + "External id": 123708,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685238642.274, "dur": 128.140, + "args": { + "External id": 123709,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685238656.234, "dur": 9.090, + "args": { + "External id": 123710,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685238671.034, "dur": 11.050, + "args": { + "External id": 123711,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685238674.154, "dur": 7.390, + "args": { + "External id": 123712,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238677.354, "dur": 2.080, + "args": { + "External id": 123713,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685238691.094, "dur": 30.060, + "args": { + "External id": 123714,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238694.404, "dur": 1.620, + "args": { + "External id": 123715,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238697.304, "dur": 0.290, + "args": { + "External id": 123716,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238701.664, "dur": 0.320, + "args": { + "External id": 123717,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238705.664, "dur": 0.310, + "args": { + "External id": 123718,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238706.884, "dur": 0.240, + "args": { + "External id": 123719,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238708.204, "dur": 1.400, + "args": { + "External id": 123720,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238711.744, "dur": 0.230, + "args": { + "External id": 123721,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238712.854, "dur": 0.340, + "args": { + "External id": 123722,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685238716.624, "dur": 1.560, + "args": { + "External id": 123723,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685238735.954, "dur": 23.740, + "args": { + "External id": 123724,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685238841.634, "dur": 117.890, + "args": { + "External id": 123725,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685238866.644, "dur": 88.840, + "args": { + "External id": 123726,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4345, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685238881.484, "dur": 68.340, + "args": { + "External id": 123727,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685238979.284, "dur": 4.409, + "args": { + "External id": 123728,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4347, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 5717, "tid": 5717, + "ts": 6302685239076.833, "dur": 325.870, + "args": { + "External id": 123729,"Record function id": 0, "Ev Idx": 4348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239189.083, "dur": 5.140, + "args": { + "External id": 123730,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239200.093, "dur": 1.020, + "args": { + "External id": 123731,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239203.283, "dur": 0.810, + "args": { + "External id": 123732,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239206.133, "dur": 0.730, + "args": { + "External id": 123733,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239208.513, "dur": 0.780, + "args": { + "External id": 123734,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239212.693, "dur": 0.760, + "args": { + "External id": 123735,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239215.153, "dur": 0.760, + "args": { + "External id": 123736,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239217.753, "dur": 2.750, + "args": { + "External id": 123737,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239222.063, "dur": 0.840, + "args": { + "External id": 123738,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239225.723, "dur": 0.570, + "args": { + "External id": 123739,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685239243.953, "dur": 123.250, + "args": { + "External id": 123740,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685239257.483, "dur": 104.890, + "args": { + "External id": 123741,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685239272.723, "dur": 8.050, + "args": { + "External id": 123742,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685239283.263, "dur": 51.050, + "args": { + "External id": 123743,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685239284.993, "dur": 48.890, + "args": { + "External id": 123744,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685239288.963, "dur": 18.240, + "args": { + "External id": 123745,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685239308.943, "dur": 24.170, + "args": { + "External id": 123746,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5717, "tid": 5717, + "ts": 6302685239536.432, "dur": 26.380, + "args": { + "External id": 123747,"Sequence number": 2575800, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4366 + } + }, + { + "ph": "s", "id": 266, "pid": 5717, "tid": 5717, "ts": 6302685239536.432, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685239550.122, "dur": 8.120, + "args": { + "External id": 123748,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685239553.072, "dur": 4.500, + "args": { + "External id": 123749,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685239631.342, "dur": 13.020, + "args": { + "External id": 123750,"Record function id": 0, "Ev Idx": 4369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5717, "tid": 5717, + "ts": 6302685239645.702, "dur": 1896.826, + "args": { + "External id": 123751,"Record function id": 0, "Ev Idx": 4370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685239673.242, "dur": 137.800, + "args": { + "External id": 123752,"Sequence number": 2575801, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4371 + } + }, + { + "ph": "s", "id": 265, "pid": 5717, "tid": 5717, "ts": 6302685239673.242, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685239731.762, "dur": 33.930, + "args": { + "External id": 123753,"kernel_hash": "cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/dl/cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685239782.322, "dur": 5.960, + "args": { + "External id": 123754,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685239783.712, "dur": 4.140, + "args": { + "External id": 123755,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685239837.602, "dur": 12.900, + "args": { + "External id": 123756,"Record function id": 0, "Ev Idx": 4375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5717, "tid": 5717, + "ts": 6302685239851.762, "dur": 1210.437, + "args": { + "External id": 123757,"Record function id": 0, "Ev Idx": 4376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685239878.091, "dur": 236.590, + "args": { + "External id": 123758,"Sequence number": 2575802, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4377 + } + }, + { + "ph": "s", "id": 264, "pid": 5717, "tid": 5717, "ts": 6302685239878.091, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685239911.091, "dur": 40.391, + "args": { + "External id": 123759,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685239966.521, "dur": 22.840, + "args": { + "External id": 123760,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685240004.311, "dur": 20.370, + "args": { + "External id": 123761,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685240067.101, "dur": 5.280, + "args": { + "External id": 123762,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685240082.431, "dur": 1.130, + "args": { + "External id": 123763,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685240089.641, "dur": 2.090, + "args": { + "External id": 123764,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240139.521, "dur": 11.260, + "args": { + "External id": 123765,"Record function id": 0, "Ev Idx": 4384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5717, "tid": 5717, + "ts": 6302685240151.821, "dur": 564.339, + "args": { + "External id": 123766,"Record function id": 0, "Ev Idx": 4385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240173.691, "dur": 3.780, + "args": { + "External id": 123767,"Record function id": 0, "Ev Idx": 4386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685240178.221, "dur": 273.849, + "args": { + "External id": 123768,"Record function id": 0, "Ev Idx": 4387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685240194.081, "dur": 256.479, + "args": { + "External id": 123769,"Sequence number": 2575803, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4388 + } + }, + { + "ph": "s", "id": 263, "pid": 5717, "tid": 5717, "ts": 6302685240194.081, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240204.481, "dur": 6.180, + "args": { + "External id": 123770,"Record function id": 0, "Ev Idx": 4389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685240211.691, "dur": 228.039, + "args": { + "External id": 123771,"Record function id": 0, "Ev Idx": 4390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240239.641, "dur": 4.160, + "args": { + "External id": 123772,"Record function id": 0, "Ev Idx": 4391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685240244.721, "dur": 165.389, + "args": { + "External id": 123773,"Record function id": 0, "Ev Idx": 4392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240248.611, "dur": 4.440, + "args": { + "External id": 123774,"Record function id": 0, "Ev Idx": 4393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685240253.971, "dur": 152.459, + "args": { + "External id": 123775,"Record function id": 0, "Ev Idx": 4394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240311.010, "dur": 7.171, + "args": { + "External id": 123776,"Record function id": 0, "Ev Idx": 4395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685240319.781, "dur": 85.469, + "args": { + "External id": 123777,"Record function id": 0, "Ev Idx": 4396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685240359.101, "dur": 31.760, + "args": { + "External id": 123778,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240417.260, "dur": 4.150, + "args": { + "External id": 123779,"Record function id": 0, "Ev Idx": 4398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685240422.350, "dur": 16.500, + "args": { + "External id": 123780,"Record function id": 0, "Ev Idx": 4399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240456.070, "dur": 5.440, + "args": { + "External id": 123781,"Record function id": 0, "Ev Idx": 4400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5717, "tid": 5717, + "ts": 6302685240462.330, "dur": 253.210, + "args": { + "External id": 123782,"Record function id": 0, "Ev Idx": 4401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240466.290, "dur": 2.910, + "args": { + "External id": 123783,"Record function id": 0, "Ev Idx": 4402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685240469.900, "dur": 244.470, + "args": { + "External id": 123784,"Record function id": 0, "Ev Idx": 4403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685240487.730, "dur": 225.290, + "args": { + "External id": 123785,"Sequence number": 2575804, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4404 + } + }, + { + "ph": "s", "id": 262, "pid": 5717, "tid": 5717, "ts": 6302685240487.730, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240493.470, "dur": 5.970, + "args": { + "External id": 123786,"Record function id": 0, "Ev Idx": 4405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685240500.460, "dur": 202.730, + "args": { + "External id": 123787,"Record function id": 0, "Ev Idx": 4406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240525.550, "dur": 3.650, + "args": { + "External id": 123788,"Record function id": 0, "Ev Idx": 4407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685240530.130, "dur": 141.270, + "args": { + "External id": 123789,"Record function id": 0, "Ev Idx": 4408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240534.910, "dur": 3.970, + "args": { + "External id": 123790,"Record function id": 0, "Ev Idx": 4409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685240539.610, "dur": 126.000, + "args": { + "External id": 123791,"Record function id": 0, "Ev Idx": 4410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240579.090, "dur": 6.010, + "args": { + "External id": 123792,"Record function id": 0, "Ev Idx": 4411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685240586.070, "dur": 78.550, + "args": { + "External id": 123793,"Record function id": 0, "Ev Idx": 4412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685240623.390, "dur": 27.660, + "args": { + "External id": 123794,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240680.510, "dur": 4.750, + "args": { + "External id": 123795,"Record function id": 0, "Ev Idx": 4414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685240686.120, "dur": 16.130, + "args": { + "External id": 123796,"Record function id": 0, "Ev Idx": 4415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685240728.110, "dur": 11.290, + "args": { + "External id": 123797,"Record function id": 0, "Ev Idx": 4416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5717, "tid": 5717, + "ts": 6302685240740.290, "dur": 320.369, + "args": { + "External id": 123798,"Record function id": 0, "Ev Idx": 4417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685240766.680, "dur": 280.899, + "args": { + "External id": 123799,"Sequence number": 2575805, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4418 + } + }, + { + "ph": "s", "id": 261, "pid": 5717, "tid": 5717, "ts": 6302685240766.680, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685240799.149, "dur": 151.660, + "args": { + "External id": 123800,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685240839.800, "dur": 15.509, + "args": { + "External id": 123801,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685240843.500, "dur": 10.549, + "args": { + "External id": 123802,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685240857.439, "dur": 10.730, + "args": { + "External id": 123803,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685240871.949, "dur": 3.200, + "args": { + "External id": 123804,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685240878.369, "dur": 4.970, + "args": { + "External id": 123805,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685240971.909, "dur": 36.430, + "args": { + "External id": 123806,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685241072.779, "dur": 24.070, + "args": { + "External id": 123807,"Record function id": 0, "Ev Idx": 4426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5717, "tid": 5717, + "ts": 6302685241098.029, "dur": 440.739, + "args": { + "External id": 123808,"Record function id": 0, "Ev Idx": 4427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685241129.919, "dur": 394.999, + "args": { + "External id": 123809,"Sequence number": 2575806, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4428 + } + }, + { + "ph": "s", "id": 260, "pid": 5717, "tid": 5717, "ts": 6302685241129.919, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685241192.319, "dur": 35.660, + "args": { + "External id": 123810,"kernel_hash": "cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/qk/cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685241247.368, "dur": 33.340, + "args": { + "External id": 123811,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685241294.858, "dur": 34.590, + "args": { + "External id": 123812,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5717, "tid": 5717, + "ts": 6302685241362.268, "dur": 26.250, + "args": { + "External id": 123813,"kernel_hash": "cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/s2/cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685241404.398, "dur": 32.000, + "args": { + "External id": 123814,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5717, "tid": 5717, + "ts": 6302685241461.728, "dur": 21.150, + "args": { + "External id": 123815,"kernel_hash": "c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/52/c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4434 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.5)", "pid": 5717, "tid": 5717, + "ts": 6302685241594.958, "dur": 69.900, + "args": { + "External id": 123816,"Record function id": 0, "Ev Idx": 4435 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685241742.637, "dur": 63.400, + "args": { + "External id": 123817,"Record function id": 0, "Ev Idx": 4436 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.6)", "pid": 5717, "tid": 5717, + "ts": 6302685241820.727, "dur": 1290.337, + "args": { + "External id": 123818,"Record function id": 0, "Ev Idx": 4437 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 5717, "tid": 5717, + "ts": 6302685241833.617, "dur": 720.679, + "args": { + "External id": 123819,"Record function id": 0, "Ev Idx": 4438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685241941.037, "dur": 11.880, + "args": { + "External id": 123820,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685241966.387, "dur": 32.410, + "args": { + "External id": 123821,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685241971.317, "dur": 1.830, + "args": { + "External id": 123822,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685241976.817, "dur": 0.300, + "args": { + "External id": 123823,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685241978.137, "dur": 0.280, + "args": { + "External id": 123824,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685241980.817, "dur": 3.160, + "args": { + "External id": 123825,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685241985.017, "dur": 0.320, + "args": { + "External id": 123826,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685241987.557, "dur": 0.290, + "args": { + "External id": 123827,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685241990.187, "dur": 0.380, + "args": { + "External id": 123828,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685241991.477, "dur": 0.290, + "args": { + "External id": 123829,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685241993.907, "dur": 0.290, + "args": { + "External id": 123830,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685242009.227, "dur": 30.560, + "args": { + "External id": 123831,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685242080.897, "dur": 141.739, + "args": { + "External id": 123832,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685242095.257, "dur": 18.549, + "args": { + "External id": 123833,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685242119.997, "dur": 11.829, + "args": { + "External id": 123834,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685242123.186, "dur": 8.131, + "args": { + "External id": 123835,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242126.717, "dur": 2.340, + "args": { + "External id": 123836,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685242141.806, "dur": 28.011, + "args": { + "External id": 123837,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242144.886, "dur": 2.280, + "args": { + "External id": 123838,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242148.497, "dur": 0.340, + "args": { + "External id": 123839,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242149.746, "dur": 0.240, + "args": { + "External id": 123840,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242153.666, "dur": 0.391, + "args": { + "External id": 123841,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242154.986, "dur": 0.331, + "args": { + "External id": 123842,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242156.386, "dur": 1.571, + "args": { + "External id": 123843,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242160.466, "dur": 0.211, + "args": { + "External id": 123844,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242161.626, "dur": 0.231, + "args": { + "External id": 123845,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242165.177, "dur": 1.609, + "args": { + "External id": 123846,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685242186.416, "dur": 24.720, + "args": { + "External id": 123847,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685242294.606, "dur": 130.990, + "args": { + "External id": 123848,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685242326.976, "dur": 94.070, + "args": { + "External id": 123849,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4468, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685242342.666, "dur": 67.960, + "args": { + "External id": 123850,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685242450.156, "dur": 8.310, + "args": { + "External id": 123851,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4470, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 5717, "tid": 5717, + "ts": 6302685242581.365, "dur": 326.270, + "args": { + "External id": 123852,"Record function id": 0, "Ev Idx": 4471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685242697.385, "dur": 5.480, + "args": { + "External id": 123853,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685242708.155, "dur": 1.110, + "args": { + "External id": 123854,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685242711.305, "dur": 0.810, + "args": { + "External id": 123855,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685242714.235, "dur": 0.890, + "args": { + "External id": 123856,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685242716.795, "dur": 0.700, + "args": { + "External id": 123857,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685242719.155, "dur": 0.790, + "args": { + "External id": 123858,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685242721.635, "dur": 0.840, + "args": { + "External id": 123859,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685242725.395, "dur": 2.380, + "args": { + "External id": 123860,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685242729.505, "dur": 0.810, + "args": { + "External id": 123861,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685242731.975, "dur": 0.620, + "args": { + "External id": 123862,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685242750.605, "dur": 120.740, + "args": { + "External id": 123863,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685242765.015, "dur": 101.230, + "args": { + "External id": 123864,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685242783.935, "dur": 8.500, + "args": { + "External id": 123865,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685242796.275, "dur": 41.990, + "args": { + "External id": 123866,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685242798.125, "dur": 39.600, + "args": { + "External id": 123867,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685242802.285, "dur": 7.980, + "args": { + "External id": 123868,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685242811.755, "dur": 25.300, + "args": { + "External id": 123869,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5717, "tid": 5717, + "ts": 6302685243046.355, "dur": 27.239, + "args": { + "External id": 123870,"Sequence number": 2575807, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4489 + } + }, + { + "ph": "s", "id": 259, "pid": 5717, "tid": 5717, "ts": 6302685243046.355, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685243060.514, "dur": 8.280, + "args": { + "External id": 123871,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685243063.564, "dur": 4.580, + "args": { + "External id": 123872,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685243157.534, "dur": 15.090, + "args": { + "External id": 123873,"Record function id": 0, "Ev Idx": 4492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5717, "tid": 5717, + "ts": 6302685243173.984, "dur": 2170.085, + "args": { + "External id": 123874,"Record function id": 0, "Ev Idx": 4493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685243203.444, "dur": 160.620, + "args": { + "External id": 123875,"Sequence number": 2575808, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4494 + } + }, + { + "ph": "s", "id": 258, "pid": 5717, "tid": 5717, "ts": 6302685243203.444, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685243264.804, "dur": 45.800, + "args": { + "External id": 123876,"kernel_hash": "cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/dl/cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685243334.254, "dur": 6.690, + "args": { + "External id": 123877,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685243336.104, "dur": 4.530, + "args": { + "External id": 123878,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685243392.834, "dur": 14.169, + "args": { + "External id": 123879,"Record function id": 0, "Ev Idx": 4498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5717, "tid": 5717, + "ts": 6302685243408.263, "dur": 1428.367, + "args": { + "External id": 123880,"Record function id": 0, "Ev Idx": 4499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685243436.983, "dur": 361.890, + "args": { + "External id": 123881,"Sequence number": 2575809, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4500 + } + }, + { + "ph": "s", "id": 257, "pid": 5717, "tid": 5717, "ts": 6302685243436.983, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685243474.063, "dur": 57.780, + "args": { + "External id": 123882,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685243555.153, "dur": 37.560, + "args": { + "External id": 123883,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685243620.123, "dur": 33.800, + "args": { + "External id": 123884,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685243721.343, "dur": 6.700, + "args": { + "External id": 123885,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685243744.323, "dur": 2.850, + "args": { + "External id": 123886,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685243756.843, "dur": 4.010, + "args": { + "External id": 123887,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685243839.083, "dur": 18.139, + "args": { + "External id": 123888,"Record function id": 0, "Ev Idx": 4507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5717, "tid": 5717, + "ts": 6302685243859.873, "dur": 632.358, + "args": { + "External id": 123889,"Record function id": 0, "Ev Idx": 4508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685243894.522, "dur": 5.691, + "args": { + "External id": 123890,"Record function id": 0, "Ev Idx": 4509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685243902.602, "dur": 283.680, + "args": { + "External id": 123891,"Record function id": 0, "Ev Idx": 4510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685243928.853, "dur": 255.809, + "args": { + "External id": 123892,"Sequence number": 2575810, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4511 + } + }, + { + "ph": "s", "id": 256, "pid": 5717, "tid": 5717, "ts": 6302685243928.853, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685243941.612, "dur": 8.370, + "args": { + "External id": 123893,"Record function id": 0, "Ev Idx": 4512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685243951.002, "dur": 222.540, + "args": { + "External id": 123894,"Record function id": 0, "Ev Idx": 4513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685243981.012, "dur": 4.920, + "args": { + "External id": 123895,"Record function id": 0, "Ev Idx": 4514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685243986.862, "dur": 156.300, + "args": { + "External id": 123896,"Record function id": 0, "Ev Idx": 4515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685243990.302, "dur": 4.820, + "args": { + "External id": 123897,"Record function id": 0, "Ev Idx": 4516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685243996.062, "dur": 143.440, + "args": { + "External id": 123898,"Record function id": 0, "Ev Idx": 4517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244043.302, "dur": 6.690, + "args": { + "External id": 123899,"Record function id": 0, "Ev Idx": 4518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685244051.432, "dur": 86.770, + "args": { + "External id": 123900,"Record function id": 0, "Ev Idx": 4519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685244092.052, "dur": 30.330, + "args": { + "External id": 123901,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244150.212, "dur": 4.450, + "args": { + "External id": 123902,"Record function id": 0, "Ev Idx": 4521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685244155.562, "dur": 17.040, + "args": { + "External id": 123903,"Record function id": 0, "Ev Idx": 4522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244190.382, "dur": 5.760, + "args": { + "External id": 123904,"Record function id": 0, "Ev Idx": 4523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5717, "tid": 5717, + "ts": 6302685244212.702, "dur": 278.889, + "args": { + "External id": 123905,"Record function id": 0, "Ev Idx": 4524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244217.392, "dur": 3.400, + "args": { + "External id": 123906,"Record function id": 0, "Ev Idx": 4525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685244221.522, "dur": 268.939, + "args": { + "External id": 123907,"Record function id": 0, "Ev Idx": 4526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685244239.982, "dur": 249.039, + "args": { + "External id": 123908,"Sequence number": 2575811, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4527 + } + }, + { + "ph": "s", "id": 255, "pid": 5717, "tid": 5717, "ts": 6302685244239.982, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244247.522, "dur": 6.330, + "args": { + "External id": 123909,"Record function id": 0, "Ev Idx": 4528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685244254.782, "dur": 222.919, + "args": { + "External id": 123910,"Record function id": 0, "Ev Idx": 4529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244282.202, "dur": 5.259, + "args": { + "External id": 123911,"Record function id": 0, "Ev Idx": 4530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685244288.461, "dur": 161.080, + "args": { + "External id": 123912,"Record function id": 0, "Ev Idx": 4531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244293.392, "dur": 15.140, + "args": { + "External id": 123913,"Record function id": 0, "Ev Idx": 4532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685244309.541, "dur": 136.960, + "args": { + "External id": 123914,"Record function id": 0, "Ev Idx": 4533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244355.012, "dur": 6.489, + "args": { + "External id": 123915,"Record function id": 0, "Ev Idx": 4534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685244362.581, "dur": 82.830, + "args": { + "External id": 123916,"Record function id": 0, "Ev Idx": 4535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685244401.261, "dur": 29.720, + "args": { + "External id": 123917,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244456.241, "dur": 4.230, + "args": { + "External id": 123918,"Record function id": 0, "Ev Idx": 4537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685244461.281, "dur": 15.610, + "args": { + "External id": 123919,"Record function id": 0, "Ev Idx": 4538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244505.561, "dur": 11.700, + "args": { + "External id": 123920,"Record function id": 0, "Ev Idx": 4539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5717, "tid": 5717, + "ts": 6302685244518.231, "dur": 316.839, + "args": { + "External id": 123921,"Record function id": 0, "Ev Idx": 4540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685244546.461, "dur": 275.170, + "args": { + "External id": 123922,"Sequence number": 2575812, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4541 + } + }, + { + "ph": "s", "id": 254, "pid": 5717, "tid": 5717, "ts": 6302685244546.461, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685244576.771, "dur": 145.390, + "args": { + "External id": 123923,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685244620.231, "dur": 15.960, + "args": { + "External id": 123924,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685244624.081, "dur": 10.690, + "args": { + "External id": 123925,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685244638.511, "dur": 7.720, + "args": { + "External id": 123926,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685244649.381, "dur": 3.240, + "args": { + "External id": 123927,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685244655.691, "dur": 4.750, + "args": { + "External id": 123928,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685244743.460, "dur": 37.991, + "args": { + "External id": 123929,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685244847.870, "dur": 24.990, + "args": { + "External id": 123930,"Record function id": 0, "Ev Idx": 4549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5717, "tid": 5717, + "ts": 6302685244874.160, "dur": 465.369, + "args": { + "External id": 123931,"Record function id": 0, "Ev Idx": 4550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685244907.510, "dur": 386.879, + "args": { + "External id": 123932,"Sequence number": 2575813, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4551 + } + }, + { + "ph": "s", "id": 253, "pid": 5717, "tid": 5717, "ts": 6302685244907.510, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685244970.520, "dur": 34.420, + "args": { + "External id": 123933,"kernel_hash": "cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/qk/cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685245025.060, "dur": 33.590, + "args": { + "External id": 123934,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685245073.620, "dur": 23.420, + "args": { + "External id": 123935,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5717, "tid": 5717, + "ts": 6302685245125.520, "dur": 24.980, + "args": { + "External id": 123936,"kernel_hash": "cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/s2/cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685245165.580, "dur": 33.530, + "args": { + "External id": 123937,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5717, "tid": 5717, + "ts": 6302685245226.670, "dur": 21.709, + "args": { + "External id": 123938,"kernel_hash": "c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/52/c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4557 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.6)", "pid": 5717, "tid": 5717, + "ts": 6302685245401.019, "dur": 72.590, + "args": { + "External id": 123939,"Record function id": 0, "Ev Idx": 4558 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685245556.439, "dur": 61.339, + "args": { + "External id": 123940,"Record function id": 0, "Ev Idx": 4559 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.7)", "pid": 5717, "tid": 5717, + "ts": 6302685245630.729, "dur": 1459.766, + "args": { + "External id": 123941,"Record function id": 0, "Ev Idx": 4560 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 5717, "tid": 5717, + "ts": 6302685245641.938, "dur": 781.329, + "args": { + "External id": 123942,"Record function id": 0, "Ev Idx": 4561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685245733.798, "dur": 12.250, + "args": { + "External id": 123943,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685245764.668, "dur": 45.890, + "args": { + "External id": 123944,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245769.878, "dur": 1.980, + "args": { + "External id": 123945,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245775.098, "dur": 0.510, + "args": { + "External id": 123946,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245776.668, "dur": 0.250, + "args": { + "External id": 123947,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245779.028, "dur": 1.730, + "args": { + "External id": 123948,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245782.998, "dur": 0.280, + "args": { + "External id": 123949,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245787.458, "dur": 0.320, + "args": { + "External id": 123950,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245790.388, "dur": 1.640, + "args": { + "External id": 123951,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245794.818, "dur": 0.260, + "args": { + "External id": 123952,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245803.768, "dur": 0.330, + "args": { + "External id": 123953,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685245828.478, "dur": 52.020, + "args": { + "External id": 123954,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685245936.328, "dur": 141.200, + "args": { + "External id": 123955,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685245951.858, "dur": 10.710, + "args": { + "External id": 123956,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685245968.448, "dur": 11.970, + "args": { + "External id": 123957,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685245971.698, "dur": 8.170, + "args": { + "External id": 123958,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245975.198, "dur": 2.340, + "args": { + "External id": 123959,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685245990.358, "dur": 29.040, + "args": { + "External id": 123960,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245993.958, "dur": 1.670, + "args": { + "External id": 123961,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245997.078, "dur": 0.340, + "args": { + "External id": 123962,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685245998.408, "dur": 0.340, + "args": { + "External id": 123963,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685246002.418, "dur": 0.260, + "args": { + "External id": 123964,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685246003.768, "dur": 0.220, + "args": { + "External id": 123965,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685246005.018, "dur": 1.420, + "args": { + "External id": 123966,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685246008.808, "dur": 0.220, + "args": { + "External id": 123967,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685246009.978, "dur": 0.290, + "args": { + "External id": 123968,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685246014.068, "dur": 1.980, + "args": { + "External id": 123969,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685246037.108, "dur": 24.740, + "args": { + "External id": 123970,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685246156.607, "dur": 125.280, + "args": { + "External id": 123971,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685246182.637, "dur": 94.840, + "args": { + "External id": 123972,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4591, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685246198.737, "dur": 72.430, + "args": { + "External id": 123973,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685246313.577, "dur": 5.340, + "args": { + "External id": 123974,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4593, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 5717, "tid": 5717, + "ts": 6302685246465.377, "dur": 419.989, + "args": { + "External id": 123975,"Record function id": 0, "Ev Idx": 4594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685246639.796, "dur": 5.800, + "args": { + "External id": 123976,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685246655.216, "dur": 3.030, + "args": { + "External id": 123977,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685246663.496, "dur": 2.560, + "args": { + "External id": 123978,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685246671.606, "dur": 0.780, + "args": { + "External id": 123979,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685246674.156, "dur": 0.810, + "args": { + "External id": 123980,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685246678.226, "dur": 0.700, + "args": { + "External id": 123981,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685246680.776, "dur": 0.810, + "args": { + "External id": 123982,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685246683.566, "dur": 2.760, + "args": { + "External id": 123983,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685246688.076, "dur": 0.870, + "args": { + "External id": 123984,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685246692.006, "dur": 0.810, + "args": { + "External id": 123985,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685246712.016, "dur": 134.750, + "args": { + "External id": 123986,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685246731.176, "dur": 110.570, + "args": { + "External id": 123987,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685246754.106, "dur": 8.900, + "args": { + "External id": 123988,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685246765.786, "dur": 45.160, + "args": { + "External id": 123989,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685246767.666, "dur": 42.730, + "args": { + "External id": 123990,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685246771.716, "dur": 9.720, + "args": { + "External id": 123991,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685246782.996, "dur": 26.670, + "args": { + "External id": 123992,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5717, "tid": 5717, + "ts": 6302685247028.435, "dur": 28.230, + "args": { + "External id": 123993,"Sequence number": 2575814, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4612 + } + }, + { + "ph": "s", "id": 252, "pid": 5717, "tid": 5717, "ts": 6302685247028.435, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685247042.915, "dur": 8.810, + "args": { + "External id": 123994,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685247045.975, "dur": 5.050, + "args": { + "External id": 123995,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685247129.085, "dur": 14.610, + "args": { + "External id": 123996,"Record function id": 0, "Ev Idx": 4615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5717, "tid": 5717, + "ts": 6302685247145.145, "dur": 2115.915, + "args": { + "External id": 123997,"Record function id": 0, "Ev Idx": 4616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685247174.745, "dur": 165.780, + "args": { + "External id": 123998,"Sequence number": 2575815, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4617 + } + }, + { + "ph": "s", "id": 251, "pid": 5717, "tid": 5717, "ts": 6302685247174.745, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685247239.845, "dur": 37.220, + "args": { + "External id": 123999,"kernel_hash": "cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/dl/cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685247296.005, "dur": 17.240, + "args": { + "External id": 124000,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685247307.535, "dur": 5.280, + "args": { + "External id": 124001,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685247371.715, "dur": 15.230, + "args": { + "External id": 124002,"Record function id": 0, "Ev Idx": 4621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5717, "tid": 5717, + "ts": 6302685247388.305, "dur": 1362.757, + "args": { + "External id": 124003,"Record function id": 0, "Ev Idx": 4622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685247424.745, "dur": 277.009, + "args": { + "External id": 124004,"Sequence number": 2575816, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4623 + } + }, + { + "ph": "s", "id": 250, "pid": 5717, "tid": 5717, "ts": 6302685247424.745, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685247467.365, "dur": 47.269, + "args": { + "External id": 124005,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685247532.074, "dur": 26.550, + "args": { + "External id": 124006,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685247573.554, "dur": 22.850, + "args": { + "External id": 124007,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685247646.574, "dur": 4.190, + "args": { + "External id": 124008,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685247661.824, "dur": 2.720, + "args": { + "External id": 124009,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685247671.024, "dur": 2.270, + "args": { + "External id": 124010,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685247729.614, "dur": 13.220, + "args": { + "External id": 124011,"Record function id": 0, "Ev Idx": 4630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5717, "tid": 5717, + "ts": 6302685247744.164, "dur": 625.558, + "args": { + "External id": 124012,"Record function id": 0, "Ev Idx": 4631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685247768.864, "dur": 4.690, + "args": { + "External id": 124013,"Record function id": 0, "Ev Idx": 4632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685247774.574, "dur": 294.199, + "args": { + "External id": 124014,"Record function id": 0, "Ev Idx": 4633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685247793.474, "dur": 273.609, + "args": { + "External id": 124015,"Sequence number": 2575817, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4634 + } + }, + { + "ph": "s", "id": 249, "pid": 5717, "tid": 5717, "ts": 6302685247793.474, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685247802.224, "dur": 7.510, + "args": { + "External id": 124016,"Record function id": 0, "Ev Idx": 4635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685247810.834, "dur": 244.349, + "args": { + "External id": 124017,"Record function id": 0, "Ev Idx": 4636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685247843.313, "dur": 5.231, + "args": { + "External id": 124018,"Record function id": 0, "Ev Idx": 4637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685247849.673, "dur": 172.290, + "args": { + "External id": 124019,"Record function id": 0, "Ev Idx": 4638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685247854.013, "dur": 5.520, + "args": { + "External id": 124020,"Record function id": 0, "Ev Idx": 4639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685247860.593, "dur": 157.310, + "args": { + "External id": 124021,"Record function id": 0, "Ev Idx": 4640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685247912.484, "dur": 7.889, + "args": { + "External id": 124022,"Record function id": 0, "Ev Idx": 4641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685247921.873, "dur": 94.710, + "args": { + "External id": 124023,"Record function id": 0, "Ev Idx": 4642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685247966.203, "dur": 33.120, + "args": { + "External id": 124024,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685248030.023, "dur": 4.970, + "args": { + "External id": 124025,"Record function id": 0, "Ev Idx": 4644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685248035.993, "dur": 18.150, + "args": { + "External id": 124026,"Record function id": 0, "Ev Idx": 4645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685248073.783, "dur": 6.170, + "args": { + "External id": 124027,"Record function id": 0, "Ev Idx": 4646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5717, "tid": 5717, + "ts": 6302685248080.843, "dur": 288.199, + "args": { + "External id": 124028,"Record function id": 0, "Ev Idx": 4647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685248085.333, "dur": 3.410, + "args": { + "External id": 124029,"Record function id": 0, "Ev Idx": 4648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685248089.593, "dur": 278.209, + "args": { + "External id": 124030,"Record function id": 0, "Ev Idx": 4649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685248109.693, "dur": 256.449, + "args": { + "External id": 124031,"Sequence number": 2575818, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4650 + } + }, + { + "ph": "s", "id": 248, "pid": 5717, "tid": 5717, "ts": 6302685248109.693, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685248116.203, "dur": 6.790, + "args": { + "External id": 124032,"Record function id": 0, "Ev Idx": 4651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685248123.963, "dur": 229.289, + "args": { + "External id": 124033,"Record function id": 0, "Ev Idx": 4652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685248152.023, "dur": 4.110, + "args": { + "External id": 124034,"Record function id": 0, "Ev Idx": 4653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685248157.223, "dur": 164.200, + "args": { + "External id": 124035,"Record function id": 0, "Ev Idx": 4654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685248162.073, "dur": 4.700, + "args": { + "External id": 124036,"Record function id": 0, "Ev Idx": 4655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685248167.623, "dur": 150.480, + "args": { + "External id": 124037,"Record function id": 0, "Ev Idx": 4656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685248213.333, "dur": 7.150, + "args": { + "External id": 124038,"Record function id": 0, "Ev Idx": 4657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685248221.733, "dur": 94.990, + "args": { + "External id": 124039,"Record function id": 0, "Ev Idx": 4658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685248260.472, "dur": 29.740, + "args": { + "External id": 124040,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685248329.323, "dur": 4.800, + "args": { + "External id": 124041,"Record function id": 0, "Ev Idx": 4660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685248335.103, "dur": 17.229, + "args": { + "External id": 124042,"Record function id": 0, "Ev Idx": 4661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685248383.152, "dur": 12.910, + "args": { + "External id": 124043,"Record function id": 0, "Ev Idx": 4662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5717, "tid": 5717, + "ts": 6302685248397.162, "dur": 352.149, + "args": { + "External id": 124044,"Record function id": 0, "Ev Idx": 4663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685248427.052, "dur": 308.030, + "args": { + "External id": 124045,"Sequence number": 2575819, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4664 + } + }, + { + "ph": "s", "id": 247, "pid": 5717, "tid": 5717, "ts": 6302685248427.052, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685248461.612, "dur": 160.810, + "args": { + "External id": 124046,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685248507.212, "dur": 19.430, + "args": { + "External id": 124047,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685248512.882, "dur": 12.380, + "args": { + "External id": 124048,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685248529.272, "dur": 8.780, + "args": { + "External id": 124049,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685248541.132, "dur": 3.380, + "args": { + "External id": 124050,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685248547.912, "dur": 6.620, + "args": { + "External id": 124051,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685248645.822, "dur": 41.890, + "args": { + "External id": 124052,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685248762.771, "dur": 27.111, + "args": { + "External id": 124053,"Record function id": 0, "Ev Idx": 4672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5717, "tid": 5717, + "ts": 6302685248791.382, "dur": 465.258, + "args": { + "External id": 124054,"Record function id": 0, "Ev Idx": 4673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685248827.341, "dur": 414.289, + "args": { + "External id": 124055,"Sequence number": 2575820, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4674 + } + }, + { + "ph": "s", "id": 246, "pid": 5717, "tid": 5717, "ts": 6302685248827.341, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685248896.311, "dur": 38.250, + "args": { + "External id": 124056,"kernel_hash": "cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/qk/cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685248955.081, "dur": 36.490, + "args": { + "External id": 124057,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685249006.921, "dur": 25.340, + "args": { + "External id": 124058,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5717, "tid": 5717, + "ts": 6302685249064.951, "dur": 26.100, + "args": { + "External id": 124059,"kernel_hash": "cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/s2/cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685249107.821, "dur": 33.569, + "args": { + "External id": 124060,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5717, "tid": 5717, + "ts": 6302685249169.150, "dur": 23.760, + "args": { + "External id": 124061,"kernel_hash": "c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/52/c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4680 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.7)", "pid": 5717, "tid": 5717, + "ts": 6302685249331.550, "dur": 76.950, + "args": { + "External id": 124062,"Record function id": 0, "Ev Idx": 4681 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685249502.520, "dur": 82.600, + "args": { + "External id": 124063,"Record function id": 0, "Ev Idx": 4682 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.8)", "pid": 5717, "tid": 5717, + "ts": 6302685249599.320, "dur": 1366.646, + "args": { + "External id": 124064,"Record function id": 0, "Ev Idx": 4683 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 5717, "tid": 5717, + "ts": 6302685249612.869, "dur": 724.809, + "args": { + "External id": 124065,"Record function id": 0, "Ev Idx": 4684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685249709.919, "dur": 13.300, + "args": { + "External id": 124066,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685249738.789, "dur": 33.510, + "args": { + "External id": 124067,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249744.049, "dur": 2.020, + "args": { + "External id": 124068,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249748.059, "dur": 1.520, + "args": { + "External id": 124069,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249750.779, "dur": 0.340, + "args": { + "External id": 124070,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249753.669, "dur": 0.370, + "args": { + "External id": 124071,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249756.339, "dur": 1.740, + "args": { + "External id": 124072,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249760.789, "dur": 0.240, + "args": { + "External id": 124073,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249762.079, "dur": 0.250, + "args": { + "External id": 124074,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249764.639, "dur": 0.260, + "args": { + "External id": 124075,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249767.359, "dur": 0.260, + "args": { + "External id": 124076,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685249784.189, "dur": 34.390, + "args": { + "External id": 124077,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685249862.939, "dur": 137.040, + "args": { + "External id": 124078,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685249878.989, "dur": 9.880, + "args": { + "External id": 124079,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685249895.129, "dur": 11.000, + "args": { + "External id": 124080,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685249898.639, "dur": 6.930, + "args": { + "External id": 124081,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249902.289, "dur": 1.060, + "args": { + "External id": 124082,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685249916.439, "dur": 28.400, + "args": { + "External id": 124083,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249920.039, "dur": 0.500, + "args": { + "External id": 124084,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249923.169, "dur": 1.640, + "args": { + "External id": 124085,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249925.879, "dur": 0.300, + "args": { + "External id": 124086,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249928.929, "dur": 1.540, + "args": { + "External id": 124087,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249931.559, "dur": 0.260, + "args": { + "External id": 124088,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249932.859, "dur": 0.310, + "args": { + "External id": 124089,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249936.839, "dur": 0.370, + "args": { + "External id": 124090,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249938.219, "dur": 0.260, + "args": { + "External id": 124091,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685249940.779, "dur": 0.360, + "args": { + "External id": 124092,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685249960.969, "dur": 25.510, + "args": { + "External id": 124093,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685250080.368, "dur": 137.750, + "args": { + "External id": 124094,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685250108.408, "dur": 105.020, + "args": { + "External id": 124095,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4714, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685250126.918, "dur": 79.560, + "args": { + "External id": 124096,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685250240.528, "dur": 5.200, + "args": { + "External id": 124097,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4716, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 5717, "tid": 5717, + "ts": 6302685250368.828, "dur": 371.369, + "args": { + "External id": 124098,"Record function id": 0, "Ev Idx": 4717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250498.887, "dur": 6.120, + "args": { + "External id": 124099,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250511.058, "dur": 1.120, + "args": { + "External id": 124100,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250514.578, "dur": 1.100, + "args": { + "External id": 124101,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250518.067, "dur": 0.891, + "args": { + "External id": 124102,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250520.778, "dur": 0.769, + "args": { + "External id": 124103,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250523.318, "dur": 1.260, + "args": { + "External id": 124104,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250526.698, "dur": 0.880, + "args": { + "External id": 124105,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250531.267, "dur": 3.311, + "args": { + "External id": 124106,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250536.478, "dur": 0.860, + "args": { + "External id": 124107,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250539.107, "dur": 0.691, + "args": { + "External id": 124108,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685250560.298, "dur": 137.739, + "args": { + "External id": 124109,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685250577.117, "dur": 115.230, + "args": { + "External id": 124110,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685250595.527, "dur": 9.690, + "args": { + "External id": 124111,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685250609.507, "dur": 49.530, + "args": { + "External id": 124112,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685250611.707, "dur": 46.830, + "args": { + "External id": 124113,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685250616.197, "dur": 10.510, + "args": { + "External id": 124114,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685250628.387, "dur": 29.420, + "args": { + "External id": 124115,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5717, "tid": 5717, + "ts": 6302685250898.617, "dur": 29.969, + "args": { + "External id": 124116,"Sequence number": 2575821, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4735 + } + }, + { + "ph": "s", "id": 245, "pid": 5717, "tid": 5717, "ts": 6302685250898.617, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685250913.297, "dur": 9.600, + "args": { + "External id": 124117,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685250916.597, "dur": 5.449, + "args": { + "External id": 124118,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685251008.766, "dur": 16.830, + "args": { + "External id": 124119,"Record function id": 0, "Ev Idx": 4738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5717, "tid": 5717, + "ts": 6302685251027.356, "dur": 2225.115, + "args": { + "External id": 124120,"Record function id": 0, "Ev Idx": 4739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685251060.926, "dur": 163.740, + "args": { + "External id": 124121,"Sequence number": 2575822, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4740 + } + }, + { + "ph": "s", "id": 244, "pid": 5717, "tid": 5717, "ts": 6302685251060.926, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685251129.896, "dur": 39.190, + "args": { + "External id": 124122,"kernel_hash": "cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/dl/cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685251188.626, "dur": 7.350, + "args": { + "External id": 124123,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685251190.556, "dur": 5.060, + "args": { + "External id": 124124,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685251255.846, "dur": 16.160, + "args": { + "External id": 124125,"Record function id": 0, "Ev Idx": 4744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5717, "tid": 5717, + "ts": 6302685251273.486, "dur": 1451.556, + "args": { + "External id": 124126,"Record function id": 0, "Ev Idx": 4745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685251325.226, "dur": 292.759, + "args": { + "External id": 124127,"Sequence number": 2575823, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4746 + } + }, + { + "ph": "s", "id": 243, "pid": 5717, "tid": 5717, "ts": 6302685251325.226, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685251370.625, "dur": 50.731, + "args": { + "External id": 124128,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685251440.305, "dur": 26.840, + "args": { + "External id": 124129,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685251482.575, "dur": 24.470, + "args": { + "External id": 124130,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685251560.705, "dur": 4.330, + "args": { + "External id": 124131,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685251576.665, "dur": 2.840, + "args": { + "External id": 124132,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685251588.485, "dur": 2.580, + "args": { + "External id": 124133,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685251648.175, "dur": 13.680, + "args": { + "External id": 124134,"Record function id": 0, "Ev Idx": 4753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5717, "tid": 5717, + "ts": 6302685251663.255, "dur": 654.459, + "args": { + "External id": 124135,"Record function id": 0, "Ev Idx": 4754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685251689.515, "dur": 4.880, + "args": { + "External id": 124136,"Record function id": 0, "Ev Idx": 4755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685251695.395, "dur": 307.509, + "args": { + "External id": 124137,"Record function id": 0, "Ev Idx": 4756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685251714.865, "dur": 286.159, + "args": { + "External id": 124138,"Sequence number": 2575824, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4757 + } + }, + { + "ph": "s", "id": 242, "pid": 5717, "tid": 5717, "ts": 6302685251714.865, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685251724.025, "dur": 7.990, + "args": { + "External id": 124139,"Record function id": 0, "Ev Idx": 4758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685251733.185, "dur": 254.929, + "args": { + "External id": 124140,"Record function id": 0, "Ev Idx": 4759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685251767.825, "dur": 4.670, + "args": { + "External id": 124141,"Record function id": 0, "Ev Idx": 4760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685251773.695, "dur": 179.829, + "args": { + "External id": 124142,"Record function id": 0, "Ev Idx": 4761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685251777.725, "dur": 5.110, + "args": { + "External id": 124143,"Record function id": 0, "Ev Idx": 4762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685251785.375, "dur": 164.019, + "args": { + "External id": 124144,"Record function id": 0, "Ev Idx": 4763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685251837.484, "dur": 7.931, + "args": { + "External id": 124145,"Record function id": 0, "Ev Idx": 4764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685251847.015, "dur": 100.749, + "args": { + "External id": 124146,"Record function id": 0, "Ev Idx": 4765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685251894.224, "dur": 35.420, + "args": { + "External id": 124147,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685251961.554, "dur": 5.140, + "args": { + "External id": 124148,"Record function id": 0, "Ev Idx": 4767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685251968.144, "dur": 18.930, + "args": { + "External id": 124149,"Record function id": 0, "Ev Idx": 4768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685252007.924, "dur": 6.470, + "args": { + "External id": 124150,"Record function id": 0, "Ev Idx": 4769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5717, "tid": 5717, + "ts": 6302685252015.404, "dur": 301.739, + "args": { + "External id": 124151,"Record function id": 0, "Ev Idx": 4770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685252019.744, "dur": 3.530, + "args": { + "External id": 124152,"Record function id": 0, "Ev Idx": 4771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685252025.564, "dur": 290.279, + "args": { + "External id": 124153,"Record function id": 0, "Ev Idx": 4772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685252045.314, "dur": 268.729, + "args": { + "External id": 124154,"Sequence number": 2575825, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4773 + } + }, + { + "ph": "s", "id": 241, "pid": 5717, "tid": 5717, "ts": 6302685252045.314, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685252052.014, "dur": 6.760, + "args": { + "External id": 124155,"Record function id": 0, "Ev Idx": 4774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685252059.944, "dur": 230.470, + "args": { + "External id": 124156,"Record function id": 0, "Ev Idx": 4775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685252089.384, "dur": 4.510, + "args": { + "External id": 124157,"Record function id": 0, "Ev Idx": 4776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685252096.254, "dur": 163.060, + "args": { + "External id": 124158,"Record function id": 0, "Ev Idx": 4777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685252100.274, "dur": 4.860, + "args": { + "External id": 124159,"Record function id": 0, "Ev Idx": 4778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685252106.094, "dur": 149.660, + "args": { + "External id": 124160,"Record function id": 0, "Ev Idx": 4779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685252155.174, "dur": 7.430, + "args": { + "External id": 124161,"Record function id": 0, "Ev Idx": 4780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685252163.844, "dur": 90.530, + "args": { + "External id": 124162,"Record function id": 0, "Ev Idx": 4781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685252206.884, "dur": 31.710, + "args": { + "External id": 124163,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685252266.623, "dur": 4.260, + "args": { + "External id": 124164,"Record function id": 0, "Ev Idx": 4783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685252271.843, "dur": 17.540, + "args": { + "External id": 124165,"Record function id": 0, "Ev Idx": 4784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685252332.023, "dur": 13.840, + "args": { + "External id": 124166,"Record function id": 0, "Ev Idx": 4785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5717, "tid": 5717, + "ts": 6302685252346.913, "dur": 376.389, + "args": { + "External id": 124167,"Record function id": 0, "Ev Idx": 4786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685252378.923, "dur": 329.730, + "args": { + "External id": 124168,"Sequence number": 2575826, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4787 + } + }, + { + "ph": "s", "id": 240, "pid": 5717, "tid": 5717, "ts": 6302685252378.923, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685252414.473, "dur": 165.410, + "args": { + "External id": 124169,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685252461.693, "dur": 20.020, + "args": { + "External id": 124170,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685252467.463, "dur": 12.820, + "args": { + "External id": 124171,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685252485.863, "dur": 8.420, + "args": { + "External id": 124172,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685252496.073, "dur": 3.660, + "args": { + "External id": 124173,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685252503.453, "dur": 6.430, + "args": { + "External id": 124174,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685252605.103, "dur": 58.920, + "args": { + "External id": 124175,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685252737.893, "dur": 28.120, + "args": { + "External id": 124176,"Record function id": 0, "Ev Idx": 4795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5717, "tid": 5717, + "ts": 6302685252767.533, "dur": 480.728, + "args": { + "External id": 124177,"Record function id": 0, "Ev Idx": 4796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685252805.362, "dur": 428.109, + "args": { + "External id": 124178,"Sequence number": 2575827, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4797 + } + }, + { + "ph": "s", "id": 239, "pid": 5717, "tid": 5717, "ts": 6302685252805.362, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685252879.882, "dur": 40.150, + "args": { + "External id": 124179,"kernel_hash": "cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/qk/cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685252941.662, "dur": 39.570, + "args": { + "External id": 124180,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685252997.552, "dur": 24.820, + "args": { + "External id": 124181,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5717, "tid": 5717, + "ts": 6302685253055.422, "dur": 27.750, + "args": { + "External id": 124182,"kernel_hash": "cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/s2/cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685253099.742, "dur": 34.659, + "args": { + "External id": 124183,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5717, "tid": 5717, + "ts": 6302685253161.972, "dur": 23.160, + "args": { + "External id": 124184,"kernel_hash": "c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/52/c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4803 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.8)", "pid": 5717, "tid": 5717, + "ts": 6302685253321.481, "dur": 73.940, + "args": { + "External id": 124185,"Record function id": 0, "Ev Idx": 4804 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5717, "tid": 5717, + "ts": 6302685253478.971, "dur": 61.550, + "args": { + "External id": 124186,"Record function id": 0, "Ev Idx": 4805 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.9)", "pid": 5717, "tid": 5717, + "ts": 6302685253553.241, "dur": 1327.987, + "args": { + "External id": 124187,"Record function id": 0, "Ev Idx": 4806 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.9)", "pid": 5717, "tid": 5717, + "ts": 6302685253565.111, "dur": 687.908, + "args": { + "External id": 124188,"Record function id": 0, "Ev Idx": 4807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685253660.890, "dur": 12.490, + "args": { + "External id": 124189,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685253687.090, "dur": 36.160, + "args": { + "External id": 124190,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253692.630, "dur": 1.860, + "args": { + "External id": 124191,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253697.740, "dur": 0.390, + "args": { + "External id": 124192,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253700.720, "dur": 0.470, + "args": { + "External id": 124193,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253702.320, "dur": 0.310, + "args": { + "External id": 124194,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253704.970, "dur": 0.270, + "args": { + "External id": 124195,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253707.780, "dur": 0.300, + "args": { + "External id": 124196,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253709.150, "dur": 2.420, + "args": { + "External id": 124197,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253715.300, "dur": 0.360, + "args": { + "External id": 124198,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253718.060, "dur": 0.250, + "args": { + "External id": 124199,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685253734.020, "dur": 33.000, + "args": { + "External id": 124200,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5717, "tid": 5717, + "ts": 6302685253812.430, "dur": 134.070, + "args": { + "External id": 124201,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "3", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685253826.700, "dur": 9.820, + "args": { + "External id": 124202,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5717, "tid": 5717, + "ts": 6302685253842.540, "dur": 12.180, + "args": { + "External id": 124203,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685253846.000, "dur": 8.130, + "args": { + "External id": 124204,"Record function id": 0, "Concrete Inputs": ["", "0", "5309568", "7079424", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253849.630, "dur": 2.090, + "args": { + "External id": 124205,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5717, "tid": 5717, + "ts": 6302685253864.900, "dur": 27.980, + "args": { + "External id": 124206,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253868.390, "dur": 0.440, + "args": { + "External id": 124207,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5309568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253870.260, "dur": 0.290, + "args": { + "External id": 124208,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5309760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253871.660, "dur": 0.400, + "args": { + "External id": 124209,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5457216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253875.650, "dur": 1.450, + "args": { + "External id": 124210,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5604672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253878.170, "dur": 0.230, + "args": { + "External id": 124211,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "5752128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253881.200, "dur": 1.520, + "args": { + "External id": 124212,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "5899584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253883.730, "dur": 0.220, + "args": { + "External id": 124213,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "5899776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253885.020, "dur": 0.220, + "args": { + "External id": 124214,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6292992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685253889.290, "dur": 0.340, + "args": { + "External id": 124215,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6686208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5717, "tid": 5717, + "ts": 6302685253909.510, "dur": 25.260, + "args": { + "External id": 124216,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5717, "tid": 5717, + "ts": 6302685254023.910, "dur": 127.359, + "args": { + "External id": 124217,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685254049.599, "dur": 97.300, + "args": { + "External id": 124218,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 3, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4837, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5717, "tid": 5717, + "ts": 6302685254067.110, "dur": 73.439, + "args": { + "External id": 124219,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685254172.659, "dur": 4.950, + "args": { + "External id": 124220,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4839, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.9)", "pid": 5717, "tid": 5717, + "ts": 6302685254281.049, "dur": 384.539, + "args": { + "External id": 124221,"Record function id": 0, "Ev Idx": 4840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254417.499, "dur": 6.000, + "args": { + "External id": 124222,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254429.549, "dur": 1.190, + "args": { + "External id": 124223,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254433.139, "dur": 2.130, + "args": { + "External id": 124224,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254437.489, "dur": 1.040, + "args": { + "External id": 124225,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254440.378, "dur": 1.151, + "args": { + "External id": 124226,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254444.638, "dur": 0.940, + "args": { + "External id": 124227,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254447.498, "dur": 0.851, + "args": { + "External id": 124228,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254450.409, "dur": 1.089, + "args": { + "External id": 124229,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254453.229, "dur": 0.849, + "args": { + "External id": 124230,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254457.118, "dur": 0.871, + "args": { + "External id": 124231,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685254477.409, "dur": 148.369, + "args": { + "External id": 124232,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5717, "tid": 5717, + "ts": 6302685254493.449, "dur": 127.019, + "args": { + "External id": 124233,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685254527.129, "dur": 10.549, + "args": { + "External id": 124234,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685254540.988, "dur": 47.500, + "args": { + "External id": 124235,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685254543.028, "dur": 44.920, + "args": { + "External id": 124236,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685254546.948, "dur": 10.660, + "args": { + "External id": 124237,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685254559.268, "dur": 27.830, + "args": { + "External id": 124238,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5717, "tid": 5717, + "ts": 6302685254817.118, "dur": 28.560, + "args": { + "External id": 124239,"Sequence number": 2575828, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4858 + } + }, + { + "ph": "s", "id": 238, "pid": 5717, "tid": 5717, "ts": 6302685254817.118, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685254831.088, "dur": 9.280, + "args": { + "External id": 124240,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685254834.458, "dur": 5.090, + "args": { + "External id": 124241,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685254921.288, "dur": 15.829, + "args": { + "External id": 124242,"Record function id": 0, "Ev Idx": 4861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5717, "tid": 5717, + "ts": 6302685254938.628, "dur": 2288.034, + "args": { + "External id": 124243,"Record function id": 0, "Ev Idx": 4862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685254969.417, "dur": 181.320, + "args": { + "External id": 124244,"Sequence number": 2575829, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4863 + } + }, + { + "ph": "s", "id": 237, "pid": 5717, "tid": 5717, "ts": 6302685254969.417, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685255038.767, "dur": 52.120, + "args": { + "External id": 124245,"kernel_hash": "cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/dl/cdliclg6pyemfhymmmir4tet3kso37luiwh2ahaalh4wfe7rxxvs.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685255114.997, "dur": 6.870, + "args": { + "External id": 124246,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685255116.967, "dur": 4.560, + "args": { + "External id": 124247,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255198.357, "dur": 16.080, + "args": { + "External id": 124248,"Record function id": 0, "Ev Idx": 4867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5717, "tid": 5717, + "ts": 6302685255215.877, "dur": 1463.227, + "args": { + "External id": 124249,"Record function id": 0, "Ev Idx": 4868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685255248.777, "dur": 314.219, + "args": { + "External id": 124250,"Sequence number": 2575830, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4869 + } + }, + { + "ph": "s", "id": 236, "pid": 5717, "tid": 5717, "ts": 6302685255248.777, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685255291.287, "dur": 61.360, + "args": { + "External id": 124251,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685255387.036, "dur": 27.040, + "args": { + "External id": 124252,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685255430.186, "dur": 24.310, + "args": { + "External id": 124253,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685255506.906, "dur": 6.330, + "args": { + "External id": 124254,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685255525.246, "dur": 1.390, + "args": { + "External id": 124255,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685255535.386, "dur": 1.530, + "args": { + "External id": 124256,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255592.646, "dur": 13.250, + "args": { + "External id": 124257,"Record function id": 0, "Ev Idx": 4876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5717, "tid": 5717, + "ts": 6302685255607.196, "dur": 652.289, + "args": { + "External id": 124258,"Record function id": 0, "Ev Idx": 4877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255633.496, "dur": 4.770, + "args": { + "External id": 124259,"Record function id": 0, "Ev Idx": 4878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685255639.216, "dur": 307.549, + "args": { + "External id": 124260,"Record function id": 0, "Ev Idx": 4879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685255659.066, "dur": 285.899, + "args": { + "External id": 124261,"Sequence number": 2575831, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4880 + } + }, + { + "ph": "s", "id": 235, "pid": 5717, "tid": 5717, "ts": 6302685255659.066, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255668.886, "dur": 7.640, + "args": { + "External id": 124262,"Record function id": 0, "Ev Idx": 4881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685255677.666, "dur": 254.579, + "args": { + "External id": 124263,"Record function id": 0, "Ev Idx": 4882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255713.116, "dur": 5.210, + "args": { + "External id": 124264,"Record function id": 0, "Ev Idx": 4883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685255719.466, "dur": 177.879, + "args": { + "External id": 124265,"Record function id": 0, "Ev Idx": 4884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255723.626, "dur": 5.330, + "args": { + "External id": 124266,"Record function id": 0, "Ev Idx": 4885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685255731.056, "dur": 162.239, + "args": { + "External id": 124267,"Record function id": 0, "Ev Idx": 4886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255784.615, "dur": 8.240, + "args": { + "External id": 124268,"Record function id": 0, "Ev Idx": 4887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685255794.426, "dur": 97.379, + "args": { + "External id": 124269,"Record function id": 0, "Ev Idx": 4888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685255839.506, "dur": 35.229, + "args": { + "External id": 124270,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255905.725, "dur": 4.880, + "args": { + "External id": 124271,"Record function id": 0, "Ev Idx": 4890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685255911.575, "dur": 19.760, + "args": { + "External id": 124272,"Record function id": 0, "Ev Idx": 4891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255951.805, "dur": 6.630, + "args": { + "External id": 124273,"Record function id": 0, "Ev Idx": 4892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5717, "tid": 5717, + "ts": 6302685255959.555, "dur": 299.239, + "args": { + "External id": 124274,"Record function id": 0, "Ev Idx": 4893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255964.045, "dur": 3.490, + "args": { + "External id": 124275,"Record function id": 0, "Ev Idx": 4894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5717, "tid": 5717, + "ts": 6302685255970.235, "dur": 287.199, + "args": { + "External id": 124276,"Record function id": 0, "Ev Idx": 4895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685255989.775, "dur": 265.950, + "args": { + "External id": 124277,"Sequence number": 2575832, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4896 + } + }, + { + "ph": "s", "id": 234, "pid": 5717, "tid": 5717, "ts": 6302685255989.775, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685255996.945, "dur": 6.940, + "args": { + "External id": 124278,"Record function id": 0, "Ev Idx": 4897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5717, "tid": 5717, + "ts": 6302685256004.955, "dur": 239.390, + "args": { + "External id": 124279,"Record function id": 0, "Ev Idx": 4898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685256035.395, "dur": 4.680, + "args": { + "External id": 124280,"Record function id": 0, "Ev Idx": 4899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5717, "tid": 5717, + "ts": 6302685256042.635, "dur": 167.739, + "args": { + "External id": 124281,"Record function id": 0, "Ev Idx": 4900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685256046.815, "dur": 4.950, + "args": { + "External id": 124282,"Record function id": 0, "Ev Idx": 4901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5717, "tid": 5717, + "ts": 6302685256052.685, "dur": 154.109, + "args": { + "External id": 124283,"Record function id": 0, "Ev Idx": 4902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685256100.825, "dur": 7.830, + "args": { + "External id": 124284,"Record function id": 0, "Ev Idx": 4903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5717, "tid": 5717, + "ts": 6302685256109.985, "dur": 95.350, + "args": { + "External id": 124285,"Record function id": 0, "Ev Idx": 4904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5717, "tid": 5717, + "ts": 6302685256157.595, "dur": 31.580, + "args": { + "External id": 124286,"kernel_hash": "csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/sh/csh2mxwepbsbbsomifqhcjskuoaxgn52f5tk6r22rd5ytdp2bfol.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685256218.594, "dur": 5.020, + "args": { + "External id": 124287,"Record function id": 0, "Ev Idx": 4906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5717, "tid": 5717, + "ts": 6302685256224.754, "dur": 18.560, + "args": { + "External id": 124288,"Record function id": 0, "Ev Idx": 4907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685256273.594, "dur": 13.411, + "args": { + "External id": 124289,"Record function id": 0, "Ev Idx": 4908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5717, "tid": 5717, + "ts": 6302685256288.254, "dur": 389.190, + "args": { + "External id": 124290,"Record function id": 0, "Ev Idx": 4909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685256333.434, "dur": 328.239, + "args": { + "External id": 124291,"Sequence number": 2575833, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4910 + } + }, + { + "ph": "s", "id": 233, "pid": 5717, "tid": 5717, "ts": 6302685256333.434, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685256371.294, "dur": 171.400, + "args": { + "External id": 124292,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685256421.454, "dur": 21.080, + "args": { + "External id": 124293,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685256425.984, "dur": 14.860, + "args": { + "External id": 124294,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685256447.024, "dur": 7.610, + "args": { + "External id": 124295,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685256456.594, "dur": 3.720, + "args": { + "External id": 124296,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685256464.084, "dur": 5.800, + "args": { + "External id": 124297,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685256569.194, "dur": 45.180, + "args": { + "External id": 124298,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685256691.964, "dur": 29.069, + "args": { + "External id": 124299,"Record function id": 0, "Ev Idx": 4918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5717, "tid": 5717, + "ts": 6302685256722.684, "dur": 499.518, + "args": { + "External id": 124300,"Record function id": 0, "Ev Idx": 4919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685256761.923, "dur": 445.169, + "args": { + "External id": 124301,"Sequence number": 2575834, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4920 + } + }, + { + "ph": "s", "id": 232, "pid": 5717, "tid": 5717, "ts": 6302685256761.923, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685256837.133, "dur": 40.930, + "args": { + "External id": 124302,"kernel_hash": "cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/qk/cqk7t66x3tuygfsczwipcxgbyzectlzeqflhwfxzcaaeckjddpb4.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685256899.743, "dur": 41.340, + "args": { + "External id": 124303,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685256958.273, "dur": 25.860, + "args": { + "External id": 124304,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5717, "tid": 5717, + "ts": 6302685257016.423, "dur": 28.910, + "args": { + "External id": 124305,"kernel_hash": "cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/s2/cs2c633uwfdafd3xgydl7sfevorfbgpxiiklcuqzwgqhu2fbtfer.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685257063.273, "dur": 36.699, + "args": { + "External id": 124306,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5717, "tid": 5717, + "ts": 6302685257130.392, "dur": 24.831, + "args": { + "External id": 124307,"kernel_hash": "c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/52/c52anwhrrv4s7fjrx22cblet6def4pjp4gnxa7amcie6vhmvvx7a.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4926 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.9)", "pid": 5717, "tid": 5717, + "ts": 6302685257284.972, "dur": 53.400, + "args": { + "External id": 124308,"Record function id": 0, "Ev Idx": 4927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5717, "tid": 5717, + "ts": 6302685257456.652, "dur": 424.289, + "args": { + "External id": 124309,"Sequence number": 2575835, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 4928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685257463.122, "dur": 88.220, + "args": { + "External id": 124310,"Sequence number": 2575835, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 4929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685257466.982, "dur": 83.809, + "args": { + "External id": 124311,"Sequence number": 2575835, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 4930 + } + }, + { + "ph": "s", "id": 231, "pid": 5717, "tid": 5717, "ts": 6302685257466.982, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685257480.222, "dur": 19.810, + "args": { + "External id": 124312,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685257502.182, "dur": 43.109, + "args": { + "External id": 124313,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685257556.631, "dur": 68.820, + "args": { + "External id": 124314,"Sequence number": 2575836, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4933 + } + }, + { + "ph": "s", "id": 230, "pid": 5717, "tid": 5717, "ts": 6302685257556.631, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685257569.751, "dur": 1.500, + "args": { + "External id": 124315,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685257573.842, "dur": 0.240, + "args": { + "External id": 124316,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 4935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5717, "tid": 5717, + "ts": 6302685257630.891, "dur": 55.730, + "args": { + "External id": 124317,"Sequence number": 2575837, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 4936 + } + }, + { + "ph": "s", "id": 229, "pid": 5717, "tid": 5717, "ts": 6302685257630.891, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685257690.331, "dur": 40.940, + "args": { + "External id": 124318,"Sequence number": 2575838, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 4937 + } + }, + { + "ph": "s", "id": 228, "pid": 5717, "tid": 5717, "ts": 6302685257690.331, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685257702.461, "dur": 24.960, + "args": { + "External id": 124319,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 4938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5717, "tid": 5717, + "ts": 6302685257735.741, "dur": 34.670, + "args": { + "External id": 124320,"Sequence number": 2575839, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 4939 + } + }, + { + "ph": "s", "id": 227, "pid": 5717, "tid": 5717, "ts": 6302685257735.741, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685257776.781, "dur": 31.050, + "args": { + "External id": 124321,"Sequence number": 2575840, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 4940 + } + }, + { + "ph": "s", "id": 226, "pid": 5717, "tid": 5717, "ts": 6302685257776.781, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5717, "tid": 5717, + "ts": 6302685257810.821, "dur": 41.180, + "args": { + "External id": 124322,"Sequence number": 2575841, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685257813.231, "dur": 38.370, + "args": { + "External id": 124323,"Sequence number": 2575841, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 4942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685257814.691, "dur": 36.500, + "args": { + "External id": 124324,"Sequence number": 2575841, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 4943 + } + }, + { + "ph": "s", "id": 225, "pid": 5717, "tid": 5717, "ts": 6302685257814.691, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685257820.901, "dur": 7.590, + "args": { + "External id": 124325,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685257830.191, "dur": 19.470, + "args": { + "External id": 124326,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685257855.091, "dur": 25.080, + "args": { + "External id": 124327,"Sequence number": 2575842, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4946 + } + }, + { + "ph": "s", "id": 224, "pid": 5717, "tid": 5717, "ts": 6302685257855.091, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685257924.301, "dur": 109.900, + "args": { + "External id": 124328,"Sequence number": 2575843, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685257927.031, "dur": 16.470, + "args": { + "External id": 124329,"Sequence number": 2575843, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4948 + } + }, + { + "ph": "s", "id": 223, "pid": 5717, "tid": 5717, "ts": 6302685257927.031, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685257933.211, "dur": 8.010, + "args": { + "External id": 124330,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685257938.101, "dur": 2.360, + "args": { + "External id": 124331,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685257946.021, "dur": 87.469, + "args": { + "External id": 124332,"Sequence number": 2575844, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 4951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685257949.871, "dur": 8.590, + "args": { + "External id": 124333,"Sequence number": 2575844, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685257951.461, "dur": 6.570, + "args": { + "External id": 124334,"Sequence number": 2575844, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4953 + } + }, + { + "ph": "s", "id": 222, "pid": 5717, "tid": 5717, "ts": 6302685257951.461, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685257961.511, "dur": 61.699, + "args": { + "External id": 124335,"Sequence number": 2575845, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 4954 + } + }, + { + "ph": "s", "id": 221, "pid": 5717, "tid": 5717, "ts": 6302685257961.511, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685258027.881, "dur": 4.129, + "args": { + "External id": 124336,"Sequence number": 2575846, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 4955 + } + }, + { + "ph": "s", "id": 220, "pid": 5717, "tid": 5717, "ts": 6302685258027.881, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685258055.630, "dur": 74.040, + "args": { + "External id": 124337,"Sequence number": 2575847, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685258056.721, "dur": 10.920, + "args": { + "External id": 124338,"Sequence number": 2575847, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4957 + } + }, + { + "ph": "s", "id": 219, "pid": 5717, "tid": 5717, "ts": 6302685258056.721, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685258059.270, "dur": 6.780, + "args": { + "External id": 124339,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685258064.150, "dur": 1.411, + "args": { + "External id": 124340,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685258069.001, "dur": 60.149, + "args": { + "External id": 124341,"Sequence number": 2575848, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 4960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685258072.070, "dur": 4.340, + "args": { + "External id": 124342,"Sequence number": 2575848, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685258073.190, "dur": 2.970, + "args": { + "External id": 124343,"Sequence number": 2575848, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4962 + } + }, + { + "ph": "s", "id": 218, "pid": 5717, "tid": 5717, "ts": 6302685258073.190, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685258077.520, "dur": 43.120, + "args": { + "External id": 124344,"Sequence number": 2575849, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 4963 + } + }, + { + "ph": "s", "id": 217, "pid": 5717, "tid": 5717, "ts": 6302685258077.520, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685258123.910, "dur": 4.100, + "args": { + "External id": 124345,"Sequence number": 2575850, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 4964 + } + }, + { + "ph": "s", "id": 216, "pid": 5717, "tid": 5717, "ts": 6302685258123.910, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685258146.530, "dur": 69.070, + "args": { + "External id": 124346,"Sequence number": 2575851, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685258147.520, "dur": 8.320, + "args": { + "External id": 124347,"Sequence number": 2575851, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4966 + } + }, + { + "ph": "s", "id": 215, "pid": 5717, "tid": 5717, "ts": 6302685258147.520, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685258149.660, "dur": 4.740, + "args": { + "External id": 124348,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685258152.820, "dur": 1.090, + "args": { + "External id": 124349,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685258158.410, "dur": 56.700, + "args": { + "External id": 124350,"Sequence number": 2575852, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 4969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685258159.800, "dur": 4.060, + "args": { + "External id": 124351,"Sequence number": 2575852, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685258160.720, "dur": 2.880, + "args": { + "External id": 124352,"Sequence number": 2575852, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4971 + } + }, + { + "ph": "s", "id": 214, "pid": 5717, "tid": 5717, "ts": 6302685258160.720, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685258164.840, "dur": 40.900, + "args": { + "External id": 124353,"Sequence number": 2575853, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 4972 + } + }, + { + "ph": "s", "id": 213, "pid": 5717, "tid": 5717, "ts": 6302685258164.840, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685258210.260, "dur": 3.740, + "args": { + "External id": 124354,"Sequence number": 2575854, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 4973 + } + }, + { + "ph": "s", "id": 212, "pid": 5717, "tid": 5717, "ts": 6302685258210.260, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685258242.950, "dur": 4.560, + "args": { + "External id": 124355,"Sequence number": 2575855, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685258243.880, "dur": 3.320, + "args": { + "External id": 124356,"Sequence number": 2575855, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4975 + } + }, + { + "ph": "s", "id": 211, "pid": 5717, "tid": 5717, "ts": 6302685258243.880, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685258259.180, "dur": 6.340, + "args": { + "External id": 124357,"Sequence number": 2575856, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685258259.980, "dur": 5.230, + "args": { + "External id": 124358,"Sequence number": 2575856, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4977 + } + }, + { + "ph": "s", "id": 210, "pid": 5717, "tid": 5717, "ts": 6302685258259.980, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685258273.770, "dur": 5.280, + "args": { + "External id": 124359,"Sequence number": 2575857, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685258275.950, "dur": 2.850, + "args": { + "External id": 124360,"Sequence number": 2575857, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4979 + } + }, + { + "ph": "s", "id": 209, "pid": 5717, "tid": 5717, "ts": 6302685258275.950, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685258333.240, "dur": 270.339, + "args": { + "External id": 124361,"Sequence number": 2575858, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4980 + } + }, + { + "ph": "s", "id": 208, "pid": 5717, "tid": 5717, "ts": 6302685258333.240, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685258362.100, "dur": 15.140, + "args": { + "External id": 124362,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685258365.140, "dur": 10.920, + "args": { + "External id": 124363,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685258626.439, "dur": 215.850, + "args": { + "External id": 124364,"Sequence number": 2575859, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4983 + } + }, + { + "ph": "s", "id": 207, "pid": 5717, "tid": 5717, "ts": 6302685258626.439, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685258651.199, "dur": 15.570, + "args": { + "External id": 124365,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685258654.609, "dur": 11.300, + "args": { + "External id": 124366,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5717, "tid": 5717, + "ts": 6302685258893.999, "dur": 283.849, + "args": { + "External id": 124367,"Sequence number": 2575860, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 4986 + } + }, + { + "ph": "s", "id": 206, "pid": 5717, "tid": 5717, "ts": 6302685258893.999, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685258932.988, "dur": 200.570, + "args": { + "External id": 124368,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685259006.748, "dur": 24.880, + "args": { + "External id": 124369,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685259014.518, "dur": 15.690, + "args": { + "External id": 124370,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685259034.148, "dur": 7.290, + "args": { + "External id": 124371,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685259043.088, "dur": 3.850, + "args": { + "External id": 124372,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685259056.148, "dur": 10.040, + "args": { + "External id": 124373,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 5717, + "ts": 6302685259153.318, "dur": 7.400, + "args": { + "External id": 124374,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 4993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685259187.298, "dur": 7.740, + "args": { + "External id": 124375,"Sequence number": 2575861, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685259188.678, "dur": 5.920, + "args": { + "External id": 124376,"Sequence number": 2575861, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4995 + } + }, + { + "ph": "s", "id": 205, "pid": 5717, "tid": 5717, "ts": 6302685259188.678, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685259214.668, "dur": 99.599, + "args": { + "External id": 124377,"Sequence number": 2575862, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685259217.188, "dur": 8.910, + "args": { + "External id": 124378,"Sequence number": 2575862, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4997 + } + }, + { + "ph": "s", "id": 204, "pid": 5717, "tid": 5717, "ts": 6302685259217.188, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685259219.958, "dur": 4.680, + "args": { + "External id": 124379,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685259222.288, "dur": 1.800, + "args": { + "External id": 124380,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685259227.508, "dur": 86.199, + "args": { + "External id": 124381,"Sequence number": 2575863, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685259229.558, "dur": 6.080, + "args": { + "External id": 124382,"Sequence number": 2575863, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685259232.888, "dur": 2.470, + "args": { + "External id": 124383,"Sequence number": 2575863, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5002 + } + }, + { + "ph": "s", "id": 203, "pid": 5717, "tid": 5717, "ts": 6302685259232.888, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685259236.678, "dur": 55.880, + "args": { + "External id": 124384,"Sequence number": 2575864, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5003 + } + }, + { + "ph": "s", "id": 202, "pid": 5717, "tid": 5717, "ts": 6302685259236.678, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685259296.208, "dur": 15.950, + "args": { + "External id": 124385,"Sequence number": 2575865, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5004 + } + }, + { + "ph": "s", "id": 201, "pid": 5717, "tid": 5717, "ts": 6302685259296.208, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685259334.267, "dur": 38.200, + "args": { + "External id": 124386,"Sequence number": 2575866, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5005 + } + }, + { + "ph": "s", "id": 200, "pid": 5717, "tid": 5717, "ts": 6302685259334.267, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5717, "tid": 5717, + "ts": 6302685259402.047, "dur": 259.650, + "args": { + "External id": 124387,"Sequence number": 2575867, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685259404.867, "dur": 43.910, + "args": { + "External id": 124388,"Sequence number": 2575867, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685259406.587, "dur": 41.770, + "args": { + "External id": 124389,"Sequence number": 2575867, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5008 + } + }, + { + "ph": "s", "id": 199, "pid": 5717, "tid": 5717, "ts": 6302685259406.587, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685259411.517, "dur": 10.720, + "args": { + "External id": 124390,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685259424.217, "dur": 21.840, + "args": { + "External id": 124391,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685259450.907, "dur": 31.190, + "args": { + "External id": 124392,"Sequence number": 2575868, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5011 + } + }, + { + "ph": "s", "id": 198, "pid": 5717, "tid": 5717, "ts": 6302685259450.907, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685259455.257, "dur": 0.590, + "args": { + "External id": 124393,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685259456.997, "dur": 0.160, + "args": { + "External id": 124394,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5717, "tid": 5717, + "ts": 6302685259485.617, "dur": 27.300, + "args": { + "External id": 124395,"Sequence number": 2575869, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5014 + } + }, + { + "ph": "s", "id": 197, "pid": 5717, "tid": 5717, "ts": 6302685259485.617, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685259515.327, "dur": 26.790, + "args": { + "External id": 124396,"Sequence number": 2575870, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5015 + } + }, + { + "ph": "s", "id": 196, "pid": 5717, "tid": 5717, "ts": 6302685259515.327, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685259523.597, "dur": 15.530, + "args": { + "External id": 124397,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5717, "tid": 5717, + "ts": 6302685259543.967, "dur": 21.950, + "args": { + "External id": 124398,"Sequence number": 2575871, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5017 + } + }, + { + "ph": "s", "id": 195, "pid": 5717, "tid": 5717, "ts": 6302685259543.967, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685259570.817, "dur": 21.520, + "args": { + "External id": 124399,"Sequence number": 2575872, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5018 + } + }, + { + "ph": "s", "id": 194, "pid": 5717, "tid": 5717, "ts": 6302685259570.817, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5717, "tid": 5717, + "ts": 6302685259595.527, "dur": 36.210, + "args": { + "External id": 124400,"Sequence number": 2575873, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685259597.827, "dur": 33.500, + "args": { + "External id": 124401,"Sequence number": 2575873, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685259599.247, "dur": 31.710, + "args": { + "External id": 124402,"Sequence number": 2575873, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5021 + } + }, + { + "ph": "s", "id": 193, "pid": 5717, "tid": 5717, "ts": 6302685259599.247, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685259606.707, "dur": 6.720, + "args": { + "External id": 124403,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685259614.877, "dur": 14.590, + "args": { + "External id": 124404,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685259635.627, "dur": 25.110, + "args": { + "External id": 124405,"Sequence number": 2575874, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5024 + } + }, + { + "ph": "s", "id": 192, "pid": 5717, "tid": 5717, "ts": 6302685259635.627, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685259710.467, "dur": 102.339, + "args": { + "External id": 124406,"Sequence number": 2575875, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685259713.307, "dur": 20.550, + "args": { + "External id": 124407,"Sequence number": 2575875, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5026 + } + }, + { + "ph": "s", "id": 191, "pid": 5717, "tid": 5717, "ts": 6302685259713.307, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685259719.657, "dur": 11.930, + "args": { + "External id": 124408,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685259727.237, "dur": 2.300, + "args": { + "External id": 124409,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685259736.777, "dur": 75.500, + "args": { + "External id": 124410,"Sequence number": 2575876, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685259740.506, "dur": 9.531, + "args": { + "External id": 124411,"Sequence number": 2575876, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685259743.017, "dur": 6.629, + "args": { + "External id": 124412,"Sequence number": 2575876, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5031 + } + }, + { + "ph": "s", "id": 190, "pid": 5717, "tid": 5717, "ts": 6302685259743.017, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685259751.226, "dur": 51.340, + "args": { + "External id": 124413,"Sequence number": 2575877, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5032 + } + }, + { + "ph": "s", "id": 189, "pid": 5717, "tid": 5717, "ts": 6302685259751.226, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685259805.986, "dur": 5.020, + "args": { + "External id": 124414,"Sequence number": 2575878, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5033 + } + }, + { + "ph": "s", "id": 188, "pid": 5717, "tid": 5717, "ts": 6302685259805.986, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685259830.977, "dur": 88.859, + "args": { + "External id": 124415,"Sequence number": 2575879, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685259832.166, "dur": 26.930, + "args": { + "External id": 124416,"Sequence number": 2575879, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5035 + } + }, + { + "ph": "s", "id": 187, "pid": 5717, "tid": 5717, "ts": 6302685259832.166, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685259850.706, "dur": 6.710, + "args": { + "External id": 124417,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685259855.476, "dur": 1.480, + "args": { + "External id": 124418,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685259860.426, "dur": 58.910, + "args": { + "External id": 124419,"Sequence number": 2575880, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685259862.046, "dur": 4.810, + "args": { + "External id": 124420,"Sequence number": 2575880, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685259863.026, "dur": 3.530, + "args": { + "External id": 124421,"Sequence number": 2575880, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5040 + } + }, + { + "ph": "s", "id": 186, "pid": 5717, "tid": 5717, "ts": 6302685259863.026, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685259867.826, "dur": 40.900, + "args": { + "External id": 124422,"Sequence number": 2575881, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5041 + } + }, + { + "ph": "s", "id": 185, "pid": 5717, "tid": 5717, "ts": 6302685259867.826, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685259912.856, "dur": 5.320, + "args": { + "External id": 124423,"Sequence number": 2575882, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5042 + } + }, + { + "ph": "s", "id": 184, "pid": 5717, "tid": 5717, "ts": 6302685259912.856, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5717, "tid": 5717, + "ts": 6302685259959.256, "dur": 204.960, + "args": { + "External id": 124424,"Sequence number": 2575883, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5043 + } + }, + { + "ph": "s", "id": 183, "pid": 5717, "tid": 5717, "ts": 6302685259959.256, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685260005.826, "dur": 10.880, + "args": { + "External id": 124425,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685260066.476, "dur": 74.930, + "args": { + "External id": 124426,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685260067.866, "dur": 10.920, + "args": { + "External id": 124427,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685260070.186, "dur": 6.740, + "args": { + "External id": 124428,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685260074.186, "dur": 2.040, + "args": { + "External id": 124429,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685260080.056, "dur": 60.610, + "args": { + "External id": 124430,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685260082.556, "dur": 5.930, + "args": { + "External id": 124431,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685260084.866, "dur": 3.370, + "args": { + "External id": 124432,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685260089.436, "dur": 44.980, + "args": { + "External id": 124433,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685260137.456, "dur": 1.630, + "args": { + "External id": 124434,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685260173.276, "dur": 30.849, + "args": { + "External id": 124435,"Sequence number": 2575884, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5054 + } + }, + { + "ph": "s", "id": 182, "pid": 5717, "tid": 5717, "ts": 6302685260173.276, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5717, "tid": 5717, + "ts": 6302685260243.445, "dur": 266.590, + "args": { + "External id": 124436,"Sequence number": 2575885, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685260245.876, "dur": 46.649, + "args": { + "External id": 124437,"Sequence number": 2575885, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685260247.696, "dur": 44.349, + "args": { + "External id": 124438,"Sequence number": 2575885, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5057 + } + }, + { + "ph": "s", "id": 181, "pid": 5717, "tid": 5717, "ts": 6302685260247.696, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685260252.816, "dur": 10.840, + "args": { + "External id": 124439,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685260266.665, "dur": 23.090, + "args": { + "External id": 124440,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685260294.435, "dur": 41.230, + "args": { + "External id": 124441,"Sequence number": 2575886, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5060 + } + }, + { + "ph": "s", "id": 180, "pid": 5717, "tid": 5717, "ts": 6302685260294.435, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685260309.795, "dur": 0.660, + "args": { + "External id": 124442,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685260311.405, "dur": 0.220, + "args": { + "External id": 124443,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5717, "tid": 5717, + "ts": 6302685260338.195, "dur": 26.560, + "args": { + "External id": 124444,"Sequence number": 2575887, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5063 + } + }, + { + "ph": "s", "id": 179, "pid": 5717, "tid": 5717, "ts": 6302685260338.195, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685260366.965, "dur": 24.120, + "args": { + "External id": 124445,"Sequence number": 2575888, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5064 + } + }, + { + "ph": "s", "id": 178, "pid": 5717, "tid": 5717, "ts": 6302685260366.965, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685260372.765, "dur": 15.450, + "args": { + "External id": 124446,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5717, "tid": 5717, + "ts": 6302685260394.015, "dur": 24.890, + "args": { + "External id": 124447,"Sequence number": 2575889, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5066 + } + }, + { + "ph": "s", "id": 177, "pid": 5717, "tid": 5717, "ts": 6302685260394.015, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685260423.955, "dur": 23.720, + "args": { + "External id": 124448,"Sequence number": 2575890, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5067 + } + }, + { + "ph": "s", "id": 176, "pid": 5717, "tid": 5717, "ts": 6302685260423.955, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5717, "tid": 5717, + "ts": 6302685260449.625, "dur": 34.030, + "args": { + "External id": 124449,"Sequence number": 2575891, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685260451.365, "dur": 31.850, + "args": { + "External id": 124450,"Sequence number": 2575891, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685260452.815, "dur": 29.980, + "args": { + "External id": 124451,"Sequence number": 2575891, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5070 + } + }, + { + "ph": "s", "id": 175, "pid": 5717, "tid": 5717, "ts": 6302685260452.815, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685260458.315, "dur": 7.060, + "args": { + "External id": 124452,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685260466.955, "dur": 14.310, + "args": { + "External id": 124453,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685260486.885, "dur": 22.230, + "args": { + "External id": 124454,"Sequence number": 2575892, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5073 + } + }, + { + "ph": "s", "id": 174, "pid": 5717, "tid": 5717, "ts": 6302685260486.885, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685260546.075, "dur": 83.609, + "args": { + "External id": 124455,"Sequence number": 2575893, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685260547.275, "dur": 12.160, + "args": { + "External id": 124456,"Sequence number": 2575893, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5075 + } + }, + { + "ph": "s", "id": 173, "pid": 5717, "tid": 5717, "ts": 6302685260547.275, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685260550.305, "dur": 6.980, + "args": { + "External id": 124457,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685260554.625, "dur": 2.110, + "args": { + "External id": 124458,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685260560.615, "dur": 68.560, + "args": { + "External id": 124459,"Sequence number": 2575894, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685260562.705, "dur": 5.200, + "args": { + "External id": 124460,"Sequence number": 2575894, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685260563.795, "dur": 3.810, + "args": { + "External id": 124461,"Sequence number": 2575894, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5080 + } + }, + { + "ph": "s", "id": 172, "pid": 5717, "tid": 5717, "ts": 6302685260563.795, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685260568.915, "dur": 50.920, + "args": { + "External id": 124462,"Sequence number": 2575895, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5081 + } + }, + { + "ph": "s", "id": 171, "pid": 5717, "tid": 5717, "ts": 6302685260568.915, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685260624.584, "dur": 3.291, + "args": { + "External id": 124463,"Sequence number": 2575896, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5082 + } + }, + { + "ph": "s", "id": 170, "pid": 5717, "tid": 5717, "ts": 6302685260624.584, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685260647.575, "dur": 90.659, + "args": { + "External id": 124464,"Sequence number": 2575897, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685260648.615, "dur": 9.820, + "args": { + "External id": 124465,"Sequence number": 2575897, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5084 + } + }, + { + "ph": "s", "id": 169, "pid": 5717, "tid": 5717, "ts": 6302685260648.615, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685260652.304, "dur": 4.511, + "args": { + "External id": 124466,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685260654.104, "dur": 2.200, + "args": { + "External id": 124467,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685260660.964, "dur": 76.690, + "args": { + "External id": 124468,"Sequence number": 2575898, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685260662.435, "dur": 3.289, + "args": { + "External id": 124469,"Sequence number": 2575898, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685260663.375, "dur": 2.100, + "args": { + "External id": 124470,"Sequence number": 2575898, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5089 + } + }, + { + "ph": "s", "id": 168, "pid": 5717, "tid": 5717, "ts": 6302685260663.375, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685260666.604, "dur": 52.660, + "args": { + "External id": 124471,"Sequence number": 2575899, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5090 + } + }, + { + "ph": "s", "id": 167, "pid": 5717, "tid": 5717, "ts": 6302685260666.604, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685260724.444, "dur": 10.250, + "args": { + "External id": 124472,"Sequence number": 2575900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5091 + } + }, + { + "ph": "s", "id": 166, "pid": 5717, "tid": 5717, "ts": 6302685260724.444, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685260759.584, "dur": 74.610, + "args": { + "External id": 124473,"Sequence number": 2575901, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685260760.654, "dur": 7.790, + "args": { + "External id": 124474,"Sequence number": 2575901, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5093 + } + }, + { + "ph": "s", "id": 165, "pid": 5717, "tid": 5717, "ts": 6302685260760.654, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685260762.884, "dur": 3.870, + "args": { + "External id": 124475,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685260764.904, "dur": 1.300, + "args": { + "External id": 124476,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685260769.634, "dur": 63.980, + "args": { + "External id": 124477,"Sequence number": 2575902, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685260773.514, "dur": 4.980, + "args": { + "External id": 124478,"Sequence number": 2575902, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685260774.574, "dur": 3.660, + "args": { + "External id": 124479,"Sequence number": 2575902, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5098 + } + }, + { + "ph": "s", "id": 164, "pid": 5717, "tid": 5717, "ts": 6302685260774.574, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685260779.434, "dur": 46.800, + "args": { + "External id": 124480,"Sequence number": 2575903, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5099 + } + }, + { + "ph": "s", "id": 163, "pid": 5717, "tid": 5717, "ts": 6302685260779.434, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685260829.674, "dur": 2.880, + "args": { + "External id": 124481,"Sequence number": 2575904, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5100 + } + }, + { + "ph": "s", "id": 162, "pid": 5717, "tid": 5717, "ts": 6302685260829.674, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685260860.044, "dur": 6.330, + "args": { + "External id": 124482,"Sequence number": 2575905, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685260862.574, "dur": 3.510, + "args": { + "External id": 124483,"Sequence number": 2575905, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5102 + } + }, + { + "ph": "s", "id": 161, "pid": 5717, "tid": 5717, "ts": 6302685260862.574, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685260883.674, "dur": 5.440, + "args": { + "External id": 124484,"Sequence number": 2575906, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685260884.454, "dur": 4.350, + "args": { + "External id": 124485,"Sequence number": 2575906, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5104 + } + }, + { + "ph": "s", "id": 160, "pid": 5717, "tid": 5717, "ts": 6302685260884.454, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685260901.334, "dur": 7.340, + "args": { + "External id": 124486,"Sequence number": 2575907, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685260903.394, "dur": 4.930, + "args": { + "External id": 124487,"Sequence number": 2575907, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5106 + } + }, + { + "ph": "s", "id": 159, "pid": 5717, "tid": 5717, "ts": 6302685260903.394, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685260952.794, "dur": 203.540, + "args": { + "External id": 124488,"Sequence number": 2575908, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5107 + } + }, + { + "ph": "s", "id": 158, "pid": 5717, "tid": 5717, "ts": 6302685260952.794, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685260978.364, "dur": 18.470, + "args": { + "External id": 124489,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685260982.724, "dur": 13.060, + "args": { + "External id": 124490,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685261180.073, "dur": 198.900, + "args": { + "External id": 124491,"Sequence number": 2575909, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5110 + } + }, + { + "ph": "s", "id": 157, "pid": 5717, "tid": 5717, "ts": 6302685261180.073, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685261203.423, "dur": 19.140, + "args": { + "External id": 124492,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685261209.013, "dur": 12.650, + "args": { + "External id": 124493,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5717, "tid": 5717, + "ts": 6302685261422.643, "dur": 241.159, + "args": { + "External id": 124494,"Sequence number": 2575910, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 5113 + } + }, + { + "ph": "s", "id": 156, "pid": 5717, "tid": 5717, "ts": 6302685261422.643, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685261451.873, "dur": 173.849, + "args": { + "External id": 124495,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 5114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685261505.693, "dur": 19.429, + "args": { + "External id": 124496,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685261510.833, "dur": 12.949, + "args": { + "External id": 124497,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685261529.033, "dur": 8.749, + "args": { + "External id": 124498,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685261539.433, "dur": 4.809, + "args": { + "External id": 124499,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685261548.033, "dur": 6.589, + "args": { + "External id": 124500,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 5717, + "ts": 6302685261641.232, "dur": 5.850, + "args": { + "External id": 124501,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 5120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685261672.032, "dur": 10.220, + "args": { + "External id": 124502,"Sequence number": 2575911, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685261674.812, "dur": 7.100, + "args": { + "External id": 124503,"Sequence number": 2575911, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5122 + } + }, + { + "ph": "s", "id": 155, "pid": 5717, "tid": 5717, "ts": 6302685261674.812, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685261701.462, "dur": 98.050, + "args": { + "External id": 124504,"Sequence number": 2575912, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685261704.152, "dur": 9.130, + "args": { + "External id": 124505,"Sequence number": 2575912, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5124 + } + }, + { + "ph": "s", "id": 154, "pid": 5717, "tid": 5717, "ts": 6302685261704.152, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685261707.042, "dur": 4.720, + "args": { + "External id": 124506,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685261709.322, "dur": 1.850, + "args": { + "External id": 124507,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685261714.682, "dur": 84.140, + "args": { + "External id": 124508,"Sequence number": 2575913, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685261716.722, "dur": 5.290, + "args": { + "External id": 124509,"Sequence number": 2575913, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685261719.322, "dur": 2.420, + "args": { + "External id": 124510,"Sequence number": 2575913, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5129 + } + }, + { + "ph": "s", "id": 153, "pid": 5717, "tid": 5717, "ts": 6302685261719.322, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685261723.062, "dur": 66.130, + "args": { + "External id": 124511,"Sequence number": 2575914, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5130 + } + }, + { + "ph": "s", "id": 152, "pid": 5717, "tid": 5717, "ts": 6302685261723.062, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685261793.562, "dur": 3.780, + "args": { + "External id": 124512,"Sequence number": 2575915, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5131 + } + }, + { + "ph": "s", "id": 151, "pid": 5717, "tid": 5717, "ts": 6302685261793.562, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685261817.502, "dur": 38.830, + "args": { + "External id": 124513,"Sequence number": 2575916, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5132 + } + }, + { + "ph": "s", "id": 150, "pid": 5717, "tid": 5717, "ts": 6302685261817.502, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5717, "tid": 5717, + "ts": 6302685261893.112, "dur": 263.749, + "args": { + "External id": 124514,"Sequence number": 2575917, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685261898.792, "dur": 46.380, + "args": { + "External id": 124515,"Sequence number": 2575917, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685261900.702, "dur": 44.010, + "args": { + "External id": 124516,"Sequence number": 2575917, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5135 + } + }, + { + "ph": "s", "id": 149, "pid": 5717, "tid": 5717, "ts": 6302685261900.702, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685261907.602, "dur": 9.970, + "args": { + "External id": 124517,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685261919.342, "dur": 23.130, + "args": { + "External id": 124518,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685261947.201, "dur": 30.031, + "args": { + "External id": 124519,"Sequence number": 2575918, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5138 + } + }, + { + "ph": "s", "id": 148, "pid": 5717, "tid": 5717, "ts": 6302685261947.201, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685261951.412, "dur": 0.640, + "args": { + "External id": 124520,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685261953.052, "dur": 0.189, + "args": { + "External id": 124521,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5717, "tid": 5717, + "ts": 6302685261980.721, "dur": 29.891, + "args": { + "External id": 124522,"Sequence number": 2575919, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5141 + } + }, + { + "ph": "s", "id": 147, "pid": 5717, "tid": 5717, "ts": 6302685261980.721, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685262013.112, "dur": 26.260, + "args": { + "External id": 124523,"Sequence number": 2575920, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5142 + } + }, + { + "ph": "s", "id": 146, "pid": 5717, "tid": 5717, "ts": 6302685262013.112, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685262020.412, "dur": 15.809, + "args": { + "External id": 124524,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5717, "tid": 5717, + "ts": 6302685262041.012, "dur": 21.809, + "args": { + "External id": 124525,"Sequence number": 2575921, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5144 + } + }, + { + "ph": "s", "id": 145, "pid": 5717, "tid": 5717, "ts": 6302685262041.012, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685262067.861, "dur": 22.610, + "args": { + "External id": 124526,"Sequence number": 2575922, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5145 + } + }, + { + "ph": "s", "id": 144, "pid": 5717, "tid": 5717, "ts": 6302685262067.861, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5717, "tid": 5717, + "ts": 6302685262093.901, "dur": 36.020, + "args": { + "External id": 124527,"Sequence number": 2575923, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685262095.791, "dur": 33.780, + "args": { + "External id": 124528,"Sequence number": 2575923, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685262098.471, "dur": 30.600, + "args": { + "External id": 124529,"Sequence number": 2575923, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5148 + } + }, + { + "ph": "s", "id": 143, "pid": 5717, "tid": 5717, "ts": 6302685262098.471, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685262104.201, "dur": 6.820, + "args": { + "External id": 124530,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685262112.601, "dur": 14.920, + "args": { + "External id": 124531,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685262133.401, "dur": 22.630, + "args": { + "External id": 124532,"Sequence number": 2575924, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5151 + } + }, + { + "ph": "s", "id": 142, "pid": 5717, "tid": 5717, "ts": 6302685262133.401, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685262188.901, "dur": 85.360, + "args": { + "External id": 124533,"Sequence number": 2575925, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685262190.291, "dur": 11.960, + "args": { + "External id": 124534,"Sequence number": 2575925, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5153 + } + }, + { + "ph": "s", "id": 141, "pid": 5717, "tid": 5717, "ts": 6302685262190.291, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685262194.261, "dur": 5.720, + "args": { + "External id": 124535,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685262197.281, "dur": 2.050, + "args": { + "External id": 124536,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685262203.661, "dur": 70.070, + "args": { + "External id": 124537,"Sequence number": 2575926, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685262205.711, "dur": 8.310, + "args": { + "External id": 124538,"Sequence number": 2575926, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685262209.751, "dur": 3.970, + "args": { + "External id": 124539,"Sequence number": 2575926, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5158 + } + }, + { + "ph": "s", "id": 140, "pid": 5717, "tid": 5717, "ts": 6302685262209.751, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685262215.151, "dur": 50.360, + "args": { + "External id": 124540,"Sequence number": 2575927, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5159 + } + }, + { + "ph": "s", "id": 139, "pid": 5717, "tid": 5717, "ts": 6302685262215.151, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685262269.011, "dur": 3.220, + "args": { + "External id": 124541,"Sequence number": 2575928, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5160 + } + }, + { + "ph": "s", "id": 138, "pid": 5717, "tid": 5717, "ts": 6302685262269.011, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685262291.501, "dur": 80.510, + "args": { + "External id": 124542,"Sequence number": 2575929, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685262292.501, "dur": 18.970, + "args": { + "External id": 124543,"Sequence number": 2575929, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5162 + } + }, + { + "ph": "s", "id": 137, "pid": 5717, "tid": 5717, "ts": 6302685262292.501, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685262295.931, "dur": 13.920, + "args": { + "External id": 124544,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685262307.931, "dur": 1.390, + "args": { + "External id": 124545,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685262312.751, "dur": 58.840, + "args": { + "External id": 124546,"Sequence number": 2575930, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685262314.371, "dur": 5.860, + "args": { + "External id": 124547,"Sequence number": 2575930, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685262315.211, "dur": 4.690, + "args": { + "External id": 124548,"Sequence number": 2575930, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5167 + } + }, + { + "ph": "s", "id": 136, "pid": 5717, "tid": 5717, "ts": 6302685262315.211, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685262321.241, "dur": 42.360, + "args": { + "External id": 124549,"Sequence number": 2575931, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5168 + } + }, + { + "ph": "s", "id": 135, "pid": 5717, "tid": 5717, "ts": 6302685262321.241, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685262366.591, "dur": 3.880, + "args": { + "External id": 124550,"Sequence number": 2575932, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5169 + } + }, + { + "ph": "s", "id": 134, "pid": 5717, "tid": 5717, "ts": 6302685262366.591, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5717, "tid": 5717, + "ts": 6302685262411.040, "dur": 185.470, + "args": { + "External id": 124551,"Sequence number": 2575933, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5170 + } + }, + { + "ph": "s", "id": 133, "pid": 5717, "tid": 5717, "ts": 6302685262411.040, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685262449.680, "dur": 12.371, + "args": { + "External id": 124552,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685262501.840, "dur": 75.210, + "args": { + "External id": 124553,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685262504.560, "dur": 9.260, + "args": { + "External id": 124554,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685262507.140, "dur": 4.960, + "args": { + "External id": 124555,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685262509.490, "dur": 1.800, + "args": { + "External id": 124556,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685262515.140, "dur": 61.210, + "args": { + "External id": 124557,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685262518.590, "dur": 5.760, + "args": { + "External id": 124558,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685262520.990, "dur": 3.080, + "args": { + "External id": 124559,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685262525.290, "dur": 44.700, + "args": { + "External id": 124560,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685262573.010, "dur": 1.820, + "args": { + "External id": 124561,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685262605.220, "dur": 30.710, + "args": { + "External id": 124562,"Sequence number": 2575934, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5181 + } + }, + { + "ph": "s", "id": 132, "pid": 5717, "tid": 5717, "ts": 6302685262605.220, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5717, "tid": 5717, + "ts": 6302685262675.280, "dur": 245.939, + "args": { + "External id": 124563,"Sequence number": 2575935, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685262677.760, "dur": 43.450, + "args": { + "External id": 124564,"Sequence number": 2575935, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685262679.630, "dur": 41.120, + "args": { + "External id": 124565,"Sequence number": 2575935, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5184 + } + }, + { + "ph": "s", "id": 131, "pid": 5717, "tid": 5717, "ts": 6302685262679.630, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685262684.470, "dur": 9.300, + "args": { + "External id": 124566,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685262696.630, "dur": 21.930, + "args": { + "External id": 124567,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685262723.090, "dur": 28.200, + "args": { + "External id": 124568,"Sequence number": 2575936, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5187 + } + }, + { + "ph": "s", "id": 130, "pid": 5717, "tid": 5717, "ts": 6302685262723.090, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685262726.890, "dur": 0.620, + "args": { + "External id": 124569,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685262728.520, "dur": 0.200, + "args": { + "External id": 124570,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5717, "tid": 5717, + "ts": 6302685262753.560, "dur": 26.840, + "args": { + "External id": 124571,"Sequence number": 2575937, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5190 + } + }, + { + "ph": "s", "id": 129, "pid": 5717, "tid": 5717, "ts": 6302685262753.560, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685262782.630, "dur": 25.010, + "args": { + "External id": 124572,"Sequence number": 2575938, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5191 + } + }, + { + "ph": "s", "id": 128, "pid": 5717, "tid": 5717, "ts": 6302685262782.630, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685262789.880, "dur": 14.770, + "args": { + "External id": 124573,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5717, "tid": 5717, + "ts": 6302685262810.480, "dur": 22.639, + "args": { + "External id": 124574,"Sequence number": 2575939, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5193 + } + }, + { + "ph": "s", "id": 127, "pid": 5717, "tid": 5717, "ts": 6302685262810.480, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685262837.999, "dur": 22.620, + "args": { + "External id": 124575,"Sequence number": 2575940, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5194 + } + }, + { + "ph": "s", "id": 126, "pid": 5717, "tid": 5717, "ts": 6302685262837.999, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5717, "tid": 5717, + "ts": 6302685262862.510, "dur": 34.520, + "args": { + "External id": 124576,"Sequence number": 2575941, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685262864.279, "dur": 32.420, + "args": { + "External id": 124577,"Sequence number": 2575941, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685262865.630, "dur": 30.649, + "args": { + "External id": 124578,"Sequence number": 2575941, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5197 + } + }, + { + "ph": "s", "id": 125, "pid": 5717, "tid": 5717, "ts": 6302685262865.630, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685262872.370, "dur": 6.540, + "args": { + "External id": 124579,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685262880.459, "dur": 14.211, + "args": { + "External id": 124580,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685262900.030, "dur": 20.500, + "args": { + "External id": 124581,"Sequence number": 2575942, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5200 + } + }, + { + "ph": "s", "id": 124, "pid": 5717, "tid": 5717, "ts": 6302685262900.030, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685262956.369, "dur": 86.400, + "args": { + "External id": 124582,"Sequence number": 2575943, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685262957.719, "dur": 13.460, + "args": { + "External id": 124583,"Sequence number": 2575943, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5202 + } + }, + { + "ph": "s", "id": 123, "pid": 5717, "tid": 5717, "ts": 6302685262957.719, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685262960.909, "dur": 8.210, + "args": { + "External id": 124584,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685262966.539, "dur": 2.010, + "args": { + "External id": 124585,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685262972.429, "dur": 69.810, + "args": { + "External id": 124586,"Sequence number": 2575944, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685262974.579, "dur": 6.810, + "args": { + "External id": 124587,"Sequence number": 2575944, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685262975.649, "dur": 5.440, + "args": { + "External id": 124588,"Sequence number": 2575944, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5207 + } + }, + { + "ph": "s", "id": 122, "pid": 5717, "tid": 5717, "ts": 6302685262975.649, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685262982.379, "dur": 51.770, + "args": { + "External id": 124589,"Sequence number": 2575945, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5208 + } + }, + { + "ph": "s", "id": 121, "pid": 5717, "tid": 5717, "ts": 6302685262982.379, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685263037.469, "dur": 3.420, + "args": { + "External id": 124590,"Sequence number": 2575946, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5209 + } + }, + { + "ph": "s", "id": 120, "pid": 5717, "tid": 5717, "ts": 6302685263037.469, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685263059.529, "dur": 69.960, + "args": { + "External id": 124591,"Sequence number": 2575947, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685263060.509, "dur": 8.250, + "args": { + "External id": 124592,"Sequence number": 2575947, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5211 + } + }, + { + "ph": "s", "id": 119, "pid": 5717, "tid": 5717, "ts": 6302685263060.509, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685263062.269, "dur": 5.180, + "args": { + "External id": 124593,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685263065.629, "dur": 1.300, + "args": { + "External id": 124594,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685263071.349, "dur": 57.620, + "args": { + "External id": 124595,"Sequence number": 2575948, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685263072.719, "dur": 4.510, + "args": { + "External id": 124596,"Sequence number": 2575948, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685263073.709, "dur": 3.220, + "args": { + "External id": 124597,"Sequence number": 2575948, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5216 + } + }, + { + "ph": "s", "id": 118, "pid": 5717, "tid": 5717, "ts": 6302685263073.709, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685263078.199, "dur": 38.320, + "args": { + "External id": 124598,"Sequence number": 2575949, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5217 + } + }, + { + "ph": "s", "id": 117, "pid": 5717, "tid": 5717, "ts": 6302685263078.199, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685263120.999, "dur": 6.740, + "args": { + "External id": 124599,"Sequence number": 2575950, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5218 + } + }, + { + "ph": "s", "id": 116, "pid": 5717, "tid": 5717, "ts": 6302685263120.999, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685263145.689, "dur": 76.380, + "args": { + "External id": 124600,"Sequence number": 2575951, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685263146.829, "dur": 8.160, + "args": { + "External id": 124601,"Sequence number": 2575951, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5220 + } + }, + { + "ph": "s", "id": 115, "pid": 5717, "tid": 5717, "ts": 6302685263146.829, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685263149.269, "dur": 4.040, + "args": { + "External id": 124602,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685263151.469, "dur": 1.360, + "args": { + "External id": 124603,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685263156.489, "dur": 64.830, + "args": { + "External id": 124604,"Sequence number": 2575952, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685263159.609, "dur": 4.110, + "args": { + "External id": 124605,"Sequence number": 2575952, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685263160.669, "dur": 2.700, + "args": { + "External id": 124606,"Sequence number": 2575952, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5225 + } + }, + { + "ph": "s", "id": 114, "pid": 5717, "tid": 5717, "ts": 6302685263160.669, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685263164.889, "dur": 48.850, + "args": { + "External id": 124607,"Sequence number": 2575953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5226 + } + }, + { + "ph": "s", "id": 113, "pid": 5717, "tid": 5717, "ts": 6302685263164.889, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685263217.069, "dur": 2.920, + "args": { + "External id": 124608,"Sequence number": 2575954, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5227 + } + }, + { + "ph": "s", "id": 112, "pid": 5717, "tid": 5717, "ts": 6302685263217.069, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685263247.529, "dur": 5.170, + "args": { + "External id": 124609,"Sequence number": 2575955, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685263248.639, "dur": 3.670, + "args": { + "External id": 124610,"Sequence number": 2575955, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5229 + } + }, + { + "ph": "s", "id": 111, "pid": 5717, "tid": 5717, "ts": 6302685263248.639, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685263266.009, "dur": 4.070, + "args": { + "External id": 124611,"Sequence number": 2575956, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685263266.989, "dur": 2.780, + "args": { + "External id": 124612,"Sequence number": 2575956, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5231 + } + }, + { + "ph": "s", "id": 110, "pid": 5717, "tid": 5717, "ts": 6302685263266.989, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685263279.878, "dur": 8.520, + "args": { + "External id": 124613,"Sequence number": 2575957, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685263282.689, "dur": 5.320, + "args": { + "External id": 124614,"Sequence number": 2575957, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5233 + } + }, + { + "ph": "s", "id": 109, "pid": 5717, "tid": 5717, "ts": 6302685263282.689, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685263346.598, "dur": 290.330, + "args": { + "External id": 124615,"Sequence number": 2575958, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5234 + } + }, + { + "ph": "s", "id": 108, "pid": 5717, "tid": 5717, "ts": 6302685263346.598, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685263373.048, "dur": 16.800, + "args": { + "External id": 124616,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685263377.468, "dur": 11.430, + "args": { + "External id": 124617,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685263658.068, "dur": 158.859, + "args": { + "External id": 124618,"Sequence number": 2575959, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5237 + } + }, + { + "ph": "s", "id": 107, "pid": 5717, "tid": 5717, "ts": 6302685263658.068, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685263678.178, "dur": 14.810, + "args": { + "External id": 124619,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685263681.178, "dur": 11.040, + "args": { + "External id": 124620,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5717, "tid": 5717, + "ts": 6302685263854.367, "dur": 212.980, + "args": { + "External id": 124621,"Sequence number": 2575960, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 5240 + } + }, + { + "ph": "s", "id": 106, "pid": 5717, "tid": 5717, "ts": 6302685263854.367, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685263879.537, "dur": 153.270, + "args": { + "External id": 124622,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 5241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685263926.477, "dur": 15.870, + "args": { + "External id": 124623,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685263929.957, "dur": 11.240, + "args": { + "External id": 124624,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685263945.977, "dur": 6.630, + "args": { + "External id": 124625,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685263955.177, "dur": 3.300, + "args": { + "External id": 124626,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685263962.787, "dur": 7.190, + "args": { + "External id": 124627,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 5717, + "ts": 6302685264046.297, "dur": 5.020, + "args": { + "External id": 124628,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 5247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685264075.147, "dur": 6.620, + "args": { + "External id": 124629,"Sequence number": 2575961, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685264076.447, "dur": 5.060, + "args": { + "External id": 124630,"Sequence number": 2575961, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5249 + } + }, + { + "ph": "s", "id": 105, "pid": 5717, "tid": 5717, "ts": 6302685264076.447, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685264098.017, "dur": 115.210, + "args": { + "External id": 124631,"Sequence number": 2575962, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685264099.277, "dur": 11.170, + "args": { + "External id": 124632,"Sequence number": 2575962, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5251 + } + }, + { + "ph": "s", "id": 104, "pid": 5717, "tid": 5717, "ts": 6302685264099.277, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685264103.877, "dur": 5.340, + "args": { + "External id": 124633,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685264107.097, "dur": 1.620, + "args": { + "External id": 124634,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685264111.657, "dur": 100.710, + "args": { + "External id": 124635,"Sequence number": 2575963, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685264113.517, "dur": 4.410, + "args": { + "External id": 124636,"Sequence number": 2575963, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685264114.227, "dur": 3.470, + "args": { + "External id": 124637,"Sequence number": 2575963, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5256 + } + }, + { + "ph": "s", "id": 103, "pid": 5717, "tid": 5717, "ts": 6302685264114.227, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685264118.977, "dur": 76.759, + "args": { + "External id": 124638,"Sequence number": 2575964, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5257 + } + }, + { + "ph": "s", "id": 102, "pid": 5717, "tid": 5717, "ts": 6302685264118.977, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685264203.067, "dur": 6.089, + "args": { + "External id": 124639,"Sequence number": 2575965, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5258 + } + }, + { + "ph": "s", "id": 101, "pid": 5717, "tid": 5717, "ts": 6302685264203.067, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685264231.976, "dur": 34.420, + "args": { + "External id": 124640,"Sequence number": 2575966, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5259 + } + }, + { + "ph": "s", "id": 100, "pid": 5717, "tid": 5717, "ts": 6302685264231.976, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5717, "tid": 5717, + "ts": 6302685264295.166, "dur": 314.769, + "args": { + "External id": 124641,"Sequence number": 2575967, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685264332.366, "dur": 52.730, + "args": { + "External id": 124642,"Sequence number": 2575967, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685264336.506, "dur": 48.200, + "args": { + "External id": 124643,"Sequence number": 2575967, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5262 + } + }, + { + "ph": "s", "id": 99, "pid": 5717, "tid": 5717, "ts": 6302685264336.506, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685264344.916, "dur": 11.050, + "args": { + "External id": 124644,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685264357.806, "dur": 24.210, + "args": { + "External id": 124645,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685264387.076, "dur": 32.500, + "args": { + "External id": 124646,"Sequence number": 2575968, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5265 + } + }, + { + "ph": "s", "id": 98, "pid": 5717, "tid": 5717, "ts": 6302685264387.076, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685264390.926, "dur": 0.570, + "args": { + "External id": 124647,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685264392.306, "dur": 0.180, + "args": { + "External id": 124648,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5717, "tid": 5717, + "ts": 6302685264423.216, "dur": 37.450, + "args": { + "External id": 124649,"Sequence number": 2575969, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5268 + } + }, + { + "ph": "s", "id": 97, "pid": 5717, "tid": 5717, "ts": 6302685264423.216, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685264462.696, "dur": 25.510, + "args": { + "External id": 124650,"Sequence number": 2575970, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5269 + } + }, + { + "ph": "s", "id": 96, "pid": 5717, "tid": 5717, "ts": 6302685264462.696, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685264472.446, "dur": 13.240, + "args": { + "External id": 124651,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5717, "tid": 5717, + "ts": 6302685264489.556, "dur": 20.370, + "args": { + "External id": 124652,"Sequence number": 2575971, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5271 + } + }, + { + "ph": "s", "id": 95, "pid": 5717, "tid": 5717, "ts": 6302685264489.556, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685264517.296, "dur": 31.130, + "args": { + "External id": 124653,"Sequence number": 2575972, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5272 + } + }, + { + "ph": "s", "id": 94, "pid": 5717, "tid": 5717, "ts": 6302685264517.296, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5717, "tid": 5717, + "ts": 6302685264551.766, "dur": 36.290, + "args": { + "External id": 124654,"Sequence number": 2575973, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685264555.806, "dur": 31.930, + "args": { + "External id": 124655,"Sequence number": 2575973, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685264557.056, "dur": 30.320, + "args": { + "External id": 124656,"Sequence number": 2575973, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5275 + } + }, + { + "ph": "s", "id": 93, "pid": 5717, "tid": 5717, "ts": 6302685264557.056, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685264563.616, "dur": 8.830, + "args": { + "External id": 124657,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685264573.616, "dur": 12.510, + "args": { + "External id": 124658,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685264590.956, "dur": 18.419, + "args": { + "External id": 124659,"Sequence number": 2575974, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5278 + } + }, + { + "ph": "s", "id": 92, "pid": 5717, "tid": 5717, "ts": 6302685264590.956, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685264637.435, "dur": 74.100, + "args": { + "External id": 124660,"Sequence number": 2575975, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685264638.366, "dur": 9.980, + "args": { + "External id": 124661,"Sequence number": 2575975, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5280 + } + }, + { + "ph": "s", "id": 91, "pid": 5717, "tid": 5717, "ts": 6302685264638.366, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685264641.946, "dur": 4.660, + "args": { + "External id": 124662,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685264644.406, "dur": 1.729, + "args": { + "External id": 124663,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685264649.486, "dur": 61.569, + "args": { + "External id": 124664,"Sequence number": 2575976, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685264651.215, "dur": 6.720, + "args": { + "External id": 124665,"Sequence number": 2575976, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685264652.126, "dur": 5.520, + "args": { + "External id": 124666,"Sequence number": 2575976, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5285 + } + }, + { + "ph": "s", "id": 90, "pid": 5717, "tid": 5717, "ts": 6302685264652.126, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685264660.315, "dur": 41.670, + "args": { + "External id": 124667,"Sequence number": 2575977, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5286 + } + }, + { + "ph": "s", "id": 89, "pid": 5717, "tid": 5717, "ts": 6302685264660.315, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685264704.945, "dur": 5.070, + "args": { + "External id": 124668,"Sequence number": 2575978, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5287 + } + }, + { + "ph": "s", "id": 88, "pid": 5717, "tid": 5717, "ts": 6302685264704.945, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685264725.775, "dur": 56.550, + "args": { + "External id": 124669,"Sequence number": 2575979, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685264726.565, "dur": 6.530, + "args": { + "External id": 124670,"Sequence number": 2575979, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5289 + } + }, + { + "ph": "s", "id": 87, "pid": 5717, "tid": 5717, "ts": 6302685264726.565, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685264728.135, "dur": 3.890, + "args": { + "External id": 124671,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685264730.745, "dur": 0.910, + "args": { + "External id": 124672,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685264734.195, "dur": 47.720, + "args": { + "External id": 124673,"Sequence number": 2575980, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685264735.405, "dur": 3.680, + "args": { + "External id": 124674,"Sequence number": 2575980, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685264736.635, "dur": 2.220, + "args": { + "External id": 124675,"Sequence number": 2575980, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5294 + } + }, + { + "ph": "s", "id": 86, "pid": 5717, "tid": 5717, "ts": 6302685264736.635, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685264741.065, "dur": 34.130, + "args": { + "External id": 124676,"Sequence number": 2575981, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5295 + } + }, + { + "ph": "s", "id": 85, "pid": 5717, "tid": 5717, "ts": 6302685264741.065, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685264777.705, "dur": 3.240, + "args": { + "External id": 124677,"Sequence number": 2575982, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5296 + } + }, + { + "ph": "s", "id": 84, "pid": 5717, "tid": 5717, "ts": 6302685264777.705, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5717, "tid": 5717, + "ts": 6302685264813.025, "dur": 174.580, + "args": { + "External id": 124678,"Sequence number": 2575983, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5297 + } + }, + { + "ph": "s", "id": 83, "pid": 5717, "tid": 5717, "ts": 6302685264813.025, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685264844.095, "dur": 9.030, + "args": { + "External id": 124679,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685264895.395, "dur": 75.580, + "args": { + "External id": 124680,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685264896.575, "dur": 8.980, + "args": { + "External id": 124681,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685264898.655, "dur": 5.310, + "args": { + "External id": 124682,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685264901.985, "dur": 1.430, + "args": { + "External id": 124683,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685264906.745, "dur": 63.660, + "args": { + "External id": 124684,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685264908.855, "dur": 3.940, + "args": { + "External id": 124685,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685264909.865, "dur": 2.580, + "args": { + "External id": 124686,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685264914.895, "dur": 49.980, + "args": { + "External id": 124687,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685264967.725, "dur": 1.390, + "args": { + "External id": 124688,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685264995.225, "dur": 25.920, + "args": { + "External id": 124689,"Sequence number": 2575984, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5308 + } + }, + { + "ph": "s", "id": 82, "pid": 5717, "tid": 5717, "ts": 6302685264995.225, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5717, "tid": 5717, + "ts": 6302685265053.445, "dur": 230.589, + "args": { + "External id": 124690,"Sequence number": 2575985, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685265055.585, "dur": 36.229, + "args": { + "External id": 124691,"Sequence number": 2575985, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685265057.094, "dur": 34.351, + "args": { + "External id": 124692,"Sequence number": 2575985, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5311 + } + }, + { + "ph": "s", "id": 81, "pid": 5717, "tid": 5717, "ts": 6302685265057.094, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685265061.725, "dur": 7.909, + "args": { + "External id": 124693,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685265071.185, "dur": 18.429, + "args": { + "External id": 124694,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685265094.574, "dur": 23.620, + "args": { + "External id": 124695,"Sequence number": 2575986, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5314 + } + }, + { + "ph": "s", "id": 80, "pid": 5717, "tid": 5717, "ts": 6302685265094.574, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685265097.814, "dur": 0.511, + "args": { + "External id": 124696,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685265099.174, "dur": 0.180, + "args": { + "External id": 124697,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5717, "tid": 5717, + "ts": 6302685265120.494, "dur": 26.500, + "args": { + "External id": 124698,"Sequence number": 2575987, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5317 + } + }, + { + "ph": "s", "id": 79, "pid": 5717, "tid": 5717, "ts": 6302685265120.494, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685265149.304, "dur": 22.650, + "args": { + "External id": 124699,"Sequence number": 2575988, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5318 + } + }, + { + "ph": "s", "id": 78, "pid": 5717, "tid": 5717, "ts": 6302685265149.304, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685265154.824, "dur": 14.310, + "args": { + "External id": 124700,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5717, "tid": 5717, + "ts": 6302685265173.624, "dur": 23.880, + "args": { + "External id": 124701,"Sequence number": 2575989, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5320 + } + }, + { + "ph": "s", "id": 77, "pid": 5717, "tid": 5717, "ts": 6302685265173.624, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685265202.364, "dur": 22.060, + "args": { + "External id": 124702,"Sequence number": 2575990, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5321 + } + }, + { + "ph": "s", "id": 76, "pid": 5717, "tid": 5717, "ts": 6302685265202.364, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5717, "tid": 5717, + "ts": 6302685265226.094, "dur": 35.470, + "args": { + "External id": 124703,"Sequence number": 2575991, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685265227.904, "dur": 33.320, + "args": { + "External id": 124704,"Sequence number": 2575991, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685265229.334, "dur": 31.530, + "args": { + "External id": 124705,"Sequence number": 2575991, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5324 + } + }, + { + "ph": "s", "id": 75, "pid": 5717, "tid": 5717, "ts": 6302685265229.334, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685265236.844, "dur": 6.510, + "args": { + "External id": 124706,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685265244.774, "dur": 14.690, + "args": { + "External id": 124707,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685265264.424, "dur": 19.010, + "args": { + "External id": 124708,"Sequence number": 2575992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5327 + } + }, + { + "ph": "s", "id": 74, "pid": 5717, "tid": 5717, "ts": 6302685265264.424, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685265327.204, "dur": 66.740, + "args": { + "External id": 124709,"Sequence number": 2575993, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685265328.204, "dur": 9.880, + "args": { + "External id": 124710,"Sequence number": 2575993, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5329 + } + }, + { + "ph": "s", "id": 73, "pid": 5717, "tid": 5717, "ts": 6302685265328.204, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685265330.614, "dur": 5.820, + "args": { + "External id": 124711,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685265334.274, "dur": 1.640, + "args": { + "External id": 124712,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685265339.104, "dur": 54.400, + "args": { + "External id": 124713,"Sequence number": 2575994, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685265340.874, "dur": 4.060, + "args": { + "External id": 124714,"Sequence number": 2575994, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685265341.824, "dur": 2.880, + "args": { + "External id": 124715,"Sequence number": 2575994, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5334 + } + }, + { + "ph": "s", "id": 72, "pid": 5717, "tid": 5717, "ts": 6302685265341.824, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685265345.614, "dur": 41.580, + "args": { + "External id": 124716,"Sequence number": 2575995, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5335 + } + }, + { + "ph": "s", "id": 71, "pid": 5717, "tid": 5717, "ts": 6302685265345.614, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685265390.044, "dur": 2.510, + "args": { + "External id": 124717,"Sequence number": 2575996, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5336 + } + }, + { + "ph": "s", "id": 70, "pid": 5717, "tid": 5717, "ts": 6302685265390.044, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685265407.184, "dur": 164.000, + "args": { + "External id": 124718,"Sequence number": 2575997, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685265408.064, "dur": 5.980, + "args": { + "External id": 124719,"Sequence number": 2575997, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5338 + } + }, + { + "ph": "s", "id": 69, "pid": 5717, "tid": 5717, "ts": 6302685265408.064, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685265409.614, "dur": 3.380, + "args": { + "External id": 124720,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685265411.564, "dur": 1.030, + "args": { + "External id": 124721,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685265416.224, "dur": 154.629, + "args": { + "External id": 124722,"Sequence number": 2575998, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685265417.614, "dur": 2.780, + "args": { + "External id": 124723,"Sequence number": 2575998, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685265418.404, "dur": 1.780, + "args": { + "External id": 124724,"Sequence number": 2575998, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5343 + } + }, + { + "ph": "s", "id": 68, "pid": 5717, "tid": 5717, "ts": 6302685265418.404, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685265421.204, "dur": 30.420, + "args": { + "External id": 124725,"Sequence number": 2575999, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5344 + } + }, + { + "ph": "s", "id": 67, "pid": 5717, "tid": 5717, "ts": 6302685265421.204, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685265454.214, "dur": 115.810, + "args": { + "External id": 124726,"Sequence number": 2576000, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5345 + } + }, + { + "ph": "s", "id": 66, "pid": 5717, "tid": 5717, "ts": 6302685265454.214, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685265583.213, "dur": 50.750, + "args": { + "External id": 124727,"Sequence number": 2576001, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685265584.073, "dur": 6.640, + "args": { + "External id": 124728,"Sequence number": 2576001, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5347 + } + }, + { + "ph": "s", "id": 65, "pid": 5717, "tid": 5717, "ts": 6302685265584.073, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685265585.563, "dur": 4.100, + "args": { + "External id": 124729,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685265587.483, "dur": 1.840, + "args": { + "External id": 124730,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685265591.613, "dur": 41.990, + "args": { + "External id": 124731,"Sequence number": 2576002, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685265593.903, "dur": 3.920, + "args": { + "External id": 124732,"Sequence number": 2576002, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685265594.893, "dur": 2.640, + "args": { + "External id": 124733,"Sequence number": 2576002, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5352 + } + }, + { + "ph": "s", "id": 64, "pid": 5717, "tid": 5717, "ts": 6302685265594.893, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685265598.523, "dur": 28.680, + "args": { + "External id": 124734,"Sequence number": 2576003, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5353 + } + }, + { + "ph": "s", "id": 63, "pid": 5717, "tid": 5717, "ts": 6302685265598.523, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685265629.693, "dur": 3.120, + "args": { + "External id": 124735,"Sequence number": 2576004, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5354 + } + }, + { + "ph": "s", "id": 62, "pid": 5717, "tid": 5717, "ts": 6302685265629.693, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685265649.943, "dur": 3.420, + "args": { + "External id": 124736,"Sequence number": 2576005, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685265650.793, "dur": 2.350, + "args": { + "External id": 124737,"Sequence number": 2576005, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5356 + } + }, + { + "ph": "s", "id": 61, "pid": 5717, "tid": 5717, "ts": 6302685265650.793, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685265661.713, "dur": 3.130, + "args": { + "External id": 124738,"Sequence number": 2576006, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685265662.793, "dur": 1.860, + "args": { + "External id": 124739,"Sequence number": 2576006, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5358 + } + }, + { + "ph": "s", "id": 60, "pid": 5717, "tid": 5717, "ts": 6302685265662.793, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685265670.723, "dur": 3.250, + "args": { + "External id": 124740,"Sequence number": 2576007, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685265672.593, "dur": 1.190, + "args": { + "External id": 124741,"Sequence number": 2576007, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5360 + } + }, + { + "ph": "s", "id": 59, "pid": 5717, "tid": 5717, "ts": 6302685265672.593, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685265706.483, "dur": 151.670, + "args": { + "External id": 124742,"Sequence number": 2576008, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5361 + } + }, + { + "ph": "s", "id": 58, "pid": 5717, "tid": 5717, "ts": 6302685265706.483, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685265725.133, "dur": 13.980, + "args": { + "External id": 124743,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685265727.763, "dur": 10.560, + "args": { + "External id": 124744,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5717, "tid": 5717, + "ts": 6302685265875.613, "dur": 133.490, + "args": { + "External id": 124745,"Sequence number": 2576009, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5364 + } + }, + { + "ph": "s", "id": 57, "pid": 5717, "tid": 5717, "ts": 6302685265875.613, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685265892.203, "dur": 12.530, + "args": { + "External id": 124746,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685265894.893, "dur": 9.120, + "args": { + "External id": 124747,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5717, "tid": 5717, + "ts": 6302685266040.622, "dur": 190.370, + "args": { + "External id": 124748,"Sequence number": 2576010, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 5367 + } + }, + { + "ph": "s", "id": 56, "pid": 5717, "tid": 5717, "ts": 6302685266040.622, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5717, "tid": 5717, + "ts": 6302685266061.642, "dur": 136.160, + "args": { + "External id": 124749,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 5368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685266098.892, "dur": 13.580, + "args": { + "External id": 124750,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685266101.972, "dur": 9.530, + "args": { + "External id": 124751,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685266116.182, "dur": 7.040, + "args": { + "External id": 124752,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685266124.802, "dur": 3.130, + "args": { + "External id": 124753,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685266131.332, "dur": 5.720, + "args": { + "External id": 124754,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5717, "tid": 5717, + "ts": 6302685266211.902, "dur": 4.840, + "args": { + "External id": 124755,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 5374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685266238.152, "dur": 8.220, + "args": { + "External id": 124756,"Sequence number": 2576011, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685266239.502, "dur": 6.500, + "args": { + "External id": 124757,"Sequence number": 2576011, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5376 + } + }, + { + "ph": "s", "id": 55, "pid": 5717, "tid": 5717, "ts": 6302685266239.502, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685266262.952, "dur": 83.410, + "args": { + "External id": 124758,"Sequence number": 2576012, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685266264.242, "dur": 8.870, + "args": { + "External id": 124759,"Sequence number": 2576012, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5378 + } + }, + { + "ph": "s", "id": 54, "pid": 5717, "tid": 5717, "ts": 6302685266264.242, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685266267.722, "dur": 4.290, + "args": { + "External id": 124760,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685266269.942, "dur": 1.650, + "args": { + "External id": 124761,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685266274.332, "dur": 71.610, + "args": { + "External id": 124762,"Sequence number": 2576013, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685266275.992, "dur": 4.580, + "args": { + "External id": 124763,"Sequence number": 2576013, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685266278.082, "dur": 2.310, + "args": { + "External id": 124764,"Sequence number": 2576013, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5383 + } + }, + { + "ph": "s", "id": 53, "pid": 5717, "tid": 5717, "ts": 6302685266278.082, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685266281.562, "dur": 55.950, + "args": { + "External id": 124765,"Sequence number": 2576014, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5384 + } + }, + { + "ph": "s", "id": 52, "pid": 5717, "tid": 5717, "ts": 6302685266281.562, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685266341.092, "dur": 3.880, + "args": { + "External id": 124766,"Sequence number": 2576015, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5385 + } + }, + { + "ph": "s", "id": 51, "pid": 5717, "tid": 5717, "ts": 6302685266341.092, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685266357.812, "dur": 22.959, + "args": { + "External id": 124767,"Sequence number": 2576016, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5386 + } + }, + { + "ph": "s", "id": 50, "pid": 5717, "tid": 5717, "ts": 6302685266357.812, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5717, "tid": 5717, + "ts": 6302685266399.322, "dur": 193.319, + "args": { + "External id": 124768,"Sequence number": 2576017, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685266402.091, "dur": 31.171, + "args": { + "External id": 124769,"Sequence number": 2576017, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685266403.351, "dur": 29.611, + "args": { + "External id": 124770,"Sequence number": 2576017, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5389 + } + }, + { + "ph": "s", "id": 49, "pid": 5717, "tid": 5717, "ts": 6302685266403.351, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685266408.682, "dur": 6.369, + "args": { + "External id": 124771,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685266416.342, "dur": 15.040, + "args": { + "External id": 124772,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685266434.591, "dur": 20.911, + "args": { + "External id": 124773,"Sequence number": 2576018, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5392 + } + }, + { + "ph": "s", "id": 48, "pid": 5717, "tid": 5717, "ts": 6302685266434.591, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685266437.502, "dur": 0.440, + "args": { + "External id": 124774,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685266438.682, "dur": 0.160, + "args": { + "External id": 124775,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5717, "tid": 5717, + "ts": 6302685266457.231, "dur": 22.590, + "args": { + "External id": 124776,"Sequence number": 2576019, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5395 + } + }, + { + "ph": "s", "id": 47, "pid": 5717, "tid": 5717, "ts": 6302685266457.231, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685266481.511, "dur": 28.710, + "args": { + "External id": 124777,"Sequence number": 2576020, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5396 + } + }, + { + "ph": "s", "id": 46, "pid": 5717, "tid": 5717, "ts": 6302685266481.511, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685266486.661, "dur": 18.680, + "args": { + "External id": 124778,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5717, "tid": 5717, + "ts": 6302685266511.351, "dur": 17.580, + "args": { + "External id": 124779,"Sequence number": 2576021, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5398 + } + }, + { + "ph": "s", "id": 45, "pid": 5717, "tid": 5717, "ts": 6302685266511.351, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685266532.271, "dur": 15.760, + "args": { + "External id": 124780,"Sequence number": 2576022, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5399 + } + }, + { + "ph": "s", "id": 44, "pid": 5717, "tid": 5717, "ts": 6302685266532.271, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5717, "tid": 5717, + "ts": 6302685266549.581, "dur": 24.650, + "args": { + "External id": 124781,"Sequence number": 2576023, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685266552.241, "dur": 21.740, + "args": { + "External id": 124782,"Sequence number": 2576023, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685266553.251, "dur": 20.450, + "args": { + "External id": 124783,"Sequence number": 2576023, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5402 + } + }, + { + "ph": "s", "id": 43, "pid": 5717, "tid": 5717, "ts": 6302685266553.251, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685266556.831, "dur": 4.520, + "args": { + "External id": 124784,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685266562.321, "dur": 10.230, + "args": { + "External id": 124785,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685266576.311, "dur": 15.800, + "args": { + "External id": 124786,"Sequence number": 2576024, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5405 + } + }, + { + "ph": "s", "id": 42, "pid": 5717, "tid": 5717, "ts": 6302685266576.311, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685266614.251, "dur": 58.380, + "args": { + "External id": 124787,"Sequence number": 2576025, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685266615.081, "dur": 8.790, + "args": { + "External id": 124788,"Sequence number": 2576025, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5407 + } + }, + { + "ph": "s", "id": 41, "pid": 5717, "tid": 5717, "ts": 6302685266615.081, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685266618.231, "dur": 4.090, + "args": { + "External id": 124789,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685266620.491, "dur": 1.440, + "args": { + "External id": 124790,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685266624.711, "dur": 47.520, + "args": { + "External id": 124791,"Sequence number": 2576026, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685266626.291, "dur": 4.750, + "args": { + "External id": 124792,"Sequence number": 2576026, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685266627.241, "dur": 3.560, + "args": { + "External id": 124793,"Sequence number": 2576026, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5412 + } + }, + { + "ph": "s", "id": 40, "pid": 5717, "tid": 5717, "ts": 6302685266627.241, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685266631.741, "dur": 34.670, + "args": { + "External id": 124794,"Sequence number": 2576027, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5413 + } + }, + { + "ph": "s", "id": 39, "pid": 5717, "tid": 5717, "ts": 6302685266631.741, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685266669.041, "dur": 2.290, + "args": { + "External id": 124795,"Sequence number": 2576028, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5414 + } + }, + { + "ph": "s", "id": 38, "pid": 5717, "tid": 5717, "ts": 6302685266669.041, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685266696.961, "dur": 51.700, + "args": { + "External id": 124796,"Sequence number": 2576029, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685266697.781, "dur": 8.590, + "args": { + "External id": 124797,"Sequence number": 2576029, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5416 + } + }, + { + "ph": "s", "id": 37, "pid": 5717, "tid": 5717, "ts": 6302685266697.781, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685266700.101, "dur": 5.290, + "args": { + "External id": 124798,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685266703.201, "dur": 1.800, + "args": { + "External id": 124799,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685266707.211, "dur": 41.170, + "args": { + "External id": 124800,"Sequence number": 2576030, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685266708.301, "dur": 2.780, + "args": { + "External id": 124801,"Sequence number": 2576030, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685266709.061, "dur": 1.700, + "args": { + "External id": 124802,"Sequence number": 2576030, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5421 + } + }, + { + "ph": "s", "id": 36, "pid": 5717, "tid": 5717, "ts": 6302685266709.061, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685266712.831, "dur": 28.110, + "args": { + "External id": 124803,"Sequence number": 2576031, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5422 + } + }, + { + "ph": "s", "id": 35, "pid": 5717, "tid": 5717, "ts": 6302685266712.831, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685266743.351, "dur": 4.160, + "args": { + "External id": 124804,"Sequence number": 2576032, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5423 + } + }, + { + "ph": "s", "id": 34, "pid": 5717, "tid": 5717, "ts": 6302685266743.351, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5717, "tid": 5717, + "ts": 6302685266774.731, "dur": 145.969, + "args": { + "External id": 124805,"Sequence number": 2576033, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5424 + } + }, + { + "ph": "s", "id": 33, "pid": 5717, "tid": 5717, "ts": 6302685266774.731, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685266801.381, "dur": 8.829, + "args": { + "External id": 124806,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685266838.970, "dur": 67.800, + "args": { + "External id": 124807,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685266839.921, "dur": 6.849, + "args": { + "External id": 124808,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685266841.550, "dur": 3.951, + "args": { + "External id": 124809,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685266843.601, "dur": 1.420, + "args": { + "External id": 124810,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685266847.630, "dur": 58.650, + "args": { + "External id": 124811,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5717, "tid": 5717, + "ts": 6302685266849.541, "dur": 3.289, + "args": { + "External id": 124812,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685266850.430, "dur": 2.211, + "args": { + "External id": 124813,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685266854.461, "dur": 46.980, + "args": { + "External id": 124814,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5717, "tid": 5717, + "ts": 6302685266903.990, "dur": 1.220, + "args": { + "External id": 124815,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685266929.590, "dur": 24.040, + "args": { + "External id": 124816,"Sequence number": 2576034, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5435 + } + }, + { + "ph": "s", "id": 32, "pid": 5717, "tid": 5717, "ts": 6302685266929.590, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::stack", "pid": 5717, "tid": 5717, + "ts": 6302685266970.990, "dur": 64.270, + "args": { + "External id": 124817,"Sequence number": 2576035, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "-2"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[[1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1]], []], "Input Dims": [[[8, 2048, 768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 768]], []], "Ev Idx": 5436 + } + }, + { + "ph": "s", "id": 31, "pid": 5717, "tid": 5717, "ts": 6302685266970.990, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::cat", "pid": 5717, "tid": 5717, + "ts": 6302685266978.370, "dur": 47.980, + "args": { + "External id": 124818,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[[1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1]], []], "Input Dims": [[[8, 2048, 768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 768]], []], "Ev Idx": 5437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685267030.250, "dur": 2.620, + "args": { + "External id": 124819,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 3072], []], "Ev Idx": 5438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5717, "tid": 5717, + "ts": 6302685267068.550, "dur": 27.380, + "args": { + "External id": 124820,"Record function id": 0, "Ev Idx": 5439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/1", "pid": 5717, "tid": 5717, + "ts": 6302685267097.510, "dur": 226.759, + "args": { + "External id": 124821,"Record function id": 0, "Ev Idx": 5440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5717, "tid": 5717, + "ts": 6302685267174.510, "dur": 133.739, + "args": { + "External id": 124822,"Sequence number": 2576036, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "8", "2048", "4", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "c10::BFloat16"], "Input Strides": [[1], [], [], [], [6291456, 3072, 768, 1]], "Input Dims": [[768], [], [], [], [8, 2048, 4, 768]], "Ev Idx": 5441 + } + }, + { + "ph": "s", "id": 30, "pid": 5717, "tid": 5717, "ts": 6302685267174.510, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5717, "tid": 5717, + "ts": 6302685267235.650, "dur": 28.810, + "args": { + "External id": 124823,"kernel_hash": "cr3d6ghn4frfo7p75kyvvnm5sxvfgy2cg6lzbkpbj2cwqsdz3dyv", "grid": "grid(65536,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "65536", "768"], "kernel_file": "/tmp/torchinductor_root/r3/cr3d6ghn4frfo7p75kyvvnm5sxvfgy2cg6lzbkpbj2cwqsdz3dyv.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[8192, 4, 1, 1], [6291456, 3072, 768, 1], [1], [6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 1], [8, 2048, 4, 768], [768], [8, 2048, 4, 768], [], []], "Ev Idx": 5442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 5717, + "ts": 6302685267393.339, "dur": 43.070, + "args": { + "External id": 124824,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False"], "Input type": ["ScalarList", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685267396.839, "dur": 9.060, + "args": { + "External id": 124825,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False", ""], "Input type": ["ScalarList", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685267408.839, "dur": 27.170, + "args": { + "External id": 124826,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 5445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685267411.909, "dur": 23.040, + "args": { + "External id": 124827,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 5446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 5717, + "ts": 6302685267443.249, "dur": 19.270, + "args": { + "External id": 124828,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False"], "Input type": ["ScalarList", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685267445.519, "dur": 4.900, + "args": { + "External id": 124829,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False", ""], "Input type": ["ScalarList", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685267451.189, "dur": 11.060, + "args": { + "External id": 124830,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 5449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685267452.269, "dur": 9.080, + "args": { + "External id": 124831,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 5450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 5717, + "ts": 6302685267466.569, "dur": 17.730, + "args": { + "External id": 124832,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False"], "Input type": ["ScalarList", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685267467.649, "dur": 4.400, + "args": { + "External id": 124833,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False", ""], "Input type": ["ScalarList", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685267472.759, "dur": 11.270, + "args": { + "External id": 124834,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 5453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685267474.889, "dur": 8.200, + "args": { + "External id": 124835,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 5454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685267494.869, "dur": 0.350, + "args": { + "External id": 124836,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], [], [], [], []], "Ev Idx": 5455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unfold", "pid": 5717, "tid": 5717, + "ts": 6302685267502.629, "dur": 7.970, + "args": { + "External id": 124837,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "5", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 5456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267507.229, "dur": 1.700, + "args": { + "External id": 124838,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 5]", "[4096, 1, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 5457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267516.319, "dur": 5.800, + "args": { + "External id": 124839,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 5], [], [], [], []], "Ev Idx": 5458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267519.859, "dur": 0.640, + "args": { + "External id": 124840,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 5]", "[4096, 1, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 5], [], [], []], "Ev Idx": 5459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267523.399, "dur": 2.820, + "args": { + "External id": 124841,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 5], [], [], [], []], "Ev Idx": 5460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267525.129, "dur": 0.330, + "args": { + "External id": 124842,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 5]", "[4096, 1, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 5], [], [], []], "Ev Idx": 5461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267527.859, "dur": 2.370, + "args": { + "External id": 124843,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "1", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 5], [], [], [], []], "Ev Idx": 5462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267529.479, "dur": 0.260, + "args": { + "External id": 124844,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 5], [], [], []], "Ev Idx": 5463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267534.549, "dur": 3.530, + "args": { + "External id": 124845,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 4], [], [], [], []], "Ev Idx": 5464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267536.299, "dur": 1.230, + "args": { + "External id": 124846,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 4], [], [], []], "Ev Idx": 5465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267539.119, "dur": 2.560, + "args": { + "External id": 124847,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 4], [], [], [], []], "Ev Idx": 5466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267540.729, "dur": 0.320, + "args": { + "External id": 124848,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 4], [], [], []], "Ev Idx": 5467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267542.739, "dur": 2.460, + "args": { + "External id": 124849,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 4], [], [], [], []], "Ev Idx": 5468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267544.419, "dur": 0.270, + "args": { + "External id": 124850,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 2048, 4], [], [], []], "Ev Idx": 5469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685267549.009, "dur": 8.160, + "args": { + "External id": 124851,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "2"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 2048, 4], [], []], "Ev Idx": 5470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267555.929, "dur": 0.420, + "args": { + "External id": 124852,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 2048, 4], [], [], []], "Ev Idx": 5471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267563.799, "dur": 2.880, + "args": { + "External id": 124853,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 5472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267565.649, "dur": 0.390, + "args": { + "External id": 124854,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 5717, + "ts": 6302685267569.549, "dur": 7.640, + "args": { + "External id": 124855,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 5474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267575.499, "dur": 0.460, + "args": { + "External id": 124856,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267578.469, "dur": 2.110, + "args": { + "External id": 124857,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 5476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267579.809, "dur": 0.260, + "args": { + "External id": 124858,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 5477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267584.519, "dur": 6.480, + "args": { + "External id": 124859,"Sequence number": 2576037, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5478 + } + }, + { + "ph": "s", "id": 29, "pid": 5717, "tid": 5717, "ts": 6302685267584.519, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267588.489, "dur": 0.530, + "args": { + "External id": 124860,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267592.069, "dur": 3.930, + "args": { + "External id": 124861,"Sequence number": 2576038, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5480 + } + }, + { + "ph": "s", "id": 28, "pid": 5717, "tid": 5717, "ts": 6302685267592.069, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267594.099, "dur": 1.130, + "args": { + "External id": 124862,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 5717, + "ts": 6302685267597.479, "dur": 4.660, + "args": { + "External id": 124863,"Sequence number": 2576039, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 5482 + } + }, + { + "ph": "s", "id": 27, "pid": 5717, "tid": 5717, "ts": 6302685267597.479, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267600.939, "dur": 0.380, + "args": { + "External id": 124864,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685267603.319, "dur": 4.700, + "args": { + "External id": 124865,"Sequence number": 2576040, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5484 + } + }, + { + "ph": "s", "id": 26, "pid": 5717, "tid": 5717, "ts": 6302685267603.319, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267606.789, "dur": 0.490, + "args": { + "External id": 124866,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5717, "tid": 5717, + "ts": 6302685267612.529, "dur": 39.620, + "args": { + "External id": 124867,"Sequence number": 2576041, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 5717, + "ts": 6302685267614.449, "dur": 37.350, + "args": { + "External id": 124868,"Sequence number": 2576041, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685267617.119, "dur": 9.950, + "args": { + "External id": 124869,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 5488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685267618.889, "dur": 7.520, + "args": { + "External id": 124870,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685267628.149, "dur": 22.930, + "args": { + "External id": 124871,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 5490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685267683.079, "dur": 5.800, + "args": { + "External id": 124872,"Sequence number": 2576041, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5491 + } + }, + { + "ph": "s", "id": 25, "pid": 5717, "tid": 5717, "ts": 6302685267683.079, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685267691.599, "dur": 1.060, + "args": { + "External id": 124873,"Sequence number": 2576042, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5717, "tid": 5717, + "ts": 6302685267717.119, "dur": 13463.579, + "args": { + "External id": 124874,"Sequence number": 2576042, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 5493 + } + }, + { + "ph": "s", "id": 24, "pid": 5717, "tid": 5717, "ts": 6302685267717.119, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5717, "tid": 5717, + "ts": 6302685267730.839, "dur": 35.500, + "args": { + "External id": 124875,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 5717, + "ts": 6302685267731.639, "dur": 34.389, + "args": { + "External id": 124876,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685267733.219, "dur": 10.269, + "args": { + "External id": 124877,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685267736.048, "dur": 6.851, + "args": { + "External id": 124878,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685267744.439, "dur": 21.009, + "args": { + "External id": 124879,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 5498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5717, "tid": 5717, + "ts": 6302685267787.428, "dur": 28.800, + "args": { + "External id": 124880,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685267788.748, "dur": 9.800, + "args": { + "External id": 124881,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267791.658, "dur": 6.490, + "args": { + "External id": 124882,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685267799.438, "dur": 16.530, + "args": { + "External id": 124883,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685267800.888, "dur": 14.150, + "args": { + "External id": 124884,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5717, "tid": 5717, + "ts": 6302685267820.708, "dur": 20.690, + "args": { + "External id": 124885,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685267821.598, "dur": 7.960, + "args": { + "External id": 124886,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267823.518, "dur": 5.700, + "args": { + "External id": 124887,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685267830.238, "dur": 10.920, + "args": { + "External id": 124888,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685267831.138, "dur": 9.130, + "args": { + "External id": 124889,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 5508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 5717, + "ts": 6302685267847.558, "dur": 18.780, + "args": { + "External id": 124890,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685267849.138, "dur": 4.360, + "args": { + "External id": 124891,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685267854.218, "dur": 11.820, + "args": { + "External id": 124892,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 5511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685267856.628, "dur": 8.540, + "args": { + "External id": 124893,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5717, "tid": 5717, + "ts": 6302685267872.558, "dur": 33.220, + "args": { + "External id": 124894,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685267910.698, "dur": 55.560, + "args": { + "External id": 124895,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685267914.208, "dur": 51.500, + "args": { + "External id": 124896,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267922.208, "dur": 1.030, + "args": { + "External id": 124897,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 5516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685267924.378, "dur": 24.180, + "args": { + "External id": 124898,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685267925.498, "dur": 22.780, + "args": { + "External id": 124899,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 5518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685267927.878, "dur": 4.250, + "args": { + "External id": 124900,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685267933.338, "dur": 14.430, + "args": { + "External id": 124901,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 5520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5717, "tid": 5717, + "ts": 6302685267972.418, "dur": 7427.723, + "args": { + "External id": 124902,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5717, "tid": 5717, + "ts": 6302685267974.618, "dur": 7424.563, + "args": { + "External id": 124903,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685275411.821, "dur": 8.440, + "args": { + "External id": 124904,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685275417.111, "dur": 1.160, + "args": { + "External id": 124905,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685275424.701, "dur": 53.540, + "args": { + "External id": 124906,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685275425.751, "dur": 5.470, + "args": { + "External id": 124907,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685275427.591, "dur": 2.910, + "args": { + "External id": 124908,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685275429.331, "dur": 0.820, + "args": { + "External id": 124909,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685275432.211, "dur": 45.220, + "args": { + "External id": 124910,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685275434.491, "dur": 41.940, + "args": { + "External id": 124911,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685275482.991, "dur": 5.770, + "args": { + "External id": 124912,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685275485.771, "dur": 1.710, + "args": { + "External id": 124913,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685275496.661, "dur": 2.500, + "args": { + "External id": 124914,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685275508.251, "dur": 10.460, + "args": { + "External id": 124915,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685275511.241, "dur": 7.050, + "args": { + "External id": 124916,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685275674.101, "dur": 250.089, + "args": { + "External id": 124917,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685275678.210, "dur": 6.640, + "args": { + "External id": 124918,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685275687.941, "dur": 235.519, + "args": { + "External id": 124919,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685275690.701, "dur": 0.640, + "args": { + "External id": 124920,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685275693.410, "dur": 31.120, + "args": { + "External id": 124921,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685275727.241, "dur": 4.629, + "args": { + "External id": 124922,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685275730.090, "dur": 1.100, + "args": { + "External id": 124923,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685275734.040, "dur": 37.880, + "args": { + "External id": 124924,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685275737.990, "dur": 4.140, + "args": { + "External id": 124925,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685275745.750, "dur": 25.810, + "args": { + "External id": 124926,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685275752.130, "dur": 5.570, + "args": { + "External id": 124927,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685275774.390, "dur": 23.030, + "args": { + "External id": 124928,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685275810.480, "dur": 18.230, + "args": { + "External id": 124929,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685275832.470, "dur": 16.730, + "args": { + "External id": 124930,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685275851.740, "dur": 12.890, + "args": { + "External id": 124931,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685275867.500, "dur": 29.100, + "args": { + "External id": 124932,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685275870.290, "dur": 2.630, + "args": { + "External id": 124933,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685275880.000, "dur": 1.780, + "args": { + "External id": 124934,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685275899.590, "dur": 13.040, + "args": { + "External id": 124935,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685275913.830, "dur": 8.080, + "args": { + "External id": 124936,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685275934.200, "dur": 3.200, + "args": { + "External id": 124937,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685275946.010, "dur": 4.780, + "args": { + "External id": 124938,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685275949.120, "dur": 0.610, + "args": { + "External id": 124939,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685276035.500, "dur": 62.060, + "args": { + "External id": 124940,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685276106.409, "dur": 8.271, + "args": { + "External id": 124941,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685276111.620, "dur": 1.209, + "args": { + "External id": 124942,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685276116.160, "dur": 37.889, + "args": { + "External id": 124943,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685276162.509, "dur": 117.760, + "args": { + "External id": 124944,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685276164.040, "dur": 115.419, + "args": { + "External id": 124945,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685276166.280, "dur": 112.819, + "args": { + "External id": 124946,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685276284.009, "dur": 68.940, + "args": { + "External id": 124947,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685276286.039, "dur": 65.670, + "args": { + "External id": 124948,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685276366.709, "dur": 19.200, + "args": { + "External id": 124949,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685276395.169, "dur": 6.290, + "args": { + "External id": 124950,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685276398.719, "dur": 1.180, + "args": { + "External id": 124951,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685276405.969, "dur": 45.490, + "args": { + "External id": 124952,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685276406.879, "dur": 7.720, + "args": { + "External id": 124953,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685276408.329, "dur": 5.540, + "args": { + "External id": 124954,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685276411.909, "dur": 1.640, + "args": { + "External id": 124955,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685276415.509, "dur": 35.290, + "args": { + "External id": 124956,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685276416.549, "dur": 33.450, + "args": { + "External id": 124957,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685276457.739, "dur": 4.650, + "args": { + "External id": 124958,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685276460.289, "dur": 0.800, + "args": { + "External id": 124959,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685276469.789, "dur": 1.970, + "args": { + "External id": 124960,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685276480.719, "dur": 8.830, + "args": { + "External id": 124961,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685276482.589, "dur": 6.580, + "args": { + "External id": 124962,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685276578.479, "dur": 176.559, + "args": { + "External id": 124963,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685276580.908, "dur": 4.900, + "args": { + "External id": 124964,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685276587.639, "dur": 166.719, + "args": { + "External id": 124965,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685276590.168, "dur": 0.300, + "args": { + "External id": 124966,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685276593.008, "dur": 23.551, + "args": { + "External id": 124967,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685276618.358, "dur": 3.760, + "args": { + "External id": 124968,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685276620.528, "dur": 1.050, + "args": { + "External id": 124969,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685276623.118, "dur": 23.180, + "args": { + "External id": 124970,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685276625.528, "dur": 3.260, + "args": { + "External id": 124971,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685276629.818, "dur": 16.150, + "args": { + "External id": 124972,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685276633.118, "dur": 3.290, + "args": { + "External id": 124973,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685276648.008, "dur": 21.140, + "args": { + "External id": 124974,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685276671.088, "dur": 12.040, + "args": { + "External id": 124975,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685276686.298, "dur": 12.250, + "args": { + "External id": 124976,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685276700.138, "dur": 8.640, + "args": { + "External id": 124977,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685276711.918, "dur": 20.630, + "args": { + "External id": 124978,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685276714.228, "dur": 2.900, + "args": { + "External id": 124979,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685276719.398, "dur": 0.770, + "args": { + "External id": 124980,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685276734.738, "dur": 8.980, + "args": { + "External id": 124981,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685276744.878, "dur": 7.910, + "args": { + "External id": 124982,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685276763.988, "dur": 3.060, + "args": { + "External id": 124983,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685276778.598, "dur": 4.850, + "args": { + "External id": 124984,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685276781.768, "dur": 0.580, + "args": { + "External id": 124985,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685276855.408, "dur": 46.680, + "args": { + "External id": 124986,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685276909.998, "dur": 8.340, + "args": { + "External id": 124987,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685276914.338, "dur": 2.190, + "args": { + "External id": 124988,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685276919.668, "dur": 21.330, + "args": { + "External id": 124989,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685276948.268, "dur": 6.910, + "args": { + "External id": 124990,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685276949.808, "dur": 4.520, + "args": { + "External id": 124991,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685276952.908, "dur": 1.090, + "args": { + "External id": 124992,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685276957.908, "dur": 34.170, + "args": { + "External id": 124993,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685276958.818, "dur": 32.309, + "args": { + "External id": 124994,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685276996.958, "dur": 14.980, + "args": { + "External id": 124995,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685277020.278, "dur": 5.860, + "args": { + "External id": 124996,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277023.778, "dur": 0.900, + "args": { + "External id": 124997,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685277031.638, "dur": 40.039, + "args": { + "External id": 124998,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685277032.618, "dur": 4.209, + "args": { + "External id": 124999,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685277033.807, "dur": 2.451, + "args": { + "External id": 125000,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277035.358, "dur": 0.629, + "args": { + "External id": 125001,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685277037.707, "dur": 33.380, + "args": { + "External id": 125002,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685277038.718, "dur": 31.689, + "args": { + "External id": 125003,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685277077.937, "dur": 4.510, + "args": { + "External id": 125004,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277080.587, "dur": 0.620, + "args": { + "External id": 125005,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685277091.017, "dur": 1.900, + "args": { + "External id": 125006,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685277099.867, "dur": 9.770, + "args": { + "External id": 125007,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685277101.677, "dur": 7.540, + "args": { + "External id": 125008,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685277192.777, "dur": 283.500, + "args": { + "External id": 125009,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685277195.147, "dur": 5.860, + "args": { + "External id": 125010,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685277202.837, "dur": 272.889, + "args": { + "External id": 125011,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685277205.337, "dur": 0.200, + "args": { + "External id": 125012,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685277206.567, "dur": 138.990, + "args": { + "External id": 125013,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685277348.507, "dur": 3.820, + "args": { + "External id": 125014,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277350.607, "dur": 1.080, + "args": { + "External id": 125015,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685277353.087, "dur": 23.480, + "args": { + "External id": 125016,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685277354.217, "dur": 3.150, + "args": { + "External id": 125017,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685277358.497, "dur": 17.750, + "args": { + "External id": 125018,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685277362.777, "dur": 3.950, + "args": { + "External id": 125019,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685277378.157, "dur": 17.340, + "args": { + "External id": 125020,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685277397.337, "dur": 10.269, + "args": { + "External id": 125021,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685277410.546, "dur": 12.011, + "args": { + "External id": 125022,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685277425.146, "dur": 8.500, + "args": { + "External id": 125023,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685277435.526, "dur": 19.311, + "args": { + "External id": 125024,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685277437.897, "dur": 2.809, + "args": { + "External id": 125025,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277442.917, "dur": 0.889, + "args": { + "External id": 125026,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685277456.717, "dur": 8.320, + "args": { + "External id": 125027,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685277466.177, "dur": 7.909, + "args": { + "External id": 125028,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685277486.666, "dur": 3.060, + "args": { + "External id": 125029,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685277500.186, "dur": 4.440, + "args": { + "External id": 125030,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277502.986, "dur": 0.620, + "args": { + "External id": 125031,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685277576.786, "dur": 45.370, + "args": { + "External id": 125032,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685277629.796, "dur": 6.830, + "args": { + "External id": 125033,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277633.746, "dur": 1.240, + "args": { + "External id": 125034,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685277637.906, "dur": 20.750, + "args": { + "External id": 125035,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685277666.076, "dur": 7.810, + "args": { + "External id": 125036,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685277668.486, "dur": 4.560, + "args": { + "External id": 125037,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277671.806, "dur": 0.870, + "args": { + "External id": 125038,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685277676.556, "dur": 33.630, + "args": { + "External id": 125039,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685277677.456, "dur": 31.770, + "args": { + "External id": 125040,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685277714.856, "dur": 14.870, + "args": { + "External id": 125041,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685277738.016, "dur": 5.460, + "args": { + "External id": 125042,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277741.206, "dur": 0.830, + "args": { + "External id": 125043,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685277748.736, "dur": 42.270, + "args": { + "External id": 125044,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685277749.666, "dur": 3.990, + "args": { + "External id": 125045,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685277750.806, "dur": 2.300, + "args": { + "External id": 125046,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277752.226, "dur": 0.590, + "args": { + "External id": 125047,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685277754.616, "dur": 35.150, + "args": { + "External id": 125048,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685277755.546, "dur": 31.420, + "args": { + "External id": 125049,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685277799.936, "dur": 8.010, + "args": { + "External id": 125050,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277804.386, "dur": 2.010, + "args": { + "External id": 125051,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685277821.056, "dur": 3.380, + "args": { + "External id": 125052,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685277835.326, "dur": 12.470, + "args": { + "External id": 125053,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685277837.116, "dur": 10.300, + "args": { + "External id": 125054,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685277936.916, "dur": 170.979, + "args": { + "External id": 125055,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685277940.265, "dur": 5.930, + "args": { + "External id": 125056,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685277948.925, "dur": 158.360, + "args": { + "External id": 125057,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685277950.185, "dur": 0.220, + "args": { + "External id": 125058,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685277951.555, "dur": 22.060, + "args": { + "External id": 125059,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685277976.905, "dur": 5.030, + "args": { + "External id": 125060,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685277980.275, "dur": 1.120, + "args": { + "External id": 125061,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685277982.815, "dur": 22.010, + "args": { + "External id": 125062,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685277983.985, "dur": 2.970, + "args": { + "External id": 125063,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685277988.015, "dur": 16.490, + "args": { + "External id": 125064,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685277991.005, "dur": 4.520, + "args": { + "External id": 125065,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685278007.255, "dur": 17.950, + "args": { + "External id": 125066,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685278026.865, "dur": 10.860, + "args": { + "External id": 125067,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685278040.935, "dur": 11.600, + "args": { + "External id": 125068,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685278054.095, "dur": 8.490, + "args": { + "External id": 125069,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685278064.465, "dur": 20.460, + "args": { + "External id": 125070,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685278067.965, "dur": 2.710, + "args": { + "External id": 125071,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278072.855, "dur": 0.790, + "args": { + "External id": 125072,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685278087.195, "dur": 8.190, + "args": { + "External id": 125073,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685278097.745, "dur": 8.100, + "args": { + "External id": 125074,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685278116.865, "dur": 2.920, + "args": { + "External id": 125075,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685278129.875, "dur": 4.730, + "args": { + "External id": 125076,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278133.045, "dur": 0.550, + "args": { + "External id": 125077,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685278205.105, "dur": 45.850, + "args": { + "External id": 125078,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685278258.455, "dur": 7.150, + "args": { + "External id": 125079,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278262.785, "dur": 1.180, + "args": { + "External id": 125080,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685278267.105, "dur": 21.570, + "args": { + "External id": 125081,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685278305.435, "dur": 7.200, + "args": { + "External id": 125082,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685278307.204, "dur": 4.480, + "args": { + "External id": 125083,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278309.424, "dur": 1.860, + "args": { + "External id": 125084,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685278315.704, "dur": 35.011, + "args": { + "External id": 125085,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685278316.615, "dur": 33.089, + "args": { + "External id": 125086,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685278355.744, "dur": 15.020, + "args": { + "External id": 125087,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685278379.195, "dur": 7.089, + "args": { + "External id": 125088,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278383.974, "dur": 0.850, + "args": { + "External id": 125089,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685278390.354, "dur": 40.790, + "args": { + "External id": 125090,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685278391.204, "dur": 3.980, + "args": { + "External id": 125091,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685278392.304, "dur": 2.260, + "args": { + "External id": 125092,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278393.724, "dur": 0.540, + "args": { + "External id": 125093,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685278396.044, "dur": 34.510, + "args": { + "External id": 125094,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685278398.354, "dur": 31.380, + "args": { + "External id": 125095,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685278437.894, "dur": 4.750, + "args": { + "External id": 125096,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278440.714, "dur": 0.700, + "args": { + "External id": 125097,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685278450.274, "dur": 2.020, + "args": { + "External id": 125098,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685278458.904, "dur": 8.500, + "args": { + "External id": 125099,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685278460.714, "dur": 6.310, + "args": { + "External id": 125100,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685278551.214, "dur": 170.270, + "args": { + "External id": 125101,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685278553.324, "dur": 5.920, + "args": { + "External id": 125102,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685278561.714, "dur": 159.040, + "args": { + "External id": 125103,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685278563.254, "dur": 0.210, + "args": { + "External id": 125104,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685278564.494, "dur": 21.850, + "args": { + "External id": 125105,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685278588.024, "dur": 4.560, + "args": { + "External id": 125106,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278590.054, "dur": 1.980, + "args": { + "External id": 125107,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685278593.424, "dur": 22.280, + "args": { + "External id": 125108,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685278594.794, "dur": 2.960, + "args": { + "External id": 125109,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685278598.824, "dur": 16.570, + "args": { + "External id": 125110,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685278602.964, "dur": 3.710, + "args": { + "External id": 125111,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685278617.174, "dur": 17.090, + "args": { + "External id": 125112,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685278637.064, "dur": 11.550, + "args": { + "External id": 125113,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685278651.824, "dur": 11.910, + "args": { + "External id": 125114,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685278665.014, "dur": 8.440, + "args": { + "External id": 125115,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685278675.204, "dur": 19.820, + "args": { + "External id": 125116,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685278677.494, "dur": 2.520, + "args": { + "External id": 125117,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278683.174, "dur": 0.800, + "args": { + "External id": 125118,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685278696.854, "dur": 8.520, + "args": { + "External id": 125119,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685278706.504, "dur": 12.050, + "args": { + "External id": 125120,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685278730.794, "dur": 3.249, + "args": { + "External id": 125121,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685278744.694, "dur": 4.669, + "args": { + "External id": 125122,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278747.643, "dur": 0.671, + "args": { + "External id": 125123,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685278828.293, "dur": 46.230, + "args": { + "External id": 125124,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685278885.303, "dur": 8.840, + "args": { + "External id": 125125,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278890.863, "dur": 1.420, + "args": { + "External id": 125126,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685278895.463, "dur": 28.990, + "args": { + "External id": 125127,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685278936.283, "dur": 11.100, + "args": { + "External id": 125128,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685278940.363, "dur": 6.090, + "args": { + "External id": 125129,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685278943.903, "dur": 2.190, + "args": { + "External id": 125130,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685278952.773, "dur": 52.730, + "args": { + "External id": 125131,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685278953.803, "dur": 50.690, + "args": { + "External id": 125132,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685279011.553, "dur": 18.140, + "args": { + "External id": 125133,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685279039.693, "dur": 5.860, + "args": { + "External id": 125134,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279043.033, "dur": 1.040, + "args": { + "External id": 125135,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685279051.313, "dur": 69.410, + "args": { + "External id": 125136,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685279053.413, "dur": 8.100, + "args": { + "External id": 125137,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685279054.753, "dur": 6.190, + "args": { + "External id": 125138,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279058.593, "dur": 1.980, + "args": { + "External id": 125139,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685279063.603, "dur": 55.290, + "args": { + "External id": 125140,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685279064.683, "dur": 53.270, + "args": { + "External id": 125141,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685279131.053, "dur": 7.590, + "args": { + "External id": 125142,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279135.203, "dur": 0.780, + "args": { + "External id": 125143,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685279146.583, "dur": 2.000, + "args": { + "External id": 125144,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685279156.593, "dur": 8.770, + "args": { + "External id": 125145,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685279158.483, "dur": 6.530, + "args": { + "External id": 125146,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685279248.762, "dur": 176.930, + "args": { + "External id": 125147,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685279252.053, "dur": 6.149, + "args": { + "External id": 125148,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685279259.813, "dur": 165.279, + "args": { + "External id": 125149,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685279261.162, "dur": 0.180, + "args": { + "External id": 125150,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685279262.522, "dur": 22.400, + "args": { + "External id": 125151,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685279286.522, "dur": 4.800, + "args": { + "External id": 125152,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279289.842, "dur": 0.940, + "args": { + "External id": 125153,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685279293.432, "dur": 31.580, + "args": { + "External id": 125154,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685279294.542, "dur": 11.150, + "args": { + "External id": 125155,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685279307.072, "dur": 17.610, + "args": { + "External id": 125156,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685279310.512, "dur": 3.710, + "args": { + "External id": 125157,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685279326.502, "dur": 18.030, + "args": { + "External id": 125158,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685279346.192, "dur": 10.690, + "args": { + "External id": 125159,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685279359.792, "dur": 11.550, + "args": { + "External id": 125160,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685279372.732, "dur": 8.550, + "args": { + "External id": 125161,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685279383.122, "dur": 21.150, + "args": { + "External id": 125162,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685279387.712, "dur": 2.720, + "args": { + "External id": 125163,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279392.642, "dur": 0.870, + "args": { + "External id": 125164,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685279406.312, "dur": 8.410, + "args": { + "External id": 125165,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685279415.862, "dur": 7.820, + "args": { + "External id": 125166,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685279434.642, "dur": 3.180, + "args": { + "External id": 125167,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685279447.962, "dur": 4.490, + "args": { + "External id": 125168,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279450.812, "dur": 0.570, + "args": { + "External id": 125169,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685279524.072, "dur": 45.810, + "args": { + "External id": 125170,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685279577.552, "dur": 7.050, + "args": { + "External id": 125171,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279581.842, "dur": 1.160, + "args": { + "External id": 125172,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685279585.922, "dur": 21.030, + "args": { + "External id": 125173,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685279614.121, "dur": 8.140, + "args": { + "External id": 125174,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685279615.652, "dur": 5.769, + "args": { + "External id": 125175,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279620.032, "dur": 1.040, + "args": { + "External id": 125176,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685279624.752, "dur": 33.189, + "args": { + "External id": 125177,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685279625.632, "dur": 31.440, + "args": { + "External id": 125178,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685279662.601, "dur": 14.640, + "args": { + "External id": 125179,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685279685.501, "dur": 5.400, + "args": { + "External id": 125180,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279688.561, "dur": 0.891, + "args": { + "External id": 125181,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685279695.352, "dur": 41.089, + "args": { + "External id": 125182,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685279697.392, "dur": 4.309, + "args": { + "External id": 125183,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685279698.581, "dur": 2.551, + "args": { + "External id": 125184,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5803 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279700.181, "dur": 0.631, + "args": { + "External id": 125185,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685279702.581, "dur": 33.290, + "args": { + "External id": 125186,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685279703.421, "dur": 31.670, + "args": { + "External id": 125187,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685279742.481, "dur": 4.640, + "args": { + "External id": 125188,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279745.161, "dur": 0.690, + "args": { + "External id": 125189,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685279754.771, "dur": 1.920, + "args": { + "External id": 125190,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685279764.561, "dur": 8.190, + "args": { + "External id": 125191,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685279766.141, "dur": 6.250, + "args": { + "External id": 125192,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685279852.821, "dur": 193.840, + "args": { + "External id": 125193,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685279854.811, "dur": 6.390, + "args": { + "External id": 125194,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685279865.421, "dur": 180.630, + "args": { + "External id": 125195,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685279867.071, "dur": 0.230, + "args": { + "External id": 125196,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685279869.641, "dur": 25.720, + "args": { + "External id": 125197,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685279898.331, "dur": 7.710, + "args": { + "External id": 125198,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685279902.921, "dur": 2.560, + "args": { + "External id": 125199,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685279906.901, "dur": 35.390, + "args": { + "External id": 125200,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685279909.471, "dur": 4.740, + "args": { + "External id": 125201,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685279916.591, "dur": 25.390, + "args": { + "External id": 125202,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685279921.081, "dur": 6.440, + "args": { + "External id": 125203,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685279945.101, "dur": 19.460, + "args": { + "External id": 125204,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685279966.741, "dur": 10.670, + "args": { + "External id": 125205,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685279980.281, "dur": 11.910, + "args": { + "External id": 125206,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685279994.721, "dur": 8.500, + "args": { + "External id": 125207,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685280005.131, "dur": 20.380, + "args": { + "External id": 125208,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685280007.571, "dur": 2.710, + "args": { + "External id": 125209,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280012.421, "dur": 1.900, + "args": { + "External id": 125210,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685280027.201, "dur": 8.190, + "args": { + "External id": 125211,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685280036.571, "dur": 7.960, + "args": { + "External id": 125212,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685280057.320, "dur": 3.051, + "args": { + "External id": 125213,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685280070.700, "dur": 4.591, + "args": { + "External id": 125214,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280073.660, "dur": 0.520, + "args": { + "External id": 125215,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685280147.591, "dur": 50.439, + "args": { + "External id": 125216,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685280206.730, "dur": 7.580, + "args": { + "External id": 125217,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280211.200, "dur": 1.300, + "args": { + "External id": 125218,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685280215.630, "dur": 20.480, + "args": { + "External id": 125219,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685280243.550, "dur": 7.380, + "args": { + "External id": 125220,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685280245.030, "dur": 5.020, + "args": { + "External id": 125221,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280248.650, "dur": 1.040, + "args": { + "External id": 125222,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685280253.570, "dur": 33.400, + "args": { + "External id": 125223,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685280254.530, "dur": 31.470, + "args": { + "External id": 125224,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685280291.900, "dur": 36.510, + "args": { + "External id": 125225,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685280337.940, "dur": 6.290, + "args": { + "External id": 125226,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280341.690, "dur": 1.070, + "args": { + "External id": 125227,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685280349.950, "dur": 43.130, + "args": { + "External id": 125228,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685280350.850, "dur": 5.540, + "args": { + "External id": 125229,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685280351.980, "dur": 3.660, + "args": { + "External id": 125230,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280354.540, "dur": 0.740, + "args": { + "External id": 125231,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685280357.450, "dur": 35.080, + "args": { + "External id": 125232,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685280358.310, "dur": 33.370, + "args": { + "External id": 125233,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685280400.110, "dur": 4.520, + "args": { + "External id": 125234,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280402.650, "dur": 0.630, + "args": { + "External id": 125235,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685280412.270, "dur": 1.880, + "args": { + "External id": 125236,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685280421.080, "dur": 8.270, + "args": { + "External id": 125237,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685280422.810, "dur": 6.190, + "args": { + "External id": 125238,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685280511.790, "dur": 165.789, + "args": { + "External id": 125239,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685280514.110, "dur": 5.849, + "args": { + "External id": 125240,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685280522.730, "dur": 154.229, + "args": { + "External id": 125241,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685280525.039, "dur": 0.300, + "args": { + "External id": 125242,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685280526.510, "dur": 22.240, + "args": { + "External id": 125243,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685280550.410, "dur": 3.749, + "args": { + "External id": 125244,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280552.470, "dur": 1.049, + "args": { + "External id": 125245,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685280554.939, "dur": 23.160, + "args": { + "External id": 125246,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685280557.330, "dur": 2.949, + "args": { + "External id": 125247,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685280561.470, "dur": 16.300, + "args": { + "External id": 125248,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685280564.550, "dur": 4.140, + "args": { + "External id": 125249,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685280580.599, "dur": 17.420, + "args": { + "External id": 125250,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685280599.659, "dur": 10.240, + "args": { + "External id": 125251,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685280612.839, "dur": 11.960, + "args": { + "External id": 125252,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685280626.269, "dur": 8.340, + "args": { + "External id": 125253,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685280636.499, "dur": 18.940, + "args": { + "External id": 125254,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685280638.909, "dur": 2.550, + "args": { + "External id": 125255,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280643.619, "dur": 0.840, + "args": { + "External id": 125256,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685280658.319, "dur": 8.450, + "args": { + "External id": 125257,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685280667.919, "dur": 7.550, + "args": { + "External id": 125258,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685280686.389, "dur": 3.020, + "args": { + "External id": 125259,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685280699.159, "dur": 4.490, + "args": { + "External id": 125260,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280702.039, "dur": 0.550, + "args": { + "External id": 125261,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685280813.359, "dur": 74.730, + "args": { + "External id": 125262,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685280899.819, "dur": 10.870, + "args": { + "External id": 125263,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280906.429, "dur": 2.430, + "args": { + "External id": 125264,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685280914.379, "dur": 34.650, + "args": { + "External id": 125265,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685280960.329, "dur": 10.269, + "args": { + "External id": 125266,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685280963.189, "dur": 6.580, + "args": { + "External id": 125267,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685280966.918, "dur": 2.371, + "args": { + "External id": 125268,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685280974.449, "dur": 53.229, + "args": { + "External id": 125269,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685280976.658, "dur": 49.960, + "args": { + "External id": 125270,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685281036.558, "dur": 21.400, + "args": { + "External id": 125271,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685281064.918, "dur": 28.760, + "args": { + "External id": 125272,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685281067.638, "dur": 25.570, + "args": { + "External id": 125273,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281075.958, "dur": 0.830, + "args": { + "External id": 125274,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 5893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685281100.378, "dur": 29.460, + "args": { + "External id": 125275,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685281101.868, "dur": 27.420, + "args": { + "External id": 125276,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 5895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281105.468, "dur": 5.830, + "args": { + "External id": 125277,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5896 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685281112.588, "dur": 15.800, + "args": { + "External id": 125278,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685281142.448, "dur": 6.830, + "args": { + "External id": 125279,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685281145.398, "dur": 3.560, + "args": { + "External id": 125280,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685281150.398, "dur": 2.480, + "args": { + "External id": 125281,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685281152.018, "dur": 0.680, + "args": { + "External id": 125282,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685281203.468, "dur": 29.170, + "args": { + "External id": 125283,"Sequence number": 2576043, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 5902 + } + }, + { + "ph": "s", "id": 23, "pid": 5717, "tid": 5717, "ts": 6302685281203.468, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685281240.498, "dur": 7.890, + "args": { + "External id": 125284,"Sequence number": 2576044, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 5903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281244.948, "dur": 1.550, + "args": { + "External id": 125285,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 5717, + "ts": 6302685281251.848, "dur": 4.970, + "args": { + "External id": 125286,"Sequence number": 2576044, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "1"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 5905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281255.248, "dur": 0.380, + "args": { + "External id": 125287,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "2"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685281258.258, "dur": 2.450, + "args": { + "External id": 125288,"Sequence number": 2576044, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 5907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281259.898, "dur": 0.280, + "args": { + "External id": 125289,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "2"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 5908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685281266.468, "dur": 6.200, + "args": { + "External id": 125290,"Sequence number": 2576044, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5909 + } + }, + { + "ph": "s", "id": 22, "pid": 5717, "tid": 5717, "ts": 6302685281266.468, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281270.728, "dur": 0.560, + "args": { + "External id": 125291,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685281273.768, "dur": 3.200, + "args": { + "External id": 125292,"Sequence number": 2576045, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5911 + } + }, + { + "ph": "s", "id": 21, "pid": 5717, "tid": 5717, "ts": 6302685281273.768, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281275.838, "dur": 0.350, + "args": { + "External id": 125293,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 5717, + "ts": 6302685281278.088, "dur": 5.020, + "args": { + "External id": 125294,"Sequence number": 2576046, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 5913 + } + }, + { + "ph": "s", "id": 20, "pid": 5717, "tid": 5717, "ts": 6302685281278.088, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281281.838, "dur": 0.460, + "args": { + "External id": 125295,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685281284.718, "dur": 5.520, + "args": { + "External id": 125296,"Sequence number": 2576047, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5915 + } + }, + { + "ph": "s", "id": 19, "pid": 5717, "tid": 5717, "ts": 6302685281284.718, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281287.018, "dur": 2.450, + "args": { + "External id": 125297,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5717, "tid": 5717, + "ts": 6302685281294.428, "dur": 46.650, + "args": { + "External id": 125298,"Sequence number": 2576048, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 5717, + "ts": 6302685281295.508, "dur": 45.130, + "args": { + "External id": 125299,"Sequence number": 2576048, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685281305.158, "dur": 10.250, + "args": { + "External id": 125300,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 5919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685281307.068, "dur": 7.710, + "args": { + "External id": 125301,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685281316.578, "dur": 23.340, + "args": { + "External id": 125302,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 5921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685281372.788, "dur": 5.929, + "args": { + "External id": 125303,"Sequence number": 2576048, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5922 + } + }, + { + "ph": "s", "id": 18, "pid": 5717, "tid": 5717, "ts": 6302685281372.788, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685281382.728, "dur": 2.100, + "args": { + "External id": 125304,"Sequence number": 2576049, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5717, "tid": 5717, + "ts": 6302685281410.477, "dur": 20238.655, + "args": { + "External id": 125305,"Sequence number": 2576049, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 5924 + } + }, + { + "ph": "s", "id": 17, "pid": 5717, "tid": 5717, "ts": 6302685281410.477, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5717, "tid": 5717, + "ts": 6302685281424.368, "dur": 34.000, + "args": { + "External id": 125306,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 5717, + "ts": 6302685281425.337, "dur": 32.771, + "args": { + "External id": 125307,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685281427.008, "dur": 9.720, + "args": { + "External id": 125308,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685281428.877, "dur": 7.091, + "args": { + "External id": 125309,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685281437.728, "dur": 19.820, + "args": { + "External id": 125310,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 5929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5717, "tid": 5717, + "ts": 6302685281474.197, "dur": 28.750, + "args": { + "External id": 125311,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685281475.327, "dur": 8.780, + "args": { + "External id": 125312,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281477.377, "dur": 6.290, + "args": { + "External id": 125313,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685281485.217, "dur": 17.460, + "args": { + "External id": 125314,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685281488.137, "dur": 13.610, + "args": { + "External id": 125315,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5717, "tid": 5717, + "ts": 6302685281507.007, "dur": 19.200, + "args": { + "External id": 125316,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685281507.927, "dur": 6.120, + "args": { + "External id": 125317,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281509.437, "dur": 4.280, + "args": { + "External id": 125318,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685281514.697, "dur": 11.290, + "args": { + "External id": 125319,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685281515.737, "dur": 9.380, + "args": { + "External id": 125320,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 5939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 5717, + "ts": 6302685281532.167, "dur": 22.490, + "args": { + "External id": 125321,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685281533.547, "dur": 6.770, + "args": { + "External id": 125322,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685281541.067, "dur": 12.980, + "args": { + "External id": 125323,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 5942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685281542.127, "dur": 8.350, + "args": { + "External id": 125324,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5717, "tid": 5717, + "ts": 6302685281560.197, "dur": 23.940, + "args": { + "External id": 125325,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685281588.277, "dur": 49.380, + "args": { + "External id": 125326,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685281590.187, "dur": 46.920, + "args": { + "External id": 125327,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281596.157, "dur": 1.070, + "args": { + "External id": 125328,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 5947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685281598.297, "dur": 25.090, + "args": { + "External id": 125329,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685281599.477, "dur": 23.610, + "args": { + "External id": 125330,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 5949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685281603.067, "dur": 4.500, + "args": { + "External id": 125331,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685281608.727, "dur": 13.900, + "args": { + "External id": 125332,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 5951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5717, "tid": 5717, + "ts": 6302685281643.577, "dur": 15423.785, + "args": { + "External id": 125333,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5717, "tid": 5717, + "ts": 6302685281645.197, "dur": 15421.425, + "args": { + "External id": 125334,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685297076.412, "dur": 7.350, + "args": { + "External id": 125335,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297081.102, "dur": 0.980, + "args": { + "External id": 125336,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685297087.462, "dur": 45.090, + "args": { + "External id": 125337,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685297088.322, "dur": 5.160, + "args": { + "External id": 125338,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685297090.022, "dur": 2.790, + "args": { + "External id": 125339,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297091.662, "dur": 0.850, + "args": { + "External id": 125340,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685297094.582, "dur": 37.300, + "args": { + "External id": 125341,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685297095.602, "dur": 35.510, + "args": { + "External id": 125342,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685297138.182, "dur": 4.340, + "args": { + "External id": 125343,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297140.732, "dur": 0.640, + "args": { + "External id": 125344,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685297148.952, "dur": 1.800, + "args": { + "External id": 125345,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685297157.122, "dur": 7.530, + "args": { + "External id": 125346,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685297158.642, "dur": 5.680, + "args": { + "External id": 125347,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685297245.062, "dur": 162.109, + "args": { + "External id": 125348,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685297247.902, "dur": 4.280, + "args": { + "External id": 125349,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685297253.892, "dur": 152.779, + "args": { + "External id": 125350,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685297255.062, "dur": 0.180, + "args": { + "External id": 125351,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685297256.282, "dur": 22.430, + "args": { + "External id": 125352,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685297280.301, "dur": 4.300, + "args": { + "External id": 125353,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297283.201, "dur": 0.851, + "args": { + "External id": 125354,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685297285.352, "dur": 28.129, + "args": { + "External id": 125355,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685297287.452, "dur": 2.540, + "args": { + "External id": 125356,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685297290.912, "dur": 22.249, + "args": { + "External id": 125357,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685297293.761, "dur": 10.271, + "args": { + "External id": 125358,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685297314.932, "dur": 16.769, + "args": { + "External id": 125359,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685297333.321, "dur": 9.860, + "args": { + "External id": 125360,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685297346.012, "dur": 10.620, + "args": { + "External id": 125361,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685297357.941, "dur": 7.671, + "args": { + "External id": 125362,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685297367.321, "dur": 20.010, + "args": { + "External id": 125363,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685297370.861, "dur": 2.480, + "args": { + "External id": 125364,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297376.561, "dur": 0.780, + "args": { + "External id": 125365,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685297389.191, "dur": 7.720, + "args": { + "External id": 125366,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685297397.951, "dur": 7.220, + "args": { + "External id": 125367,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685297415.221, "dur": 2.740, + "args": { + "External id": 125368,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685297425.241, "dur": 4.060, + "args": { + "External id": 125369,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297427.861, "dur": 0.480, + "args": { + "External id": 125370,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685297500.711, "dur": 40.100, + "args": { + "External id": 125371,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685297548.971, "dur": 6.830, + "args": { + "External id": 125372,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297552.901, "dur": 1.050, + "args": { + "External id": 125373,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685297557.111, "dur": 19.740, + "args": { + "External id": 125374,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685297583.401, "dur": 6.340, + "args": { + "External id": 125375,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685297584.831, "dur": 4.140, + "args": { + "External id": 125376,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297586.801, "dur": 1.830, + "args": { + "External id": 125377,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685297593.251, "dur": 29.970, + "args": { + "External id": 125378,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685297594.301, "dur": 28.140, + "args": { + "External id": 125379,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685297627.411, "dur": 13.190, + "args": { + "External id": 125380,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685297648.091, "dur": 4.900, + "args": { + "External id": 125381,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297650.841, "dur": 0.890, + "args": { + "External id": 125382,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685297656.601, "dur": 37.120, + "args": { + "External id": 125383,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685297657.441, "dur": 5.130, + "args": { + "External id": 125384,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685297659.821, "dur": 2.180, + "args": { + "External id": 125385,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297661.161, "dur": 0.500, + "args": { + "External id": 125386,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685297663.411, "dur": 29.780, + "args": { + "External id": 125387,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685297664.191, "dur": 28.330, + "args": { + "External id": 125388,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685297699.151, "dur": 5.330, + "args": { + "External id": 125389,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297701.781, "dur": 1.650, + "args": { + "External id": 125390,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685297712.171, "dur": 1.750, + "args": { + "External id": 125391,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685297720.040, "dur": 7.571, + "args": { + "External id": 125392,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685297721.880, "dur": 5.400, + "args": { + "External id": 125393,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685297801.160, "dur": 148.420, + "args": { + "External id": 125394,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685297803.120, "dur": 4.400, + "args": { + "External id": 125395,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685297810.031, "dur": 139.019, + "args": { + "External id": 125396,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685297811.291, "dur": 0.200, + "args": { + "External id": 125397,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685297813.810, "dur": 19.850, + "args": { + "External id": 125398,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685297835.190, "dur": 4.590, + "args": { + "External id": 125399,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297837.140, "dur": 2.170, + "args": { + "External id": 125400,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685297840.550, "dur": 18.450, + "args": { + "External id": 125401,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685297841.630, "dur": 2.640, + "args": { + "External id": 125402,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685297845.250, "dur": 13.520, + "args": { + "External id": 125403,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685297847.900, "dur": 3.050, + "args": { + "External id": 125404,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685297860.360, "dur": 16.890, + "args": { + "External id": 125405,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685297878.840, "dur": 9.140, + "args": { + "External id": 125406,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685297890.730, "dur": 11.110, + "args": { + "External id": 125407,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685297904.030, "dur": 7.530, + "args": { + "External id": 125408,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685297913.220, "dur": 17.100, + "args": { + "External id": 125409,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685297915.310, "dur": 2.610, + "args": { + "External id": 125410,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297919.950, "dur": 0.720, + "args": { + "External id": 125411,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685297932.170, "dur": 7.650, + "args": { + "External id": 125412,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685297940.860, "dur": 6.870, + "args": { + "External id": 125413,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685297957.360, "dur": 2.650, + "args": { + "External id": 125414,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685297969.840, "dur": 4.150, + "args": { + "External id": 125415,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685297972.530, "dur": 0.540, + "args": { + "External id": 125416,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685298035.280, "dur": 40.280, + "args": { + "External id": 125417,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685298082.620, "dur": 6.190, + "args": { + "External id": 125418,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298086.210, "dur": 1.140, + "args": { + "External id": 125419,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685298089.970, "dur": 18.300, + "args": { + "External id": 125420,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685298114.570, "dur": 6.240, + "args": { + "External id": 125421,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685298117.110, "dur": 2.990, + "args": { + "External id": 125422,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298118.910, "dur": 0.870, + "args": { + "External id": 125423,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685298123.000, "dur": 28.970, + "args": { + "External id": 125424,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685298123.880, "dur": 27.310, + "args": { + "External id": 125425,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685298156.160, "dur": 12.770, + "args": { + "External id": 125426,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685298176.319, "dur": 6.151, + "args": { + "External id": 125427,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298180.370, "dur": 0.880, + "args": { + "External id": 125428,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685298186.139, "dur": 36.140, + "args": { + "External id": 125429,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685298186.850, "dur": 4.840, + "args": { + "External id": 125430,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685298187.839, "dur": 3.311, + "args": { + "External id": 125431,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298189.170, "dur": 1.680, + "args": { + "External id": 125432,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685298192.579, "dur": 29.111, + "args": { + "External id": 125433,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685298193.339, "dur": 27.640, + "args": { + "External id": 125434,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685298227.910, "dur": 4.089, + "args": { + "External id": 125435,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298230.259, "dur": 0.531, + "args": { + "External id": 125436,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685298239.590, "dur": 1.580, + "args": { + "External id": 125437,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685298247.219, "dur": 7.451, + "args": { + "External id": 125438,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685298248.810, "dur": 5.529, + "args": { + "External id": 125439,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685298336.149, "dur": 163.110, + "args": { + "External id": 125440,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685298353.089, "dur": 4.870, + "args": { + "External id": 125441,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685298360.679, "dur": 138.030, + "args": { + "External id": 125442,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685298361.809, "dur": 0.200, + "args": { + "External id": 125443,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685298364.019, "dur": 20.680, + "args": { + "External id": 125444,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685298386.149, "dur": 3.300, + "args": { + "External id": 125445,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298388.089, "dur": 0.880, + "args": { + "External id": 125446,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685298390.079, "dur": 20.100, + "args": { + "External id": 125447,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685298391.049, "dur": 3.780, + "args": { + "External id": 125448,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685298395.759, "dur": 14.130, + "args": { + "External id": 125449,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685298398.489, "dur": 3.230, + "args": { + "External id": 125450,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685298411.309, "dur": 16.430, + "args": { + "External id": 125451,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685298429.219, "dur": 9.020, + "args": { + "External id": 125452,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685298441.919, "dur": 10.620, + "args": { + "External id": 125453,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685298453.739, "dur": 7.630, + "args": { + "External id": 125454,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685298463.019, "dur": 16.220, + "args": { + "External id": 125455,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685298465.099, "dur": 2.120, + "args": { + "External id": 125456,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298469.089, "dur": 0.730, + "args": { + "External id": 125457,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685298480.899, "dur": 7.260, + "args": { + "External id": 125458,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685298489.359, "dur": 7.050, + "args": { + "External id": 125459,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685298506.899, "dur": 2.620, + "args": { + "External id": 125460,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685298518.629, "dur": 7.000, + "args": { + "External id": 125461,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298524.109, "dur": 0.520, + "args": { + "External id": 125462,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685298588.829, "dur": 41.269, + "args": { + "External id": 125463,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685298636.958, "dur": 5.980, + "args": { + "External id": 125464,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298640.518, "dur": 1.020, + "args": { + "External id": 125465,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685298644.358, "dur": 18.060, + "args": { + "External id": 125466,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685298669.698, "dur": 5.271, + "args": { + "External id": 125467,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685298670.989, "dur": 3.209, + "args": { + "External id": 125468,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298672.998, "dur": 0.891, + "args": { + "External id": 125469,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685298677.309, "dur": 29.349, + "args": { + "External id": 125470,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685298678.209, "dur": 27.709, + "args": { + "External id": 125471,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685298711.018, "dur": 13.240, + "args": { + "External id": 125472,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685298731.608, "dur": 5.840, + "args": { + "External id": 125473,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298735.478, "dur": 0.770, + "args": { + "External id": 125474,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685298741.028, "dur": 37.250, + "args": { + "External id": 125475,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685298741.818, "dur": 4.880, + "args": { + "External id": 125476,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685298742.718, "dur": 3.500, + "args": { + "External id": 125477,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298744.298, "dur": 1.590, + "args": { + "External id": 125478,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685298747.478, "dur": 30.260, + "args": { + "External id": 125479,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685298749.268, "dur": 27.740, + "args": { + "External id": 125480,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685298783.448, "dur": 4.140, + "args": { + "External id": 125481,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298785.958, "dur": 0.510, + "args": { + "External id": 125482,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685298794.078, "dur": 1.700, + "args": { + "External id": 125483,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685298801.468, "dur": 8.770, + "args": { + "External id": 125484,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685298804.098, "dur": 5.820, + "args": { + "External id": 125485,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685298881.228, "dur": 147.530, + "args": { + "External id": 125486,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685298883.388, "dur": 5.260, + "args": { + "External id": 125487,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685298890.028, "dur": 138.190, + "args": { + "External id": 125488,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685298891.148, "dur": 0.130, + "args": { + "External id": 125489,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685298893.418, "dur": 19.830, + "args": { + "External id": 125490,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685298914.768, "dur": 3.260, + "args": { + "External id": 125491,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298916.598, "dur": 0.870, + "args": { + "External id": 125492,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685298918.778, "dur": 22.320, + "args": { + "External id": 125493,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685298920.818, "dur": 4.190, + "args": { + "External id": 125494,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685298926.018, "dur": 14.820, + "args": { + "External id": 125495,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685298928.648, "dur": 4.370, + "args": { + "External id": 125496,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685298942.238, "dur": 15.230, + "args": { + "External id": 125497,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685298959.068, "dur": 8.840, + "args": { + "External id": 125498,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685298970.568, "dur": 10.420, + "args": { + "External id": 125499,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685298982.178, "dur": 7.730, + "args": { + "External id": 125500,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685298991.508, "dur": 18.400, + "args": { + "External id": 125501,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685298993.778, "dur": 2.470, + "args": { + "External id": 125502,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685298999.318, "dur": 0.740, + "args": { + "External id": 125503,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685299011.528, "dur": 7.340, + "args": { + "External id": 125504,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685299019.908, "dur": 6.930, + "args": { + "External id": 125505,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685299036.558, "dur": 2.680, + "args": { + "External id": 125506,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685299047.957, "dur": 3.911, + "args": { + "External id": 125507,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299050.477, "dur": 0.511, + "args": { + "External id": 125508,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685299113.248, "dur": 39.729, + "args": { + "External id": 125509,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685299160.917, "dur": 7.000, + "args": { + "External id": 125510,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299164.297, "dur": 2.010, + "args": { + "External id": 125511,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685299169.127, "dur": 17.440, + "args": { + "External id": 125512,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685299192.827, "dur": 5.220, + "args": { + "External id": 125513,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685299194.147, "dur": 3.170, + "args": { + "External id": 125514,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299196.107, "dur": 0.890, + "args": { + "External id": 125515,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685299201.387, "dur": 29.070, + "args": { + "External id": 125516,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685299202.197, "dur": 27.400, + "args": { + "External id": 125517,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685299234.697, "dur": 13.040, + "args": { + "External id": 125518,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685299255.017, "dur": 4.610, + "args": { + "External id": 125519,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299257.687, "dur": 0.810, + "args": { + "External id": 125520,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685299263.117, "dur": 43.670, + "args": { + "External id": 125521,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685299263.867, "dur": 4.430, + "args": { + "External id": 125522,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685299265.957, "dur": 1.900, + "args": { + "External id": 125523,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299267.177, "dur": 0.450, + "args": { + "External id": 125524,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685299269.097, "dur": 37.140, + "args": { + "External id": 125525,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685299269.847, "dur": 35.120, + "args": { + "External id": 125526,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685299313.147, "dur": 4.380, + "args": { + "External id": 125527,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299315.657, "dur": 0.640, + "args": { + "External id": 125528,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685299324.207, "dur": 1.720, + "args": { + "External id": 125529,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685299332.607, "dur": 8.360, + "args": { + "External id": 125530,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685299334.027, "dur": 6.590, + "args": { + "External id": 125531,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685299412.437, "dur": 146.710, + "args": { + "External id": 125532,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685299414.667, "dur": 4.330, + "args": { + "External id": 125533,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685299420.317, "dur": 138.270, + "args": { + "External id": 125534,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685299421.567, "dur": 0.170, + "args": { + "External id": 125535,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685299422.777, "dur": 20.650, + "args": { + "External id": 125536,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685299446.347, "dur": 3.490, + "args": { + "External id": 125537,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299448.507, "dur": 0.860, + "args": { + "External id": 125538,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685299451.607, "dur": 19.580, + "args": { + "External id": 125539,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685299452.557, "dur": 2.580, + "args": { + "External id": 125540,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685299455.997, "dur": 14.880, + "args": { + "External id": 125541,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685299459.537, "dur": 3.280, + "args": { + "External id": 125542,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685299472.347, "dur": 15.829, + "args": { + "External id": 125543,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685299489.596, "dur": 8.840, + "args": { + "External id": 125544,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685299501.367, "dur": 10.429, + "args": { + "External id": 125545,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685299513.056, "dur": 7.400, + "args": { + "External id": 125546,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685299522.016, "dur": 18.160, + "args": { + "External id": 125547,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685299524.307, "dur": 2.440, + "args": { + "External id": 125548,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299529.787, "dur": 0.720, + "args": { + "External id": 125549,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685299542.036, "dur": 7.491, + "args": { + "External id": 125550,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685299550.547, "dur": 6.840, + "args": { + "External id": 125551,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685299566.676, "dur": 2.660, + "args": { + "External id": 125552,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685299577.747, "dur": 3.999, + "args": { + "External id": 125553,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299580.296, "dur": 0.570, + "args": { + "External id": 125554,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685299642.236, "dur": 46.150, + "args": { + "External id": 125555,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685299701.716, "dur": 8.660, + "args": { + "External id": 125556,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299707.646, "dur": 1.140, + "args": { + "External id": 125557,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685299711.526, "dur": 18.720, + "args": { + "External id": 125558,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685299736.606, "dur": 6.480, + "args": { + "External id": 125559,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685299739.086, "dur": 3.280, + "args": { + "External id": 125560,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299741.226, "dur": 0.800, + "args": { + "External id": 125561,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685299746.606, "dur": 32.320, + "args": { + "External id": 125562,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685299748.956, "dur": 29.010, + "args": { + "External id": 125563,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685299783.126, "dur": 15.140, + "args": { + "External id": 125564,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685299807.186, "dur": 5.220, + "args": { + "External id": 125565,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299810.186, "dur": 0.840, + "args": { + "External id": 125566,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685299816.056, "dur": 58.340, + "args": { + "External id": 125567,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685299816.976, "dur": 7.620, + "args": { + "External id": 125568,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685299820.496, "dur": 3.550, + "args": { + "External id": 125569,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299822.086, "dur": 1.640, + "args": { + "External id": 125570,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685299825.526, "dur": 48.370, + "args": { + "External id": 125571,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685299827.486, "dur": 45.610, + "args": { + "External id": 125572,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685299879.886, "dur": 5.110, + "args": { + "External id": 125573,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685299882.366, "dur": 1.480, + "args": { + "External id": 125574,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685299893.026, "dur": 1.810, + "args": { + "External id": 125575,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685299900.986, "dur": 7.490, + "args": { + "External id": 125576,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685299902.386, "dur": 5.780, + "args": { + "External id": 125577,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685299993.726, "dur": 199.969, + "args": { + "External id": 125578,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685299995.755, "dur": 6.100, + "args": { + "External id": 125579,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685300003.506, "dur": 189.589, + "args": { + "External id": 125580,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685300008.355, "dur": 0.200, + "args": { + "External id": 125581,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685300009.575, "dur": 34.290, + "args": { + "External id": 125582,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685300045.505, "dur": 5.830, + "args": { + "External id": 125583,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300048.715, "dur": 2.140, + "args": { + "External id": 125584,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685300052.145, "dur": 22.140, + "args": { + "External id": 125585,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685300054.235, "dur": 2.810, + "args": { + "External id": 125586,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685300058.215, "dur": 15.800, + "args": { + "External id": 125587,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685300062.065, "dur": 4.180, + "args": { + "External id": 125588,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685300075.525, "dur": 18.520, + "args": { + "External id": 125589,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685300095.885, "dur": 9.300, + "args": { + "External id": 125590,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685300108.165, "dur": 11.020, + "args": { + "External id": 125591,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685300121.385, "dur": 7.540, + "args": { + "External id": 125592,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685300131.455, "dur": 31.010, + "args": { + "External id": 125593,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685300136.315, "dur": 3.610, + "args": { + "External id": 125594,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300143.275, "dur": 2.940, + "args": { + "External id": 125595,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685300164.275, "dur": 12.490, + "args": { + "External id": 125596,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685300179.035, "dur": 11.490, + "args": { + "External id": 125597,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685300208.845, "dur": 2.790, + "args": { + "External id": 125598,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685300221.325, "dur": 4.010, + "args": { + "External id": 125599,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300223.905, "dur": 0.490, + "args": { + "External id": 125600,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685300295.795, "dur": 69.890, + "args": { + "External id": 125601,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685300373.405, "dur": 6.840, + "args": { + "External id": 125602,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300377.445, "dur": 1.080, + "args": { + "External id": 125603,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685300381.554, "dur": 18.660, + "args": { + "External id": 125604,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685300406.665, "dur": 6.280, + "args": { + "External id": 125605,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685300407.954, "dur": 4.260, + "args": { + "External id": 125606,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300410.945, "dur": 0.960, + "args": { + "External id": 125607,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685300415.345, "dur": 29.820, + "args": { + "External id": 125608,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685300416.185, "dur": 28.189, + "args": { + "External id": 125609,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685300449.345, "dur": 13.169, + "args": { + "External id": 125610,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685300469.964, "dur": 4.720, + "args": { + "External id": 125611,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300472.694, "dur": 0.820, + "args": { + "External id": 125612,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685300481.794, "dur": 39.720, + "args": { + "External id": 125613,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685300482.634, "dur": 5.460, + "args": { + "External id": 125614,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685300483.714, "dur": 3.700, + "args": { + "External id": 125615,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300486.344, "dur": 0.800, + "args": { + "External id": 125616,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685300488.954, "dur": 32.030, + "args": { + "External id": 125617,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685300489.824, "dur": 30.310, + "args": { + "External id": 125618,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685300527.564, "dur": 4.010, + "args": { + "External id": 125619,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300529.914, "dur": 0.500, + "args": { + "External id": 125620,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685300539.724, "dur": 1.700, + "args": { + "External id": 125621,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685300547.484, "dur": 7.470, + "args": { + "External id": 125622,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685300548.964, "dur": 5.670, + "args": { + "External id": 125623,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685300627.514, "dur": 169.400, + "args": { + "External id": 125624,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685300629.514, "dur": 4.240, + "args": { + "External id": 125625,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685300635.154, "dur": 161.210, + "args": { + "External id": 125626,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685300638.314, "dur": 0.270, + "args": { + "External id": 125627,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685300639.824, "dur": 20.670, + "args": { + "External id": 125628,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685300662.024, "dur": 3.050, + "args": { + "External id": 125629,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300663.734, "dur": 0.880, + "args": { + "External id": 125630,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685300665.774, "dur": 20.090, + "args": { + "External id": 125631,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685300667.884, "dur": 3.120, + "args": { + "External id": 125632,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685300671.884, "dur": 13.710, + "args": { + "External id": 125633,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685300674.384, "dur": 3.490, + "args": { + "External id": 125634,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685300687.164, "dur": 21.210, + "args": { + "External id": 125635,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685300711.214, "dur": 12.890, + "args": { + "External id": 125636,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685300727.064, "dur": 10.760, + "args": { + "External id": 125637,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685300740.074, "dur": 7.640, + "args": { + "External id": 125638,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685300749.374, "dur": 19.340, + "args": { + "External id": 125639,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685300751.514, "dur": 2.570, + "args": { + "External id": 125640,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300756.094, "dur": 1.070, + "args": { + "External id": 125641,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685300770.474, "dur": 12.650, + "args": { + "External id": 125642,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685300785.384, "dur": 9.570, + "args": { + "External id": 125643,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685300805.954, "dur": 2.699, + "args": { + "External id": 125644,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685300823.864, "dur": 5.800, + "args": { + "External id": 125645,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300828.173, "dur": 0.520, + "args": { + "External id": 125646,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685300894.284, "dur": 40.199, + "args": { + "External id": 125647,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685300941.303, "dur": 7.340, + "args": { + "External id": 125648,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300945.893, "dur": 1.190, + "args": { + "External id": 125649,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685300949.903, "dur": 17.920, + "args": { + "External id": 125650,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685300974.303, "dur": 6.200, + "args": { + "External id": 125651,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685300976.743, "dur": 2.950, + "args": { + "External id": 125652,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685300978.533, "dur": 0.840, + "args": { + "External id": 125653,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685300982.683, "dur": 29.120, + "args": { + "External id": 125654,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685300983.483, "dur": 27.550, + "args": { + "External id": 125655,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685301015.773, "dur": 12.970, + "args": { + "External id": 125656,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685301035.623, "dur": 5.810, + "args": { + "External id": 125657,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301039.433, "dur": 0.830, + "args": { + "External id": 125658,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685301045.433, "dur": 34.930, + "args": { + "External id": 125659,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685301046.153, "dur": 3.460, + "args": { + "External id": 125660,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685301047.143, "dur": 1.990, + "args": { + "External id": 125661,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301048.333, "dur": 0.540, + "args": { + "External id": 125662,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685301050.363, "dur": 29.530, + "args": { + "External id": 125663,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685301051.213, "dur": 27.960, + "args": { + "External id": 125664,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685301086.473, "dur": 4.930, + "args": { + "External id": 125665,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301088.703, "dur": 1.480, + "args": { + "External id": 125666,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685301097.943, "dur": 1.900, + "args": { + "External id": 125667,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685301105.683, "dur": 7.040, + "args": { + "External id": 125668,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685301107.103, "dur": 5.320, + "args": { + "External id": 125669,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685301184.213, "dur": 157.570, + "args": { + "External id": 125670,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685301186.153, "dur": 5.120, + "args": { + "External id": 125671,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685301192.853, "dur": 148.370, + "args": { + "External id": 125672,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685301193.993, "dur": 0.140, + "args": { + "External id": 125673,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685301195.073, "dur": 20.740, + "args": { + "External id": 125674,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685301217.373, "dur": 3.200, + "args": { + "External id": 125675,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301219.303, "dur": 0.840, + "args": { + "External id": 125676,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685301221.273, "dur": 21.050, + "args": { + "External id": 125677,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685301222.363, "dur": 2.710, + "args": { + "External id": 125678,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685301225.963, "dur": 16.110, + "args": { + "External id": 125679,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685301230.883, "dur": 3.490, + "args": { + "External id": 125680,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685301243.593, "dur": 15.350, + "args": { + "External id": 125681,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685301260.463, "dur": 9.209, + "args": { + "External id": 125682,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685301272.652, "dur": 10.920, + "args": { + "External id": 125683,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685301284.903, "dur": 7.920, + "args": { + "External id": 125684,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685301294.372, "dur": 26.720, + "args": { + "External id": 125685,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685301304.263, "dur": 2.960, + "args": { + "External id": 125686,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301309.432, "dur": 0.811, + "args": { + "External id": 125687,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685301324.123, "dur": 7.769, + "args": { + "External id": 125688,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685301333.012, "dur": 6.991, + "args": { + "External id": 125689,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685301349.952, "dur": 2.570, + "args": { + "External id": 125690,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685301361.122, "dur": 4.060, + "args": { + "External id": 125691,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301363.762, "dur": 0.540, + "args": { + "External id": 125692,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685301426.512, "dur": 39.970, + "args": { + "External id": 125693,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685301473.382, "dur": 7.190, + "args": { + "External id": 125694,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301478.062, "dur": 1.000, + "args": { + "External id": 125695,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685301481.762, "dur": 20.260, + "args": { + "External id": 125696,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685301508.422, "dur": 5.150, + "args": { + "External id": 125697,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685301509.862, "dur": 2.990, + "args": { + "External id": 125698,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301511.752, "dur": 0.810, + "args": { + "External id": 125699,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685301515.972, "dur": 29.070, + "args": { + "External id": 125700,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685301516.742, "dur": 27.530, + "args": { + "External id": 125701,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685301550.272, "dur": 13.020, + "args": { + "External id": 125702,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685301568.402, "dur": 22.520, + "args": { + "External id": 125703,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685301570.082, "dur": 20.430, + "args": { + "External id": 125704,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301576.352, "dur": 0.790, + "args": { + "External id": 125705,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685301595.622, "dur": 22.340, + "args": { + "External id": 125706,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685301596.832, "dur": 20.890, + "args": { + "External id": 125707,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 6326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301600.042, "dur": 5.280, + "args": { + "External id": 125708,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685301606.342, "dur": 10.830, + "args": { + "External id": 125709,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685301627.472, "dur": 5.910, + "args": { + "External id": 125710,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685301629.742, "dur": 3.350, + "args": { + "External id": 125711,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685301634.332, "dur": 1.090, + "args": { + "External id": 125712,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685301634.792, "dur": 0.480, + "args": { + "External id": 125713,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685301661.162, "dur": 18.900, + "args": { + "External id": 125714,"Sequence number": 2576050, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685301682.382, "dur": 10.229, + "args": { + "External id": 125715,"Sequence number": 2576051, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6334 + } + }, + { + "ph": "s", "id": 16, "pid": 5717, "tid": 5717, "ts": 6302685301682.382, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685301698.702, "dur": 6.780, + "args": { + "External id": 125716,"Sequence number": 2576052, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 6335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301702.551, "dur": 1.280, + "args": { + "External id": 125717,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 5717, + "ts": 6302685301708.662, "dur": 4.040, + "args": { + "External id": 125718,"Sequence number": 2576052, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "2"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 6337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301711.471, "dur": 0.300, + "args": { + "External id": 125719,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "3"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685301714.222, "dur": 1.969, + "args": { + "External id": 125720,"Sequence number": 2576052, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 6339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301715.522, "dur": 0.220, + "args": { + "External id": 125721,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "3"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 6340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685301721.431, "dur": 5.120, + "args": { + "External id": 125722,"Sequence number": 2576052, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6341 + } + }, + { + "ph": "s", "id": 15, "pid": 5717, "tid": 5717, "ts": 6302685301721.431, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301724.911, "dur": 0.471, + "args": { + "External id": 125723,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685301727.691, "dur": 3.791, + "args": { + "External id": 125724,"Sequence number": 2576053, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6343 + } + }, + { + "ph": "s", "id": 14, "pid": 5717, "tid": 5717, "ts": 6302685301727.691, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301730.482, "dur": 0.320, + "args": { + "External id": 125725,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 5717, + "ts": 6302685301732.471, "dur": 4.320, + "args": { + "External id": 125726,"Sequence number": 2576054, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "2"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 6345 + } + }, + { + "ph": "s", "id": 13, "pid": 5717, "tid": 5717, "ts": 6302685301732.471, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301735.662, "dur": 0.400, + "args": { + "External id": 125727,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685301737.791, "dur": 3.631, + "args": { + "External id": 125728,"Sequence number": 2576055, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 6347 + } + }, + { + "ph": "s", "id": 12, "pid": 5717, "tid": 5717, "ts": 6302685301737.791, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301739.422, "dur": 1.369, + "args": { + "External id": 125729,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 6348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5717, "tid": 5717, + "ts": 6302685301744.871, "dur": 29.251, + "args": { + "External id": 125730,"Sequence number": 2576056, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 5717, + "ts": 6302685301745.711, "dur": 28.091, + "args": { + "External id": 125731,"Sequence number": 2576056, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685301747.131, "dur": 9.391, + "args": { + "External id": 125732,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 6351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685301748.642, "dur": 7.320, + "args": { + "External id": 125733,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685301757.511, "dur": 15.840, + "args": { + "External id": 125734,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 6353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685301797.561, "dur": 4.420, + "args": { + "External id": 125735,"Sequence number": 2576056, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 6354 + } + }, + { + "ph": "s", "id": 11, "pid": 5717, "tid": 5717, "ts": 6302685301797.561, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685301805.491, "dur": 1.020, + "args": { + "External id": 125736,"Sequence number": 2576057, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5717, "tid": 5717, + "ts": 6302685301825.321, "dur": 22157.500, + "args": { + "External id": 125737,"Sequence number": 2576057, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 6356 + } + }, + { + "ph": "s", "id": 10, "pid": 5717, "tid": 5717, "ts": 6302685301825.321, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5717, "tid": 5717, + "ts": 6302685301836.351, "dur": 28.110, + "args": { + "External id": 125738,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 5717, + "ts": 6302685301837.121, "dur": 27.100, + "args": { + "External id": 125739,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685301838.401, "dur": 7.900, + "args": { + "External id": 125740,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685301839.741, "dur": 5.930, + "args": { + "External id": 125741,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685301847.291, "dur": 16.400, + "args": { + "External id": 125742,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 6361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5717, "tid": 5717, + "ts": 6302685301877.541, "dur": 24.660, + "args": { + "External id": 125743,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685301878.451, "dur": 7.230, + "args": { + "External id": 125744,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301880.071, "dur": 5.180, + "args": { + "External id": 125745,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685301886.631, "dur": 15.330, + "args": { + "External id": 125746,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685301889.471, "dur": 11.590, + "args": { + "External id": 125747,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5717, "tid": 5717, + "ts": 6302685301905.911, "dur": 16.010, + "args": { + "External id": 125748,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685301906.601, "dur": 4.960, + "args": { + "External id": 125749,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301907.701, "dur": 3.570, + "args": { + "External id": 125750,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685301912.161, "dur": 9.530, + "args": { + "External id": 125751,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685301913.051, "dur": 7.830, + "args": { + "External id": 125752,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 6371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 5717, + "ts": 6302685301927.381, "dur": 17.250, + "args": { + "External id": 125753,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 6372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685301929.591, "dur": 4.220, + "args": { + "External id": 125754,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685301934.931, "dur": 9.430, + "args": { + "External id": 125755,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 6374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685301935.811, "dur": 7.720, + "args": { + "External id": 125756,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5717, "tid": 5717, + "ts": 6302685301948.431, "dur": 18.750, + "args": { + "External id": 125757,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685301969.901, "dur": 42.340, + "args": { + "External id": 125758,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685301972.631, "dur": 39.040, + "args": { + "External id": 125759,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301977.891, "dur": 0.940, + "args": { + "External id": 125760,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685301979.731, "dur": 20.550, + "args": { + "External id": 125761,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685301980.641, "dur": 19.410, + "args": { + "External id": 125762,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 6381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685301983.251, "dur": 3.640, + "args": { + "External id": 125763,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685301987.891, "dur": 11.680, + "args": { + "External id": 125764,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 6383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5717, "tid": 5717, + "ts": 6302685302016.411, "dur": 16750.652, + "args": { + "External id": 125765,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5717, "tid": 5717, + "ts": 6302685302017.671, "dur": 16748.532, + "args": { + "External id": 125766,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685318778.413, "dur": 7.510, + "args": { + "External id": 125767,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685318782.683, "dur": 1.230, + "args": { + "External id": 125768,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685318791.443, "dur": 60.430, + "args": { + "External id": 125769,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685318792.433, "dur": 5.350, + "args": { + "External id": 125770,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685318794.283, "dur": 2.750, + "args": { + "External id": 125771,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685318795.763, "dur": 0.920, + "args": { + "External id": 125772,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685318798.783, "dur": 52.200, + "args": { + "External id": 125773,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685318800.083, "dur": 49.540, + "args": { + "External id": 125774,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685318857.503, "dur": 4.930, + "args": { + "External id": 125775,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685318860.293, "dur": 0.760, + "args": { + "External id": 125776,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685318870.823, "dur": 2.160, + "args": { + "External id": 125777,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685318880.563, "dur": 9.070, + "args": { + "External id": 125778,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685318882.173, "dur": 7.090, + "args": { + "External id": 125779,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685318976.752, "dur": 213.380, + "args": { + "External id": 125780,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685318980.812, "dur": 5.060, + "args": { + "External id": 125781,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685318989.063, "dur": 200.339, + "args": { + "External id": 125782,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685318990.303, "dur": 0.200, + "args": { + "External id": 125783,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685318991.732, "dur": 24.560, + "args": { + "External id": 125784,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685319018.122, "dur": 12.650, + "args": { + "External id": 125785,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319028.832, "dur": 1.220, + "args": { + "External id": 125786,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685319031.632, "dur": 24.050, + "args": { + "External id": 125787,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685319033.222, "dur": 3.880, + "args": { + "External id": 125788,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685319038.172, "dur": 17.180, + "args": { + "External id": 125789,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685319041.372, "dur": 3.810, + "args": { + "External id": 125790,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685319058.522, "dur": 17.860, + "args": { + "External id": 125791,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685319078.222, "dur": 15.210, + "args": { + "External id": 125792,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685319098.012, "dur": 20.540, + "args": { + "External id": 125793,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685319119.982, "dur": 14.310, + "args": { + "External id": 125794,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685319137.612, "dur": 24.270, + "args": { + "External id": 125795,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685319141.462, "dur": 4.370, + "args": { + "External id": 125796,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319148.122, "dur": 1.760, + "args": { + "External id": 125797,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685319164.872, "dur": 11.760, + "args": { + "External id": 125798,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685319178.322, "dur": 9.220, + "args": { + "External id": 125799,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685319203.912, "dur": 4.590, + "args": { + "External id": 125800,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685319217.402, "dur": 4.860, + "args": { + "External id": 125801,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319220.412, "dur": 0.690, + "args": { + "External id": 125802,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685319304.892, "dur": 51.570, + "args": { + "External id": 125803,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685319366.422, "dur": 8.400, + "args": { + "External id": 125804,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319371.342, "dur": 1.269, + "args": { + "External id": 125805,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685319377.522, "dur": 23.020, + "args": { + "External id": 125806,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685319408.242, "dur": 5.920, + "args": { + "External id": 125807,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685319409.822, "dur": 3.500, + "args": { + "External id": 125808,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319411.951, "dur": 1.020, + "args": { + "External id": 125809,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685319417.362, "dur": 44.039, + "args": { + "External id": 125810,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685319418.371, "dur": 41.900, + "args": { + "External id": 125811,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685319466.621, "dur": 20.140, + "args": { + "External id": 125812,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685319504.791, "dur": 7.420, + "args": { + "External id": 125813,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319509.421, "dur": 1.260, + "args": { + "External id": 125814,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685319516.611, "dur": 45.270, + "args": { + "External id": 125815,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685319517.621, "dur": 5.810, + "args": { + "External id": 125816,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685319518.771, "dur": 4.050, + "args": { + "External id": 125817,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319521.761, "dur": 0.770, + "args": { + "External id": 125818,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685319525.521, "dur": 35.730, + "args": { + "External id": 125819,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685319526.411, "dur": 33.970, + "args": { + "External id": 125820,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685319568.371, "dur": 5.740, + "args": { + "External id": 125821,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319571.121, "dur": 1.610, + "args": { + "External id": 125822,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685319582.131, "dur": 1.970, + "args": { + "External id": 125823,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685319591.481, "dur": 10.170, + "args": { + "External id": 125824,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685319594.631, "dur": 6.650, + "args": { + "External id": 125825,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685319686.721, "dur": 167.500, + "args": { + "External id": 125826,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685319689.951, "dur": 4.990, + "args": { + "External id": 125827,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685319696.521, "dur": 157.089, + "args": { + "External id": 125828,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685319698.851, "dur": 0.300, + "args": { + "External id": 125829,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685319700.451, "dur": 22.530, + "args": { + "External id": 125830,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685319724.761, "dur": 3.960, + "args": { + "External id": 125831,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319727.171, "dur": 1.020, + "args": { + "External id": 125832,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685319729.741, "dur": 23.880, + "args": { + "External id": 125833,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685319732.881, "dur": 3.180, + "args": { + "External id": 125834,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685319737.131, "dur": 16.180, + "args": { + "External id": 125835,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685319740.271, "dur": 4.210, + "args": { + "External id": 125836,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685319754.931, "dur": 18.290, + "args": { + "External id": 125837,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685319775.001, "dur": 10.270, + "args": { + "External id": 125838,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685319788.681, "dur": 11.940, + "args": { + "External id": 125839,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685319801.981, "dur": 8.549, + "args": { + "External id": 125840,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685319812.401, "dur": 20.189, + "args": { + "External id": 125841,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685319814.701, "dur": 2.680, + "args": { + "External id": 125842,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319820.530, "dur": 0.871, + "args": { + "External id": 125843,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685319834.570, "dur": 8.400, + "args": { + "External id": 125844,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685319844.261, "dur": 7.780, + "args": { + "External id": 125845,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685319863.250, "dur": 3.091, + "args": { + "External id": 125846,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685319876.570, "dur": 4.580, + "args": { + "External id": 125847,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685319879.570, "dur": 0.591, + "args": { + "External id": 125848,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685319951.760, "dur": 47.040, + "args": { + "External id": 125849,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685320008.000, "dur": 7.110, + "args": { + "External id": 125850,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320012.170, "dur": 1.220, + "args": { + "External id": 125851,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685320016.440, "dur": 20.250, + "args": { + "External id": 125852,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685320043.940, "dur": 6.040, + "args": { + "External id": 125853,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685320045.600, "dur": 3.610, + "args": { + "External id": 125854,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320047.770, "dur": 1.010, + "args": { + "External id": 125855,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685320053.570, "dur": 34.070, + "args": { + "External id": 125856,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685320054.530, "dur": 32.200, + "args": { + "External id": 125857,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685320092.270, "dur": 20.860, + "args": { + "External id": 125858,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685320121.480, "dur": 5.430, + "args": { + "External id": 125859,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320124.660, "dur": 0.860, + "args": { + "External id": 125860,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685320131.350, "dur": 42.330, + "args": { + "External id": 125861,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685320132.290, "dur": 6.950, + "args": { + "External id": 125862,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685320134.960, "dur": 3.710, + "args": { + "External id": 125863,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320136.670, "dur": 1.640, + "args": { + "External id": 125864,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685320140.170, "dur": 32.890, + "args": { + "External id": 125865,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685320141.030, "dur": 31.180, + "args": { + "External id": 125866,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685320179.960, "dur": 4.730, + "args": { + "External id": 125867,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320182.720, "dur": 0.570, + "args": { + "External id": 125868,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685320192.200, "dur": 1.910, + "args": { + "External id": 125869,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685320201.880, "dur": 8.330, + "args": { + "External id": 125870,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685320203.570, "dur": 6.290, + "args": { + "External id": 125871,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685320291.309, "dur": 174.140, + "args": { + "External id": 125872,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685320293.749, "dur": 13.280, + "args": { + "External id": 125873,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685320308.849, "dur": 156.060, + "args": { + "External id": 125874,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685320310.089, "dur": 0.200, + "args": { + "External id": 125875,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685320312.509, "dur": 24.250, + "args": { + "External id": 125876,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685320338.629, "dur": 3.920, + "args": { + "External id": 125877,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320340.929, "dur": 1.090, + "args": { + "External id": 125878,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685320344.419, "dur": 23.120, + "args": { + "External id": 125879,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685320345.819, "dur": 4.350, + "args": { + "External id": 125880,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685320351.419, "dur": 15.810, + "args": { + "External id": 125881,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685320354.409, "dur": 3.670, + "args": { + "External id": 125882,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685320368.969, "dur": 17.620, + "args": { + "External id": 125883,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685320388.269, "dur": 9.850, + "args": { + "External id": 125884,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685320400.929, "dur": 11.660, + "args": { + "External id": 125885,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685320413.879, "dur": 8.460, + "args": { + "External id": 125886,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685320424.329, "dur": 20.110, + "args": { + "External id": 125887,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685320427.689, "dur": 2.520, + "args": { + "External id": 125888,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320432.449, "dur": 0.800, + "args": { + "External id": 125889,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685320446.309, "dur": 8.420, + "args": { + "External id": 125890,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685320455.989, "dur": 7.690, + "args": { + "External id": 125891,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685320474.629, "dur": 3.000, + "args": { + "External id": 125892,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685320488.239, "dur": 4.620, + "args": { + "External id": 125893,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320491.319, "dur": 0.540, + "args": { + "External id": 125894,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685320564.199, "dur": 45.240, + "args": { + "External id": 125895,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685320618.589, "dur": 6.940, + "args": { + "External id": 125896,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320622.679, "dur": 1.160, + "args": { + "External id": 125897,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685320626.999, "dur": 21.830, + "args": { + "External id": 125898,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685320656.429, "dur": 5.740, + "args": { + "External id": 125899,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685320657.929, "dur": 3.390, + "args": { + "External id": 125900,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320660.079, "dur": 0.910, + "args": { + "External id": 125901,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685320666.079, "dur": 35.580, + "args": { + "External id": 125902,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685320666.989, "dur": 31.579, + "args": { + "External id": 125903,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685320707.599, "dur": 16.489, + "args": { + "External id": 125904,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685320733.099, "dur": 6.100, + "args": { + "External id": 125905,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320736.739, "dur": 1.069, + "args": { + "External id": 125906,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685320743.319, "dur": 43.519, + "args": { + "External id": 125907,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685320744.308, "dur": 5.731, + "args": { + "External id": 125908,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685320746.639, "dur": 2.869, + "args": { + "External id": 125909,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320748.319, "dur": 0.769, + "args": { + "External id": 125910,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685320750.928, "dur": 35.260, + "args": { + "External id": 125911,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685320751.979, "dur": 33.379, + "args": { + "External id": 125912,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685320793.258, "dur": 4.660, + "args": { + "External id": 125913,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320795.898, "dur": 0.720, + "args": { + "External id": 125914,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685320806.928, "dur": 1.860, + "args": { + "External id": 125915,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685320815.698, "dur": 8.600, + "args": { + "External id": 125916,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685320817.328, "dur": 6.590, + "args": { + "External id": 125917,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685320916.978, "dur": 169.190, + "args": { + "External id": 125918,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685320920.578, "dur": 6.370, + "args": { + "External id": 125919,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685320928.618, "dur": 156.960, + "args": { + "External id": 125920,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685320931.448, "dur": 0.210, + "args": { + "External id": 125921,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685320932.858, "dur": 22.530, + "args": { + "External id": 125922,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685320957.418, "dur": 4.690, + "args": { + "External id": 125923,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685320960.658, "dur": 0.930, + "args": { + "External id": 125924,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685320963.028, "dur": 21.420, + "args": { + "External id": 125925,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685320964.138, "dur": 3.080, + "args": { + "External id": 125926,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685320968.388, "dur": 15.720, + "args": { + "External id": 125927,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685320971.368, "dur": 3.870, + "args": { + "External id": 125928,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685320985.768, "dur": 17.630, + "args": { + "External id": 125929,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685321005.088, "dur": 10.090, + "args": { + "External id": 125930,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685321019.428, "dur": 11.800, + "args": { + "External id": 125931,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685321032.538, "dur": 8.700, + "args": { + "External id": 125932,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685321043.108, "dur": 20.300, + "args": { + "External id": 125933,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685321046.648, "dur": 2.750, + "args": { + "External id": 125934,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321051.558, "dur": 0.850, + "args": { + "External id": 125935,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685321065.258, "dur": 8.790, + "args": { + "External id": 125936,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685321076.368, "dur": 7.720, + "args": { + "External id": 125937,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685321095.288, "dur": 3.000, + "args": { + "External id": 125938,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685321108.348, "dur": 4.450, + "args": { + "External id": 125939,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321111.238, "dur": 0.520, + "args": { + "External id": 125940,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685321194.167, "dur": 45.460, + "args": { + "External id": 125941,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685321247.527, "dur": 7.340, + "args": { + "External id": 125942,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321252.077, "dur": 1.060, + "args": { + "External id": 125943,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685321256.317, "dur": 20.300, + "args": { + "External id": 125944,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685321285.047, "dur": 7.320, + "args": { + "External id": 125945,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685321286.557, "dur": 4.960, + "args": { + "External id": 125946,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321290.087, "dur": 1.070, + "args": { + "External id": 125947,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685321294.877, "dur": 43.470, + "args": { + "External id": 125948,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685321295.947, "dur": 41.310, + "args": { + "External id": 125949,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685321343.747, "dur": 15.510, + "args": { + "External id": 125950,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685321367.957, "dur": 7.040, + "args": { + "External id": 125951,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321372.437, "dur": 1.020, + "args": { + "External id": 125952,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685321391.947, "dur": 42.920, + "args": { + "External id": 125953,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685321392.937, "dur": 5.680, + "args": { + "External id": 125954,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685321394.197, "dur": 3.590, + "args": { + "External id": 125955,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321396.827, "dur": 0.630, + "args": { + "External id": 125956,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685321399.467, "dur": 34.800, + "args": { + "External id": 125957,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685321401.457, "dur": 31.950, + "args": { + "External id": 125958,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685321441.727, "dur": 5.780, + "args": { + "External id": 125959,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321444.447, "dur": 1.770, + "args": { + "External id": 125960,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685321454.967, "dur": 1.980, + "args": { + "External id": 125961,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685321463.487, "dur": 8.340, + "args": { + "External id": 125962,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685321465.087, "dur": 6.370, + "args": { + "External id": 125963,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685321573.377, "dur": 199.679, + "args": { + "External id": 125964,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685321575.686, "dur": 5.091, + "args": { + "External id": 125965,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685321583.637, "dur": 188.669, + "args": { + "External id": 125966,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685321584.977, "dur": 0.169, + "args": { + "External id": 125967,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685321586.186, "dur": 35.820, + "args": { + "External id": 125968,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685321623.986, "dur": 3.811, + "args": { + "External id": 125969,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321626.206, "dur": 1.051, + "args": { + "External id": 125970,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685321628.717, "dur": 26.640, + "args": { + "External id": 125971,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685321630.017, "dur": 3.009, + "args": { + "External id": 125972,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685321634.017, "dur": 20.989, + "args": { + "External id": 125973,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685321640.337, "dur": 4.649, + "args": { + "External id": 125974,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685321659.446, "dur": 23.570, + "args": { + "External id": 125975,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685321684.846, "dur": 10.450, + "args": { + "External id": 125976,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685321698.676, "dur": 12.270, + "args": { + "External id": 125977,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685321712.426, "dur": 8.640, + "args": { + "External id": 125978,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685321723.046, "dur": 25.880, + "args": { + "External id": 125979,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685321725.396, "dur": 4.840, + "args": { + "External id": 125980,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321735.176, "dur": 0.890, + "args": { + "External id": 125981,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685321751.076, "dur": 8.820, + "args": { + "External id": 125982,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685321762.336, "dur": 8.220, + "args": { + "External id": 125983,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685321782.366, "dur": 2.970, + "args": { + "External id": 125984,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685321795.726, "dur": 4.600, + "args": { + "External id": 125985,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321798.666, "dur": 0.610, + "args": { + "External id": 125986,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685321872.916, "dur": 46.120, + "args": { + "External id": 125987,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685321927.026, "dur": 7.090, + "args": { + "External id": 125988,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321931.316, "dur": 1.110, + "args": { + "External id": 125989,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685321936.556, "dur": 20.500, + "args": { + "External id": 125990,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685321964.416, "dur": 7.210, + "args": { + "External id": 125991,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685321967.066, "dur": 3.610, + "args": { + "External id": 125992,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685321969.256, "dur": 0.950, + "args": { + "External id": 125993,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685321974.256, "dur": 33.769, + "args": { + "External id": 125994,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685321975.186, "dur": 31.890, + "args": { + "External id": 125995,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685322012.816, "dur": 14.669, + "args": { + "External id": 125996,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685322036.916, "dur": 5.509, + "args": { + "External id": 125997,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322040.196, "dur": 0.829, + "args": { + "External id": 125998,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685322046.425, "dur": 40.891, + "args": { + "External id": 125999,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685322047.256, "dur": 4.040, + "args": { + "External id": 126000,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685322048.385, "dur": 2.400, + "args": { + "External id": 126001,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322049.845, "dur": 0.551, + "args": { + "External id": 126002,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685322053.205, "dur": 33.551, + "args": { + "External id": 126003,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685322054.085, "dur": 31.800, + "args": { + "External id": 126004,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685322097.176, "dur": 5.109, + "args": { + "External id": 126005,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322100.076, "dur": 0.849, + "args": { + "External id": 126006,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685322110.125, "dur": 2.020, + "args": { + "External id": 126007,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685322120.785, "dur": 16.780, + "args": { + "External id": 126008,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685322125.195, "dur": 10.820, + "args": { + "External id": 126009,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685322255.605, "dur": 177.130, + "args": { + "External id": 126010,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685322258.885, "dur": 6.420, + "args": { + "External id": 126011,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685322266.975, "dur": 165.200, + "args": { + "External id": 126012,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685322268.335, "dur": 0.160, + "args": { + "External id": 126013,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685322269.635, "dur": 22.080, + "args": { + "External id": 126014,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685322293.455, "dur": 13.320, + "args": { + "External id": 126015,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322305.065, "dur": 1.130, + "args": { + "External id": 126016,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685322307.685, "dur": 24.520, + "args": { + "External id": 126017,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685322310.035, "dur": 3.270, + "args": { + "External id": 126018,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685322314.255, "dur": 17.640, + "args": { + "External id": 126019,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685322317.635, "dur": 4.100, + "args": { + "External id": 126020,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685322333.555, "dur": 17.590, + "args": { + "External id": 126021,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685322353.075, "dur": 10.230, + "args": { + "External id": 126022,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685322366.225, "dur": 12.330, + "args": { + "External id": 126023,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685322379.955, "dur": 8.520, + "args": { + "External id": 126024,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685322390.285, "dur": 20.110, + "args": { + "External id": 126025,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685322393.635, "dur": 2.640, + "args": { + "External id": 126026,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322398.655, "dur": 0.770, + "args": { + "External id": 126027,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685322413.725, "dur": 8.320, + "args": { + "External id": 126028,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685322423.205, "dur": 7.820, + "args": { + "External id": 126029,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685322441.845, "dur": 2.919, + "args": { + "External id": 126030,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685322454.975, "dur": 4.640, + "args": { + "External id": 126031,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322457.924, "dur": 0.591, + "args": { + "External id": 126032,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685322532.335, "dur": 45.339, + "args": { + "External id": 126033,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685322585.584, "dur": 8.240, + "args": { + "External id": 126034,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322590.904, "dur": 1.250, + "args": { + "External id": 126035,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685322595.174, "dur": 20.270, + "args": { + "External id": 126036,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685322622.914, "dur": 6.920, + "args": { + "External id": 126037,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685322624.394, "dur": 4.630, + "args": { + "External id": 126038,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322626.514, "dur": 2.120, + "args": { + "External id": 126039,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685322632.234, "dur": 32.860, + "args": { + "External id": 126040,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685322633.164, "dur": 31.050, + "args": { + "External id": 126041,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685322671.234, "dur": 14.690, + "args": { + "External id": 126042,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685322694.574, "dur": 5.890, + "args": { + "External id": 126043,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322697.844, "dur": 1.050, + "args": { + "External id": 126044,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685322704.964, "dur": 41.010, + "args": { + "External id": 126045,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685322705.794, "dur": 5.700, + "args": { + "External id": 126046,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685322706.994, "dur": 3.930, + "args": { + "External id": 126047,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322710.014, "dur": 0.580, + "args": { + "External id": 126048,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685322712.394, "dur": 32.990, + "args": { + "External id": 126049,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685322713.254, "dur": 31.310, + "args": { + "External id": 126050,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685322752.344, "dur": 4.650, + "args": { + "External id": 126051,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322754.974, "dur": 0.830, + "args": { + "External id": 126052,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685322764.394, "dur": 1.950, + "args": { + "External id": 126053,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685322772.814, "dur": 9.430, + "args": { + "External id": 126054,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685322775.494, "dur": 6.390, + "args": { + "External id": 126055,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685322862.314, "dur": 169.699, + "args": { + "External id": 126056,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685322864.494, "dur": 4.850, + "args": { + "External id": 126057,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685322872.184, "dur": 159.269, + "args": { + "External id": 126058,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685322873.504, "dur": 0.210, + "args": { + "External id": 126059,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685322875.014, "dur": 23.069, + "args": { + "External id": 126060,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685322899.834, "dur": 6.880, + "args": { + "External id": 126061,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322905.083, "dur": 1.100, + "args": { + "External id": 126062,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685322908.563, "dur": 22.320, + "args": { + "External id": 126063,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685322909.603, "dur": 2.991, + "args": { + "External id": 126064,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685322913.623, "dur": 16.940, + "args": { + "External id": 126065,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685322916.694, "dur": 4.920, + "args": { + "External id": 126066,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685322932.183, "dur": 17.211, + "args": { + "External id": 126067,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685322951.374, "dur": 10.000, + "args": { + "External id": 126068,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685322964.374, "dur": 11.909, + "args": { + "External id": 126069,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685322977.694, "dur": 8.599, + "args": { + "External id": 126070,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685322989.283, "dur": 21.240, + "args": { + "External id": 126071,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685322993.413, "dur": 2.810, + "args": { + "External id": 126072,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685322998.663, "dur": 0.770, + "args": { + "External id": 126073,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685323012.433, "dur": 8.440, + "args": { + "External id": 126074,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685323022.113, "dur": 7.840, + "args": { + "External id": 126075,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685323041.083, "dur": 3.030, + "args": { + "External id": 126076,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685323054.303, "dur": 4.580, + "args": { + "External id": 126077,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323057.233, "dur": 0.570, + "args": { + "External id": 126078,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685323130.143, "dur": 44.750, + "args": { + "External id": 126079,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685323182.793, "dur": 7.220, + "args": { + "External id": 126080,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323187.043, "dur": 1.230, + "args": { + "External id": 126081,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685323191.303, "dur": 20.100, + "args": { + "External id": 126082,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685323218.653, "dur": 8.080, + "args": { + "External id": 126083,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685323220.093, "dur": 5.810, + "args": { + "External id": 126084,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323223.333, "dur": 2.190, + "args": { + "External id": 126085,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685323229.233, "dur": 33.420, + "args": { + "External id": 126086,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685323230.163, "dur": 31.580, + "args": { + "External id": 126087,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685323267.353, "dur": 14.790, + "args": { + "External id": 126088,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685323290.253, "dur": 5.710, + "args": { + "External id": 126089,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323293.633, "dur": 0.910, + "args": { + "External id": 126090,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685323308.213, "dur": 43.709, + "args": { + "External id": 126091,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685323310.563, "dur": 4.610, + "args": { + "External id": 126092,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685323311.763, "dur": 2.840, + "args": { + "External id": 126093,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323313.603, "dur": 0.670, + "args": { + "External id": 126094,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685323316.053, "dur": 35.289, + "args": { + "External id": 126095,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685323316.963, "dur": 33.559, + "args": { + "External id": 126096,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685323361.293, "dur": 6.540, + "args": { + "External id": 126097,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323364.842, "dur": 1.591, + "args": { + "External id": 126098,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685323376.662, "dur": 1.900, + "args": { + "External id": 126099,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685323385.502, "dur": 8.400, + "args": { + "External id": 126100,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685323387.122, "dur": 6.420, + "args": { + "External id": 126101,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685323475.032, "dur": 165.260, + "args": { + "External id": 126102,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685323477.202, "dur": 6.000, + "args": { + "External id": 126103,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685323484.972, "dur": 154.750, + "args": { + "External id": 126104,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685323487.282, "dur": 0.160, + "args": { + "External id": 126105,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685323488.442, "dur": 23.380, + "args": { + "External id": 126106,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685323513.732, "dur": 3.780, + "args": { + "External id": 126107,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323515.932, "dur": 1.050, + "args": { + "External id": 126108,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685323518.242, "dur": 22.750, + "args": { + "External id": 126109,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685323519.312, "dur": 3.150, + "args": { + "External id": 126110,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685323523.582, "dur": 17.130, + "args": { + "External id": 126111,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685323527.542, "dur": 4.350, + "args": { + "External id": 126112,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685323542.322, "dur": 17.870, + "args": { + "External id": 126113,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685323561.872, "dur": 10.350, + "args": { + "External id": 126114,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685323576.452, "dur": 11.850, + "args": { + "External id": 126115,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685323589.632, "dur": 8.660, + "args": { + "External id": 126116,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685323600.032, "dur": 19.100, + "args": { + "External id": 126117,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685323602.382, "dur": 2.740, + "args": { + "External id": 126118,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323607.262, "dur": 0.900, + "args": { + "External id": 126119,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685323621.052, "dur": 8.190, + "args": { + "External id": 126120,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685323630.582, "dur": 7.760, + "args": { + "External id": 126121,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685323650.132, "dur": 3.010, + "args": { + "External id": 126122,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685323662.772, "dur": 4.420, + "args": { + "External id": 126123,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323665.582, "dur": 0.540, + "args": { + "External id": 126124,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685323737.132, "dur": 44.720, + "args": { + "External id": 126125,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685323789.372, "dur": 8.109, + "args": { + "External id": 126126,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323793.581, "dur": 2.160, + "args": { + "External id": 126127,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685323798.761, "dur": 19.691, + "args": { + "External id": 126128,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685323826.561, "dur": 5.960, + "args": { + "External id": 126129,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685323828.212, "dur": 3.480, + "args": { + "External id": 126130,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323830.292, "dur": 1.049, + "args": { + "External id": 126131,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685323835.021, "dur": 32.560, + "args": { + "External id": 126132,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685323835.912, "dur": 30.749, + "args": { + "External id": 126133,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685323872.131, "dur": 14.660, + "args": { + "External id": 126134,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685323891.991, "dur": 28.090, + "args": { + "External id": 126135,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685323895.371, "dur": 24.190, + "args": { + "External id": 126136,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323902.261, "dur": 2.020, + "args": { + "External id": 126137,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685323925.081, "dur": 24.550, + "args": { + "External id": 126138,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685323926.341, "dur": 22.990, + "args": { + "External id": 126139,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 6758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685323929.891, "dur": 5.870, + "args": { + "External id": 126140,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685323936.851, "dur": 11.950, + "args": { + "External id": 126141,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685323960.281, "dur": 5.150, + "args": { + "External id": 126142,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685323962.851, "dur": 2.240, + "args": { + "External id": 126143,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685323966.461, "dur": 1.090, + "args": { + "External id": 126144,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685323966.861, "dur": 0.510, + "args": { + "External id": 126145,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685323997.741, "dur": 21.670, + "args": { + "External id": 126146,"Sequence number": 2576058, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685324022.091, "dur": 10.820, + "args": { + "External id": 126147,"Sequence number": 2576059, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6766 + } + }, + { + "ph": "s", "id": 9, "pid": 5717, "tid": 5717, "ts": 6302685324022.091, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685324040.151, "dur": 7.800, + "args": { + "External id": 126148,"Sequence number": 2576060, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 6767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324044.641, "dur": 1.510, + "args": { + "External id": 126149,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 5717, + "ts": 6302685324051.221, "dur": 4.540, + "args": { + "External id": 126150,"Sequence number": 2576060, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "3"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 6769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324054.301, "dur": 0.360, + "args": { + "External id": 126151,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "4"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685324057.331, "dur": 2.490, + "args": { + "External id": 126152,"Sequence number": 2576060, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 6771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324058.991, "dur": 0.270, + "args": { + "External id": 126153,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "4"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 6772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685324065.761, "dur": 5.430, + "args": { + "External id": 126154,"Sequence number": 2576060, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6773 + } + }, + { + "ph": "s", "id": 8, "pid": 5717, "tid": 5717, "ts": 6302685324065.761, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324069.291, "dur": 0.580, + "args": { + "External id": 126155,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685324072.311, "dur": 2.980, + "args": { + "External id": 126156,"Sequence number": 2576061, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6775 + } + }, + { + "ph": "s", "id": 7, "pid": 5717, "tid": 5717, "ts": 6302685324072.311, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324074.241, "dur": 0.280, + "args": { + "External id": 126157,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5717, "tid": 5717, + "ts": 6302685324076.381, "dur": 4.820, + "args": { + "External id": 126158,"Sequence number": 2576062, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "3"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 6777 + } + }, + { + "ph": "s", "id": 6, "pid": 5717, "tid": 5717, "ts": 6302685324076.381, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324079.931, "dur": 0.460, + "args": { + "External id": 126159,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685324082.441, "dur": 3.390, + "args": { + "External id": 126160,"Sequence number": 2576063, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 6779 + } + }, + { + "ph": "s", "id": 5, "pid": 5717, "tid": 5717, "ts": 6302685324082.441, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324084.731, "dur": 0.380, + "args": { + "External id": 126161,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 6780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5717, "tid": 5717, + "ts": 6302685324089.761, "dur": 35.410, + "args": { + "External id": 126162,"Sequence number": 2576064, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 5717, + "ts": 6302685324090.671, "dur": 34.170, + "args": { + "External id": 126163,"Sequence number": 2576064, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685324092.401, "dur": 11.450, + "args": { + "External id": 126164,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 6783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685324095.271, "dur": 8.040, + "args": { + "External id": 126165,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685324104.931, "dur": 19.310, + "args": { + "External id": 126166,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 6785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685324151.761, "dur": 5.200, + "args": { + "External id": 126167,"Sequence number": 2576064, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 6786 + } + }, + { + "ph": "s", "id": 4, "pid": 5717, "tid": 5717, "ts": 6302685324151.761, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685324160.881, "dur": 0.930, + "args": { + "External id": 126168,"Sequence number": 2576065, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5717, "tid": 5717, + "ts": 6302685324183.501, "dur": 21510.101, + "args": { + "External id": 126169,"Sequence number": 2576065, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 6788 + } + }, + { + "ph": "s", "id": 3, "pid": 5717, "tid": 5717, "ts": 6302685324183.501, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5717, "tid": 5717, + "ts": 6302685324197.731, "dur": 32.349, + "args": { + "External id": 126170,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 5717, + "ts": 6302685324198.501, "dur": 31.290, + "args": { + "External id": 126171,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685324200.001, "dur": 9.660, + "args": { + "External id": 126172,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685324201.621, "dur": 7.380, + "args": { + "External id": 126173,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685324210.601, "dur": 18.639, + "args": { + "External id": 126174,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 6793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5717, "tid": 5717, + "ts": 6302685324244.980, "dur": 26.891, + "args": { + "External id": 126175,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685324246.240, "dur": 8.451, + "args": { + "External id": 126176,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324248.151, "dur": 6.080, + "args": { + "External id": 126177,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685324255.711, "dur": 15.840, + "args": { + "External id": 126178,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685324258.351, "dur": 12.260, + "args": { + "External id": 126179,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5717, "tid": 5717, + "ts": 6302685324276.120, "dur": 29.131, + "args": { + "External id": 126180,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685324277.000, "dur": 7.160, + "args": { + "External id": 126181,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324278.671, "dur": 5.140, + "args": { + "External id": 126182,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685324285.151, "dur": 11.100, + "args": { + "External id": 126183,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685324286.080, "dur": 9.291, + "args": { + "External id": 126184,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 6803 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 5717, + "ts": 6302685324312.370, "dur": 19.760, + "args": { + "External id": 126185,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 6804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685324313.770, "dur": 5.420, + "args": { + "External id": 126186,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685324320.070, "dur": 11.680, + "args": { + "External id": 126187,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 6806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685324321.110, "dur": 9.850, + "args": { + "External id": 126188,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5717, "tid": 5717, + "ts": 6302685324336.350, "dur": 20.510, + "args": { + "External id": 126189,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685324360.960, "dur": 47.020, + "args": { + "External id": 126190,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685324362.810, "dur": 44.650, + "args": { + "External id": 126191,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324368.810, "dur": 2.000, + "args": { + "External id": 126192,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685324371.940, "dur": 22.610, + "args": { + "External id": 126193,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685324373.140, "dur": 21.150, + "args": { + "External id": 126194,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 6813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685324375.490, "dur": 4.440, + "args": { + "External id": 126195,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685324380.920, "dur": 12.900, + "args": { + "External id": 126196,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 6815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5717, "tid": 5717, + "ts": 6302685324412.850, "dur": 16076.954, + "args": { + "External id": 126197,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5717, "tid": 5717, + "ts": 6302685324414.450, "dur": 16073.364, + "args": { + "External id": 126198,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685340505.644, "dur": 10.340, + "args": { + "External id": 126199,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685340512.264, "dur": 1.630, + "args": { + "External id": 126200,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685340521.184, "dur": 59.199, + "args": { + "External id": 126201,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685340522.244, "dur": 6.220, + "args": { + "External id": 126202,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685340524.484, "dur": 3.200, + "args": { + "External id": 126203,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685340526.384, "dur": 0.940, + "args": { + "External id": 126204,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685340529.714, "dur": 49.849, + "args": { + "External id": 126205,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685340530.994, "dur": 47.629, + "args": { + "External id": 126206,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685340587.323, "dur": 5.271, + "args": { + "External id": 126207,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685340590.654, "dur": 0.680, + "args": { + "External id": 126208,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685340600.474, "dur": 2.300, + "args": { + "External id": 126209,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685340610.254, "dur": 10.960, + "args": { + "External id": 126210,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685340612.063, "dur": 8.711, + "args": { + "External id": 126211,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685340727.353, "dur": 179.430, + "args": { + "External id": 126212,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685340729.733, "dur": 5.360, + "args": { + "External id": 126213,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685340736.723, "dur": 169.440, + "args": { + "External id": 126214,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685340738.533, "dur": 0.220, + "args": { + "External id": 126215,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685340740.033, "dur": 26.870, + "args": { + "External id": 126216,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685340770.023, "dur": 3.980, + "args": { + "External id": 126217,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685340772.463, "dur": 1.000, + "args": { + "External id": 126218,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685340774.743, "dur": 25.230, + "args": { + "External id": 126219,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685340776.023, "dur": 3.080, + "args": { + "External id": 126220,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685340780.193, "dur": 19.460, + "args": { + "External id": 126221,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685340785.783, "dur": 3.910, + "args": { + "External id": 126222,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685340801.353, "dur": 18.820, + "args": { + "External id": 126223,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685340822.173, "dur": 10.790, + "args": { + "External id": 126224,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685340836.183, "dur": 13.020, + "args": { + "External id": 126225,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685340850.763, "dur": 9.340, + "args": { + "External id": 126226,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685340862.203, "dur": 20.900, + "args": { + "External id": 126227,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685340864.853, "dur": 3.030, + "args": { + "External id": 126228,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685340870.553, "dur": 0.810, + "args": { + "External id": 126229,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685340886.143, "dur": 8.810, + "args": { + "External id": 126230,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685340896.353, "dur": 8.150, + "args": { + "External id": 126231,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685340916.033, "dur": 2.970, + "args": { + "External id": 126232,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685340927.683, "dur": 4.740, + "args": { + "External id": 126233,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685340930.813, "dur": 0.550, + "args": { + "External id": 126234,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685341006.602, "dur": 47.820, + "args": { + "External id": 126235,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685341062.493, "dur": 9.109, + "args": { + "External id": 126236,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341067.973, "dur": 1.189, + "args": { + "External id": 126237,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685341073.342, "dur": 23.550, + "args": { + "External id": 126238,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685341104.672, "dur": 7.210, + "args": { + "External id": 126239,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685341107.402, "dur": 3.610, + "args": { + "External id": 126240,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341109.662, "dur": 0.980, + "args": { + "External id": 126241,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685341114.812, "dur": 35.740, + "args": { + "External id": 126242,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685341115.932, "dur": 33.640, + "args": { + "External id": 126243,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685341156.722, "dur": 15.160, + "args": { + "External id": 126244,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685341180.702, "dur": 5.810, + "args": { + "External id": 126245,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341184.042, "dur": 1.040, + "args": { + "External id": 126246,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685341191.012, "dur": 43.550, + "args": { + "External id": 126247,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685341192.122, "dur": 6.100, + "args": { + "External id": 126248,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685341193.452, "dur": 3.790, + "args": { + "External id": 126249,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341196.302, "dur": 0.650, + "args": { + "External id": 126250,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685341199.252, "dur": 34.610, + "args": { + "External id": 126251,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685341200.152, "dur": 32.870, + "args": { + "External id": 126252,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685341240.862, "dur": 6.060, + "args": { + "External id": 126253,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341243.652, "dur": 2.070, + "args": { + "External id": 126254,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685341254.532, "dur": 1.920, + "args": { + "External id": 126255,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685341264.492, "dur": 13.260, + "args": { + "External id": 126256,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685341266.352, "dur": 10.360, + "args": { + "External id": 126257,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685341382.262, "dur": 172.039, + "args": { + "External id": 126258,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685341386.142, "dur": 5.170, + "args": { + "External id": 126259,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685341393.212, "dur": 160.489, + "args": { + "External id": 126260,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685341394.532, "dur": 0.150, + "args": { + "External id": 126261,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685341395.812, "dur": 23.030, + "args": { + "External id": 126262,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685341420.682, "dur": 5.090, + "args": { + "External id": 126263,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341423.892, "dur": 1.280, + "args": { + "External id": 126264,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685341427.722, "dur": 21.479, + "args": { + "External id": 126265,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685341428.852, "dur": 3.110, + "args": { + "External id": 126266,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685341433.161, "dur": 15.720, + "args": { + "External id": 126267,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685341436.421, "dur": 3.611, + "args": { + "External id": 126268,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685341450.592, "dur": 18.749, + "args": { + "External id": 126269,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685341471.381, "dur": 10.200, + "args": { + "External id": 126270,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685341485.012, "dur": 12.180, + "args": { + "External id": 126271,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685341498.692, "dur": 8.720, + "args": { + "External id": 126272,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685341509.412, "dur": 22.209, + "args": { + "External id": 126273,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685341513.921, "dur": 3.120, + "args": { + "External id": 126274,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341519.521, "dur": 0.820, + "args": { + "External id": 126275,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685341534.101, "dur": 8.700, + "args": { + "External id": 126276,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685341544.201, "dur": 8.050, + "args": { + "External id": 126277,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6896 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685341563.351, "dur": 3.140, + "args": { + "External id": 126278,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685341577.131, "dur": 4.470, + "args": { + "External id": 126279,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341579.991, "dur": 0.580, + "args": { + "External id": 126280,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685341653.921, "dur": 53.020, + "args": { + "External id": 126281,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685341716.141, "dur": 7.520, + "args": { + "External id": 126282,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341720.791, "dur": 1.190, + "args": { + "External id": 126283,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685341725.181, "dur": 21.480, + "args": { + "External id": 126284,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685341754.251, "dur": 8.860, + "args": { + "External id": 126285,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685341755.871, "dur": 6.500, + "args": { + "External id": 126286,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341759.311, "dur": 2.660, + "args": { + "External id": 126287,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685341765.821, "dur": 35.210, + "args": { + "External id": 126288,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685341767.091, "dur": 32.930, + "args": { + "External id": 126289,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685341806.121, "dur": 15.100, + "args": { + "External id": 126290,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685341830.091, "dur": 5.700, + "args": { + "External id": 126291,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341833.341, "dur": 0.960, + "args": { + "External id": 126292,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685341840.251, "dur": 41.400, + "args": { + "External id": 126293,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685341842.351, "dur": 4.380, + "args": { + "External id": 126294,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685341843.591, "dur": 2.510, + "args": { + "External id": 126295,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341845.181, "dur": 0.610, + "args": { + "External id": 126296,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685341847.791, "dur": 33.320, + "args": { + "External id": 126297,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685341848.621, "dur": 31.650, + "args": { + "External id": 126298,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685341888.271, "dur": 4.760, + "args": { + "External id": 126299,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685341890.980, "dur": 0.800, + "args": { + "External id": 126300,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685341900.880, "dur": 1.920, + "args": { + "External id": 126301,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685341911.000, "dur": 8.971, + "args": { + "External id": 126302,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685341912.880, "dur": 6.671, + "args": { + "External id": 126303,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685342005.290, "dur": 174.100, + "args": { + "External id": 126304,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685342008.790, "dur": 5.120, + "args": { + "External id": 126305,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685342016.660, "dur": 162.140, + "args": { + "External id": 126306,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685342018.050, "dur": 0.160, + "args": { + "External id": 126307,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685342020.500, "dur": 24.340, + "args": { + "External id": 126308,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685342046.800, "dur": 5.000, + "args": { + "External id": 126309,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342049.320, "dur": 1.930, + "args": { + "External id": 126310,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685342052.670, "dur": 20.980, + "args": { + "External id": 126311,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685342053.880, "dur": 3.050, + "args": { + "External id": 126312,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685342058.010, "dur": 15.350, + "args": { + "External id": 126313,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685342061.100, "dur": 3.170, + "args": { + "External id": 126314,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685342075.140, "dur": 19.190, + "args": { + "External id": 126315,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685342096.030, "dur": 10.510, + "args": { + "External id": 126316,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685342111.710, "dur": 11.760, + "args": { + "External id": 126317,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685342124.810, "dur": 8.600, + "args": { + "External id": 126318,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685342136.450, "dur": 20.950, + "args": { + "External id": 126319,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685342140.110, "dur": 2.990, + "args": { + "External id": 126320,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342145.320, "dur": 0.780, + "args": { + "External id": 126321,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685342159.300, "dur": 8.510, + "args": { + "External id": 126322,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685342169.210, "dur": 8.070, + "args": { + "External id": 126323,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685342188.570, "dur": 3.060, + "args": { + "External id": 126324,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685342202.770, "dur": 4.870, + "args": { + "External id": 126325,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342205.880, "dur": 0.640, + "args": { + "External id": 126326,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685342282.510, "dur": 55.749, + "args": { + "External id": 126327,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685342346.779, "dur": 7.831, + "args": { + "External id": 126328,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342351.419, "dur": 1.320, + "args": { + "External id": 126329,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685342357.139, "dur": 20.740, + "args": { + "External id": 126330,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685342385.659, "dur": 7.180, + "args": { + "External id": 126331,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685342387.039, "dur": 4.851, + "args": { + "External id": 126332,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342390.439, "dur": 1.091, + "args": { + "External id": 126333,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685342395.530, "dur": 34.449, + "args": { + "External id": 126334,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685342396.579, "dur": 32.560, + "args": { + "External id": 126335,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685342435.179, "dur": 14.810, + "args": { + "External id": 126336,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685342459.429, "dur": 5.830, + "args": { + "External id": 126337,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342462.819, "dur": 0.950, + "args": { + "External id": 126338,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685342469.309, "dur": 41.490, + "args": { + "External id": 126339,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685342470.259, "dur": 4.250, + "args": { + "External id": 126340,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685342471.489, "dur": 2.480, + "args": { + "External id": 126341,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342473.099, "dur": 0.600, + "args": { + "External id": 126342,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685342476.479, "dur": 33.750, + "args": { + "External id": 126343,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685342477.549, "dur": 31.850, + "args": { + "External id": 126344,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685342516.969, "dur": 4.520, + "args": { + "External id": 126345,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342519.609, "dur": 0.730, + "args": { + "External id": 126346,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685342529.399, "dur": 1.910, + "args": { + "External id": 126347,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685342538.109, "dur": 9.600, + "args": { + "External id": 126348,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685342541.219, "dur": 6.130, + "args": { + "External id": 126349,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685342630.089, "dur": 167.589, + "args": { + "External id": 126350,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685342632.409, "dur": 7.190, + "args": { + "External id": 126351,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685342641.689, "dur": 155.420, + "args": { + "External id": 126352,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685342643.029, "dur": 0.190, + "args": { + "External id": 126353,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685342644.499, "dur": 22.470, + "args": { + "External id": 126354,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685342668.779, "dur": 4.760, + "args": { + "External id": 126355,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342671.989, "dur": 1.030, + "args": { + "External id": 126356,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685342675.699, "dur": 21.110, + "args": { + "External id": 126357,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685342677.029, "dur": 3.100, + "args": { + "External id": 126358,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685342681.139, "dur": 15.360, + "args": { + "External id": 126359,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685342684.239, "dur": 3.560, + "args": { + "External id": 126360,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685342698.279, "dur": 17.220, + "args": { + "External id": 126361,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685342717.269, "dur": 10.160, + "args": { + "External id": 126362,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685342730.229, "dur": 11.570, + "args": { + "External id": 126363,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685342743.189, "dur": 8.650, + "args": { + "External id": 126364,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685342753.849, "dur": 22.129, + "args": { + "External id": 126365,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685342758.638, "dur": 2.880, + "args": { + "External id": 126366,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342763.798, "dur": 0.780, + "args": { + "External id": 126367,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685342777.849, "dur": 8.700, + "args": { + "External id": 126368,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685342787.889, "dur": 7.809, + "args": { + "External id": 126369,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685342806.478, "dur": 3.031, + "args": { + "External id": 126370,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685342819.469, "dur": 4.640, + "args": { + "External id": 126371,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342822.409, "dur": 0.669, + "args": { + "External id": 126372,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685342895.018, "dur": 45.250, + "args": { + "External id": 126373,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685342947.908, "dur": 7.350, + "args": { + "External id": 126374,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342952.198, "dur": 1.220, + "args": { + "External id": 126375,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685342956.828, "dur": 20.490, + "args": { + "External id": 126376,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685342984.708, "dur": 8.380, + "args": { + "External id": 126377,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685342986.378, "dur": 5.850, + "args": { + "External id": 126378,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685342989.848, "dur": 2.010, + "args": { + "External id": 126379,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685342995.598, "dur": 33.450, + "args": { + "External id": 126380,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685342996.628, "dur": 31.600, + "args": { + "External id": 126381,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685343033.898, "dur": 14.840, + "args": { + "External id": 126382,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685343056.948, "dur": 5.960, + "args": { + "External id": 126383,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343060.108, "dur": 1.010, + "args": { + "External id": 126384,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685343067.108, "dur": 41.890, + "args": { + "External id": 126385,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685343069.218, "dur": 4.500, + "args": { + "External id": 126386,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685343070.698, "dur": 2.500, + "args": { + "External id": 126387,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343072.228, "dur": 0.530, + "args": { + "External id": 126388,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685343074.688, "dur": 33.740, + "args": { + "External id": 126389,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685343075.638, "dur": 31.980, + "args": { + "External id": 126390,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685343115.518, "dur": 4.660, + "args": { + "External id": 126391,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343118.258, "dur": 0.690, + "args": { + "External id": 126392,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685343127.518, "dur": 1.880, + "args": { + "External id": 126393,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685343137.158, "dur": 8.370, + "args": { + "External id": 126394,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685343138.788, "dur": 6.380, + "args": { + "External id": 126395,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685343227.157, "dur": 193.690, + "args": { + "External id": 126396,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685343229.448, "dur": 4.880, + "args": { + "External id": 126397,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685343236.897, "dur": 183.320, + "args": { + "External id": 126398,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685343238.288, "dur": 0.180, + "args": { + "External id": 126399,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685343240.908, "dur": 23.809, + "args": { + "External id": 126400,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685343266.628, "dur": 4.680, + "args": { + "External id": 126401,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343268.917, "dur": 1.851, + "args": { + "External id": 126402,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685343272.137, "dur": 20.071, + "args": { + "External id": 126403,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685343273.268, "dur": 2.960, + "args": { + "External id": 126404,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685343277.197, "dur": 14.691, + "args": { + "External id": 126405,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685343280.317, "dur": 3.091, + "args": { + "External id": 126406,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685343293.608, "dur": 44.449, + "args": { + "External id": 126407,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685343340.487, "dur": 10.200, + "args": { + "External id": 126408,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685343353.867, "dur": 11.810, + "args": { + "External id": 126409,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685343366.967, "dur": 8.630, + "args": { + "External id": 126410,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685343377.807, "dur": 21.130, + "args": { + "External id": 126411,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685343381.477, "dur": 2.970, + "args": { + "External id": 126412,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343386.757, "dur": 0.820, + "args": { + "External id": 126413,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685343400.967, "dur": 8.510, + "args": { + "External id": 126414,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685343410.817, "dur": 7.740, + "args": { + "External id": 126415,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685343430.277, "dur": 3.050, + "args": { + "External id": 126416,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685343443.747, "dur": 4.880, + "args": { + "External id": 126417,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343446.977, "dur": 0.640, + "args": { + "External id": 126418,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685343519.397, "dur": 46.340, + "args": { + "External id": 126419,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685343574.567, "dur": 7.240, + "args": { + "External id": 126420,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343578.797, "dur": 1.280, + "args": { + "External id": 126421,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685343583.297, "dur": 20.750, + "args": { + "External id": 126422,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685343611.057, "dur": 5.760, + "args": { + "External id": 126423,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685343612.597, "dur": 3.450, + "args": { + "External id": 126424,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343614.687, "dur": 1.000, + "args": { + "External id": 126425,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685343620.427, "dur": 32.969, + "args": { + "External id": 126426,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685343621.437, "dur": 31.090, + "args": { + "External id": 126427,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685343658.116, "dur": 14.820, + "args": { + "External id": 126428,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685343681.187, "dur": 5.520, + "args": { + "External id": 126429,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343684.456, "dur": 0.991, + "args": { + "External id": 126430,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685343690.696, "dur": 43.460, + "args": { + "External id": 126431,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685343691.656, "dur": 6.520, + "args": { + "External id": 126432,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685343693.896, "dur": 3.720, + "args": { + "External id": 126433,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343695.636, "dur": 1.651, + "args": { + "External id": 126434,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685343699.127, "dur": 34.509, + "args": { + "External id": 126435,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685343700.196, "dur": 32.591, + "args": { + "External id": 126436,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685343740.386, "dur": 4.660, + "args": { + "External id": 126437,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343743.086, "dur": 0.610, + "args": { + "External id": 126438,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685343753.656, "dur": 1.870, + "args": { + "External id": 126439,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685343762.316, "dur": 8.560, + "args": { + "External id": 126440,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685343764.046, "dur": 6.480, + "args": { + "External id": 126441,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685343850.846, "dur": 168.390, + "args": { + "External id": 126442,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685343853.356, "dur": 6.110, + "args": { + "External id": 126443,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685343861.176, "dur": 157.420, + "args": { + "External id": 126444,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685343863.756, "dur": 0.210, + "args": { + "External id": 126445,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685343866.476, "dur": 22.400, + "args": { + "External id": 126446,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685343890.726, "dur": 3.760, + "args": { + "External id": 126447,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343892.946, "dur": 1.030, + "args": { + "External id": 126448,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685343895.416, "dur": 22.420, + "args": { + "External id": 126449,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685343896.736, "dur": 4.240, + "args": { + "External id": 126450,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685343902.196, "dur": 15.320, + "args": { + "External id": 126451,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685343905.206, "dur": 3.700, + "args": { + "External id": 126452,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685343919.286, "dur": 17.620, + "args": { + "External id": 126453,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685343938.786, "dur": 10.200, + "args": { + "External id": 126454,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685343953.366, "dur": 11.900, + "args": { + "External id": 126455,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685343966.656, "dur": 9.110, + "args": { + "External id": 126456,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685343977.716, "dur": 19.100, + "args": { + "External id": 126457,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685343980.256, "dur": 2.740, + "args": { + "External id": 126458,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685343985.136, "dur": 0.870, + "args": { + "External id": 126459,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685343998.786, "dur": 8.170, + "args": { + "External id": 126460,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685344009.246, "dur": 7.980, + "args": { + "External id": 126461,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685344028.066, "dur": 3.100, + "args": { + "External id": 126462,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685344041.446, "dur": 4.920, + "args": { + "External id": 126463,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344044.406, "dur": 0.600, + "args": { + "External id": 126464,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685344129.406, "dur": 48.840, + "args": { + "External id": 126465,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685344186.355, "dur": 7.060, + "args": { + "External id": 126466,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344190.505, "dur": 1.120, + "args": { + "External id": 126467,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685344195.855, "dur": 20.490, + "args": { + "External id": 126468,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685344223.495, "dur": 5.900, + "args": { + "External id": 126469,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685344224.955, "dur": 3.630, + "args": { + "External id": 126470,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344227.185, "dur": 0.960, + "args": { + "External id": 126471,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685344231.885, "dur": 32.810, + "args": { + "External id": 126472,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685344232.935, "dur": 30.840, + "args": { + "External id": 126473,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685344269.445, "dur": 14.780, + "args": { + "External id": 126474,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685344293.495, "dur": 13.860, + "args": { + "External id": 126475,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344304.735, "dur": 1.080, + "args": { + "External id": 126476,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685344311.755, "dur": 43.620, + "args": { + "External id": 126477,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685344312.675, "dur": 5.370, + "args": { + "External id": 126478,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685344313.715, "dur": 3.750, + "args": { + "External id": 126479,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344315.275, "dur": 1.830, + "args": { + "External id": 126480,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685344319.975, "dur": 34.870, + "args": { + "External id": 126481,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685344320.965, "dur": 32.990, + "args": { + "External id": 126482,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685344362.165, "dur": 4.500, + "args": { + "External id": 126483,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344364.855, "dur": 0.590, + "args": { + "External id": 126484,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685344374.395, "dur": 1.950, + "args": { + "External id": 126485,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685344383.155, "dur": 8.500, + "args": { + "External id": 126486,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685344384.825, "dur": 6.420, + "args": { + "External id": 126487,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685344475.005, "dur": 164.729, + "args": { + "External id": 126488,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685344477.385, "dur": 5.000, + "args": { + "External id": 126489,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685344483.955, "dur": 155.209, + "args": { + "External id": 126490,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685344485.095, "dur": 0.200, + "args": { + "External id": 126491,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685344487.465, "dur": 23.440, + "args": { + "External id": 126492,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685344512.585, "dur": 3.490, + "args": { + "External id": 126493,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344514.625, "dur": 0.950, + "args": { + "External id": 126494,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685344516.795, "dur": 22.510, + "args": { + "External id": 126495,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685344518.065, "dur": 3.930, + "args": { + "External id": 126496,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685344524.114, "dur": 14.840, + "args": { + "External id": 126497,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685344527.185, "dur": 3.069, + "args": { + "External id": 126498,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685344540.554, "dur": 18.800, + "args": { + "External id": 126499,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685344561.094, "dur": 10.551, + "args": { + "External id": 126500,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685344574.565, "dur": 11.809, + "args": { + "External id": 126501,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685344587.814, "dur": 8.611, + "args": { + "External id": 126502,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685344598.294, "dur": 19.240, + "args": { + "External id": 126503,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685344600.714, "dur": 2.820, + "args": { + "External id": 126504,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344605.774, "dur": 0.791, + "args": { + "External id": 126505,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685344620.414, "dur": 8.390, + "args": { + "External id": 126506,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685344630.044, "dur": 7.720, + "args": { + "External id": 126507,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685344648.394, "dur": 3.010, + "args": { + "External id": 126508,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685344661.314, "dur": 4.560, + "args": { + "External id": 126509,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344664.314, "dur": 0.550, + "args": { + "External id": 126510,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685344736.184, "dur": 45.460, + "args": { + "External id": 126511,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685344789.354, "dur": 8.370, + "args": { + "External id": 126512,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344793.514, "dur": 2.340, + "args": { + "External id": 126513,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685344802.614, "dur": 25.800, + "args": { + "External id": 126514,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685344835.494, "dur": 5.700, + "args": { + "External id": 126515,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685344836.964, "dur": 3.470, + "args": { + "External id": 126516,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344839.044, "dur": 1.010, + "args": { + "External id": 126517,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685344843.624, "dur": 32.900, + "args": { + "External id": 126518,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685344844.624, "dur": 30.990, + "args": { + "External id": 126519,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685344881.364, "dur": 14.540, + "args": { + "External id": 126520,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685344905.324, "dur": 5.490, + "args": { + "External id": 126521,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344908.464, "dur": 0.920, + "args": { + "External id": 126522,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5717, "tid": 5717, + "ts": 6302685344914.714, "dur": 40.650, + "args": { + "External id": 126523,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685344915.524, "dur": 3.910, + "args": { + "External id": 126524,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685344916.564, "dur": 2.370, + "args": { + "External id": 126525,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344918.124, "dur": 0.520, + "args": { + "External id": 126526,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685344921.324, "dur": 33.480, + "args": { + "External id": 126527,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685344922.214, "dur": 31.750, + "args": { + "External id": 126528,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685344961.374, "dur": 4.630, + "args": { + "External id": 126529,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685344964.104, "dur": 0.680, + "args": { + "External id": 126530,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685344973.593, "dur": 1.940, + "args": { + "External id": 126531,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5717, "tid": 5717, + "ts": 6302685344982.104, "dur": 10.560, + "args": { + "External id": 126532,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685344984.704, "dur": 7.560, + "args": { + "External id": 126533,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685345073.823, "dur": 203.860, + "args": { + "External id": 126534,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685345076.073, "dur": 6.200, + "args": { + "External id": 126535,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5717, "tid": 5717, + "ts": 6302685345083.843, "dur": 193.120, + "args": { + "External id": 126536,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5717, "tid": 5717, + "ts": 6302685345085.183, "dur": 0.180, + "args": { + "External id": 126537,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5717, "tid": 5717, + "ts": 6302685345086.473, "dur": 21.940, + "args": { + "External id": 126538,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5717, "tid": 5717, + "ts": 6302685345111.153, "dur": 3.570, + "args": { + "External id": 126539,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685345113.273, "dur": 0.960, + "args": { + "External id": 126540,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685345116.673, "dur": 21.850, + "args": { + "External id": 126541,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685345117.733, "dur": 2.970, + "args": { + "External id": 126542,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685345121.723, "dur": 16.480, + "args": { + "External id": 126543,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685345125.773, "dur": 3.760, + "args": { + "External id": 126544,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685345139.853, "dur": 17.630, + "args": { + "External id": 126545,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5717, "tid": 5717, + "ts": 6302685345159.243, "dur": 14.740, + "args": { + "External id": 126546,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5717, "tid": 5717, + "ts": 6302685345177.853, "dur": 15.890, + "args": { + "External id": 126547,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5717, "tid": 5717, + "ts": 6302685345196.403, "dur": 14.090, + "args": { + "External id": 126548,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685345214.083, "dur": 35.540, + "args": { + "External id": 126549,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685345219.073, "dur": 5.430, + "args": { + "External id": 126550,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685345229.283, "dur": 2.080, + "args": { + "External id": 126551,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5717, "tid": 5717, + "ts": 6302685345252.973, "dur": 12.690, + "args": { + "External id": 126552,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685345267.053, "dur": 8.140, + "args": { + "External id": 126553,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685345287.113, "dur": 3.000, + "args": { + "External id": 126554,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685345310.743, "dur": 4.860, + "args": { + "External id": 126555,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685345313.813, "dur": 0.710, + "args": { + "External id": 126556,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685345395.063, "dur": 73.440, + "args": { + "External id": 126557,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5717, "tid": 5717, + "ts": 6302685345480.803, "dur": 12.500, + "args": { + "External id": 126558,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685345487.552, "dur": 2.520, + "args": { + "External id": 126559,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685345494.683, "dur": 28.019, + "args": { + "External id": 126560,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5717, "tid": 5717, + "ts": 6302685345530.462, "dur": 7.290, + "args": { + "External id": 126561,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5717, "tid": 5717, + "ts": 6302685345531.942, "dur": 5.000, + "args": { + "External id": 126562,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685345535.432, "dur": 1.070, + "args": { + "External id": 126563,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5717, "tid": 5717, + "ts": 6302685345540.542, "dur": 34.970, + "args": { + "External id": 126564,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5717, "tid": 5717, + "ts": 6302685345543.022, "dur": 31.640, + "args": { + "External id": 126565,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685345580.312, "dur": 15.050, + "args": { + "External id": 126566,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685345600.162, "dur": 26.100, + "args": { + "External id": 126567,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 7186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5717, "tid": 5717, + "ts": 6302685345602.052, "dur": 23.720, + "args": { + "External id": 126568,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685345610.142, "dur": 0.720, + "args": { + "External id": 126569,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685345631.282, "dur": 26.430, + "args": { + "External id": 126570,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 7189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5717, "tid": 5717, + "ts": 6302685345632.502, "dur": 24.930, + "args": { + "External id": 126571,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 7190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685345637.332, "dur": 5.930, + "args": { + "External id": 126572,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685345644.362, "dur": 12.440, + "args": { + "External id": 126573,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685345668.742, "dur": 3.770, + "args": { + "External id": 126574,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 7193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685345670.052, "dur": 2.120, + "args": { + "External id": 126575,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 7194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5717, "tid": 5717, + "ts": 6302685345673.572, "dur": 1.040, + "args": { + "External id": 126576,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5717, "tid": 5717, + "ts": 6302685345673.942, "dur": 0.530, + "args": { + "External id": 126577,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685345708.822, "dur": 21.750, + "args": { + "External id": 126578,"Sequence number": 2576066, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5717, "tid": 5717, + "ts": 6302685345733.292, "dur": 11.420, + "args": { + "External id": 126579,"Sequence number": 2576067, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7198 + } + }, + { + "ph": "s", "id": 2, "pid": 5717, "tid": 5717, "ts": 6302685345733.292, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward", "pid": 5717, "tid": 5717, + "ts": 6302685345839.522, "dur": 37.429, + "args": { + "External id": 126580,"Record function id": 0, "Ev Idx": 7199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 5717, + "ts": 6302685345984.991, "dur": 36.940, + "args": { + "External id": 126581,"Sequence number": 2576068, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7200 + } + }, + { + "ph": "s", "id": 1, "pid": 5717, "tid": 5717, "ts": 6302685345984.991, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 5717, "tid": 5717, + "ts": 6302685346072.611, "dur": 28.150, + "args": { + "External id": 126582,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "1"], "Input type": ["float", "", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[1], [], [], [], [], []], "Ev Idx": 7201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685346074.171, "dur": 11.010, + "args": { + "External id": 126583,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "1"], "Input type": ["float", "", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[1], [], [], [], [], []], "Ev Idx": 7202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5717, "tid": 5717, + "ts": 6302685346077.181, "dur": 7.240, + "args": { + "External id": 126584,"Record function id": 0, "Concrete Inputs": ["[1]", "[1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685346086.321, "dur": 14.010, + "args": { + "External id": 126585,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 5717, + "ts": 6302685434102.172, "dur": 430.749, + "args": { + "External id": 126586,"Sequence number": 2576069, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5717, "tid": 5717, + "ts": 6302685434615.221, "dur": 145.479, + "args": { + "External id": 126587,"Sequence number": 2576070, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685434811.000, "dur": 305.300, + "args": { + "External id": 126588,"Sequence number": 2576071, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "long int", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685437042.205, "dur": 180.300, + "args": { + "External id": 126589,"Sequence number": 2576072, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "long int", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685437279.515, "dur": 149.679, + "args": { + "External id": 126590,"Sequence number": 2576073, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "long int", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_norm", "pid": 5717, "tid": 5717, + "ts": 6302685441949.354, "dur": 12275.422, + "args": { + "External id": 126591,"Record function id": 0, "Concrete Inputs": ["", "2.", ""], "Input type": ["TensorList", "Scalar", ""], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_norm", "pid": 5717, "tid": 5717, + "ts": 6302685444192.979, "dur": 3400.412, + "args": { + "External id": 126592,"Record function id": 0, "Concrete Inputs": ["", "2.", ""], "Input type": ["TensorList", "Scalar", ""], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5717, "tid": 5717, + "ts": 6302685444272.179, "dur": 331.379, + "args": { + "External id": 126593,"Record function id": 0, "Concrete Inputs": ["[12032]", "6", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685444285.729, "dur": 104.099, + "args": { + "External id": 126594,"Record function id": 0, "Concrete Inputs": ["[12032]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5717, "tid": 5717, + "ts": 6302685444404.808, "dur": 194.780, + "args": { + "External id": 126595,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[12032]], "Ev Idx": 7214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5717, "tid": 5717, + "ts": 6302685444417.088, "dur": 171.720, + "args": { + "External id": 126596,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[12032], []], "Ev Idx": 7215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454340.866, "dur": 12.210, + "args": { + "External id": 126597,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454367.586, "dur": 2.340, + "args": { + "External id": 126598,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454378.356, "dur": 1.560, + "args": { + "External id": 126599,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454387.296, "dur": 1.530, + "args": { + "External id": 126600,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454396.186, "dur": 1.600, + "args": { + "External id": 126601,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454406.316, "dur": 1.520, + "args": { + "External id": 126602,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454414.846, "dur": 1.460, + "args": { + "External id": 126603,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454427.106, "dur": 1.570, + "args": { + "External id": 126604,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454436.116, "dur": 1.510, + "args": { + "External id": 126605,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454445.576, "dur": 1.500, + "args": { + "External id": 126606,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454453.986, "dur": 1.460, + "args": { + "External id": 126607,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454462.836, "dur": 1.430, + "args": { + "External id": 126608,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454471.596, "dur": 1.450, + "args": { + "External id": 126609,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454480.406, "dur": 1.470, + "args": { + "External id": 126610,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454488.845, "dur": 1.500, + "args": { + "External id": 126611,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454500.496, "dur": 1.540, + "args": { + "External id": 126612,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454509.016, "dur": 1.560, + "args": { + "External id": 126613,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454518.325, "dur": 1.720, + "args": { + "External id": 126614,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454527.145, "dur": 1.480, + "args": { + "External id": 126615,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454535.576, "dur": 1.489, + "args": { + "External id": 126616,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454544.236, "dur": 1.509, + "args": { + "External id": 126617,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454552.565, "dur": 1.520, + "args": { + "External id": 126618,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454561.176, "dur": 1.469, + "args": { + "External id": 126619,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454573.025, "dur": 1.540, + "args": { + "External id": 126620,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454581.465, "dur": 1.480, + "args": { + "External id": 126621,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454590.765, "dur": 1.460, + "args": { + "External id": 126622,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454599.155, "dur": 1.500, + "args": { + "External id": 126623,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454607.625, "dur": 1.420, + "args": { + "External id": 126624,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454616.355, "dur": 1.430, + "args": { + "External id": 126625,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454625.025, "dur": 1.430, + "args": { + "External id": 126626,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454633.285, "dur": 1.490, + "args": { + "External id": 126627,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454644.955, "dur": 1.440, + "args": { + "External id": 126628,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454653.395, "dur": 1.500, + "args": { + "External id": 126629,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454662.905, "dur": 1.520, + "args": { + "External id": 126630,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454671.365, "dur": 1.520, + "args": { + "External id": 126631,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454679.765, "dur": 1.530, + "args": { + "External id": 126632,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454688.345, "dur": 1.640, + "args": { + "External id": 126633,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454696.735, "dur": 1.620, + "args": { + "External id": 126634,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454705.325, "dur": 1.480, + "args": { + "External id": 126635,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454717.395, "dur": 1.420, + "args": { + "External id": 126636,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454726.195, "dur": 1.510, + "args": { + "External id": 126637,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454735.795, "dur": 1.490, + "args": { + "External id": 126638,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454744.095, "dur": 1.450, + "args": { + "External id": 126639,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454752.655, "dur": 1.420, + "args": { + "External id": 126640,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454761.125, "dur": 1.430, + "args": { + "External id": 126641,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454769.795, "dur": 1.430, + "args": { + "External id": 126642,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454778.555, "dur": 1.480, + "args": { + "External id": 126643,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454790.295, "dur": 1.450, + "args": { + "External id": 126644,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454798.705, "dur": 1.480, + "args": { + "External id": 126645,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454807.115, "dur": 1.480, + "args": { + "External id": 126646,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454815.525, "dur": 1.510, + "args": { + "External id": 126647,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454823.845, "dur": 1.510, + "args": { + "External id": 126648,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454832.155, "dur": 1.460, + "args": { + "External id": 126649,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454841.465, "dur": 1.550, + "args": { + "External id": 126650,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454861.315, "dur": 1.580, + "args": { + "External id": 126651,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454873.955, "dur": 1.550, + "args": { + "External id": 126652,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454882.735, "dur": 1.570, + "args": { + "External id": 126653,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454891.585, "dur": 1.510, + "args": { + "External id": 126654,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454900.125, "dur": 1.520, + "args": { + "External id": 126655,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454908.525, "dur": 1.430, + "args": { + "External id": 126656,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454916.825, "dur": 1.410, + "args": { + "External id": 126657,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454925.035, "dur": 1.409, + "args": { + "External id": 126658,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454933.255, "dur": 1.480, + "args": { + "External id": 126659,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454947.524, "dur": 1.600, + "args": { + "External id": 126660,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454955.984, "dur": 1.440, + "args": { + "External id": 126661,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454965.004, "dur": 1.520, + "args": { + "External id": 126662,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454973.255, "dur": 1.400, + "args": { + "External id": 126663,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454981.435, "dur": 1.449, + "args": { + "External id": 126664,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454989.455, "dur": 1.429, + "args": { + "External id": 126665,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685454997.344, "dur": 1.460, + "args": { + "External id": 126666,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455005.595, "dur": 1.440, + "args": { + "External id": 126667,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455016.655, "dur": 1.460, + "args": { + "External id": 126668,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455024.914, "dur": 1.410, + "args": { + "External id": 126669,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455032.934, "dur": 1.730, + "args": { + "External id": 126670,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455041.584, "dur": 1.420, + "args": { + "External id": 126671,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455049.984, "dur": 1.400, + "args": { + "External id": 126672,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455058.074, "dur": 1.380, + "args": { + "External id": 126673,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455067.954, "dur": 1.420, + "args": { + "External id": 126674,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455076.154, "dur": 1.390, + "args": { + "External id": 126675,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455087.344, "dur": 1.410, + "args": { + "External id": 126676,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455095.604, "dur": 1.600, + "args": { + "External id": 126677,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455103.944, "dur": 1.450, + "args": { + "External id": 126678,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455112.084, "dur": 1.430, + "args": { + "External id": 126679,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455120.164, "dur": 1.430, + "args": { + "External id": 126680,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455128.374, "dur": 1.400, + "args": { + "External id": 126681,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455136.604, "dur": 1.440, + "args": { + "External id": 126682,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455144.844, "dur": 1.400, + "args": { + "External id": 126683,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455156.224, "dur": 1.440, + "args": { + "External id": 126684,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455164.654, "dur": 1.440, + "args": { + "External id": 126685,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455172.874, "dur": 1.450, + "args": { + "External id": 126686,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455181.034, "dur": 1.650, + "args": { + "External id": 126687,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455189.394, "dur": 1.400, + "args": { + "External id": 126688,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455197.624, "dur": 1.430, + "args": { + "External id": 126689,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455206.794, "dur": 1.480, + "args": { + "External id": 126690,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455214.894, "dur": 1.420, + "args": { + "External id": 126691,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455226.044, "dur": 1.460, + "args": { + "External id": 126692,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455234.114, "dur": 1.480, + "args": { + "External id": 126693,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455242.264, "dur": 1.470, + "args": { + "External id": 126694,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455250.784, "dur": 1.420, + "args": { + "External id": 126695,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455259.014, "dur": 1.440, + "args": { + "External id": 126696,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455267.464, "dur": 1.520, + "args": { + "External id": 126697,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455275.834, "dur": 1.400, + "args": { + "External id": 126698,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455283.904, "dur": 1.420, + "args": { + "External id": 126699,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455294.964, "dur": 1.440, + "args": { + "External id": 126700,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455323.544, "dur": 1.540, + "args": { + "External id": 126701,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455331.764, "dur": 1.460, + "args": { + "External id": 126702,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455339.684, "dur": 1.440, + "args": { + "External id": 126703,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455347.594, "dur": 1.480, + "args": { + "External id": 126704,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455355.754, "dur": 1.440, + "args": { + "External id": 126705,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455365.094, "dur": 1.429, + "args": { + "External id": 126706,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455373.143, "dur": 1.440, + "args": { + "External id": 126707,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455384.223, "dur": 1.491, + "args": { + "External id": 126708,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455392.423, "dur": 1.631, + "args": { + "External id": 126709,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455400.623, "dur": 1.391, + "args": { + "External id": 126710,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455409.334, "dur": 1.409, + "args": { + "External id": 126711,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455417.383, "dur": 1.440, + "args": { + "External id": 126712,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455425.623, "dur": 1.440, + "args": { + "External id": 126713,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455433.703, "dur": 1.531, + "args": { + "External id": 126714,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455441.943, "dur": 1.440, + "args": { + "External id": 126715,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455453.074, "dur": 1.489, + "args": { + "External id": 126716,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455461.174, "dur": 1.439, + "args": { + "External id": 126717,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455469.273, "dur": 1.450, + "args": { + "External id": 126718,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455477.343, "dur": 1.470, + "args": { + "External id": 126719,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455485.493, "dur": 1.500, + "args": { + "External id": 126720,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455493.753, "dur": 1.470, + "args": { + "External id": 126721,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455501.793, "dur": 1.410, + "args": { + "External id": 126722,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455509.883, "dur": 1.450, + "args": { + "External id": 126723,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685455521.503, "dur": 1.450, + "args": { + "External id": 126724,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::stack", "pid": 5717, "tid": 5717, + "ts": 6302685455725.143, "dur": 4928.519, + "args": { + "External id": 126725,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::stack", "pid": 5717, "tid": 5717, + "ts": 6302685457750.268, "dur": 2539.064, + "args": { + "External id": 126726,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457776.298, "dur": 34.560, + "args": { + "External id": 126727,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457794.618, "dur": 11.900, + "args": { + "External id": 126728,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457815.728, "dur": 12.610, + "args": { + "External id": 126729,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457825.368, "dur": 1.790, + "args": { + "External id": 126730,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457832.608, "dur": 13.830, + "args": { + "External id": 126731,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457839.838, "dur": 5.420, + "args": { + "External id": 126732,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457850.368, "dur": 7.900, + "args": { + "External id": 126733,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457855.698, "dur": 1.460, + "args": { + "External id": 126734,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457861.778, "dur": 13.140, + "args": { + "External id": 126735,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457872.248, "dur": 1.460, + "args": { + "External id": 126736,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457878.418, "dur": 7.100, + "args": { + "External id": 126737,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457883.058, "dur": 1.400, + "args": { + "External id": 126738,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457889.188, "dur": 11.190, + "args": { + "External id": 126739,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457893.728, "dur": 5.350, + "args": { + "External id": 126740,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457908.098, "dur": 9.260, + "args": { + "External id": 126741,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457914.668, "dur": 1.370, + "args": { + "External id": 126742,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457921.048, "dur": 7.300, + "args": { + "External id": 126743,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457925.898, "dur": 1.350, + "args": { + "External id": 126744,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457931.598, "dur": 10.490, + "args": { + "External id": 126745,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457939.318, "dur": 1.660, + "args": { + "External id": 126746,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457945.698, "dur": 12.250, + "args": { + "External id": 126747,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457952.098, "dur": 4.720, + "args": { + "External id": 126748,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457961.158, "dur": 7.080, + "args": { + "External id": 126749,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457965.538, "dur": 1.620, + "args": { + "External id": 126750,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457971.668, "dur": 12.900, + "args": { + "External id": 126751,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457982.078, "dur": 1.380, + "args": { + "External id": 126752,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457987.868, "dur": 6.770, + "args": { + "External id": 126753,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685457992.238, "dur": 1.310, + "args": { + "External id": 126754,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685457998.258, "dur": 10.460, + "args": { + "External id": 126755,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458002.708, "dur": 4.860, + "args": { + "External id": 126756,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458015.697, "dur": 9.051, + "args": { + "External id": 126757,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458022.288, "dur": 1.389, + "args": { + "External id": 126758,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458028.028, "dur": 7.000, + "args": { + "External id": 126759,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458032.637, "dur": 1.331, + "args": { + "External id": 126760,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458038.217, "dur": 10.220, + "args": { + "External id": 126761,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458045.908, "dur": 1.460, + "args": { + "External id": 126762,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458051.837, "dur": 12.411, + "args": { + "External id": 126763,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458058.248, "dur": 4.860, + "args": { + "External id": 126764,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458067.508, "dur": 6.869, + "args": { + "External id": 126765,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458072.088, "dur": 1.240, + "args": { + "External id": 126766,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458077.588, "dur": 12.609, + "args": { + "External id": 126767,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458087.488, "dur": 1.629, + "args": { + "External id": 126768,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458093.508, "dur": 6.640, + "args": { + "External id": 126769,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458097.737, "dur": 1.331, + "args": { + "External id": 126770,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458103.377, "dur": 13.950, + "args": { + "External id": 126771,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458114.717, "dur": 1.530, + "args": { + "External id": 126772,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458124.457, "dur": 8.640, + "args": { + "External id": 126773,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458130.667, "dur": 1.360, + "args": { + "External id": 126774,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458136.577, "dur": 6.890, + "args": { + "External id": 126775,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458141.187, "dur": 1.220, + "args": { + "External id": 126776,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458146.727, "dur": 10.510, + "args": { + "External id": 126777,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458154.377, "dur": 1.840, + "args": { + "External id": 126778,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458160.497, "dur": 12.130, + "args": { + "External id": 126779,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458166.957, "dur": 4.540, + "args": { + "External id": 126780,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458175.867, "dur": 9.900, + "args": { + "External id": 126781,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458180.397, "dur": 4.210, + "args": { + "External id": 126782,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458189.177, "dur": 9.420, + "args": { + "External id": 126783,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458196.097, "dur": 1.350, + "args": { + "External id": 126784,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458201.837, "dur": 7.000, + "args": { + "External id": 126785,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458206.537, "dur": 1.270, + "args": { + "External id": 126786,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458212.057, "dur": 9.370, + "args": { + "External id": 126787,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458218.837, "dur": 1.520, + "args": { + "External id": 126788,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458228.967, "dur": 9.050, + "args": { + "External id": 126789,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458235.537, "dur": 1.320, + "args": { + "External id": 126790,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458241.137, "dur": 7.090, + "args": { + "External id": 126791,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458245.837, "dur": 1.290, + "args": { + "External id": 126792,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458251.497, "dur": 10.210, + "args": { + "External id": 126793,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458259.187, "dur": 1.440, + "args": { + "External id": 126794,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458264.917, "dur": 11.950, + "args": { + "External id": 126795,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458271.487, "dur": 4.220, + "args": { + "External id": 126796,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458280.267, "dur": 10.120, + "args": { + "External id": 126797,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458284.677, "dur": 4.580, + "args": { + "External id": 126798,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458293.917, "dur": 38.620, + "args": { + "External id": 126799,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458328.957, "dur": 1.850, + "args": { + "External id": 126800,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458336.377, "dur": 7.560, + "args": { + "External id": 126801,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458341.617, "dur": 1.240, + "args": { + "External id": 126802,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458347.177, "dur": 10.070, + "args": { + "External id": 126803,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458354.567, "dur": 1.560, + "args": { + "External id": 126804,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458364.397, "dur": 8.780, + "args": { + "External id": 126805,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458370.827, "dur": 1.280, + "args": { + "External id": 126806,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458376.357, "dur": 6.970, + "args": { + "External id": 126807,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458380.977, "dur": 1.330, + "args": { + "External id": 126808,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458386.577, "dur": 10.490, + "args": { + "External id": 126809,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458394.687, "dur": 1.300, + "args": { + "External id": 126810,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458400.307, "dur": 12.550, + "args": { + "External id": 126811,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458406.807, "dur": 4.940, + "args": { + "External id": 126812,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458416.167, "dur": 10.060, + "args": { + "External id": 126813,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458420.587, "dur": 4.500, + "args": { + "External id": 126814,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458429.687, "dur": 8.870, + "args": { + "External id": 126815,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458436.247, "dur": 1.240, + "args": { + "External id": 126816,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458441.927, "dur": 7.360, + "args": { + "External id": 126817,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458446.557, "dur": 1.600, + "args": { + "External id": 126818,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458452.557, "dur": 9.899, + "args": { + "External id": 126819,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458459.836, "dur": 1.460, + "args": { + "External id": 126820,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458468.947, "dur": 8.640, + "args": { + "External id": 126821,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458475.196, "dur": 1.251, + "args": { + "External id": 126822,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458480.836, "dur": 6.991, + "args": { + "External id": 126823,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458485.387, "dur": 1.340, + "args": { + "External id": 126824,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458491.116, "dur": 11.620, + "args": { + "External id": 126825,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458499.847, "dur": 1.680, + "args": { + "External id": 126826,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458505.987, "dur": 11.080, + "args": { + "External id": 126827,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458511.256, "dur": 4.651, + "args": { + "External id": 126828,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458520.427, "dur": 13.020, + "args": { + "External id": 126829,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458527.407, "dur": 4.829, + "args": { + "External id": 126830,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458536.907, "dur": 8.949, + "args": { + "External id": 126831,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458543.256, "dur": 1.340, + "args": { + "External id": 126832,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458549.147, "dur": 8.229, + "args": { + "External id": 126833,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458554.836, "dur": 1.450, + "args": { + "External id": 126834,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458560.566, "dur": 11.760, + "args": { + "External id": 126835,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458569.676, "dur": 1.580, + "args": { + "External id": 126836,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458579.266, "dur": 7.910, + "args": { + "External id": 126837,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458584.876, "dur": 1.230, + "args": { + "External id": 126838,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458590.426, "dur": 8.770, + "args": { + "External id": 126839,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458596.616, "dur": 1.470, + "args": { + "External id": 126840,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458602.406, "dur": 12.140, + "args": { + "External id": 126841,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458612.076, "dur": 1.400, + "args": { + "External id": 126842,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458617.756, "dur": 12.170, + "args": { + "External id": 126843,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458623.806, "dur": 4.970, + "args": { + "External id": 126844,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458633.146, "dur": 11.780, + "args": { + "External id": 126845,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458639.006, "dur": 4.820, + "args": { + "External id": 126846,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458648.296, "dur": 8.220, + "args": { + "External id": 126847,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458654.146, "dur": 1.260, + "args": { + "External id": 126848,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458659.636, "dur": 8.490, + "args": { + "External id": 126849,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458665.676, "dur": 1.400, + "args": { + "External id": 126850,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458671.316, "dur": 11.640, + "args": { + "External id": 126851,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458680.316, "dur": 1.560, + "args": { + "External id": 126852,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458690.496, "dur": 8.140, + "args": { + "External id": 126853,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458696.106, "dur": 1.440, + "args": { + "External id": 126854,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458702.316, "dur": 8.180, + "args": { + "External id": 126855,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458708.146, "dur": 1.260, + "args": { + "External id": 126856,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458713.706, "dur": 11.780, + "args": { + "External id": 126857,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458723.086, "dur": 1.270, + "args": { + "External id": 126858,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458728.596, "dur": 11.110, + "args": { + "External id": 126859,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458733.976, "dur": 4.530, + "args": { + "External id": 126860,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458743.026, "dur": 11.200, + "args": { + "External id": 126861,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458748.566, "dur": 4.500, + "args": { + "External id": 126862,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458757.556, "dur": 8.880, + "args": { + "External id": 126863,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458763.836, "dur": 1.450, + "args": { + "External id": 126864,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458769.716, "dur": 7.990, + "args": { + "External id": 126865,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458775.176, "dur": 1.430, + "args": { + "External id": 126866,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458781.046, "dur": 11.650, + "args": { + "External id": 126867,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458789.996, "dur": 1.540, + "args": { + "External id": 126868,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458799.036, "dur": 7.940, + "args": { + "External id": 126869,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458804.596, "dur": 1.290, + "args": { + "External id": 126870,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458810.276, "dur": 8.060, + "args": { + "External id": 126871,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458816.026, "dur": 1.260, + "args": { + "External id": 126872,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458821.556, "dur": 11.200, + "args": { + "External id": 126873,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458830.326, "dur": 1.340, + "args": { + "External id": 126874,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458836.396, "dur": 11.550, + "args": { + "External id": 126875,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458842.306, "dur": 4.550, + "args": { + "External id": 126876,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458851.216, "dur": 11.150, + "args": { + "External id": 126877,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458856.836, "dur": 4.410, + "args": { + "External id": 126878,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458865.756, "dur": 8.030, + "args": { + "External id": 126879,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458871.496, "dur": 1.230, + "args": { + "External id": 126880,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458876.926, "dur": 8.480, + "args": { + "External id": 126881,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458882.936, "dur": 1.410, + "args": { + "External id": 126882,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458888.646, "dur": 11.109, + "args": { + "External id": 126883,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458897.006, "dur": 1.600, + "args": { + "External id": 126884,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458906.935, "dur": 8.040, + "args": { + "External id": 126885,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458912.575, "dur": 1.280, + "args": { + "External id": 126886,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458918.255, "dur": 8.471, + "args": { + "External id": 126887,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458924.415, "dur": 1.231, + "args": { + "External id": 126888,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458930.015, "dur": 11.900, + "args": { + "External id": 126889,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458939.166, "dur": 1.329, + "args": { + "External id": 126890,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458945.146, "dur": 11.769, + "args": { + "External id": 126891,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458950.795, "dur": 5.000, + "args": { + "External id": 126892,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458960.195, "dur": 11.880, + "args": { + "External id": 126893,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458966.055, "dur": 4.880, + "args": { + "External id": 126894,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458975.695, "dur": 8.251, + "args": { + "External id": 126895,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458981.415, "dur": 1.400, + "args": { + "External id": 126896,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458987.155, "dur": 8.370, + "args": { + "External id": 126897,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685458993.146, "dur": 1.309, + "args": { + "External id": 126898,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685458998.835, "dur": 10.990, + "args": { + "External id": 126899,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459007.265, "dur": 1.440, + "args": { + "External id": 126900,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459016.515, "dur": 8.070, + "args": { + "External id": 126901,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459022.185, "dur": 1.300, + "args": { + "External id": 126902,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459028.075, "dur": 8.970, + "args": { + "External id": 126903,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459034.735, "dur": 1.240, + "args": { + "External id": 126904,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459040.315, "dur": 11.370, + "args": { + "External id": 126905,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459049.335, "dur": 1.270, + "args": { + "External id": 126906,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459054.915, "dur": 11.360, + "args": { + "External id": 126907,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459060.775, "dur": 4.420, + "args": { + "External id": 126908,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459069.575, "dur": 11.610, + "args": { + "External id": 126909,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459075.355, "dur": 4.720, + "args": { + "External id": 126910,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459084.575, "dur": 8.060, + "args": { + "External id": 126911,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459090.205, "dur": 1.300, + "args": { + "External id": 126912,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459095.955, "dur": 8.420, + "args": { + "External id": 126913,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459102.085, "dur": 1.240, + "args": { + "External id": 126914,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459107.665, "dur": 11.150, + "args": { + "External id": 126915,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459116.265, "dur": 1.480, + "args": { + "External id": 126916,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459125.915, "dur": 7.900, + "args": { + "External id": 126917,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459131.445, "dur": 1.290, + "args": { + "External id": 126918,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459137.155, "dur": 7.980, + "args": { + "External id": 126919,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459142.775, "dur": 1.300, + "args": { + "External id": 126920,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459148.515, "dur": 11.440, + "args": { + "External id": 126921,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459157.625, "dur": 1.260, + "args": { + "External id": 126922,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459163.125, "dur": 11.060, + "args": { + "External id": 126923,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459168.605, "dur": 4.510, + "args": { + "External id": 126924,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459177.555, "dur": 11.500, + "args": { + "External id": 126925,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459183.465, "dur": 4.450, + "args": { + "External id": 126926,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459192.705, "dur": 8.010, + "args": { + "External id": 126927,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459198.315, "dur": 1.310, + "args": { + "External id": 126928,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459204.015, "dur": 8.200, + "args": { + "External id": 126929,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459209.835, "dur": 1.300, + "args": { + "External id": 126930,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459215.535, "dur": 11.820, + "args": { + "External id": 126931,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459224.645, "dur": 1.640, + "args": { + "External id": 126932,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459234.275, "dur": 8.380, + "args": { + "External id": 126933,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459240.205, "dur": 1.290, + "args": { + "External id": 126934,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459246.115, "dur": 8.000, + "args": { + "External id": 126935,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459251.645, "dur": 1.360, + "args": { + "External id": 126936,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459257.375, "dur": 11.430, + "args": { + "External id": 126937,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459266.325, "dur": 1.260, + "args": { + "External id": 126938,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459272.055, "dur": 10.730, + "args": { + "External id": 126939,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459277.335, "dur": 4.310, + "args": { + "External id": 126940,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459286.085, "dur": 34.910, + "args": { + "External id": 126941,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459291.665, "dur": 4.590, + "args": { + "External id": 126942,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459325.265, "dur": 8.660, + "args": { + "External id": 126943,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459331.325, "dur": 1.470, + "args": { + "External id": 126944,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459337.225, "dur": 8.029, + "args": { + "External id": 126945,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459342.894, "dur": 1.280, + "args": { + "External id": 126946,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459348.474, "dur": 11.331, + "args": { + "External id": 126947,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459357.294, "dur": 1.420, + "args": { + "External id": 126948,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459366.494, "dur": 8.000, + "args": { + "External id": 126949,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459372.085, "dur": 1.320, + "args": { + "External id": 126950,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459377.654, "dur": 8.080, + "args": { + "External id": 126951,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459383.334, "dur": 1.351, + "args": { + "External id": 126952,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459389.065, "dur": 11.940, + "args": { + "External id": 126953,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459398.385, "dur": 1.429, + "args": { + "External id": 126954,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459404.234, "dur": 11.940, + "args": { + "External id": 126955,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459409.894, "dur": 5.080, + "args": { + "External id": 126956,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459419.594, "dur": 11.491, + "args": { + "External id": 126957,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459425.385, "dur": 4.549, + "args": { + "External id": 126958,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459434.534, "dur": 8.090, + "args": { + "External id": 126959,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459440.234, "dur": 1.370, + "args": { + "External id": 126960,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459445.914, "dur": 8.740, + "args": { + "External id": 126961,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459452.274, "dur": 1.300, + "args": { + "External id": 126962,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459457.954, "dur": 10.980, + "args": { + "External id": 126963,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459466.294, "dur": 1.530, + "args": { + "External id": 126964,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459476.214, "dur": 8.210, + "args": { + "External id": 126965,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459482.044, "dur": 1.280, + "args": { + "External id": 126966,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459487.674, "dur": 8.060, + "args": { + "External id": 126967,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459493.334, "dur": 1.330, + "args": { + "External id": 126968,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459499.024, "dur": 11.280, + "args": { + "External id": 126969,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459507.664, "dur": 1.340, + "args": { + "External id": 126970,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459513.564, "dur": 11.200, + "args": { + "External id": 126971,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459519.074, "dur": 4.550, + "args": { + "External id": 126972,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459528.234, "dur": 11.330, + "args": { + "External id": 126973,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459533.904, "dur": 4.530, + "args": { + "External id": 126974,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459543.074, "dur": 7.980, + "args": { + "External id": 126975,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459548.674, "dur": 1.270, + "args": { + "External id": 126976,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459611.554, "dur": 22.230, + "args": { + "External id": 130561,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459628.934, "dur": 3.730, + "args": { + "External id": 130562,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459644.894, "dur": 17.230, + "args": { + "External id": 130563,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459655.884, "dur": 5.110, + "args": { + "External id": 130564,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5717, "tid": 5717, + "ts": 6302685459679.724, "dur": 12.680, + "args": { + "External id": 130565,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685459689.514, "dur": 1.810, + "args": { + "External id": 130566,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::cat", "pid": 5717, "tid": 5717, + "ts": 6302685459757.514, "dur": 499.258, + "args": { + "External id": 130567,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linalg_vector_norm", "pid": 5717, "tid": 5717, + "ts": 6302685460739.502, "dur": 992.967, + "args": { + "External id": 130568,"Record function id": 0, "Concrete Inputs": ["", "2.", "", "False", ""], "Input type": ["float", "Scalar", "", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[128], [], [], [], []], "Ev Idx": 7603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linalg_vector_norm", "pid": 5717, "tid": 5717, + "ts": 6302685461166.030, "dur": 354.350, + "args": { + "External id": 130569,"Record function id": 0, "Concrete Inputs": ["", "2.", "", "False", ""], "Input type": ["float", "Scalar", "", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[128], [], [], [], []], "Ev Idx": 7604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5717, "tid": 5717, + "ts": 6302685461344.180, "dur": 8.510, + "args": { + "External id": 130570,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Redistribute", "pid": 5717, "tid": 5717, + "ts": 6302685462981.136, "dur": 3426.602, + "args": { + "External id": 130571,"Sequence number": 2576074, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "False"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685463205.146, "dur": 326.139, + "args": { + "External id": 130572,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685463220.646, "dur": 5.110, + "args": { + "External id": 130573,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685463236.846, "dur": 2.250, + "args": { + "External id": 130574,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "_c10d_functional::all_reduce", "pid": 5717, "tid": 5717, + "ts": 6302685463659.295, "dur": 1270.327, + "args": { + "External id": 130575,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["float", "", ""], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5717, "tid": 5717, + "ts": 6302685463677.815, "dur": 368.129, + "args": { + "External id": 130576,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5717, "tid": 5717, + "ts": 6302685463691.775, "dur": 188.349, + "args": { + "External id": 130577,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "0"], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685463825.995, "dur": 48.449, + "args": { + "External id": 130578,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5717, "tid": 5717, + "ts": 6302685463888.984, "dur": 151.880, + "args": { + "External id": 130579,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::allreduce_", "pid": 5717, "tid": 5717, + "ts": 6302685464088.674, "dur": 824.118, + "args": { + "External id": 130580,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "-1"], "Input type": ["TensorList", "", "", "", "Scalar"], "Input Strides": [[[]], [], [], [], []], "Input Dims": [[[]], [], [], [], []], "Ev Idx": 7615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685464248.734, "dur": 628.118, + "args": { + "External id": 130581,"Record function id": 0, "Collective name": "allreduce", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 3, "Input Strides": [[[]], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "3", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1, "Process Group Name": "0", "Input type": ["TensorList", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[[]], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 7616, "In msg nelems": 1 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:all_reduce", "pid": 5717, "tid": 5717, + "ts": 6302685464400.703, "dur": 437.739, + "args": { + "External id": 130582,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685465214.541, "dur": 992.278, + "args": { + "External id": 130583,"Record function id": 0, "Concrete Inputs": ["", "0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "_c10d_functional::wait_tensor", "pid": 5717, "tid": 5717, + "ts": 6302685465584.731, "dur": 262.359, + "args": { + "External id": 130584,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5717, "tid": 5717, + "ts": 6302685465761.830, "dur": 37.650, + "args": { + "External id": 130585,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "3", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 7620, "In msg nelems": 0, "Rank": 3, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5717, "tid": 5717, + "ts": 6302685466012.879, "dur": 162.230, + "args": { + "External id": 130586,"Record function id": 0, "Concrete Inputs": ["", "0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685466020.779, "dur": 3.140, + "args": { + "External id": 130587,"Record function id": 0, "Concrete Inputs": ["", "0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685466029.690, "dur": 1.529, + "args": { + "External id": 130588,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "_ToTorchTensor", "pid": 5717, "tid": 5717, + "ts": 6302685466478.649, "dur": 84.289, + "args": { + "External id": 130589,"Sequence number": 2576075, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5717, "tid": 5717, + "ts": 6302685466515.178, "dur": 29.370, + "args": { + "External id": 130590,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5717, "tid": 5717, + "ts": 6302685466526.218, "dur": 16.990, + "args": { + "External id": 130591,"Record function id": 0, "Concrete Inputs": ["", "[]"], "Input type": ["float", "ScalarList"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5717, "tid": 5717, + "ts": 6302685467339.556, "dur": 146.360, + "args": { + "External id": 130592,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reciprocal", "pid": 5717, "tid": 5717, + "ts": 6302685467610.106, "dur": 79.710, + "args": { + "External id": 130593,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5717, "tid": 5717, + "ts": 6302685467718.476, "dur": 77.390, + "args": { + "External id": 130594,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clamp", "pid": 5717, "tid": 5717, + "ts": 6302685467844.555, "dur": 90.300, + "args": { + "External id": 130595,"Record function id": 0, "Concrete Inputs": ["", "", "1."], "Input type": ["float", "", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685467855.035, "dur": 1.610, + "args": { + "External id": 130596,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5717, "tid": 5717, + "ts": 6302685467998.835, "dur": 2.040, + "args": { + "External id": 130597,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_mul_", "pid": 5717, "tid": 5717, + "ts": 6302685468344.404, "dur": 2517.235, + "args": { + "External id": 130598,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["TensorList", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_mul_", "pid": 5717, "tid": 5717, + "ts": 6302685470177.210, "dur": 492.529, + "args": { + "External id": 130599,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["TensorList", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::isnan", "pid": 5717, "tid": 5717, + "ts": 6302685471107.988, "dur": 187.210, + "args": { + "External id": 130600,"Sequence number": 2576076, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5717, "tid": 5717, + "ts": 6302685471122.578, "dur": 168.800, + "args": { + "External id": 130601,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5717, "tid": 5717, + "ts": 6302685471360.937, "dur": 140070.283, + "args": { + "External id": 130602,"Sequence number": 2576076, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5717, "tid": 5717, + "ts": 6302685471370.497, "dur": 140056.673, + "args": { + "External id": 130603,"Sequence number": 2576076, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5717, "tid": 5717, + "ts": 6302685471379.637, "dur": 140039.483, + "args": { + "External id": 130604,"Sequence number": 2576076, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::isinf", "pid": 5717, "tid": 5717, + "ts": 6302685611485.070, "dur": 402.609, + "args": { + "External id": 130605,"Sequence number": 2576076, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685611497.530, "dur": 245.930, + "args": { + "External id": 130606,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5717, "tid": 5717, + "ts": 6302685611524.940, "dur": 35.970, + "args": { + "External id": 130607,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5717, "tid": 5717, + "ts": 6302685611573.990, "dur": 166.590, + "args": { + "External id": 130608,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], [1]], "Input Dims": [[], [0]], "Ev Idx": 7643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5717, "tid": 5717, + "ts": 6302685611604.820, "dur": 28.180, + "args": { + "External id": 130609,"Record function id": 0, "Concrete Inputs": ["", "[]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5717, "tid": 5717, + "ts": 6302685611754.320, "dur": 123.739, + "args": { + "External id": 130610,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5717, "tid": 5717, + "ts": 6302685611914.979, "dur": 222.089, + "args": { + "External id": 130611,"Sequence number": 2576076, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5717, "tid": 5717, + "ts": 6302685611922.349, "dur": 212.590, + "args": { + "External id": 130612,"Sequence number": 2576076, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5717, "tid": 5717, + "ts": 6302685611930.509, "dur": 200.730, + "args": { + "External id": 130613,"Sequence number": 2576076, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7648 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "Optimizer.step#OptimizersContainer.step", "pid": 5717, "tid": 5717, + "ts": 6302685612343.968, "dur": 7174.934, + "args": { + "External id": 130614,"Record function id": 0, "Ev Idx": 7649 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "Optimizer.step#AdamW.step", "pid": 5717, "tid": 5717, + "ts": 6302685612560.548, "dur": 6935.604, + "args": { + "External id": 130615,"Record function id": 0, "Ev Idx": 7650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_add_", "pid": 5717, "tid": 5717, + "ts": 6302685615006.312, "dur": 367.249, + "args": { + "External id": 130616,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615038.312, "dur": 2.150, + "args": { + "External id": 130617,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615044.472, "dur": 0.340, + "args": { + "External id": 130618,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615047.072, "dur": 0.290, + "args": { + "External id": 130619,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615049.312, "dur": 0.400, + "args": { + "External id": 130620,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615051.532, "dur": 0.210, + "args": { + "External id": 130621,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615053.662, "dur": 0.210, + "args": { + "External id": 130622,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615055.562, "dur": 0.210, + "args": { + "External id": 130623,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615057.642, "dur": 0.290, + "args": { + "External id": 130624,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615059.702, "dur": 0.200, + "args": { + "External id": 130625,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615061.692, "dur": 0.230, + "args": { + "External id": 130626,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615063.602, "dur": 0.230, + "args": { + "External id": 130627,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615065.482, "dur": 0.220, + "args": { + "External id": 130628,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615067.442, "dur": 0.210, + "args": { + "External id": 130629,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615069.342, "dur": 0.230, + "args": { + "External id": 130630,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615071.272, "dur": 0.230, + "args": { + "External id": 130631,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615073.262, "dur": 0.230, + "args": { + "External id": 130632,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615075.192, "dur": 0.220, + "args": { + "External id": 130633,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615077.092, "dur": 0.220, + "args": { + "External id": 130634,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615079.032, "dur": 0.220, + "args": { + "External id": 130635,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615081.012, "dur": 0.220, + "args": { + "External id": 130636,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615082.932, "dur": 0.190, + "args": { + "External id": 130637,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615084.922, "dur": 0.220, + "args": { + "External id": 130638,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615086.872, "dur": 0.210, + "args": { + "External id": 130639,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615088.832, "dur": 0.210, + "args": { + "External id": 130640,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615090.782, "dur": 0.210, + "args": { + "External id": 130641,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615092.712, "dur": 0.230, + "args": { + "External id": 130642,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615094.652, "dur": 0.210, + "args": { + "External id": 130643,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615096.612, "dur": 0.210, + "args": { + "External id": 130644,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615098.652, "dur": 0.230, + "args": { + "External id": 130645,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615100.462, "dur": 0.210, + "args": { + "External id": 130646,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615102.382, "dur": 0.210, + "args": { + "External id": 130647,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615104.452, "dur": 0.230, + "args": { + "External id": 130648,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615106.352, "dur": 0.210, + "args": { + "External id": 130649,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615108.292, "dur": 0.210, + "args": { + "External id": 130650,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615110.472, "dur": 0.210, + "args": { + "External id": 130651,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615112.402, "dur": 0.220, + "args": { + "External id": 130652,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615114.352, "dur": 0.200, + "args": { + "External id": 130653,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615116.262, "dur": 0.210, + "args": { + "External id": 130654,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615118.482, "dur": 0.200, + "args": { + "External id": 130655,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615120.452, "dur": 0.220, + "args": { + "External id": 130656,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615122.392, "dur": 0.230, + "args": { + "External id": 130657,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615124.292, "dur": 0.220, + "args": { + "External id": 130658,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615126.242, "dur": 0.210, + "args": { + "External id": 130659,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615128.272, "dur": 0.210, + "args": { + "External id": 130660,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615130.162, "dur": 0.220, + "args": { + "External id": 130661,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615132.062, "dur": 0.200, + "args": { + "External id": 130662,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615133.972, "dur": 0.210, + "args": { + "External id": 130663,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615136.062, "dur": 0.220, + "args": { + "External id": 130664,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615138.002, "dur": 0.220, + "args": { + "External id": 130665,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615139.962, "dur": 0.220, + "args": { + "External id": 130666,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615141.872, "dur": 0.220, + "args": { + "External id": 130667,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615143.982, "dur": 0.210, + "args": { + "External id": 130668,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615145.932, "dur": 0.200, + "args": { + "External id": 130669,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615147.862, "dur": 0.210, + "args": { + "External id": 130670,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615149.762, "dur": 0.200, + "args": { + "External id": 130671,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615151.652, "dur": 0.220, + "args": { + "External id": 130672,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615153.732, "dur": 0.210, + "args": { + "External id": 130673,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615155.642, "dur": 0.220, + "args": { + "External id": 130674,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615157.542, "dur": 0.200, + "args": { + "External id": 130675,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615159.472, "dur": 0.200, + "args": { + "External id": 130676,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615161.422, "dur": 0.220, + "args": { + "External id": 130677,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615163.292, "dur": 0.200, + "args": { + "External id": 130678,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615165.232, "dur": 0.210, + "args": { + "External id": 130679,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615167.232, "dur": 0.200, + "args": { + "External id": 130680,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615169.112, "dur": 0.220, + "args": { + "External id": 130681,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615170.982, "dur": 0.170, + "args": { + "External id": 130682,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615172.682, "dur": 0.230, + "args": { + "External id": 130683,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615174.412, "dur": 0.200, + "args": { + "External id": 130684,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615176.172, "dur": 0.180, + "args": { + "External id": 130685,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615177.892, "dur": 0.190, + "args": { + "External id": 130686,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615179.622, "dur": 0.200, + "args": { + "External id": 130687,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615181.302, "dur": 0.200, + "args": { + "External id": 130688,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615183.002, "dur": 0.190, + "args": { + "External id": 130689,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615184.702, "dur": 0.200, + "args": { + "External id": 130690,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615186.392, "dur": 0.200, + "args": { + "External id": 130691,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615187.952, "dur": 0.170, + "args": { + "External id": 130692,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615189.512, "dur": 0.170, + "args": { + "External id": 130693,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615191.142, "dur": 0.160, + "args": { + "External id": 130694,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615192.652, "dur": 0.170, + "args": { + "External id": 130695,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615194.272, "dur": 0.160, + "args": { + "External id": 130696,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615195.772, "dur": 0.180, + "args": { + "External id": 130697,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615197.302, "dur": 0.150, + "args": { + "External id": 130698,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615198.722, "dur": 0.160, + "args": { + "External id": 130699,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615200.282, "dur": 0.170, + "args": { + "External id": 130700,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615201.912, "dur": 0.180, + "args": { + "External id": 130701,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615203.572, "dur": 0.160, + "args": { + "External id": 130702,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615205.102, "dur": 0.150, + "args": { + "External id": 130703,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615206.622, "dur": 0.200, + "args": { + "External id": 130704,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615208.172, "dur": 0.200, + "args": { + "External id": 130705,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615209.722, "dur": 0.150, + "args": { + "External id": 130706,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615211.212, "dur": 0.160, + "args": { + "External id": 130707,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615212.712, "dur": 0.160, + "args": { + "External id": 130708,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615214.392, "dur": 0.170, + "args": { + "External id": 130709,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615215.992, "dur": 0.160, + "args": { + "External id": 130710,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615217.542, "dur": 0.160, + "args": { + "External id": 130711,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615219.072, "dur": 0.150, + "args": { + "External id": 130712,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615220.561, "dur": 0.171, + "args": { + "External id": 130713,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615222.321, "dur": 0.171, + "args": { + "External id": 130714,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615223.741, "dur": 0.160, + "args": { + "External id": 130715,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615225.241, "dur": 0.180, + "args": { + "External id": 130716,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615226.861, "dur": 0.160, + "args": { + "External id": 130717,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615228.421, "dur": 0.160, + "args": { + "External id": 130718,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615229.832, "dur": 0.169, + "args": { + "External id": 130719,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615231.321, "dur": 0.151, + "args": { + "External id": 130720,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615232.761, "dur": 0.191, + "args": { + "External id": 130721,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615234.341, "dur": 0.151, + "args": { + "External id": 130722,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615235.781, "dur": 0.171, + "args": { + "External id": 130723,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615237.401, "dur": 0.151, + "args": { + "External id": 130724,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615238.881, "dur": 0.171, + "args": { + "External id": 130725,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615240.361, "dur": 0.160, + "args": { + "External id": 130726,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615242.272, "dur": 0.169, + "args": { + "External id": 130727,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615244.021, "dur": 0.160, + "args": { + "External id": 130728,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615245.492, "dur": 0.169, + "args": { + "External id": 130729,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615247.061, "dur": 0.180, + "args": { + "External id": 130730,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615248.592, "dur": 0.180, + "args": { + "External id": 130731,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615250.161, "dur": 0.171, + "args": { + "External id": 130732,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615251.652, "dur": 0.180, + "args": { + "External id": 130733,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615253.132, "dur": 0.160, + "args": { + "External id": 130734,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615254.641, "dur": 0.180, + "args": { + "External id": 130735,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615256.221, "dur": 0.160, + "args": { + "External id": 130736,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615257.721, "dur": 0.171, + "args": { + "External id": 130737,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615259.212, "dur": 0.160, + "args": { + "External id": 130738,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615260.752, "dur": 0.149, + "args": { + "External id": 130739,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615262.241, "dur": 0.171, + "args": { + "External id": 130740,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615263.712, "dur": 0.160, + "args": { + "External id": 130741,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615265.221, "dur": 0.171, + "args": { + "External id": 130742,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615266.712, "dur": 0.180, + "args": { + "External id": 130743,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5717, "tid": 5717, + "ts": 6302685615268.552, "dur": 0.180, + "args": { + "External id": 130744,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_fused_adamw_", "pid": 5717, "tid": 5717, + "ts": 6302685615993.160, "dur": 3433.962, + "args": { + "External id": 130745,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "0.00026533485307350115", "0.90000000000000002", "0.94999999999999996", "0.10000000000000001", "1.0000000000000001e-15", "False", "False", "", ""], "Input type": ["TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 7780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_fused_adamw_", "pid": 5717, "tid": 5717, + "ts": 6302685619205.252, "dur": 143.230, + "args": { + "External id": 130746,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "0.00026533485307350115", "0.90000000000000002", "0.94999999999999996", "0.10000000000000001", "1.0000000000000001e-15", "False", "False", "", ""], "Input type": ["TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 7781 + } + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 3, "tid": 7, + "ts": 6302684941904.165, "dur": 254.434, + "args": { + "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241660917, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 241660917, "pid": 3, "tid": 7, "ts": 6302684941904.165, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 3, "tid": 7, + "ts": 6302684942159.303, "dur": 254.850, + "args": { + "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241660920, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 241660920, "pid": 3, "tid": 7, "ts": 6302684942159.303, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 3, "tid": 7, + "ts": 6302684942414.825, "dur": 253.762, + "args": { + "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241660923, "registers per thread": 64, "shared memory": 0, "blocks per SM": 0.554688, "warps per SM": 8.875000, "grid": [71, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 18 + } + }, + { + "ph": "f", "id": 241660923, "pid": 3, "tid": 7, "ts": 6302684942414.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941825.997, "dur": 1.280, + "args": { + "cbid": 138, "correlation": 241660928 + } + }, + { + "ph": "f", "id": 241660928, "pid": 5717, "tid": 423623104, "ts": 6302684941825.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941831.166, "dur": 0.920, + "args": { + "cbid": 138, "correlation": 241660929 + } + }, + { + "ph": "f", "id": 241660929, "pid": 5717, "tid": 423623104, "ts": 6302684941831.166, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941836.306, "dur": 1.011, + "args": { + "cbid": 138, "correlation": 241660930 + } + }, + { + "ph": "f", "id": 241660930, "pid": 5717, "tid": 423623104, "ts": 6302684941836.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941837.526, "dur": 0.700, + "args": { + "cbid": 138, "correlation": 241660931 + } + }, + { + "ph": "f", "id": 241660931, "pid": 5717, "tid": 423623104, "ts": 6302684941837.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941840.197, "dur": 0.729, + "args": { + "cbid": 138, "correlation": 241660932 + } + }, + { + "ph": "f", "id": 241660932, "pid": 5717, "tid": 423623104, "ts": 6302684941840.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941842.717, "dur": 0.909, + "args": { + "cbid": 138, "correlation": 241660933 + } + }, + { + "ph": "f", "id": 241660933, "pid": 5717, "tid": 423623104, "ts": 6302684941842.717, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941843.877, "dur": 0.569, + "args": { + "cbid": 138, "correlation": 241660934 + } + }, + { + "ph": "f", "id": 241660934, "pid": 5717, "tid": 423623104, "ts": 6302684941843.877, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941845.617, "dur": 0.640, + "args": { + "cbid": 138, "correlation": 241660935 + } + }, + { + "ph": "f", "id": 241660935, "pid": 5717, "tid": 423623104, "ts": 6302684941845.617, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941847.997, "dur": 0.800, + "args": { + "cbid": 138, "correlation": 241660936 + } + }, + { + "ph": "f", "id": 241660936, "pid": 5717, "tid": 423623104, "ts": 6302684941847.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941848.997, "dur": 0.589, + "args": { + "cbid": 138, "correlation": 241660937 + } + }, + { + "ph": "f", "id": 241660937, "pid": 5717, "tid": 423623104, "ts": 6302684941848.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941850.666, "dur": 0.631, + "args": { + "cbid": 138, "correlation": 241660938 + } + }, + { + "ph": "f", "id": 241660938, "pid": 5717, "tid": 423623104, "ts": 6302684941850.666, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941852.937, "dur": 1.120, + "args": { + "cbid": 138, "correlation": 241660939 + } + }, + { + "ph": "f", "id": 241660939, "pid": 5717, "tid": 423623104, "ts": 6302684941852.937, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941854.306, "dur": 0.591, + "args": { + "cbid": 138, "correlation": 241660940 + } + }, + { + "ph": "f", "id": 241660940, "pid": 5717, "tid": 423623104, "ts": 6302684941854.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302684941856.017, "dur": 0.640, + "args": { + "cbid": 138, "correlation": 241660941 + } + }, + { + "ph": "f", "id": 241660941, "pid": 5717, "tid": 423623104, "ts": 6302684941856.017, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 3, "tid": 7, + "ts": 6302685181822.260, "dur": 6.496, + "args": { + "External id": 122907, "device": 3, "context": 1, "stream": 7, "correlation": 241660952, "bytes": 131072, "memory bandwidth (GB/s)": 20.177339901477833 + } + }, + { + "ph": "f", "id": 241660952, "pid": 3, "tid": 7, "ts": 6302685181822.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685181769.153, "dur": 54.440, + "args": { + "External id": 122907, "cbid": 41, "correlation": 241660952 + } + }, + { + "ph": "s", "id": 241660952, "pid": 5717, "tid": 5717, "ts": 6302685181769.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 5717, + "ts": 6302685181824.763, "dur": 7.920, + "args": { + "External id": 122907, "cbid": 131, "correlation": 241660953 + } + }, + { + "ph": "s", "id": 241660953, "pid": 5717, "tid": 5717, "ts": 6302685181824.763, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 3, "tid": 7, + "ts": 6302685181901.269, "dur": 11.360, + "args": { + "External id": 122911, "device": 3, "context": 1, "stream": 7, "correlation": 241660966, "bytes": 262144, "memory bandwidth (GB/s)": 23.07605633802817 + } + }, + { + "ph": "f", "id": 241660966, "pid": 3, "tid": 7, "ts": 6302685181901.269, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685181859.993, "dur": 41.340, + "args": { + "External id": 122911, "cbid": 41, "correlation": 241660966 + } + }, + { + "ph": "s", "id": 241660966, "pid": 5717, "tid": 5717, "ts": 6302685181859.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 5717, + "ts": 6302685181902.053, "dur": 13.070, + "args": { + "External id": 122911, "cbid": 131, "correlation": 241660967 + } + }, + { + "ph": "s", "id": 241660967, "pid": 5717, "tid": 5717, "ts": 6302685181902.053, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 3, "tid": 7, + "ts": 6302685181990.101, "dur": 1.056, + "args": { + "External id": 122914, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241660983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 0.500000, "grid": [32, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241660983, "pid": 3, "tid": 7, "ts": 6302685181990.101, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685181962.783, "dur": 30.000, + "args": { + "External id": 122914, "cbid": 211, "correlation": 241660983 + } + }, + { + "ph": "s", "id": 241660983, "pid": 5717, "tid": 5717, "ts": 6302685181962.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685182092.150, "dur": 1.472, + "args": { + "External id": 122928, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241660996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 241660996, "pid": 3, "tid": 7, "ts": 6302685182092.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685182081.653, "dur": 11.900, + "args": { + "External id": 122928, "cbid": 211, "correlation": 241660996 + } + }, + { + "ph": "s", "id": 241660996, "pid": 5717, "tid": 5717, "ts": 6302685182081.653, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#3}::operator()() const::{lambda(int)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685182129.110, "dur": 1.632, + "args": { + "External id": 122932, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661010, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241661010, "pid": 3, "tid": 7, "ts": 6302685182129.110, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685182120.562, "dur": 9.330, + "args": { + "External id": 122932, "cbid": 211, "correlation": 241661010 + } + }, + { + "ph": "s", "id": 241661010, "pid": 5717, "tid": 5717, "ts": 6302685182120.562, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685182309.222, "dur": 3.030, + "args": { + "cbid": 135, "correlation": 241661022 + } + }, + { + "ph": "f", "id": 241661022, "pid": 5717, "tid": 5717, "ts": 6302685182309.222, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685182316.232, "dur": 2.070, + "args": { + "cbid": 147, "correlation": 241661026 + } + }, + { + "ph": "s", "id": 241661026, "pid": 5717, "tid": 5717, "ts": 6302685182316.232, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685182326.002, "dur": 0.670, + "args": { + "cbid": 135, "correlation": 241661038 + } + }, + { + "ph": "f", "id": 241661038, "pid": 5717, "tid": 5717, "ts": 6302685182326.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685182328.322, "dur": 1.100, + "args": { + "cbid": 147, "correlation": 241661042 + } + }, + { + "ph": "s", "id": 241661042, "pid": 5717, "tid": 5717, "ts": 6302685182328.322, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685182791.803, "dur": 67.553, + "args": { + "External id": 122978, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241661062, "registers per thread": 40, "shared memory": 0, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 241661062, "pid": 3, "tid": 17, "ts": 6302685182791.803, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685182776.951, "dur": 15.680, + "args": { + "External id": 122978, "cbid": 211, "correlation": 241661062 + } + }, + { + "ph": "s", "id": 241661062, "pid": 5717, "tid": 5717, "ts": 6302685182776.951, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685183053.501, "dur": 33.952, + "args": { + "External id": 123023, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241661075, "registers per thread": 36, "shared memory": 0, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 241661075, "pid": 3, "tid": 17, "ts": 6302685183053.501, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685183042.520, "dur": 11.360, + "args": { + "External id": 123023, "cbid": 211, "correlation": 241661075 + } + }, + { + "ph": "s", "id": 241661075, "pid": 5717, "tid": 5717, "ts": 6302685183042.520, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685183089.580, "dur": 1.280, + "args": { + "cbid": 135, "correlation": 241661085 + } + }, + { + "ph": "f", "id": 241661085, "pid": 5717, "tid": 5717, "ts": 6302685183089.580, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685183093.450, "dur": 1.360, + "args": { + "cbid": 147, "correlation": 241661089 + } + }, + { + "ph": "s", "id": 241661089, "pid": 5717, "tid": 5717, "ts": 6302685183093.450, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685183179.480, "dur": 1.540, + "args": { + "External id": 123025, "cbid": 317, "correlation": 241661102 + } + }, + { + "ph": "f", "id": 241661102, "pid": 5717, "tid": 5717, "ts": 6302685183179.480, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685183186.120, "dur": 2.480, + "args": { + "External id": 123025, "cbid": 135, "correlation": 241661104 + } + }, + { + "ph": "f", "id": 241661104, "pid": 5717, "tid": 5717, "ts": 6302685183186.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685183190.670, "dur": 2.140, + "args": { + "External id": 123025, "cbid": 147, "correlation": 241661108 + } + }, + { + "ph": "s", "id": 241661108, "pid": 5717, "tid": 5717, "ts": 6302685183190.670, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685183235.750, "dur": 1.480, + "args": { + "External id": 123025, "cbid": 409, "correlation": 241661111 + } + }, + { + "ph": "f", "id": 241661111, "pid": 5717, "tid": 5717, "ts": 6302685183235.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685183247.180, "dur": 1.380, + "args": { + "External id": 123025, "cbid": 135, "correlation": 241661114 + } + }, + { + "ph": "f", "id": 241661114, "pid": 5717, "tid": 5717, "ts": 6302685183247.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685183248.730, "dur": 1.010, + "args": { + "External id": 123025, "cbid": 147, "correlation": 241661115 + } + }, + { + "ph": "s", "id": 241661115, "pid": 5717, "tid": 5717, "ts": 6302685183248.730, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685183267.999, "dur": 24479.480, + "args": { + "External id": 123025, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241661117, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 13223616, "Out msg nelems": 52894464, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241661117, "pid": 3, "tid": 20, "ts": 6302685183267.999, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685183252.080, "dur": 17.530, + "args": { + "External id": 123025, "cbid": 430, "correlation": 241661117 + } + }, + { + "ph": "s", "id": 241661117, "pid": 5717, "tid": 5717, "ts": 6302685183252.080, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685183271.550, "dur": 0.370, + "args": { + "External id": 123025, "cbid": 135, "correlation": 241661119 + } + }, + { + "ph": "f", "id": 241661119, "pid": 5717, "tid": 5717, "ts": 6302685183271.550, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685183272.060, "dur": 0.660, + "args": { + "External id": 123025, "cbid": 147, "correlation": 241661120 + } + }, + { + "ph": "s", "id": 241661120, "pid": 5717, "tid": 5717, "ts": 6302685183272.060, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685183274.520, "dur": 0.900, + "args": { + "External id": 123025, "cbid": 135, "correlation": 241661123 + } + }, + { + "ph": "f", "id": 241661123, "pid": 5717, "tid": 5717, "ts": 6302685183274.520, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685183288.010, "dur": 0.420, + "args": { + "External id": 123025, "cbid": 135, "correlation": 241661130 + } + }, + { + "ph": "f", "id": 241661130, "pid": 5717, "tid": 5717, "ts": 6302685183288.010, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685183339.030, "dur": 1.770, + "args": { + "External id": 123027, "cbid": 147, "correlation": 241661135 + } + }, + { + "ph": "s", "id": 241661135, "pid": 5717, "tid": 5717, "ts": 6302685183339.030, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685183361.579, "dur": 0.980, + "args": { + "cbid": 135, "correlation": 241661150 + } + }, + { + "ph": "f", "id": 241661150, "pid": 5717, "tid": 5717, "ts": 6302685183361.579, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685183452.359, "dur": 3.350, + "args": { + "cbid": 147, "correlation": 241661157 + } + }, + { + "ph": "s", "id": 241661157, "pid": 5717, "tid": 5717, "ts": 6302685183452.359, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685183842.938, "dur": 1.211, + "args": { + "External id": 123069, "cbid": 317, "correlation": 241661314 + } + }, + { + "ph": "f", "id": 241661314, "pid": 5717, "tid": 5717, "ts": 6302685183842.938, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685207752.279, "dur": 3.488, + "args": { + "External id": 123074, "device": 3, "context": 1, "stream": 7, "correlation": 241661326, "bytes": 22000, "memory bandwidth (GB/s)": 6.307339449541284 + } + }, + { + "ph": "f", "id": 241661326, "pid": 3, "tid": 7, "ts": 6302685207752.279, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685183885.488, "dur": 14.460, + "args": { + "External id": 123074, "cbid": 41, "correlation": 241661326 + } + }, + { + "ph": "s", "id": 241661326, "pid": 5717, "tid": 5717, "ts": 6302685183885.488, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685183905.248, "dur": 1.830, + "args": { + "External id": 123069, "cbid": 135, "correlation": 241661330 + } + }, + { + "ph": "f", "id": 241661330, "pid": 5717, "tid": 5717, "ts": 6302685183905.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685207757.687, "dur": 709.349, + "args": { + "External id": 123069, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661334, "registers per thread": 38, "shared memory": 0, "blocks per SM": 20.289062, "warps per SM": 81.156250, "grid": [2597, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241661334, "pid": 3, "tid": 7, "ts": 6302685207757.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685183910.288, "dur": 12.370, + "args": { + "External id": 123069, "cbid": 211, "correlation": 241661334 + } + }, + { + "ph": "s", "id": 241661334, "pid": 5717, "tid": 5717, "ts": 6302685183910.288, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685184116.488, "dur": 1.390, + "args": { + "cbid": 135, "correlation": 241661345 + } + }, + { + "ph": "f", "id": 241661345, "pid": 5717, "tid": 5717, "ts": 6302685184116.488, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_0", "pid": 3, "tid": 7, + "ts": 6302685208467.644, "dur": 29.697, + "args": { + "External id": 123078, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661358, "registers per thread": 32, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241661358, "pid": 3, "tid": 7, "ts": 6302685208467.644, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685184406.997, "dur": 12.910, + "args": { + "External id": 123078, "cbid": 307, "correlation": 241661358 + } + }, + { + "ph": "s", "id": 241661358, "pid": 5717, "tid": 5717, "ts": 6302685184406.997, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685184814.378, "dur": 125.185, + "args": { + "External id": 123093, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241661373, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241661373, "pid": 3, "tid": 17, "ts": 6302685184814.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685184796.406, "dur": 18.990, + "args": { + "External id": 123093, "cbid": 211, "correlation": 241661373 + } + }, + { + "ph": "s", "id": 241661373, "pid": 5717, "tid": 5717, "ts": 6302685184796.406, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685185003.308, "dur": 74.816, + "args": { + "External id": 123109, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241661386, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241661386, "pid": 3, "tid": 17, "ts": 6302685185003.308, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685184986.476, "dur": 10.400, + "args": { + "External id": 123109, "cbid": 211, "correlation": 241661386 + } + }, + { + "ph": "s", "id": 241661386, "pid": 5717, "tid": 5717, "ts": 6302685184986.476, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685185022.376, "dur": 1.840, + "args": { + "cbid": 135, "correlation": 241661396 + } + }, + { + "ph": "f", "id": 241661396, "pid": 5717, "tid": 5717, "ts": 6302685185022.376, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685185026.476, "dur": 1.280, + "args": { + "cbid": 147, "correlation": 241661400 + } + }, + { + "ph": "s", "id": 241661400, "pid": 5717, "tid": 5717, "ts": 6302685185026.476, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685185084.426, "dur": 1.070, + "args": { + "External id": 123111, "cbid": 317, "correlation": 241661413 + } + }, + { + "ph": "f", "id": 241661413, "pid": 5717, "tid": 5717, "ts": 6302685185084.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685185087.406, "dur": 1.230, + "args": { + "External id": 123111, "cbid": 135, "correlation": 241661415 + } + }, + { + "ph": "f", "id": 241661415, "pid": 5717, "tid": 5717, "ts": 6302685185087.406, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685185089.996, "dur": 1.090, + "args": { + "External id": 123111, "cbid": 147, "correlation": 241661419 + } + }, + { + "ph": "s", "id": 241661419, "pid": 5717, "tid": 5717, "ts": 6302685185089.996, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685185108.836, "dur": 0.699, + "args": { + "External id": 123111, "cbid": 409, "correlation": 241661422 + } + }, + { + "ph": "f", "id": 241661422, "pid": 5717, "tid": 5717, "ts": 6302685185108.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685185113.575, "dur": 0.780, + "args": { + "External id": 123111, "cbid": 135, "correlation": 241661425 + } + }, + { + "ph": "f", "id": 241661425, "pid": 5717, "tid": 5717, "ts": 6302685185113.575, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685185114.526, "dur": 0.840, + "args": { + "External id": 123111, "cbid": 147, "correlation": 241661426 + } + }, + { + "ph": "s", "id": 241661426, "pid": 5717, "tid": 5717, "ts": 6302685185114.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685207748.983, "dur": 4064.575, + "args": { + "External id": 123111, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241661428, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241661428, "pid": 3, "tid": 20, "ts": 6302685207748.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685185116.415, "dur": 11.280, + "args": { + "External id": 123111, "cbid": 430, "correlation": 241661428 + } + }, + { + "ph": "s", "id": 241661428, "pid": 5717, "tid": 5717, "ts": 6302685185116.415, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685185129.215, "dur": 0.820, + "args": { + "External id": 123111, "cbid": 135, "correlation": 241661430 + } + }, + { + "ph": "f", "id": 241661430, "pid": 5717, "tid": 5717, "ts": 6302685185129.215, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685185130.306, "dur": 0.840, + "args": { + "External id": 123111, "cbid": 147, "correlation": 241661431 + } + }, + { + "ph": "s", "id": 241661431, "pid": 5717, "tid": 5717, "ts": 6302685185130.306, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685185133.766, "dur": 1.720, + "args": { + "External id": 123111, "cbid": 135, "correlation": 241661434 + } + }, + { + "ph": "f", "id": 241661434, "pid": 5717, "tid": 5717, "ts": 6302685185133.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685185147.715, "dur": 0.471, + "args": { + "External id": 123111, "cbid": 135, "correlation": 241661441 + } + }, + { + "ph": "f", "id": 241661441, "pid": 5717, "tid": 5717, "ts": 6302685185147.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685185178.855, "dur": 1.800, + "args": { + "External id": 123113, "cbid": 147, "correlation": 241661446 + } + }, + { + "ph": "s", "id": 241661446, "pid": 5717, "tid": 5717, "ts": 6302685185178.855, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685185198.686, "dur": 0.980, + "args": { + "cbid": 135, "correlation": 241661461 + } + }, + { + "ph": "f", "id": 241661461, "pid": 5717, "tid": 5717, "ts": 6302685185198.686, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685185247.745, "dur": 1.340, + "args": { + "cbid": 147, "correlation": 241661466 + } + }, + { + "ph": "s", "id": 241661466, "pid": 5717, "tid": 5717, "ts": 6302685185247.745, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685185251.255, "dur": 0.650, + "args": { + "cbid": 147, "correlation": 241661470 + } + }, + { + "ph": "s", "id": 241661470, "pid": 5717, "tid": 5717, "ts": 6302685185251.255, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685185309.135, "dur": 6.390, + "args": { + "cbid": 147, "correlation": 241661476 + } + }, + { + "ph": "s", "id": 241661476, "pid": 5717, "tid": 5717, "ts": 6302685185309.135, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685185524.925, "dur": 1.950, + "args": { + "External id": 123126, "cbid": 317, "correlation": 241661517 + } + }, + { + "ph": "f", "id": 241661517, "pid": 5717, "tid": 5717, "ts": 6302685185524.925, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685185559.245, "dur": 4.469, + "args": { + "External id": 123127, "cbid": 138, "correlation": 241661520 + } + }, + { + "ph": "f", "id": 241661520, "pid": 5717, "tid": 5717, "ts": 6302685185559.245, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685211818.614, "dur": 2.432, + "args": { + "External id": 123131, "device": 3, "context": 1, "stream": 7, "correlation": 241661531, "bytes": 7224, "memory bandwidth (GB/s)": 2.9703947368421053 + } + }, + { + "ph": "f", "id": 241661531, "pid": 3, "tid": 7, "ts": 6302685211818.614, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685185609.785, "dur": 20.869, + "args": { + "External id": 123131, "cbid": 41, "correlation": 241661531 + } + }, + { + "ph": "s", "id": 241661531, "pid": 5717, "tid": 5717, "ts": 6302685185609.785, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685185639.465, "dur": 3.909, + "args": { + "External id": 123126, "cbid": 135, "correlation": 241661535 + } + }, + { + "ph": "f", "id": 241661535, "pid": 5717, "tid": 5717, "ts": 6302685185639.465, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685211822.966, "dur": 334.594, + "args": { + "External id": 123126, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661539, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241661539, "pid": 3, "tid": 7, "ts": 6302685211822.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685185647.774, "dur": 17.300, + "args": { + "External id": 123126, "cbid": 211, "correlation": 241661539 + } + }, + { + "ph": "s", "id": 241661539, "pid": 5717, "tid": 5717, "ts": 6302685185647.774, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685185816.674, "dur": 1.910, + "args": { + "cbid": 135, "correlation": 241661550 + } + }, + { + "ph": "f", "id": 241661550, "pid": 5717, "tid": 5717, "ts": 6302685185816.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685212163.416, "dur": 365.923, + "args": { + "External id": 123138, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661576, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241661576, "pid": 3, "tid": 7, "ts": 6302685212163.416, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685186129.633, "dur": 15.780, + "args": { + "External id": 123138, "cbid": 307, "correlation": 241661576 + } + }, + { + "ph": "s", "id": 241661576, "pid": 5717, "tid": 5717, "ts": 6302685186129.633, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685212530.043, "dur": 143.713, + "args": { + "External id": 123144, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661599, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241661599, "pid": 3, "tid": 7, "ts": 6302685212530.043, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685186388.353, "dur": 14.270, + "args": { + "External id": 123144, "cbid": 211, "correlation": 241661599 + } + }, + { + "ph": "s", "id": 241661599, "pid": 5717, "tid": 5717, "ts": 6302685186388.353, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685212674.396, "dur": 140.513, + "args": { + "External id": 123145, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661622, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241661622, "pid": 3, "tid": 7, "ts": 6302685212674.396, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685186435.852, "dur": 6.900, + "args": { + "External id": 123145, "cbid": 211, "correlation": 241661622 + } + }, + { + "ph": "s", "id": 241661622, "pid": 5717, "tid": 5717, "ts": 6302685186435.852, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685212815.581, "dur": 151.361, + "args": { + "External id": 123146, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661645, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241661645, "pid": 3, "tid": 7, "ts": 6302685212815.581, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685186468.963, "dur": 6.869, + "args": { + "External id": 123146, "cbid": 211, "correlation": 241661645 + } + }, + { + "ph": "s", "id": 241661645, "pid": 5717, "tid": 5717, "ts": 6302685186468.963, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685212967.646, "dur": 64.769, + "args": { + "External id": 123163, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661665, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241661665, "pid": 3, "tid": 7, "ts": 6302685212967.646, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685186886.422, "dur": 14.369, + "args": { + "External id": 123163, "cbid": 307, "correlation": 241661665 + } + }, + { + "ph": "s", "id": 241661665, "pid": 5717, "tid": 5717, "ts": 6302685186886.422, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685213033.919, "dur": 103.201, + "args": { + "External id": 123179, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661683, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241661683, "pid": 3, "tid": 7, "ts": 6302685213033.919, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685187179.441, "dur": 13.910, + "args": { + "External id": 123179, "cbid": 307, "correlation": 241661683 + } + }, + { + "ph": "s", "id": 241661683, "pid": 5717, "tid": 5717, "ts": 6302685187179.441, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685187449.570, "dur": 1.650, + "args": { + "External id": 123185, "cbid": 200, "correlation": 241661690 + } + }, + { + "ph": "f", "id": 241661690, "pid": 5717, "tid": 5717, "ts": 6302685187449.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685187451.530, "dur": 0.340, + "args": { + "External id": 123185, "cbid": 200, "correlation": 241661691 + } + }, + { + "ph": "f", "id": 241661691, "pid": 5717, "tid": 5717, "ts": 6302685187451.530, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685187492.190, "dur": 0.570, + "args": { + "External id": 123185, "cbid": 200, "correlation": 241661714 + } + }, + { + "ph": "f", "id": 241661714, "pid": 5717, "tid": 5717, "ts": 6302685187492.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685187502.880, "dur": 4.270, + "args": { + "External id": 123185, "cbid": 273, "correlation": 241661723 + } + }, + { + "ph": "f", "id": 241661723, "pid": 5717, "tid": 5717, "ts": 6302685187502.880, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685213137.728, "dur": 503.395, + "args": { + "External id": 123185, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661724, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241661724, "pid": 3, "tid": 7, "ts": 6302685213137.728, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685187508.100, "dur": 16.780, + "args": { + "External id": 123185, "cbid": 211, "correlation": 241661724 + } + }, + { + "ph": "s", "id": 241661724, "pid": 5717, "tid": 5717, "ts": 6302685187508.100, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685213641.699, "dur": 142.209, + "args": { + "External id": 123191, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661747, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241661747, "pid": 3, "tid": 7, "ts": 6302685213641.699, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685187619.960, "dur": 10.590, + "args": { + "External id": 123191, "cbid": 211, "correlation": 241661747 + } + }, + { + "ph": "s", "id": 241661747, "pid": 5717, "tid": 5717, "ts": 6302685187619.960, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685213784.580, "dur": 136.641, + "args": { + "External id": 123195, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661773, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241661773, "pid": 3, "tid": 7, "ts": 6302685213784.580, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685187887.439, "dur": 13.560, + "args": { + "External id": 123195, "cbid": 307, "correlation": 241661773 + } + }, + { + "ph": "s", "id": 241661773, "pid": 5717, "tid": 5717, "ts": 6302685187887.439, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685213921.925, "dur": 385.891, + "args": { + "External id": 123196, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661793, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241661793, "pid": 3, "tid": 7, "ts": 6302685213921.925, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685187950.249, "dur": 11.740, + "args": { + "External id": 123196, "cbid": 211, "correlation": 241661793 + } + }, + { + "ph": "s", "id": 241661793, "pid": 5717, "tid": 5717, "ts": 6302685187950.249, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685214308.392, "dur": 432.772, + "args": { + "External id": 123197, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661816, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241661816, "pid": 3, "tid": 7, "ts": 6302685214308.392, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685187994.649, "dur": 7.050, + "args": { + "External id": 123197, "cbid": 211, "correlation": 241661816 + } + }, + { + "ph": "s", "id": 241661816, "pid": 5717, "tid": 5717, "ts": 6302685187994.649, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 3, "tid": 7, + "ts": 6302685214741.868, "dur": 216.225, + "args": { + "External id": 123198, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661828, "registers per thread": 16, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241661828, "pid": 3, "tid": 7, "ts": 6302685214741.868, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685188055.469, "dur": 11.730, + "args": { + "External id": 123198, "cbid": 307, "correlation": 241661828 + } + }, + { + "ph": "s", "id": 241661828, "pid": 5717, "tid": 5717, "ts": 6302685188055.469, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685188110.469, "dur": 4.020, + "args": { + "External id": 123199, "cbid": 210, "correlation": 241661848 + } + }, + { + "ph": "f", "id": 241661848, "pid": 5717, "tid": 5717, "ts": 6302685188110.469, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685214958.797, "dur": 342.723, + "args": { + "External id": 123199, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661849, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241661849, "pid": 3, "tid": 7, "ts": 6302685214958.797, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685188118.799, "dur": 11.350, + "args": { + "External id": 123199, "cbid": 211, "correlation": 241661849 + } + }, + { + "ph": "s", "id": 241661849, "pid": 5717, "tid": 5717, "ts": 6302685188118.799, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 3, "tid": 7, + "ts": 6302685215302.224, "dur": 42.528, + "args": { + "External id": 123200, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241661856, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241661856, "pid": 3, "tid": 7, "ts": 6302685215302.224, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685188188.739, "dur": 11.510, + "args": { + "External id": 123200, "cbid": 307, "correlation": 241661856 + } + }, + { + "ph": "s", "id": 241661856, "pid": 5717, "tid": 5717, "ts": 6302685188188.739, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685208470.044, "dur": 53.185, + "args": { + "External id": 123216, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241661871, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241661871, "pid": 3, "tid": 17, "ts": 6302685208470.044, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685188772.747, "dur": 16.800, + "args": { + "External id": 123216, "cbid": 211, "correlation": 241661871 + } + }, + { + "ph": "s", "id": 241661871, "pid": 5717, "tid": 5717, "ts": 6302685188772.747, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685208526.909, "dur": 18.112, + "args": { + "External id": 123232, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241661884, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241661884, "pid": 3, "tid": 17, "ts": 6302685208526.909, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685188949.697, "dur": 14.400, + "args": { + "External id": 123232, "cbid": 211, "correlation": 241661884 + } + }, + { + "ph": "s", "id": 241661884, "pid": 5717, "tid": 5717, "ts": 6302685188949.697, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685189005.607, "dur": 1.910, + "args": { + "cbid": 135, "correlation": 241661894 + } + }, + { + "ph": "f", "id": 241661894, "pid": 5717, "tid": 5717, "ts": 6302685189005.607, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685189010.197, "dur": 1.830, + "args": { + "cbid": 147, "correlation": 241661898 + } + }, + { + "ph": "s", "id": 241661898, "pid": 5717, "tid": 5717, "ts": 6302685189010.197, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685189115.117, "dur": 2.449, + "args": { + "External id": 123234, "cbid": 317, "correlation": 241661911 + } + }, + { + "ph": "f", "id": 241661911, "pid": 5717, "tid": 5717, "ts": 6302685189115.117, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685189125.217, "dur": 2.780, + "args": { + "External id": 123234, "cbid": 135, "correlation": 241661913 + } + }, + { + "ph": "f", "id": 241661913, "pid": 5717, "tid": 5717, "ts": 6302685189125.217, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685189130.686, "dur": 2.111, + "args": { + "External id": 123234, "cbid": 147, "correlation": 241661917 + } + }, + { + "ph": "s", "id": 241661917, "pid": 5717, "tid": 5717, "ts": 6302685189130.686, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685189169.026, "dur": 1.620, + "args": { + "External id": 123234, "cbid": 409, "correlation": 241661920 + } + }, + { + "ph": "f", "id": 241661920, "pid": 5717, "tid": 5717, "ts": 6302685189169.026, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685189178.166, "dur": 1.471, + "args": { + "External id": 123234, "cbid": 135, "correlation": 241661923 + } + }, + { + "ph": "f", "id": 241661923, "pid": 5717, "tid": 5717, "ts": 6302685189178.166, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685189179.966, "dur": 1.671, + "args": { + "External id": 123234, "cbid": 147, "correlation": 241661924 + } + }, + { + "ph": "s", "id": 241661924, "pid": 5717, "tid": 5717, "ts": 6302685189179.966, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685211814.806, "dur": 4149.055, + "args": { + "External id": 123234, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241661926, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241661926, "pid": 3, "tid": 20, "ts": 6302685211814.806, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685189183.746, "dur": 28.210, + "args": { + "External id": 123234, "cbid": 430, "correlation": 241661926 + } + }, + { + "ph": "s", "id": 241661926, "pid": 5717, "tid": 5717, "ts": 6302685189183.746, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685189213.916, "dur": 0.780, + "args": { + "External id": 123234, "cbid": 135, "correlation": 241661928 + } + }, + { + "ph": "f", "id": 241661928, "pid": 5717, "tid": 5717, "ts": 6302685189213.916, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685189214.926, "dur": 0.940, + "args": { + "External id": 123234, "cbid": 147, "correlation": 241661929 + } + }, + { + "ph": "s", "id": 241661929, "pid": 5717, "tid": 5717, "ts": 6302685189214.926, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685189219.116, "dur": 1.450, + "args": { + "External id": 123234, "cbid": 135, "correlation": 241661932 + } + }, + { + "ph": "f", "id": 241661932, "pid": 5717, "tid": 5717, "ts": 6302685189219.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685189239.496, "dur": 0.770, + "args": { + "External id": 123234, "cbid": 135, "correlation": 241661939 + } + }, + { + "ph": "f", "id": 241661939, "pid": 5717, "tid": 5717, "ts": 6302685189239.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685189287.986, "dur": 1.910, + "args": { + "External id": 123236, "cbid": 147, "correlation": 241661944 + } + }, + { + "ph": "s", "id": 241661944, "pid": 5717, "tid": 5717, "ts": 6302685189287.986, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685189330.816, "dur": 1.790, + "args": { + "cbid": 135, "correlation": 241661959 + } + }, + { + "ph": "f", "id": 241661959, "pid": 5717, "tid": 5717, "ts": 6302685189330.816, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685189404.566, "dur": 1.890, + "args": { + "cbid": 147, "correlation": 241661964 + } + }, + { + "ph": "s", "id": 241661964, "pid": 5717, "tid": 5717, "ts": 6302685189404.566, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685189409.636, "dur": 1.050, + "args": { + "cbid": 147, "correlation": 241661968 + } + }, + { + "ph": "s", "id": 241661968, "pid": 5717, "tid": 5717, "ts": 6302685189409.636, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685189478.366, "dur": 4.780, + "args": { + "cbid": 147, "correlation": 241661974 + } + }, + { + "ph": "s", "id": 241661974, "pid": 5717, "tid": 5717, "ts": 6302685189478.366, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685189707.885, "dur": 1.930, + "args": { + "External id": 123249, "cbid": 317, "correlation": 241662015 + } + }, + { + "ph": "f", "id": 241662015, "pid": 5717, "tid": 5717, "ts": 6302685189707.885, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685189725.545, "dur": 3.870, + "args": { + "External id": 123250, "cbid": 138, "correlation": 241662018 + } + }, + { + "ph": "f", "id": 241662018, "pid": 5717, "tid": 5717, "ts": 6302685189725.545, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685215970.965, "dur": 3.424, + "args": { + "External id": 123254, "device": 3, "context": 1, "stream": 7, "correlation": 241662029, "bytes": 7224, "memory bandwidth (GB/s)": 2.1098130841121496 + } + }, + { + "ph": "f", "id": 241662029, "pid": 3, "tid": 7, "ts": 6302685215970.965, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685189766.765, "dur": 20.690, + "args": { + "External id": 123254, "cbid": 41, "correlation": 241662029 + } + }, + { + "ph": "s", "id": 241662029, "pid": 5717, "tid": 5717, "ts": 6302685189766.765, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685189795.835, "dur": 2.880, + "args": { + "External id": 123249, "cbid": 135, "correlation": 241662033 + } + }, + { + "ph": "f", "id": 241662033, "pid": 5717, "tid": 5717, "ts": 6302685189795.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685216007.413, "dur": 531.780, + "args": { + "External id": 123249, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662037, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662037, "pid": 3, "tid": 7, "ts": 6302685216007.413, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685189802.845, "dur": 17.300, + "args": { + "External id": 123249, "cbid": 211, "correlation": 241662037 + } + }, + { + "ph": "s", "id": 241662037, "pid": 5717, "tid": 5717, "ts": 6302685189802.845, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685189976.184, "dur": 2.340, + "args": { + "cbid": 135, "correlation": 241662048 + } + }, + { + "ph": "f", "id": 241662048, "pid": 5717, "tid": 5717, "ts": 6302685189976.184, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685216546.201, "dur": 137.633, + "args": { + "External id": 123261, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662074, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662074, "pid": 3, "tid": 7, "ts": 6302685216546.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685190482.734, "dur": 32.819, + "args": { + "External id": 123261, "cbid": 307, "correlation": 241662074 + } + }, + { + "ph": "s", "id": 241662074, "pid": 5717, "tid": 5717, "ts": 6302685190482.734, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685216684.442, "dur": 158.370, + "args": { + "External id": 123267, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662097, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662097, "pid": 3, "tid": 7, "ts": 6302685216684.442, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685190805.643, "dur": 19.280, + "args": { + "External id": 123267, "cbid": 211, "correlation": 241662097 + } + }, + { + "ph": "s", "id": 241662097, "pid": 5717, "tid": 5717, "ts": 6302685190805.643, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685216843.484, "dur": 148.225, + "args": { + "External id": 123268, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662120, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662120, "pid": 3, "tid": 7, "ts": 6302685216843.484, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685190872.633, "dur": 9.960, + "args": { + "External id": 123268, "cbid": 211, "correlation": 241662120 + } + }, + { + "ph": "s", "id": 241662120, "pid": 5717, "tid": 5717, "ts": 6302685190872.633, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685216992.413, "dur": 140.449, + "args": { + "External id": 123269, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662143, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662143, "pid": 3, "tid": 7, "ts": 6302685216992.413, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685190923.493, "dur": 10.040, + "args": { + "External id": 123269, "cbid": 211, "correlation": 241662143 + } + }, + { + "ph": "s", "id": 241662143, "pid": 5717, "tid": 5717, "ts": 6302685190923.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685217133.470, "dur": 53.696, + "args": { + "External id": 123286, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662163, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662163, "pid": 3, "tid": 7, "ts": 6302685217133.470, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685191521.221, "dur": 22.070, + "args": { + "External id": 123286, "cbid": 307, "correlation": 241662163 + } + }, + { + "ph": "s", "id": 241662163, "pid": 5717, "tid": 5717, "ts": 6302685191521.221, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685217187.774, "dur": 73.121, + "args": { + "External id": 123302, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662181, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662181, "pid": 3, "tid": 7, "ts": 6302685217187.774, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685191948.490, "dur": 19.160, + "args": { + "External id": 123302, "cbid": 307, "correlation": 241662181 + } + }, + { + "ph": "s", "id": 241662181, "pid": 5717, "tid": 5717, "ts": 6302685191948.490, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685192337.609, "dur": 1.350, + "args": { + "External id": 123308, "cbid": 200, "correlation": 241662188 + } + }, + { + "ph": "f", "id": 241662188, "pid": 5717, "tid": 5717, "ts": 6302685192337.609, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685192339.289, "dur": 0.530, + "args": { + "External id": 123308, "cbid": 200, "correlation": 241662189 + } + }, + { + "ph": "f", "id": 241662189, "pid": 5717, "tid": 5717, "ts": 6302685192339.289, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685192409.589, "dur": 0.990, + "args": { + "External id": 123308, "cbid": 200, "correlation": 241662212 + } + }, + { + "ph": "f", "id": 241662212, "pid": 5717, "tid": 5717, "ts": 6302685192409.589, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685192428.289, "dur": 5.440, + "args": { + "External id": 123308, "cbid": 273, "correlation": 241662221 + } + }, + { + "ph": "f", "id": 241662221, "pid": 5717, "tid": 5717, "ts": 6302685192428.289, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685217261.503, "dur": 419.811, + "args": { + "External id": 123308, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662222, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662222, "pid": 3, "tid": 7, "ts": 6302685217261.503, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685192435.349, "dur": 31.090, + "args": { + "External id": 123308, "cbid": 211, "correlation": 241662222 + } + }, + { + "ph": "s", "id": 241662222, "pid": 5717, "tid": 5717, "ts": 6302685192435.349, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685217682.018, "dur": 142.465, + "args": { + "External id": 123314, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662245, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662245, "pid": 3, "tid": 7, "ts": 6302685217682.018, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685192649.958, "dur": 23.040, + "args": { + "External id": 123314, "cbid": 211, "correlation": 241662245 + } + }, + { + "ph": "s", "id": 241662245, "pid": 5717, "tid": 5717, "ts": 6302685192649.958, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685217825.091, "dur": 90.624, + "args": { + "External id": 123318, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662271, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662271, "pid": 3, "tid": 7, "ts": 6302685217825.091, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685193097.168, "dur": 27.640, + "args": { + "External id": 123318, "cbid": 307, "correlation": 241662271 + } + }, + { + "ph": "s", "id": 241662271, "pid": 5717, "tid": 5717, "ts": 6302685193097.168, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685217916.323, "dur": 358.051, + "args": { + "External id": 123319, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662291, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662291, "pid": 3, "tid": 7, "ts": 6302685217916.323, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685193247.447, "dur": 21.820, + "args": { + "External id": 123319, "cbid": 211, "correlation": 241662291 + } + }, + { + "ph": "s", "id": 241662291, "pid": 5717, "tid": 5717, "ts": 6302685193247.447, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685218274.982, "dur": 331.395, + "args": { + "External id": 123320, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662314, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662314, "pid": 3, "tid": 7, "ts": 6302685218274.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685193393.227, "dur": 22.480, + "args": { + "External id": 123320, "cbid": 211, "correlation": 241662314 + } + }, + { + "ph": "s", "id": 241662314, "pid": 5717, "tid": 5717, "ts": 6302685193393.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 3, "tid": 7, + "ts": 6302685218607.017, "dur": 231.714, + "args": { + "External id": 123321, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662326, "registers per thread": 16, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662326, "pid": 3, "tid": 7, "ts": 6302685218607.017, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685193543.736, "dur": 21.760, + "args": { + "External id": 123321, "cbid": 307, "correlation": 241662326 + } + }, + { + "ph": "s", "id": 241662326, "pid": 5717, "tid": 5717, "ts": 6302685193543.736, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685193665.046, "dur": 4.810, + "args": { + "External id": 123322, "cbid": 210, "correlation": 241662346 + } + }, + { + "ph": "f", "id": 241662346, "pid": 5717, "tid": 5717, "ts": 6302685193665.046, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685218839.371, "dur": 350.786, + "args": { + "External id": 123322, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662347, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662347, "pid": 3, "tid": 7, "ts": 6302685218839.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685193676.926, "dur": 19.920, + "args": { + "External id": 123322, "cbid": 211, "correlation": 241662347 + } + }, + { + "ph": "s", "id": 241662347, "pid": 5717, "tid": 5717, "ts": 6302685193676.926, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 3, "tid": 7, + "ts": 6302685219190.797, "dur": 44.672, + "args": { + "External id": 123323, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662354, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662354, "pid": 3, "tid": 7, "ts": 6302685219190.797, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685193808.316, "dur": 22.410, + "args": { + "External id": 123323, "cbid": 307, "correlation": 241662354 + } + }, + { + "ph": "s", "id": 241662354, "pid": 5717, "tid": 5717, "ts": 6302685193808.316, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685212178.712, "dur": 362.435, + "args": { + "External id": 123339, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241662369, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241662369, "pid": 3, "tid": 17, "ts": 6302685212178.712, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685196046.601, "dur": 108.729, + "args": { + "External id": 123339, "cbid": 211, "correlation": 241662369 + } + }, + { + "ph": "s", "id": 241662369, "pid": 5717, "tid": 5717, "ts": 6302685196046.601, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685212556.795, "dur": 15.617, + "args": { + "External id": 123355, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241662382, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241662382, "pid": 3, "tid": 17, "ts": 6302685212556.795, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685197092.859, "dur": 89.269, + "args": { + "External id": 123355, "cbid": 211, "correlation": 241662382 + } + }, + { + "ph": "s", "id": 241662382, "pid": 5717, "tid": 5717, "ts": 6302685197092.859, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685197451.268, "dur": 11.960, + "args": { + "cbid": 135, "correlation": 241662392 + } + }, + { + "ph": "f", "id": 241662392, "pid": 5717, "tid": 5717, "ts": 6302685197451.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685197486.218, "dur": 12.029, + "args": { + "cbid": 147, "correlation": 241662396 + } + }, + { + "ph": "s", "id": 241662396, "pid": 5717, "tid": 5717, "ts": 6302685197486.218, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685198101.206, "dur": 9.250, + "args": { + "External id": 123357, "cbid": 317, "correlation": 241662409 + } + }, + { + "ph": "f", "id": 241662409, "pid": 5717, "tid": 5717, "ts": 6302685198101.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685198139.266, "dur": 13.080, + "args": { + "External id": 123357, "cbid": 135, "correlation": 241662411 + } + }, + { + "ph": "f", "id": 241662411, "pid": 5717, "tid": 5717, "ts": 6302685198139.266, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685198171.596, "dur": 20.690, + "args": { + "External id": 123357, "cbid": 147, "correlation": 241662415 + } + }, + { + "ph": "s", "id": 241662415, "pid": 5717, "tid": 5717, "ts": 6302685198171.596, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685198402.336, "dur": 7.380, + "args": { + "External id": 123357, "cbid": 409, "correlation": 241662418 + } + }, + { + "ph": "f", "id": 241662418, "pid": 5717, "tid": 5717, "ts": 6302685198402.336, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685198443.876, "dur": 7.160, + "args": { + "External id": 123357, "cbid": 135, "correlation": 241662421 + } + }, + { + "ph": "f", "id": 241662421, "pid": 5717, "tid": 5717, "ts": 6302685198443.876, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685198452.616, "dur": 8.269, + "args": { + "External id": 123357, "cbid": 147, "correlation": 241662422 + } + }, + { + "ph": "s", "id": 241662422, "pid": 5717, "tid": 5717, "ts": 6302685198452.616, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685215964.533, "dur": 4140.479, + "args": { + "External id": 123357, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241662424, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241662424, "pid": 3, "tid": 20, "ts": 6302685215964.533, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685198470.115, "dur": 96.410, + "args": { + "External id": 123357, "cbid": 430, "correlation": 241662424 + } + }, + { + "ph": "s", "id": 241662424, "pid": 5717, "tid": 5717, "ts": 6302685198470.115, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685198575.585, "dur": 3.910, + "args": { + "External id": 123357, "cbid": 135, "correlation": 241662426 + } + }, + { + "ph": "f", "id": 241662426, "pid": 5717, "tid": 5717, "ts": 6302685198575.585, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685198580.555, "dur": 4.670, + "args": { + "External id": 123357, "cbid": 147, "correlation": 241662427 + } + }, + { + "ph": "s", "id": 241662427, "pid": 5717, "tid": 5717, "ts": 6302685198580.555, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685198600.355, "dur": 6.960, + "args": { + "External id": 123357, "cbid": 135, "correlation": 241662430 + } + }, + { + "ph": "f", "id": 241662430, "pid": 5717, "tid": 5717, "ts": 6302685198600.355, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685198684.455, "dur": 3.750, + "args": { + "External id": 123357, "cbid": 135, "correlation": 241662437 + } + }, + { + "ph": "f", "id": 241662437, "pid": 5717, "tid": 5717, "ts": 6302685198684.455, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685198971.244, "dur": 11.470, + "args": { + "External id": 123359, "cbid": 147, "correlation": 241662442 + } + }, + { + "ph": "s", "id": 241662442, "pid": 5717, "tid": 5717, "ts": 6302685198971.244, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685199157.364, "dur": 16.630, + "args": { + "cbid": 135, "correlation": 241662457 + } + }, + { + "ph": "f", "id": 241662457, "pid": 5717, "tid": 5717, "ts": 6302685199157.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685199883.712, "dur": 18.100, + "args": { + "cbid": 147, "correlation": 241662462 + } + }, + { + "ph": "s", "id": 241662462, "pid": 5717, "tid": 5717, "ts": 6302685199883.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685199926.762, "dur": 5.880, + "args": { + "cbid": 147, "correlation": 241662466 + } + }, + { + "ph": "s", "id": 241662466, "pid": 5717, "tid": 5717, "ts": 6302685199926.762, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685200626.631, "dur": 27.069, + "args": { + "cbid": 147, "correlation": 241662472 + } + }, + { + "ph": "s", "id": 241662472, "pid": 5717, "tid": 5717, "ts": 6302685200626.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685201756.658, "dur": 11.080, + "args": { + "External id": 123372, "cbid": 317, "correlation": 241662513 + } + }, + { + "ph": "f", "id": 241662513, "pid": 5717, "tid": 5717, "ts": 6302685201756.658, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685201875.118, "dur": 22.579, + "args": { + "External id": 123373, "cbid": 138, "correlation": 241662516 + } + }, + { + "ph": "f", "id": 241662516, "pid": 5717, "tid": 5717, "ts": 6302685201875.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685220110.036, "dur": 2.720, + "args": { + "External id": 123377, "device": 3, "context": 1, "stream": 7, "correlation": 241662527, "bytes": 7224, "memory bandwidth (GB/s)": 2.6558823529411764 + } + }, + { + "ph": "f", "id": 241662527, "pid": 3, "tid": 7, "ts": 6302685220110.036, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685202097.247, "dur": 148.230, + "args": { + "External id": 123377, "cbid": 41, "correlation": 241662527 + } + }, + { + "ph": "s", "id": 241662527, "pid": 5717, "tid": 5717, "ts": 6302685202097.247, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685202296.777, "dur": 89.999, + "args": { + "External id": 123372, "cbid": 135, "correlation": 241662531 + } + }, + { + "ph": "f", "id": 241662531, "pid": 5717, "tid": 5717, "ts": 6302685202296.777, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685220118.228, "dur": 19.136, + "args": { + "External id": 123372, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662535, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662535, "pid": 3, "tid": 7, "ts": 6302685220118.228, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685202416.667, "dur": 123.969, + "args": { + "External id": 123372, "cbid": 211, "correlation": 241662535 + } + }, + { + "ph": "s", "id": 241662535, "pid": 5717, "tid": 5717, "ts": 6302685202416.667, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685203629.744, "dur": 13.570, + "args": { + "cbid": 135, "correlation": 241662546 + } + }, + { + "ph": "f", "id": 241662546, "pid": 5717, "tid": 5717, "ts": 6302685203629.744, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685220138.068, "dur": 20.769, + "args": { + "External id": 123384, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662572, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662572, "pid": 3, "tid": 7, "ts": 6302685220138.068, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685205587.099, "dur": 96.440, + "args": { + "External id": 123384, "cbid": 307, "correlation": 241662572 + } + }, + { + "ph": "s", "id": 241662572, "pid": 5717, "tid": 5717, "ts": 6302685205587.099, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685220159.637, "dur": 122.336, + "args": { + "External id": 123390, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662595, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662595, "pid": 3, "tid": 7, "ts": 6302685220159.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685206998.036, "dur": 87.200, + "args": { + "External id": 123390, "cbid": 211, "correlation": 241662595 + } + }, + { + "ph": "s", "id": 241662595, "pid": 5717, "tid": 5717, "ts": 6302685206998.036, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685220282.614, "dur": 121.024, + "args": { + "External id": 123391, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662618, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662618, "pid": 3, "tid": 7, "ts": 6302685220282.614, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685207285.105, "dur": 87.900, + "args": { + "External id": 123391, "cbid": 211, "correlation": 241662618 + } + }, + { + "ph": "s", "id": 241662618, "pid": 5717, "tid": 5717, "ts": 6302685207285.105, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685220404.310, "dur": 120.865, + "args": { + "External id": 123392, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662641, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662641, "pid": 3, "tid": 7, "ts": 6302685220404.310, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685207558.145, "dur": 46.510, + "args": { + "External id": 123392, "cbid": 211, "correlation": 241662641 + } + }, + { + "ph": "s", "id": 241662641, "pid": 5717, "tid": 5717, "ts": 6302685207558.145, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685220525.847, "dur": 53.313, + "args": { + "External id": 123409, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662661, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662661, "pid": 3, "tid": 7, "ts": 6302685220525.847, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685210006.499, "dur": 78.810, + "args": { + "External id": 123409, "cbid": 307, "correlation": 241662661 + } + }, + { + "ph": "s", "id": 241662661, "pid": 5717, "tid": 5717, "ts": 6302685210006.499, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685220579.864, "dur": 59.680, + "args": { + "External id": 123425, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662679, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662679, "pid": 3, "tid": 7, "ts": 6302685220579.864, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685211673.106, "dur": 74.899, + "args": { + "External id": 123425, "cbid": 307, "correlation": 241662679 + } + }, + { + "ph": "s", "id": 241662679, "pid": 5717, "tid": 5717, "ts": 6302685211673.106, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685213044.512, "dur": 4.160, + "args": { + "External id": 123431, "cbid": 200, "correlation": 241662686 + } + }, + { + "ph": "f", "id": 241662686, "pid": 5717, "tid": 5717, "ts": 6302685213044.512, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685213049.792, "dur": 1.750, + "args": { + "External id": 123431, "cbid": 200, "correlation": 241662687 + } + }, + { + "ph": "f", "id": 241662687, "pid": 5717, "tid": 5717, "ts": 6302685213049.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685213351.552, "dur": 3.580, + "args": { + "External id": 123431, "cbid": 200, "correlation": 241662710 + } + }, + { + "ph": "f", "id": 241662710, "pid": 5717, "tid": 5717, "ts": 6302685213351.552, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685213412.231, "dur": 17.671, + "args": { + "External id": 123431, "cbid": 273, "correlation": 241662719 + } + }, + { + "ph": "f", "id": 241662719, "pid": 5717, "tid": 5717, "ts": 6302685213412.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685220640.280, "dur": 399.267, + "args": { + "External id": 123431, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662720, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662720, "pid": 3, "tid": 7, "ts": 6302685220640.280, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685213435.182, "dur": 92.459, + "args": { + "External id": 123431, "cbid": 211, "correlation": 241662720 + } + }, + { + "ph": "s", "id": 241662720, "pid": 5717, "tid": 5717, "ts": 6302685213435.182, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685221040.251, "dur": 121.505, + "args": { + "External id": 123437, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662743, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662743, "pid": 3, "tid": 7, "ts": 6302685221040.251, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685214162.740, "dur": 66.460, + "args": { + "External id": 123437, "cbid": 211, "correlation": 241662743 + } + }, + { + "ph": "s", "id": 241662743, "pid": 5717, "tid": 5717, "ts": 6302685214162.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685221162.460, "dur": 89.953, + "args": { + "External id": 123441, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662769, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662769, "pid": 3, "tid": 7, "ts": 6302685221162.460, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685215648.557, "dur": 146.929, + "args": { + "External id": 123441, "cbid": 307, "correlation": 241662769 + } + }, + { + "ph": "s", "id": 241662769, "pid": 5717, "tid": 5717, "ts": 6302685215648.557, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685221253.021, "dur": 324.258, + "args": { + "External id": 123442, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662789, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662789, "pid": 3, "tid": 7, "ts": 6302685221253.021, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685216216.965, "dur": 119.060, + "args": { + "External id": 123442, "cbid": 211, "correlation": 241662789 + } + }, + { + "ph": "s", "id": 241662789, "pid": 5717, "tid": 5717, "ts": 6302685216216.965, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685221577.983, "dur": 317.251, + "args": { + "External id": 123443, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662812, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662812, "pid": 3, "tid": 7, "ts": 6302685221577.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685216565.924, "dur": 48.750, + "args": { + "External id": 123443, "cbid": 211, "correlation": 241662812 + } + }, + { + "ph": "s", "id": 241662812, "pid": 5717, "tid": 5717, "ts": 6302685216565.924, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 3, "tid": 7, + "ts": 6302685221895.938, "dur": 220.289, + "args": { + "External id": 123444, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662824, "registers per thread": 16, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662824, "pid": 3, "tid": 7, "ts": 6302685221895.938, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685216893.084, "dur": 42.210, + "args": { + "External id": 123444, "cbid": 307, "correlation": 241662824 + } + }, + { + "ph": "s", "id": 241662824, "pid": 5717, "tid": 5717, "ts": 6302685216893.084, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685217135.943, "dur": 11.290, + "args": { + "External id": 123445, "cbid": 210, "correlation": 241662844 + } + }, + { + "ph": "f", "id": 241662844, "pid": 5717, "tid": 5717, "ts": 6302685217135.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685222116.899, "dur": 322.787, + "args": { + "External id": 123445, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662845, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241662845, "pid": 3, "tid": 7, "ts": 6302685222116.899, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685217163.053, "dur": 47.900, + "args": { + "External id": 123445, "cbid": 211, "correlation": 241662845 + } + }, + { + "ph": "s", "id": 241662845, "pid": 5717, "tid": 5717, "ts": 6302685217163.053, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 3, "tid": 7, + "ts": 6302685222440.294, "dur": 41.312, + "args": { + "External id": 123446, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241662852, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241662852, "pid": 3, "tid": 7, "ts": 6302685222440.294, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685217507.002, "dur": 47.830, + "args": { + "External id": 123446, "cbid": 307, "correlation": 241662852 + } + }, + { + "ph": "s", "id": 241662852, "pid": 5717, "tid": 5717, "ts": 6302685217507.002, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685220730.553, "dur": 26.912, + "args": { + "External id": 123462, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241662867, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241662867, "pid": 3, "tid": 17, "ts": 6302685220730.553, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685220648.475, "dur": 99.990, + "args": { + "External id": 123462, "cbid": 211, "correlation": 241662867 + } + }, + { + "ph": "s", "id": 241662867, "pid": 5717, "tid": 5717, "ts": 6302685220648.475, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685222026.179, "dur": 83.936, + "args": { + "External id": 123478, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241662880, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241662880, "pid": 3, "tid": 17, "ts": 6302685222026.179, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685221920.522, "dur": 126.290, + "args": { + "External id": 123478, "cbid": 211, "correlation": 241662880 + } + }, + { + "ph": "s", "id": 241662880, "pid": 5717, "tid": 5717, "ts": 6302685221920.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685222353.381, "dur": 11.170, + "args": { + "cbid": 135, "correlation": 241662890 + } + }, + { + "ph": "f", "id": 241662890, "pid": 5717, "tid": 5717, "ts": 6302685222353.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685222380.941, "dur": 10.080, + "args": { + "cbid": 147, "correlation": 241662894 + } + }, + { + "ph": "s", "id": 241662894, "pid": 5717, "tid": 5717, "ts": 6302685222380.941, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685222843.690, "dur": 6.730, + "args": { + "External id": 123480, "cbid": 317, "correlation": 241662907 + } + }, + { + "ph": "f", "id": 241662907, "pid": 5717, "tid": 5717, "ts": 6302685222843.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685222864.090, "dur": 9.040, + "args": { + "External id": 123480, "cbid": 135, "correlation": 241662909 + } + }, + { + "ph": "f", "id": 241662909, "pid": 5717, "tid": 5717, "ts": 6302685222864.090, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685222883.070, "dur": 8.570, + "args": { + "External id": 123480, "cbid": 147, "correlation": 241662913 + } + }, + { + "ph": "s", "id": 241662913, "pid": 5717, "tid": 5717, "ts": 6302685222883.070, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685222994.210, "dur": 4.870, + "args": { + "External id": 123480, "cbid": 409, "correlation": 241662916 + } + }, + { + "ph": "f", "id": 241662916, "pid": 5717, "tid": 5717, "ts": 6302685222994.210, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685223041.480, "dur": 7.050, + "args": { + "External id": 123480, "cbid": 135, "correlation": 241662919 + } + }, + { + "ph": "f", "id": 241662919, "pid": 5717, "tid": 5717, "ts": 6302685223041.480, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685223049.900, "dur": 6.320, + "args": { + "External id": 123480, "cbid": 147, "correlation": 241662920 + } + }, + { + "ph": "s", "id": 241662920, "pid": 5717, "tid": 5717, "ts": 6302685223049.900, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685223121.099, "dur": 2490.035, + "args": { + "External id": 123480, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241662922, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241662922, "pid": 3, "tid": 20, "ts": 6302685223121.099, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685223063.840, "dur": 65.709, + "args": { + "External id": 123480, "cbid": 430, "correlation": 241662922 + } + }, + { + "ph": "s", "id": 241662922, "pid": 5717, "tid": 5717, "ts": 6302685223063.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685223136.900, "dur": 2.900, + "args": { + "External id": 123480, "cbid": 135, "correlation": 241662924 + } + }, + { + "ph": "f", "id": 241662924, "pid": 5717, "tid": 5717, "ts": 6302685223136.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685223140.649, "dur": 3.680, + "args": { + "External id": 123480, "cbid": 147, "correlation": 241662925 + } + }, + { + "ph": "s", "id": 241662925, "pid": 5717, "tid": 5717, "ts": 6302685223140.649, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685223158.740, "dur": 4.369, + "args": { + "External id": 123480, "cbid": 135, "correlation": 241662928 + } + }, + { + "ph": "f", "id": 241662928, "pid": 5717, "tid": 5717, "ts": 6302685223158.740, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685223244.709, "dur": 3.040, + "args": { + "External id": 123480, "cbid": 135, "correlation": 241662935 + } + }, + { + "ph": "f", "id": 241662935, "pid": 5717, "tid": 5717, "ts": 6302685223244.709, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685223741.388, "dur": 15.590, + "args": { + "External id": 123482, "cbid": 147, "correlation": 241662940 + } + }, + { + "ph": "s", "id": 241662940, "pid": 5717, "tid": 5717, "ts": 6302685223741.388, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685223912.658, "dur": 8.220, + "args": { + "cbid": 135, "correlation": 241662955 + } + }, + { + "ph": "f", "id": 241662955, "pid": 5717, "tid": 5717, "ts": 6302685223912.658, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685224259.077, "dur": 9.240, + "args": { + "cbid": 147, "correlation": 241662960 + } + }, + { + "ph": "s", "id": 241662960, "pid": 5717, "tid": 5717, "ts": 6302685224259.077, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685224283.157, "dur": 4.810, + "args": { + "cbid": 147, "correlation": 241662964 + } + }, + { + "ph": "s", "id": 241662964, "pid": 5717, "tid": 5717, "ts": 6302685224283.157, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685224637.946, "dur": 17.990, + "args": { + "cbid": 147, "correlation": 241662970 + } + }, + { + "ph": "s", "id": 241662970, "pid": 5717, "tid": 5717, "ts": 6302685224637.946, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685225444.414, "dur": 10.410, + "args": { + "External id": 123495, "cbid": 317, "correlation": 241663011 + } + }, + { + "ph": "f", "id": 241663011, "pid": 5717, "tid": 5717, "ts": 6302685225444.414, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685225524.794, "dur": 21.800, + "args": { + "External id": 123496, "cbid": 138, "correlation": 241663014 + } + }, + { + "ph": "f", "id": 241663014, "pid": 5717, "tid": 5717, "ts": 6302685225524.794, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685225549.114, "dur": 3.080, + "args": { + "External id": 123496, "cbid": 138, "correlation": 241663015 + } + }, + { + "ph": "f", "id": 241663015, "pid": 5717, "tid": 5717, "ts": 6302685225549.114, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685225557.334, "dur": 2.250, + "args": { + "External id": 123496, "cbid": 138, "correlation": 241663016 + } + }, + { + "ph": "f", "id": 241663016, "pid": 5717, "tid": 5717, "ts": 6302685225557.334, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685225561.384, "dur": 2.040, + "args": { + "External id": 123496, "cbid": 138, "correlation": 241663017 + } + }, + { + "ph": "f", "id": 241663017, "pid": 5717, "tid": 5717, "ts": 6302685225561.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685225775.807, "dur": 0.544, + "args": { + "External id": 123500, "device": 3, "context": 1, "stream": 7, "correlation": 241663027, "bytes": 7224, "memory bandwidth (GB/s)": 13.279411764705882 + } + }, + { + "ph": "f", "id": 241663027, "pid": 3, "tid": 7, "ts": 6302685225775.807, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685225708.704, "dur": 76.079, + "args": { + "External id": 123500, "cbid": 41, "correlation": 241663027 + } + }, + { + "ph": "s", "id": 241663027, "pid": 5717, "tid": 5717, "ts": 6302685225708.704, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685225814.854, "dur": 13.149, + "args": { + "External id": 123495, "cbid": 135, "correlation": 241663031 + } + }, + { + "ph": "f", "id": 241663031, "pid": 5717, "tid": 5717, "ts": 6302685225814.854, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685225901.376, "dur": 9.088, + "args": { + "External id": 123495, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663035, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663035, "pid": 3, "tid": 7, "ts": 6302685225901.376, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685225844.933, "dur": 68.270, + "args": { + "External id": 123495, "cbid": 211, "correlation": 241663035 + } + }, + { + "ph": "s", "id": 241663035, "pid": 5717, "tid": 5717, "ts": 6302685225844.933, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685226695.772, "dur": 14.709, + "args": { + "cbid": 135, "correlation": 241663046 + } + }, + { + "ph": "f", "id": 241663046, "pid": 5717, "tid": 5717, "ts": 6302685226695.772, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685228369.810, "dur": 20.320, + "args": { + "External id": 123507, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663072, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663072, "pid": 3, "tid": 7, "ts": 6302685228369.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685228277.278, "dur": 102.820, + "args": { + "External id": 123507, "cbid": 307, "correlation": 241663072 + } + }, + { + "ph": "s", "id": 241663072, "pid": 5717, "tid": 5717, "ts": 6302685228277.278, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685229252.409, "dur": 123.713, + "args": { + "External id": 123513, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663095, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663095, "pid": 3, "tid": 7, "ts": 6302685229252.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685229199.636, "dur": 61.860, + "args": { + "External id": 123513, "cbid": 211, "correlation": 241663095 + } + }, + { + "ph": "s", "id": 241663095, "pid": 5717, "tid": 5717, "ts": 6302685229199.636, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685229514.811, "dur": 121.665, + "args": { + "External id": 123514, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663118, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663118, "pid": 3, "tid": 7, "ts": 6302685229514.811, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685229450.715, "dur": 87.420, + "args": { + "External id": 123514, "cbid": 211, "correlation": 241663118 + } + }, + { + "ph": "s", "id": 241663118, "pid": 5717, "tid": 5717, "ts": 6302685229450.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685229710.652, "dur": 120.353, + "args": { + "External id": 123515, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663141, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663141, "pid": 3, "tid": 7, "ts": 6302685229710.652, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685229684.235, "dur": 31.230, + "args": { + "External id": 123515, "cbid": 211, "correlation": 241663141 + } + }, + { + "ph": "s", "id": 241663141, "pid": 5717, "tid": 5717, "ts": 6302685229684.235, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685231269.000, "dur": 53.217, + "args": { + "External id": 123532, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663161, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663161, "pid": 3, "tid": 7, "ts": 6302685231269.000, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685231231.421, "dur": 42.530, + "args": { + "External id": 123532, "cbid": 307, "correlation": 241663161 + } + }, + { + "ph": "s", "id": 241663161, "pid": 5717, "tid": 5717, "ts": 6302685231231.421, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685232110.286, "dur": 60.961, + "args": { + "External id": 123548, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663179, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663179, "pid": 3, "tid": 7, "ts": 6302685232110.286, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685232082.739, "dur": 30.350, + "args": { + "External id": 123548, "cbid": 307, "correlation": 241663179 + } + }, + { + "ph": "s", "id": 241663179, "pid": 5717, "tid": 5717, "ts": 6302685232082.739, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685232675.678, "dur": 1.370, + "args": { + "External id": 123554, "cbid": 200, "correlation": 241663186 + } + }, + { + "ph": "f", "id": 241663186, "pid": 5717, "tid": 5717, "ts": 6302685232675.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685232677.398, "dur": 0.530, + "args": { + "External id": 123554, "cbid": 200, "correlation": 241663187 + } + }, + { + "ph": "f", "id": 241663187, "pid": 5717, "tid": 5717, "ts": 6302685232677.398, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685232746.758, "dur": 0.940, + "args": { + "External id": 123554, "cbid": 200, "correlation": 241663210 + } + }, + { + "ph": "f", "id": 241663210, "pid": 5717, "tid": 5717, "ts": 6302685232746.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685232762.118, "dur": 5.650, + "args": { + "External id": 123554, "cbid": 273, "correlation": 241663219 + } + }, + { + "ph": "f", "id": 241663219, "pid": 5717, "tid": 5717, "ts": 6302685232762.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685232793.524, "dur": 409.539, + "args": { + "External id": 123554, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663220, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663220, "pid": 3, "tid": 7, "ts": 6302685232793.524, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685232769.358, "dur": 27.480, + "args": { + "External id": 123554, "cbid": 211, "correlation": 241663220 + } + }, + { + "ph": "s", "id": 241663220, "pid": 5717, "tid": 5717, "ts": 6302685232769.358, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685233203.735, "dur": 121.409, + "args": { + "External id": 123560, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663243, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663243, "pid": 3, "tid": 7, "ts": 6302685233203.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685232964.317, "dur": 20.520, + "args": { + "External id": 123560, "cbid": 211, "correlation": 241663243 + } + }, + { + "ph": "s", "id": 241663243, "pid": 5717, "tid": 5717, "ts": 6302685232964.317, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685233422.648, "dur": 89.473, + "args": { + "External id": 123564, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663269, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663269, "pid": 3, "tid": 7, "ts": 6302685233422.648, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685233403.336, "dur": 20.710, + "args": { + "External id": 123564, "cbid": 307, "correlation": 241663269 + } + }, + { + "ph": "s", "id": 241663269, "pid": 5717, "tid": 5717, "ts": 6302685233403.336, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685233512.793, "dur": 322.082, + "args": { + "External id": 123565, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663289, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663289, "pid": 3, "tid": 7, "ts": 6302685233512.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685233493.256, "dur": 13.440, + "args": { + "External id": 123565, "cbid": 211, "correlation": 241663289 + } + }, + { + "ph": "s", "id": 241663289, "pid": 5717, "tid": 5717, "ts": 6302685233493.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685233835.547, "dur": 317.763, + "args": { + "External id": 123566, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663312, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663312, "pid": 3, "tid": 7, "ts": 6302685233835.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685233554.866, "dur": 12.410, + "args": { + "External id": 123566, "cbid": 211, "correlation": 241663312 + } + }, + { + "ph": "s", "id": 241663312, "pid": 5717, "tid": 5717, "ts": 6302685233554.866, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 3, "tid": 7, + "ts": 6302685234153.982, "dur": 214.818, + "args": { + "External id": 123567, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663324, "registers per thread": 16, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663324, "pid": 3, "tid": 7, "ts": 6302685234153.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685233634.696, "dur": 11.390, + "args": { + "External id": 123567, "cbid": 307, "correlation": 241663324 + } + }, + { + "ph": "s", "id": 241663324, "pid": 5717, "tid": 5717, "ts": 6302685233634.696, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685233700.685, "dur": 3.640, + "args": { + "External id": 123568, "cbid": 210, "correlation": 241663344 + } + }, + { + "ph": "f", "id": 241663344, "pid": 5717, "tid": 5717, "ts": 6302685233700.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685234369.472, "dur": 323.394, + "args": { + "External id": 123568, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663345, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663345, "pid": 3, "tid": 7, "ts": 6302685234369.472, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685233710.085, "dur": 18.671, + "args": { + "External id": 123568, "cbid": 211, "correlation": 241663345 + } + }, + { + "ph": "s", "id": 241663345, "pid": 5717, "tid": 5717, "ts": 6302685233710.085, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 3, "tid": 7, + "ts": 6302685234693.474, "dur": 50.944, + "args": { + "External id": 123569, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663352, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663352, "pid": 3, "tid": 7, "ts": 6302685234693.474, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685233815.335, "dur": 14.550, + "args": { + "External id": 123569, "cbid": 307, "correlation": 241663352 + } + }, + { + "ph": "s", "id": 241663352, "pid": 5717, "tid": 5717, "ts": 6302685233815.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685234679.714, "dur": 23.616, + "args": { + "External id": 123585, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241663367, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241663367, "pid": 3, "tid": 17, "ts": 6302685234679.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685234659.863, "dur": 21.650, + "args": { + "External id": 123585, "cbid": 211, "correlation": 241663367 + } + }, + { + "ph": "s", "id": 241663367, "pid": 5717, "tid": 5717, "ts": 6302685234659.863, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685234881.027, "dur": 6.560, + "args": { + "External id": 123601, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241663380, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241663380, "pid": 3, "tid": 17, "ts": 6302685234881.027, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685234865.483, "dur": 16.600, + "args": { + "External id": 123601, "cbid": 211, "correlation": 241663380 + } + }, + { + "ph": "s", "id": 241663380, "pid": 5717, "tid": 5717, "ts": 6302685234865.483, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685234925.643, "dur": 2.280, + "args": { + "cbid": 135, "correlation": 241663390 + } + }, + { + "ph": "f", "id": 241663390, "pid": 5717, "tid": 5717, "ts": 6302685234925.643, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685234931.363, "dur": 2.340, + "args": { + "cbid": 147, "correlation": 241663394 + } + }, + { + "ph": "s", "id": 241663394, "pid": 5717, "tid": 5717, "ts": 6302685234931.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685235026.453, "dur": 1.689, + "args": { + "External id": 123603, "cbid": 317, "correlation": 241663407 + } + }, + { + "ph": "f", "id": 241663407, "pid": 5717, "tid": 5717, "ts": 6302685235026.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685235031.482, "dur": 2.280, + "args": { + "External id": 123603, "cbid": 135, "correlation": 241663409 + } + }, + { + "ph": "f", "id": 241663409, "pid": 5717, "tid": 5717, "ts": 6302685235031.482, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685235036.262, "dur": 2.120, + "args": { + "External id": 123603, "cbid": 147, "correlation": 241663413 + } + }, + { + "ph": "s", "id": 241663413, "pid": 5717, "tid": 5717, "ts": 6302685235036.262, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685235066.162, "dur": 1.240, + "args": { + "External id": 123603, "cbid": 409, "correlation": 241663416 + } + }, + { + "ph": "f", "id": 241663416, "pid": 5717, "tid": 5717, "ts": 6302685235066.162, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685235074.453, "dur": 1.349, + "args": { + "External id": 123603, "cbid": 135, "correlation": 241663419 + } + }, + { + "ph": "f", "id": 241663419, "pid": 5717, "tid": 5717, "ts": 6302685235074.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685235076.122, "dur": 1.520, + "args": { + "External id": 123603, "cbid": 147, "correlation": 241663420 + } + }, + { + "ph": "s", "id": 241663420, "pid": 5717, "tid": 5717, "ts": 6302685235076.122, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685235095.685, "dur": 2455.154, + "args": { + "External id": 123603, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241663422, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241663422, "pid": 3, "tid": 20, "ts": 6302685235095.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685235079.542, "dur": 16.611, + "args": { + "External id": 123603, "cbid": 430, "correlation": 241663422 + } + }, + { + "ph": "s", "id": 241663422, "pid": 5717, "tid": 5717, "ts": 6302685235079.542, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685235098.002, "dur": 0.811, + "args": { + "External id": 123603, "cbid": 135, "correlation": 241663424 + } + }, + { + "ph": "f", "id": 241663424, "pid": 5717, "tid": 5717, "ts": 6302685235098.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685235099.033, "dur": 0.920, + "args": { + "External id": 123603, "cbid": 147, "correlation": 241663425 + } + }, + { + "ph": "s", "id": 241663425, "pid": 5717, "tid": 5717, "ts": 6302685235099.033, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685235102.813, "dur": 1.320, + "args": { + "External id": 123603, "cbid": 135, "correlation": 241663428 + } + }, + { + "ph": "f", "id": 241663428, "pid": 5717, "tid": 5717, "ts": 6302685235102.813, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685235119.262, "dur": 0.800, + "args": { + "External id": 123603, "cbid": 135, "correlation": 241663435 + } + }, + { + "ph": "f", "id": 241663435, "pid": 5717, "tid": 5717, "ts": 6302685235119.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685235175.522, "dur": 2.370, + "args": { + "External id": 123605, "cbid": 147, "correlation": 241663440 + } + }, + { + "ph": "s", "id": 241663440, "pid": 5717, "tid": 5717, "ts": 6302685235175.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685235212.492, "dur": 2.020, + "args": { + "cbid": 135, "correlation": 241663455 + } + }, + { + "ph": "f", "id": 241663455, "pid": 5717, "tid": 5717, "ts": 6302685235212.492, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685235316.132, "dur": 2.100, + "args": { + "cbid": 147, "correlation": 241663460 + } + }, + { + "ph": "s", "id": 241663460, "pid": 5717, "tid": 5717, "ts": 6302685235316.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685235322.012, "dur": 0.910, + "args": { + "cbid": 147, "correlation": 241663464 + } + }, + { + "ph": "s", "id": 241663464, "pid": 5717, "tid": 5717, "ts": 6302685235322.012, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685235393.052, "dur": 3.510, + "args": { + "cbid": 147, "correlation": 241663470 + } + }, + { + "ph": "s", "id": 241663470, "pid": 5717, "tid": 5717, "ts": 6302685235393.052, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685235669.181, "dur": 1.700, + "args": { + "External id": 123618, "cbid": 317, "correlation": 241663511 + } + }, + { + "ph": "f", "id": 241663511, "pid": 5717, "tid": 5717, "ts": 6302685235669.181, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685235688.181, "dur": 3.660, + "args": { + "External id": 123619, "cbid": 138, "correlation": 241663514 + } + }, + { + "ph": "f", "id": 241663514, "pid": 5717, "tid": 5717, "ts": 6302685235688.181, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685237557.047, "dur": 1.632, + "args": { + "External id": 123623, "device": 3, "context": 1, "stream": 7, "correlation": 241663524, "bytes": 7224, "memory bandwidth (GB/s)": 4.426470588235294 + } + }, + { + "ph": "f", "id": 241663524, "pid": 3, "tid": 7, "ts": 6302685237557.047, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685235722.341, "dur": 17.620, + "args": { + "External id": 123623, "cbid": 41, "correlation": 241663524 + } + }, + { + "ph": "s", "id": 241663524, "pid": 5717, "tid": 5717, "ts": 6302685235722.341, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685235746.671, "dur": 2.760, + "args": { + "External id": 123618, "cbid": 135, "correlation": 241663528 + } + }, + { + "ph": "f", "id": 241663528, "pid": 5717, "tid": 5717, "ts": 6302685235746.671, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685237565.495, "dur": 9.281, + "args": { + "External id": 123618, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663532, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663532, "pid": 3, "tid": 7, "ts": 6302685237565.495, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685235753.271, "dur": 15.510, + "args": { + "External id": 123618, "cbid": 211, "correlation": 241663532 + } + }, + { + "ph": "s", "id": 241663532, "pid": 5717, "tid": 5717, "ts": 6302685235753.271, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685235899.891, "dur": 2.040, + "args": { + "cbid": 135, "correlation": 241663543 + } + }, + { + "ph": "f", "id": 241663543, "pid": 5717, "tid": 5717, "ts": 6302685235899.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685237575.512, "dur": 25.344, + "args": { + "External id": 123630, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663569, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663569, "pid": 3, "tid": 7, "ts": 6302685237575.512, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685236241.660, "dur": 19.400, + "args": { + "External id": 123630, "cbid": 307, "correlation": 241663569 + } + }, + { + "ph": "s", "id": 241663569, "pid": 5717, "tid": 5717, "ts": 6302685236241.660, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685237601.592, "dur": 122.785, + "args": { + "External id": 123636, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663592, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663592, "pid": 3, "tid": 7, "ts": 6302685237601.592, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685236496.899, "dur": 13.810, + "args": { + "External id": 123636, "cbid": 211, "correlation": 241663592 + } + }, + { + "ph": "s", "id": 241663592, "pid": 5717, "tid": 5717, "ts": 6302685236496.899, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685237725.017, "dur": 120.449, + "args": { + "External id": 123637, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663615, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663615, "pid": 3, "tid": 7, "ts": 6302685237725.017, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685236542.739, "dur": 8.920, + "args": { + "External id": 123637, "cbid": 211, "correlation": 241663615 + } + }, + { + "ph": "s", "id": 241663615, "pid": 5717, "tid": 5717, "ts": 6302685236542.739, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685237846.138, "dur": 119.872, + "args": { + "External id": 123638, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663638, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663638, "pid": 3, "tid": 7, "ts": 6302685237846.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685236579.459, "dur": 7.010, + "args": { + "External id": 123638, "cbid": 211, "correlation": 241663638 + } + }, + { + "ph": "s", "id": 241663638, "pid": 5717, "tid": 5717, "ts": 6302685236579.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685237966.714, "dur": 51.809, + "args": { + "External id": 123655, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663658, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663658, "pid": 3, "tid": 7, "ts": 6302685237966.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685236981.298, "dur": 16.620, + "args": { + "External id": 123655, "cbid": 307, "correlation": 241663658 + } + }, + { + "ph": "s", "id": 241663658, "pid": 5717, "tid": 5717, "ts": 6302685236981.298, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685238019.195, "dur": 60.737, + "args": { + "External id": 123671, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663676, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663676, "pid": 3, "tid": 7, "ts": 6302685238019.195, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685237277.488, "dur": 13.020, + "args": { + "External id": 123671, "cbid": 307, "correlation": 241663676 + } + }, + { + "ph": "s", "id": 241663676, "pid": 5717, "tid": 5717, "ts": 6302685237277.488, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685237479.777, "dur": 0.720, + "args": { + "External id": 123677, "cbid": 200, "correlation": 241663683 + } + }, + { + "ph": "f", "id": 241663683, "pid": 5717, "tid": 5717, "ts": 6302685237479.777, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685237480.677, "dur": 0.250, + "args": { + "External id": 123677, "cbid": 200, "correlation": 241663684 + } + }, + { + "ph": "f", "id": 241663684, "pid": 5717, "tid": 5717, "ts": 6302685237480.677, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685237516.297, "dur": 0.490, + "args": { + "External id": 123677, "cbid": 200, "correlation": 241663707 + } + }, + { + "ph": "f", "id": 241663707, "pid": 5717, "tid": 5717, "ts": 6302685237516.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685237525.057, "dur": 2.720, + "args": { + "External id": 123677, "cbid": 273, "correlation": 241663716 + } + }, + { + "ph": "f", "id": 241663716, "pid": 5717, "tid": 5717, "ts": 6302685237525.057, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685238080.572, "dur": 407.363, + "args": { + "External id": 123677, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663717, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663717, "pid": 3, "tid": 7, "ts": 6302685238080.572, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685237528.547, "dur": 13.880, + "args": { + "External id": 123677, "cbid": 211, "correlation": 241663717 + } + }, + { + "ph": "s", "id": 241663717, "pid": 5717, "tid": 5717, "ts": 6302685237528.547, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685238488.575, "dur": 122.304, + "args": { + "External id": 123683, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663740, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663740, "pid": 3, "tid": 7, "ts": 6302685238488.575, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685237625.857, "dur": 10.120, + "args": { + "External id": 123683, "cbid": 211, "correlation": 241663740 + } + }, + { + "ph": "s", "id": 241663740, "pid": 5717, "tid": 5717, "ts": 6302685237625.857, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685238611.519, "dur": 93.089, + "args": { + "External id": 123687, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663766, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663766, "pid": 3, "tid": 7, "ts": 6302685238611.519, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685237835.006, "dur": 11.850, + "args": { + "External id": 123687, "cbid": 307, "correlation": 241663766 + } + }, + { + "ph": "s", "id": 241663766, "pid": 5717, "tid": 5717, "ts": 6302685237835.006, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685238705.280, "dur": 324.643, + "args": { + "External id": 123688, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663786, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663786, "pid": 3, "tid": 7, "ts": 6302685238705.280, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685237888.946, "dur": 8.530, + "args": { + "External id": 123688, "cbid": 211, "correlation": 241663786 + } + }, + { + "ph": "s", "id": 241663786, "pid": 5717, "tid": 5717, "ts": 6302685237888.946, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685239030.531, "dur": 1327.210, + "args": { + "External id": 123689, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663809, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663809, "pid": 3, "tid": 7, "ts": 6302685239030.531, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685237925.276, "dur": 8.160, + "args": { + "External id": 123689, "cbid": 211, "correlation": 241663809 + } + }, + { + "ph": "s", "id": 241663809, "pid": 5717, "tid": 5717, "ts": 6302685237925.276, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 3, "tid": 7, + "ts": 6302685240358.445, "dur": 361.986, + "args": { + "External id": 123690, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663821, "registers per thread": 16, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663821, "pid": 3, "tid": 7, "ts": 6302685240358.445, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685237976.966, "dur": 7.120, + "args": { + "External id": 123690, "cbid": 307, "correlation": 241663821 + } + }, + { + "ph": "s", "id": 241663821, "pid": 5717, "tid": 5717, "ts": 6302685237976.966, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685238015.616, "dur": 1.610, + "args": { + "External id": 123691, "cbid": 210, "correlation": 241663841 + } + }, + { + "ph": "f", "id": 241663841, "pid": 5717, "tid": 5717, "ts": 6302685238015.616, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685240721.007, "dur": 460.420, + "args": { + "External id": 123691, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663842, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241663842, "pid": 3, "tid": 7, "ts": 6302685240721.007, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685238019.666, "dur": 7.320, + "args": { + "External id": 123691, "cbid": 211, "correlation": 241663842 + } + }, + { + "ph": "s", "id": 241663842, "pid": 5717, "tid": 5717, "ts": 6302685238019.666, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 3, "tid": 7, + "ts": 6302685241182.163, "dur": 46.112, + "args": { + "External id": 123692, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241663849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241663849, "pid": 3, "tid": 7, "ts": 6302685241182.163, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685238066.316, "dur": 6.940, + "args": { + "External id": 123692, "cbid": 307, "correlation": 241663849 + } + }, + { + "ph": "s", "id": 241663849, "pid": 5717, "tid": 5717, "ts": 6302685238066.316, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685238595.743, "dur": 44.033, + "args": { + "External id": 123708, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241663864, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241663864, "pid": 3, "tid": 17, "ts": 6302685238595.743, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685238580.365, "dur": 15.869, + "args": { + "External id": 123708, "cbid": 211, "correlation": 241663864 + } + }, + { + "ph": "s", "id": 241663864, "pid": 5717, "tid": 5717, "ts": 6302685238580.365, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685238772.001, "dur": 20.640, + "args": { + "External id": 123724, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241663877, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241663877, "pid": 3, "tid": 17, "ts": 6302685238772.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685238743.434, "dur": 12.490, + "args": { + "External id": 123724, "cbid": 211, "correlation": 241663877 + } + }, + { + "ph": "s", "id": 241663877, "pid": 5717, "tid": 5717, "ts": 6302685238743.434, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685238789.134, "dur": 1.920, + "args": { + "cbid": 135, "correlation": 241663887 + } + }, + { + "ph": "f", "id": 241663887, "pid": 5717, "tid": 5717, "ts": 6302685238789.134, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685238793.714, "dur": 1.780, + "args": { + "cbid": 147, "correlation": 241663891 + } + }, + { + "ph": "s", "id": 241663891, "pid": 5717, "tid": 5717, "ts": 6302685238793.714, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685238869.284, "dur": 1.200, + "args": { + "External id": 123726, "cbid": 317, "correlation": 241663904 + } + }, + { + "ph": "f", "id": 241663904, "pid": 5717, "tid": 5717, "ts": 6302685238869.284, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685238872.984, "dur": 1.680, + "args": { + "External id": 123726, "cbid": 135, "correlation": 241663906 + } + }, + { + "ph": "f", "id": 241663906, "pid": 5717, "tid": 5717, "ts": 6302685238872.984, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685238876.554, "dur": 1.590, + "args": { + "External id": 123726, "cbid": 147, "correlation": 241663910 + } + }, + { + "ph": "s", "id": 241663910, "pid": 5717, "tid": 5717, "ts": 6302685238876.554, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685238898.364, "dur": 0.960, + "args": { + "External id": 123726, "cbid": 409, "correlation": 241663913 + } + }, + { + "ph": "f", "id": 241663913, "pid": 5717, "tid": 5717, "ts": 6302685238898.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685238904.624, "dur": 1.030, + "args": { + "External id": 123726, "cbid": 135, "correlation": 241663916 + } + }, + { + "ph": "f", "id": 241663916, "pid": 5717, "tid": 5717, "ts": 6302685238904.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685238905.904, "dur": 1.190, + "args": { + "External id": 123726, "cbid": 147, "correlation": 241663917 + } + }, + { + "ph": "s", "id": 241663917, "pid": 5717, "tid": 5717, "ts": 6302685238905.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685239020.834, "dur": 2841.078, + "args": { + "External id": 123726, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241663919, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241663919, "pid": 3, "tid": 20, "ts": 6302685239020.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685238908.584, "dur": 12.680, + "args": { + "External id": 123726, "cbid": 430, "correlation": 241663919 + } + }, + { + "ph": "s", "id": 241663919, "pid": 5717, "tid": 5717, "ts": 6302685238908.584, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685238922.604, "dur": 0.550, + "args": { + "External id": 123726, "cbid": 135, "correlation": 241663921 + } + }, + { + "ph": "f", "id": 241663921, "pid": 5717, "tid": 5717, "ts": 6302685238922.604, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685238923.364, "dur": 0.720, + "args": { + "External id": 123726, "cbid": 147, "correlation": 241663922 + } + }, + { + "ph": "s", "id": 241663922, "pid": 5717, "tid": 5717, "ts": 6302685238923.364, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685238926.254, "dur": 1.010, + "args": { + "External id": 123726, "cbid": 135, "correlation": 241663925 + } + }, + { + "ph": "f", "id": 241663925, "pid": 5717, "tid": 5717, "ts": 6302685238926.254, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685238942.264, "dur": 0.600, + "args": { + "External id": 123726, "cbid": 135, "correlation": 241663932 + } + }, + { + "ph": "f", "id": 241663932, "pid": 5717, "tid": 5717, "ts": 6302685238942.264, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685238980.864, "dur": 1.430, + "args": { + "External id": 123728, "cbid": 147, "correlation": 241663937 + } + }, + { + "ph": "s", "id": 241663937, "pid": 5717, "tid": 5717, "ts": 6302685238980.864, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685239004.993, "dur": 1.280, + "args": { + "cbid": 135, "correlation": 241663952 + } + }, + { + "ph": "f", "id": 241663952, "pid": 5717, "tid": 5717, "ts": 6302685239004.993, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685239060.564, "dur": 1.520, + "args": { + "cbid": 147, "correlation": 241663957 + } + }, + { + "ph": "s", "id": 241663957, "pid": 5717, "tid": 5717, "ts": 6302685239060.564, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685239064.673, "dur": 0.811, + "args": { + "cbid": 147, "correlation": 241663961 + } + }, + { + "ph": "s", "id": 241663961, "pid": 5717, "tid": 5717, "ts": 6302685239064.673, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685239117.013, "dur": 3.050, + "args": { + "cbid": 147, "correlation": 241663967 + } + }, + { + "ph": "s", "id": 241663967, "pid": 5717, "tid": 5717, "ts": 6302685239117.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685239262.133, "dur": 1.410, + "args": { + "External id": 123741, "cbid": 317, "correlation": 241664008 + } + }, + { + "ph": "f", "id": 241664008, "pid": 5717, "tid": 5717, "ts": 6302685239262.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685239275.483, "dur": 3.500, + "args": { + "External id": 123742, "cbid": 138, "correlation": 241664011 + } + }, + { + "ph": "f", "id": 241664011, "pid": 5717, "tid": 5717, "ts": 6302685239275.483, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685241866.072, "dur": 1.632, + "args": { + "External id": 123746, "device": 3, "context": 1, "stream": 7, "correlation": 241664021, "bytes": 7224, "memory bandwidth (GB/s)": 4.426470588235294 + } + }, + { + "ph": "f", "id": 241664021, "pid": 3, "tid": 7, "ts": 6302685241866.072, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685239315.533, "dur": 15.190, + "args": { + "External id": 123746, "cbid": 41, "correlation": 241664021 + } + }, + { + "ph": "s", "id": 241664021, "pid": 5717, "tid": 5717, "ts": 6302685239315.533, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685239337.423, "dur": 2.330, + "args": { + "External id": 123741, "cbid": 135, "correlation": 241664025 + } + }, + { + "ph": "f", "id": 241664025, "pid": 5717, "tid": 5717, "ts": 6302685239337.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685241874.296, "dur": 19.296, + "args": { + "External id": 123741, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664029, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664029, "pid": 3, "tid": 7, "ts": 6302685241874.296, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685239343.123, "dur": 13.080, + "args": { + "External id": 123741, "cbid": 211, "correlation": 241664029 + } + }, + { + "ph": "s", "id": 241664029, "pid": 5717, "tid": 5717, "ts": 6302685239343.123, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685239471.712, "dur": 1.640, + "args": { + "cbid": 135, "correlation": 241664040 + } + }, + { + "ph": "f", "id": 241664040, "pid": 5717, "tid": 5717, "ts": 6302685239471.712, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685241894.264, "dur": 19.520, + "args": { + "External id": 123753, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664066, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664066, "pid": 3, "tid": 7, "ts": 6302685241894.264, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685239749.642, "dur": 13.920, + "args": { + "External id": 123753, "cbid": 307, "correlation": 241664066 + } + }, + { + "ph": "s", "id": 241664066, "pid": 5717, "tid": 5717, "ts": 6302685239749.642, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685241914.520, "dur": 123.201, + "args": { + "External id": 123759, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664089, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664089, "pid": 3, "tid": 7, "ts": 6302685241914.520, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685239935.091, "dur": 13.120, + "args": { + "External id": 123759, "cbid": 211, "correlation": 241664089 + } + }, + { + "ph": "s", "id": 241664089, "pid": 5717, "tid": 5717, "ts": 6302685239935.091, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685242038.457, "dur": 123.937, + "args": { + "External id": 123760, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664112, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664112, "pid": 3, "tid": 7, "ts": 6302685242038.457, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685239979.001, "dur": 8.450, + "args": { + "External id": 123760, "cbid": 211, "correlation": 241664112 + } + }, + { + "ph": "s", "id": 241664112, "pid": 5717, "tid": 5717, "ts": 6302685239979.001, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685242162.970, "dur": 120.865, + "args": { + "External id": 123761, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664135, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664135, "pid": 3, "tid": 7, "ts": 6302685242162.970, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685240016.071, "dur": 6.780, + "args": { + "External id": 123761, "cbid": 211, "correlation": 241664135 + } + }, + { + "ph": "s", "id": 241664135, "pid": 5717, "tid": 5717, "ts": 6302685240016.071, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685242284.475, "dur": 53.121, + "args": { + "External id": 123778, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664155, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664155, "pid": 3, "tid": 7, "ts": 6302685242284.475, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685240376.221, "dur": 12.689, + "args": { + "External id": 123778, "cbid": 307, "correlation": 241664155 + } + }, + { + "ph": "s", "id": 241664155, "pid": 5717, "tid": 5717, "ts": 6302685240376.221, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685242338.300, "dur": 59.840, + "args": { + "External id": 123794, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664173, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664173, "pid": 3, "tid": 7, "ts": 6302685242338.300, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685240636.740, "dur": 12.730, + "args": { + "External id": 123794, "cbid": 307, "correlation": 241664173 + } + }, + { + "ph": "s", "id": 241664173, "pid": 5717, "tid": 5717, "ts": 6302685240636.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685240835.709, "dur": 0.720, + "args": { + "External id": 123800, "cbid": 200, "correlation": 241664180 + } + }, + { + "ph": "f", "id": 241664180, "pid": 5717, "tid": 5717, "ts": 6302685240835.709, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685240836.600, "dur": 0.260, + "args": { + "External id": 123800, "cbid": 200, "correlation": 241664181 + } + }, + { + "ph": "f", "id": 241664181, "pid": 5717, "tid": 5717, "ts": 6302685240836.600, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685240876.739, "dur": 0.520, + "args": { + "External id": 123800, "cbid": 200, "correlation": 241664204 + } + }, + { + "ph": "f", "id": 241664204, "pid": 5717, "tid": 5717, "ts": 6302685240876.739, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685240884.899, "dur": 2.650, + "args": { + "External id": 123800, "cbid": 273, "correlation": 241664213 + } + }, + { + "ph": "f", "id": 241664213, "pid": 5717, "tid": 5717, "ts": 6302685240884.899, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685242398.844, "dur": 1176.585, + "args": { + "External id": 123800, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664214, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664214, "pid": 3, "tid": 7, "ts": 6302685242398.844, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685240888.349, "dur": 20.200, + "args": { + "External id": 123800, "cbid": 211, "correlation": 241664214 + } + }, + { + "ph": "s", "id": 241664214, "pid": 5717, "tid": 5717, "ts": 6302685240888.349, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685243576.069, "dur": 273.090, + "args": { + "External id": 123806, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664237, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664237, "pid": 3, "tid": 7, "ts": 6302685243576.069, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685240995.119, "dur": 10.070, + "args": { + "External id": 123806, "cbid": 211, "correlation": 241664237 + } + }, + { + "ph": "s", "id": 241664237, "pid": 5717, "tid": 5717, "ts": 6302685240995.119, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685243849.799, "dur": 90.945, + "args": { + "External id": 123810, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664263, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664263, "pid": 3, "tid": 7, "ts": 6302685243849.799, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685241213.308, "dur": 12.660, + "args": { + "External id": 123810, "cbid": 307, "correlation": 241664263 + } + }, + { + "ph": "s", "id": 241664263, "pid": 5717, "tid": 5717, "ts": 6302685241213.308, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685243941.352, "dur": 334.978, + "args": { + "External id": 123811, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664283, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664283, "pid": 3, "tid": 7, "ts": 6302685243941.352, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685241269.948, "dur": 8.080, + "args": { + "External id": 123811, "cbid": 211, "correlation": 241664283 + } + }, + { + "ph": "s", "id": 241664283, "pid": 5717, "tid": 5717, "ts": 6302685241269.948, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685244277.002, "dur": 349.123, + "args": { + "External id": 123812, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664306, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664306, "pid": 3, "tid": 7, "ts": 6302685244277.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685241317.698, "dur": 9.480, + "args": { + "External id": 123812, "cbid": 211, "correlation": 241664306 + } + }, + { + "ph": "s", "id": 241664306, "pid": 5717, "tid": 5717, "ts": 6302685241317.698, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 3, "tid": 7, + "ts": 6302685244626.797, "dur": 240.833, + "args": { + "External id": 123813, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664318, "pid": 3, "tid": 7, "ts": 6302685244626.797, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685241379.058, "dur": 7.670, + "args": { + "External id": 123813, "cbid": 307, "correlation": 241664318 + } + }, + { + "ph": "s", "id": 241664318, "pid": 5717, "tid": 5717, "ts": 6302685241379.058, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685241421.678, "dur": 1.640, + "args": { + "External id": 123814, "cbid": 210, "correlation": 241664338 + } + }, + { + "ph": "f", "id": 241664338, "pid": 5717, "tid": 5717, "ts": 6302685241421.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685244868.302, "dur": 391.716, + "args": { + "External id": 123814, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664339, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664339, "pid": 3, "tid": 7, "ts": 6302685244868.302, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685241425.918, "dur": 7.850, + "args": { + "External id": 123814, "cbid": 211, "correlation": 241664339 + } + }, + { + "ph": "s", "id": 241664339, "pid": 5717, "tid": 5717, "ts": 6302685241425.918, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 3, "tid": 7, + "ts": 6302685245260.690, "dur": 45.952, + "args": { + "External id": 123815, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664346, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664346, "pid": 3, "tid": 7, "ts": 6302685245260.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685241474.098, "dur": 7.200, + "args": { + "External id": 123815, "cbid": 307, "correlation": 241664346 + } + }, + { + "ph": "s", "id": 241664346, "pid": 5717, "tid": 5717, "ts": 6302685241474.098, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685242033.369, "dur": 24.448, + "args": { + "External id": 123831, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241664361, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241664361, "pid": 3, "tid": 17, "ts": 6302685242033.369, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685242017.917, "dur": 16.060, + "args": { + "External id": 123831, "cbid": 211, "correlation": 241664361 + } + }, + { + "ph": "s", "id": 241664361, "pid": 5717, "tid": 5717, "ts": 6302685242017.917, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242066.277, "dur": 11.369, + "args": { + "cbid": 138, "correlation": 241664365 + } + }, + { + "ph": "f", "id": 241664365, "pid": 5717, "tid": 423623104, "ts": 6302685242066.277, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242078.197, "dur": 1.040, + "args": { + "cbid": 138, "correlation": 241664366 + } + }, + { + "ph": "f", "id": 241664366, "pid": 5717, "tid": 423623104, "ts": 6302685242078.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242086.597, "dur": 0.729, + "args": { + "cbid": 138, "correlation": 241664367 + } + }, + { + "ph": "f", "id": 241664367, "pid": 5717, "tid": 423623104, "ts": 6302685242086.597, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242093.277, "dur": 1.429, + "args": { + "cbid": 138, "correlation": 241664368 + } + }, + { + "ph": "f", "id": 241664368, "pid": 5717, "tid": 423623104, "ts": 6302685242093.277, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242094.966, "dur": 0.580, + "args": { + "cbid": 138, "correlation": 241664369 + } + }, + { + "ph": "f", "id": 241664369, "pid": 5717, "tid": 423623104, "ts": 6302685242094.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242096.866, "dur": 0.980, + "args": { + "cbid": 138, "correlation": 241664371 + } + }, + { + "ph": "f", "id": 241664371, "pid": 5717, "tid": 423623104, "ts": 6302685242096.866, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242100.226, "dur": 1.900, + "args": { + "cbid": 138, "correlation": 241664374 + } + }, + { + "ph": "f", "id": 241664374, "pid": 5717, "tid": 423623104, "ts": 6302685242100.226, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242102.437, "dur": 0.540, + "args": { + "cbid": 138, "correlation": 241664375 + } + }, + { + "ph": "f", "id": 241664375, "pid": 5717, "tid": 423623104, "ts": 6302685242102.437, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242104.226, "dur": 0.920, + "args": { + "cbid": 138, "correlation": 241664377 + } + }, + { + "ph": "f", "id": 241664377, "pid": 5717, "tid": 423623104, "ts": 6302685242104.226, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242107.366, "dur": 1.631, + "args": { + "cbid": 138, "correlation": 241664379 + } + }, + { + "ph": "f", "id": 241664379, "pid": 5717, "tid": 423623104, "ts": 6302685242107.366, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242109.386, "dur": 0.540, + "args": { + "cbid": 138, "correlation": 241664380 + } + }, + { + "ph": "f", "id": 241664380, "pid": 5717, "tid": 423623104, "ts": 6302685242109.386, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242110.857, "dur": 0.909, + "args": { + "cbid": 138, "correlation": 241664381 + } + }, + { + "ph": "f", "id": 241664381, "pid": 5717, "tid": 423623104, "ts": 6302685242110.857, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242113.997, "dur": 1.700, + "args": { + "cbid": 138, "correlation": 241664384 + } + }, + { + "ph": "f", "id": 241664384, "pid": 5717, "tid": 423623104, "ts": 6302685242113.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242115.957, "dur": 0.529, + "args": { + "cbid": 138, "correlation": 241664385 + } + }, + { + "ph": "f", "id": 241664385, "pid": 5717, "tid": 423623104, "ts": 6302685242115.957, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242117.486, "dur": 0.540, + "args": { + "cbid": 138, "correlation": 241664386 + } + }, + { + "ph": "f", "id": 241664386, "pid": 5717, "tid": 423623104, "ts": 6302685242117.486, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242120.146, "dur": 1.380, + "args": { + "cbid": 138, "correlation": 241664387 + } + }, + { + "ph": "f", "id": 241664387, "pid": 5717, "tid": 423623104, "ts": 6302685242120.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242121.786, "dur": 0.511, + "args": { + "cbid": 138, "correlation": 241664388 + } + }, + { + "ph": "f", "id": 241664388, "pid": 5717, "tid": 423623104, "ts": 6302685242121.786, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242123.177, "dur": 0.580, + "args": { + "cbid": 138, "correlation": 241664389 + } + }, + { + "ph": "f", "id": 241664389, "pid": 5717, "tid": 423623104, "ts": 6302685242123.177, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242125.926, "dur": 1.851, + "args": { + "cbid": 138, "correlation": 241664390 + } + }, + { + "ph": "f", "id": 241664390, "pid": 5717, "tid": 423623104, "ts": 6302685242125.926, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242128.046, "dur": 0.560, + "args": { + "cbid": 138, "correlation": 241664391 + } + }, + { + "ph": "f", "id": 241664391, "pid": 5717, "tid": 423623104, "ts": 6302685242128.046, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685242129.346, "dur": 0.571, + "args": { + "cbid": 138, "correlation": 241664392 + } + }, + { + "ph": "f", "id": 241664392, "pid": 5717, "tid": 423623104, "ts": 6302685242129.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685242206.939, "dur": 9.696, + "args": { + "External id": 123847, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241664395, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241664395, "pid": 3, "tid": 17, "ts": 6302685242206.939, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685242193.206, "dur": 13.750, + "args": { + "External id": 123847, "cbid": 211, "correlation": 241664395 + } + }, + { + "ph": "s", "id": 241664395, "pid": 5717, "tid": 5717, "ts": 6302685242193.206, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685242241.726, "dur": 1.840, + "args": { + "cbid": 135, "correlation": 241664405 + } + }, + { + "ph": "f", "id": 241664405, "pid": 5717, "tid": 5717, "ts": 6302685242241.726, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685242246.256, "dur": 1.880, + "args": { + "cbid": 147, "correlation": 241664409 + } + }, + { + "ph": "s", "id": 241664409, "pid": 5717, "tid": 5717, "ts": 6302685242246.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685242330.106, "dur": 1.470, + "args": { + "External id": 123849, "cbid": 317, "correlation": 241664422 + } + }, + { + "ph": "f", "id": 241664422, "pid": 5717, "tid": 5717, "ts": 6302685242330.106, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685242334.206, "dur": 1.750, + "args": { + "External id": 123849, "cbid": 135, "correlation": 241664424 + } + }, + { + "ph": "f", "id": 241664424, "pid": 5717, "tid": 5717, "ts": 6302685242334.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685242337.956, "dur": 1.480, + "args": { + "External id": 123849, "cbid": 147, "correlation": 241664428 + } + }, + { + "ph": "s", "id": 241664428, "pid": 5717, "tid": 5717, "ts": 6302685242337.956, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685242362.356, "dur": 0.930, + "args": { + "External id": 123849, "cbid": 409, "correlation": 241664431 + } + }, + { + "ph": "f", "id": 241664431, "pid": 5717, "tid": 5717, "ts": 6302685242362.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685242368.606, "dur": 1.070, + "args": { + "External id": 123849, "cbid": 135, "correlation": 241664434 + } + }, + { + "ph": "f", "id": 241664434, "pid": 5717, "tid": 5717, "ts": 6302685242368.606, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685242369.916, "dur": 1.170, + "args": { + "External id": 123849, "cbid": 147, "correlation": 241664435 + } + }, + { + "ph": "s", "id": 241664435, "pid": 5717, "tid": 5717, "ts": 6302685242369.916, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685242396.060, "dur": 3804.317, + "args": { + "External id": 123849, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241664437, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241664437, "pid": 3, "tid": 20, "ts": 6302685242396.060, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685242372.556, "dur": 12.790, + "args": { + "External id": 123849, "cbid": 430, "correlation": 241664437 + } + }, + { + "ph": "s", "id": 241664437, "pid": 5717, "tid": 5717, "ts": 6302685242372.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685242386.716, "dur": 0.530, + "args": { + "External id": 123849, "cbid": 135, "correlation": 241664439 + } + }, + { + "ph": "f", "id": 241664439, "pid": 5717, "tid": 5717, "ts": 6302685242386.716, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685242387.406, "dur": 0.730, + "args": { + "External id": 123849, "cbid": 147, "correlation": 241664440 + } + }, + { + "ph": "s", "id": 241664440, "pid": 5717, "tid": 5717, "ts": 6302685242387.406, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685242390.176, "dur": 0.980, + "args": { + "External id": 123849, "cbid": 135, "correlation": 241664443 + } + }, + { + "ph": "f", "id": 241664443, "pid": 5717, "tid": 5717, "ts": 6302685242390.176, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685242403.076, "dur": 0.620, + "args": { + "External id": 123849, "cbid": 135, "correlation": 241664450 + } + }, + { + "ph": "f", "id": 241664450, "pid": 5717, "tid": 5717, "ts": 6302685242403.076, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685242453.596, "dur": 2.050, + "args": { + "External id": 123851, "cbid": 147, "correlation": 241664455 + } + }, + { + "ph": "s", "id": 241664455, "pid": 5717, "tid": 5717, "ts": 6302685242453.596, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685242493.406, "dur": 2.800, + "args": { + "cbid": 135, "correlation": 241664470 + } + }, + { + "ph": "f", "id": 241664470, "pid": 5717, "tid": 5717, "ts": 6302685242493.406, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685242563.976, "dur": 1.660, + "args": { + "cbid": 147, "correlation": 241664475 + } + }, + { + "ph": "s", "id": 241664475, "pid": 5717, "tid": 5717, "ts": 6302685242563.976, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685242569.085, "dur": 0.871, + "args": { + "cbid": 147, "correlation": 241664479 + } + }, + { + "ph": "s", "id": 241664479, "pid": 5717, "tid": 5717, "ts": 6302685242569.085, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685242622.555, "dur": 3.260, + "args": { + "cbid": 147, "correlation": 241664485 + } + }, + { + "ph": "s", "id": 241664485, "pid": 5717, "tid": 5717, "ts": 6302685242622.555, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685242769.795, "dur": 1.540, + "args": { + "External id": 123864, "cbid": 317, "correlation": 241664526 + } + }, + { + "ph": "f", "id": 241664526, "pid": 5717, "tid": 5717, "ts": 6302685242769.795, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685242786.715, "dur": 3.760, + "args": { + "External id": 123865, "cbid": 138, "correlation": 241664529 + } + }, + { + "ph": "f", "id": 241664529, "pid": 5717, "tid": 5717, "ts": 6302685242786.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685246207.865, "dur": 1.792, + "args": { + "External id": 123869, "device": 3, "context": 1, "stream": 7, "correlation": 241664539, "bytes": 7224, "memory bandwidth (GB/s)": 4.03125 + } + }, + { + "ph": "f", "id": 241664539, "pid": 3, "tid": 7, "ts": 6302685246207.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685242818.505, "dur": 15.880, + "args": { + "External id": 123869, "cbid": 41, "correlation": 241664539 + } + }, + { + "ph": "s", "id": 241664539, "pid": 5717, "tid": 5717, "ts": 6302685242818.505, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685242840.515, "dur": 2.410, + "args": { + "External id": 123864, "cbid": 135, "correlation": 241664543 + } + }, + { + "ph": "f", "id": 241664543, "pid": 5717, "tid": 5717, "ts": 6302685242840.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685246216.089, "dur": 22.368, + "args": { + "External id": 123864, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664547, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664547, "pid": 3, "tid": 7, "ts": 6302685246216.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685242846.425, "dur": 13.850, + "args": { + "External id": 123864, "cbid": 211, "correlation": 241664547 + } + }, + { + "ph": "s", "id": 241664547, "pid": 5717, "tid": 5717, "ts": 6302685242846.425, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685242978.624, "dur": 1.560, + "args": { + "cbid": 135, "correlation": 241664558 + } + }, + { + "ph": "f", "id": 241664558, "pid": 5717, "tid": 5717, "ts": 6302685242978.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685246239.097, "dur": 36.832, + "args": { + "External id": 123876, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664584, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664584, "pid": 3, "tid": 7, "ts": 6302685246239.097, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685243282.504, "dur": 25.490, + "args": { + "External id": 123876, "cbid": 307, "correlation": 241664584 + } + }, + { + "ph": "s", "id": 241664584, "pid": 5717, "tid": 5717, "ts": 6302685243282.504, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685246276.569, "dur": 805.638, + "args": { + "External id": 123882, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664607, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664607, "pid": 3, "tid": 7, "ts": 6302685246276.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685243504.053, "dur": 21.320, + "args": { + "External id": 123882, "cbid": 211, "correlation": 241664607 + } + }, + { + "ph": "s", "id": 241664607, "pid": 5717, "tid": 5717, "ts": 6302685243504.053, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685247082.847, "dur": 193.281, + "args": { + "External id": 123883, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664630, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664630, "pid": 3, "tid": 7, "ts": 6302685247082.847, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685243575.833, "dur": 14.610, + "args": { + "External id": 123883, "cbid": 211, "correlation": 241664630 + } + }, + { + "ph": "s", "id": 241664630, "pid": 5717, "tid": 5717, "ts": 6302685243575.833, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685247276.768, "dur": 200.930, + "args": { + "External id": 123884, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664653, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664653, "pid": 3, "tid": 7, "ts": 6302685247276.768, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685243640.413, "dur": 9.960, + "args": { + "External id": 123884, "cbid": 211, "correlation": 241664653 + } + }, + { + "ph": "s", "id": 241664653, "pid": 5717, "tid": 5717, "ts": 6302685243640.413, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685247478.370, "dur": 53.089, + "args": { + "External id": 123901, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664673, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664673, "pid": 3, "tid": 7, "ts": 6302685247478.370, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685244107.112, "dur": 13.270, + "args": { + "External id": 123901, "cbid": 307, "correlation": 241664673 + } + }, + { + "ph": "s", "id": 241664673, "pid": 5717, "tid": 5717, "ts": 6302685244107.112, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685247532.131, "dur": 63.008, + "args": { + "External id": 123917, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664691, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664691, "pid": 3, "tid": 7, "ts": 6302685247532.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685244415.231, "dur": 13.920, + "args": { + "External id": 123917, "cbid": 307, "correlation": 241664691 + } + }, + { + "ph": "s", "id": 241664691, "pid": 5717, "tid": 5717, "ts": 6302685244415.231, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685244615.031, "dur": 0.750, + "args": { + "External id": 123923, "cbid": 200, "correlation": 241664698 + } + }, + { + "ph": "f", "id": 241664698, "pid": 5717, "tid": 5717, "ts": 6302685244615.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685244615.961, "dur": 0.290, + "args": { + "External id": 123923, "cbid": 200, "correlation": 241664699 + } + }, + { + "ph": "f", "id": 241664699, "pid": 5717, "tid": 5717, "ts": 6302685244615.961, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685244654.121, "dur": 0.490, + "args": { + "External id": 123923, "cbid": 200, "correlation": 241664722 + } + }, + { + "ph": "f", "id": 241664722, "pid": 5717, "tid": 5717, "ts": 6302685244654.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685244662.101, "dur": 2.800, + "args": { + "External id": 123923, "cbid": 273, "correlation": 241664731 + } + }, + { + "ph": "f", "id": 241664731, "pid": 5717, "tid": 5717, "ts": 6302685244662.101, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685247595.843, "dur": 436.483, + "args": { + "External id": 123923, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664732, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664732, "pid": 3, "tid": 7, "ts": 6302685247595.843, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685244665.711, "dur": 14.120, + "args": { + "External id": 123923, "cbid": 211, "correlation": 241664732 + } + }, + { + "ph": "s", "id": 241664732, "pid": 5717, "tid": 5717, "ts": 6302685244665.711, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685248032.998, "dur": 141.569, + "args": { + "External id": 123929, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664755, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664755, "pid": 3, "tid": 7, "ts": 6302685248032.998, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685244767.571, "dur": 10.429, + "args": { + "External id": 123929, "cbid": 211, "correlation": 241664755 + } + }, + { + "ph": "s", "id": 241664755, "pid": 5717, "tid": 5717, "ts": 6302685244767.571, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685248175.175, "dur": 89.601, + "args": { + "External id": 123933, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664781, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664781, "pid": 3, "tid": 7, "ts": 6302685248175.175, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685244990.450, "dur": 12.430, + "args": { + "External id": 123933, "cbid": 307, "correlation": 241664781 + } + }, + { + "ph": "s", "id": 241664781, "pid": 5717, "tid": 5717, "ts": 6302685244990.450, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685248265.384, "dur": 392.707, + "args": { + "External id": 123934, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664801, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664801, "pid": 3, "tid": 7, "ts": 6302685248265.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685245047.520, "dur": 8.420, + "args": { + "External id": 123934, "cbid": 211, "correlation": 241664801 + } + }, + { + "ph": "s", "id": 241664801, "pid": 5717, "tid": 5717, "ts": 6302685245047.520, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685248658.795, "dur": 377.347, + "args": { + "External id": 123935, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664824, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664824, "pid": 3, "tid": 7, "ts": 6302685248658.795, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685245086.330, "dur": 8.700, + "args": { + "External id": 123935, "cbid": 211, "correlation": 241664824 + } + }, + { + "ph": "s", "id": 241664824, "pid": 5717, "tid": 5717, "ts": 6302685245086.330, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 3, "tid": 7, + "ts": 6302685249036.782, "dur": 254.370, + "args": { + "External id": 123936, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664836, "registers per thread": 16, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664836, "pid": 3, "tid": 7, "ts": 6302685249036.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685245140.930, "dur": 7.730, + "args": { + "External id": 123936, "cbid": 307, "correlation": 241664836 + } + }, + { + "ph": "s", "id": 241664836, "pid": 5717, "tid": 5717, "ts": 6302685245140.930, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685245182.870, "dur": 1.909, + "args": { + "External id": 123937, "cbid": 210, "correlation": 241664856 + } + }, + { + "ph": "f", "id": 241664856, "pid": 5717, "tid": 5717, "ts": 6302685245182.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685249291.856, "dur": 341.346, + "args": { + "External id": 123937, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664857, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241664857, "pid": 3, "tid": 7, "ts": 6302685249291.856, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685245187.510, "dur": 8.240, + "args": { + "External id": 123937, "cbid": 211, "correlation": 241664857 + } + }, + { + "ph": "s", "id": 241664857, "pid": 5717, "tid": 5717, "ts": 6302685245187.510, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 3, "tid": 7, + "ts": 6302685249633.810, "dur": 43.617, + "args": { + "External id": 123938, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241664864, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241664864, "pid": 3, "tid": 7, "ts": 6302685249633.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685245238.850, "dur": 7.700, + "args": { + "External id": 123938, "cbid": 307, "correlation": 241664864 + } + }, + { + "ph": "s", "id": 241664864, "pid": 5717, "tid": 5717, "ts": 6302685245238.850, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685245868.502, "dur": 14.752, + "args": { + "External id": 123954, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241664879, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241664879, "pid": 3, "tid": 17, "ts": 6302685245868.502, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685245844.828, "dur": 26.260, + "args": { + "External id": 123954, "cbid": 211, "correlation": 241664879 + } + }, + { + "ph": "s", "id": 241664879, "pid": 5717, "tid": 5717, "ts": 6302685245844.828, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685246057.048, "dur": 8.672, + "args": { + "External id": 123970, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241664892, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241664892, "pid": 3, "tid": 17, "ts": 6302685246057.048, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685246044.148, "dur": 13.649, + "args": { + "External id": 123970, "cbid": 211, "correlation": 241664892 + } + }, + { + "ph": "s", "id": 241664892, "pid": 5717, "tid": 5717, "ts": 6302685246044.148, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685246099.788, "dur": 2.080, + "args": { + "cbid": 135, "correlation": 241664902 + } + }, + { + "ph": "f", "id": 241664902, "pid": 5717, "tid": 5717, "ts": 6302685246099.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685246104.728, "dur": 1.920, + "args": { + "cbid": 147, "correlation": 241664906 + } + }, + { + "ph": "s", "id": 241664906, "pid": 5717, "tid": 5717, "ts": 6302685246104.728, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685246185.607, "dur": 1.430, + "args": { + "External id": 123972, "cbid": 317, "correlation": 241664919 + } + }, + { + "ph": "f", "id": 241664919, "pid": 5717, "tid": 5717, "ts": 6302685246185.607, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685246189.767, "dur": 1.750, + "args": { + "External id": 123972, "cbid": 135, "correlation": 241664921 + } + }, + { + "ph": "f", "id": 241664921, "pid": 5717, "tid": 5717, "ts": 6302685246189.767, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685246193.517, "dur": 1.580, + "args": { + "External id": 123972, "cbid": 147, "correlation": 241664925 + } + }, + { + "ph": "s", "id": 241664925, "pid": 5717, "tid": 5717, "ts": 6302685246193.517, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685246218.827, "dur": 1.000, + "args": { + "External id": 123972, "cbid": 409, "correlation": 241664928 + } + }, + { + "ph": "f", "id": 241664928, "pid": 5717, "tid": 5717, "ts": 6302685246218.827, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685246225.547, "dur": 1.120, + "args": { + "External id": 123972, "cbid": 135, "correlation": 241664931 + } + }, + { + "ph": "f", "id": 241664931, "pid": 5717, "tid": 5717, "ts": 6302685246225.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685246226.917, "dur": 1.290, + "args": { + "External id": 123972, "cbid": 147, "correlation": 241664932 + } + }, + { + "ph": "s", "id": 241664932, "pid": 5717, "tid": 5717, "ts": 6302685246226.917, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685246274.425, "dur": 4127.135, + "args": { + "External id": 123972, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241664934, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241664934, "pid": 3, "tid": 20, "ts": 6302685246274.425, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685246229.827, "dur": 14.000, + "args": { + "External id": 123972, "cbid": 430, "correlation": 241664934 + } + }, + { + "ph": "s", "id": 241664934, "pid": 5717, "tid": 5717, "ts": 6302685246229.827, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685246245.347, "dur": 0.620, + "args": { + "External id": 123972, "cbid": 135, "correlation": 241664936 + } + }, + { + "ph": "f", "id": 241664936, "pid": 5717, "tid": 5717, "ts": 6302685246245.347, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685246246.137, "dur": 0.750, + "args": { + "External id": 123972, "cbid": 147, "correlation": 241664937 + } + }, + { + "ph": "s", "id": 241664937, "pid": 5717, "tid": 5717, "ts": 6302685246246.137, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685246249.187, "dur": 1.130, + "args": { + "External id": 123972, "cbid": 135, "correlation": 241664940 + } + }, + { + "ph": "f", "id": 241664940, "pid": 5717, "tid": 5717, "ts": 6302685246249.187, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685246262.917, "dur": 0.660, + "args": { + "External id": 123972, "cbid": 135, "correlation": 241664947 + } + }, + { + "ph": "f", "id": 241664947, "pid": 5717, "tid": 5717, "ts": 6302685246262.917, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685246315.657, "dur": 1.760, + "args": { + "External id": 123974, "cbid": 147, "correlation": 241664952 + } + }, + { + "ph": "s", "id": 241664952, "pid": 5717, "tid": 5717, "ts": 6302685246315.657, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685246355.287, "dur": 1.390, + "args": { + "cbid": 135, "correlation": 241664967 + } + }, + { + "ph": "f", "id": 241664967, "pid": 5717, "tid": 5717, "ts": 6302685246355.287, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685246436.487, "dur": 3.060, + "args": { + "cbid": 147, "correlation": 241664972 + } + }, + { + "ph": "s", "id": 241664972, "pid": 5717, "tid": 5717, "ts": 6302685246436.487, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685246443.877, "dur": 1.020, + "args": { + "cbid": 147, "correlation": 241664976 + } + }, + { + "ph": "s", "id": 241664976, "pid": 5717, "tid": 5717, "ts": 6302685246443.877, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685246532.916, "dur": 4.940, + "args": { + "cbid": 147, "correlation": 241664982 + } + }, + { + "ph": "s", "id": 241664982, "pid": 5717, "tid": 5717, "ts": 6302685246532.916, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685246742.146, "dur": 1.690, + "args": { + "External id": 123987, "cbid": 317, "correlation": 241665023 + } + }, + { + "ph": "f", "id": 241665023, "pid": 5717, "tid": 5717, "ts": 6302685246742.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685246757.136, "dur": 3.860, + "args": { + "External id": 123988, "cbid": 138, "correlation": 241665026 + } + }, + { + "ph": "f", "id": 241665026, "pid": 5717, "tid": 5717, "ts": 6302685246757.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685250404.248, "dur": 1.664, + "args": { + "External id": 123992, "device": 3, "context": 1, "stream": 7, "correlation": 241665036, "bytes": 7224, "memory bandwidth (GB/s)": 4.341346153846154 + } + }, + { + "ph": "f", "id": 241665036, "pid": 3, "tid": 7, "ts": 6302685250404.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685246790.206, "dur": 16.600, + "args": { + "External id": 123992, "cbid": 41, "correlation": 241665036 + } + }, + { + "ph": "s", "id": 241665036, "pid": 5717, "tid": 5717, "ts": 6302685246790.206, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685246814.236, "dur": 2.570, + "args": { + "External id": 123987, "cbid": 135, "correlation": 241665040 + } + }, + { + "ph": "f", "id": 241665040, "pid": 5717, "tid": 5717, "ts": 6302685246814.236, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685250410.424, "dur": 344.163, + "args": { + "External id": 123987, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665044, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665044, "pid": 3, "tid": 7, "ts": 6302685250410.424, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685246820.506, "dur": 14.680, + "args": { + "External id": 123987, "cbid": 211, "correlation": 241665044 + } + }, + { + "ph": "s", "id": 241665044, "pid": 5717, "tid": 5717, "ts": 6302685246820.506, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685246958.826, "dur": 1.789, + "args": { + "cbid": 135, "correlation": 241665055 + } + }, + { + "ph": "f", "id": 241665055, "pid": 5717, "tid": 5717, "ts": 6302685246958.826, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685250755.195, "dur": 370.147, + "args": { + "External id": 123999, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665081, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665081, "pid": 3, "tid": 7, "ts": 6302685250755.195, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685247258.565, "dur": 15.990, + "args": { + "External id": 123999, "cbid": 307, "correlation": 241665081 + } + }, + { + "ph": "s", "id": 241665081, "pid": 5717, "tid": 5717, "ts": 6302685247258.565, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685251125.982, "dur": 142.113, + "args": { + "External id": 124005, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665104, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665104, "pid": 3, "tid": 7, "ts": 6302685251125.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685247495.044, "dur": 15.680, + "args": { + "External id": 124005, "cbid": 211, "correlation": 241665104 + } + }, + { + "ph": "s", "id": 241665104, "pid": 5717, "tid": 5717, "ts": 6302685247495.044, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685251268.703, "dur": 140.833, + "args": { + "External id": 124006, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665127, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665127, "pid": 3, "tid": 7, "ts": 6302685251268.703, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685247546.754, "dur": 9.610, + "args": { + "External id": 124006, "cbid": 211, "correlation": 241665127 + } + }, + { + "ph": "s", "id": 241665127, "pid": 5717, "tid": 5717, "ts": 6302685247546.754, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685251410.176, "dur": 140.257, + "args": { + "External id": 124007, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665150, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665150, "pid": 3, "tid": 7, "ts": 6302685251410.176, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685247586.954, "dur": 7.390, + "args": { + "External id": 124007, "cbid": 211, "correlation": 241665150 + } + }, + { + "ph": "s", "id": 241665150, "pid": 5717, "tid": 5717, "ts": 6302685247586.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685251551.105, "dur": 53.376, + "args": { + "External id": 124024, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665170, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665170, "pid": 3, "tid": 7, "ts": 6302685251551.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685247982.493, "dur": 14.520, + "args": { + "External id": 124024, "cbid": 307, "correlation": 241665170 + } + }, + { + "ph": "s", "id": 241665170, "pid": 5717, "tid": 5717, "ts": 6302685247982.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685251605.121, "dur": 61.889, + "args": { + "External id": 124040, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665188, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665188, "pid": 3, "tid": 7, "ts": 6302685251605.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685248274.543, "dur": 13.820, + "args": { + "External id": 124040, "cbid": 307, "correlation": 241665188 + } + }, + { + "ph": "s", "id": 241665188, "pid": 5717, "tid": 5717, "ts": 6302685248274.543, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685248502.622, "dur": 0.790, + "args": { + "External id": 124046, "cbid": 200, "correlation": 241665195 + } + }, + { + "ph": "f", "id": 241665195, "pid": 5717, "tid": 5717, "ts": 6302685248502.622, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685248503.602, "dur": 0.300, + "args": { + "External id": 124046, "cbid": 200, "correlation": 241665196 + } + }, + { + "ph": "f", "id": 241665196, "pid": 5717, "tid": 5717, "ts": 6302685248503.602, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685248546.162, "dur": 0.570, + "args": { + "External id": 124046, "cbid": 200, "correlation": 241665219 + } + }, + { + "ph": "f", "id": 241665219, "pid": 5717, "tid": 5717, "ts": 6302685248546.162, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685248556.422, "dur": 3.080, + "args": { + "External id": 124046, "cbid": 273, "correlation": 241665228 + } + }, + { + "ph": "f", "id": 241665228, "pid": 5717, "tid": 5717, "ts": 6302685248556.422, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685251667.682, "dur": 412.419, + "args": { + "External id": 124046, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665229, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665229, "pid": 3, "tid": 7, "ts": 6302685251667.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685248560.422, "dur": 15.710, + "args": { + "External id": 124046, "cbid": 211, "correlation": 241665229 + } + }, + { + "ph": "s", "id": 241665229, "pid": 5717, "tid": 5717, "ts": 6302685248560.422, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685252080.709, "dur": 141.953, + "args": { + "External id": 124052, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665252, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665252, "pid": 3, "tid": 7, "ts": 6302685252080.709, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685248672.202, "dur": 11.950, + "args": { + "External id": 124052, "cbid": 211, "correlation": 241665252 + } + }, + { + "ph": "s", "id": 241665252, "pid": 5717, "tid": 5717, "ts": 6302685248672.202, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685252223.366, "dur": 103.201, + "args": { + "External id": 124056, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665278, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665278, "pid": 3, "tid": 7, "ts": 6302685252223.366, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685248918.361, "dur": 13.960, + "args": { + "External id": 124056, "cbid": 307, "correlation": 241665278 + } + }, + { + "ph": "s", "id": 241665278, "pid": 5717, "tid": 5717, "ts": 6302685248918.361, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685252327.271, "dur": 338.914, + "args": { + "External id": 124057, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665298, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665298, "pid": 3, "tid": 7, "ts": 6302685252327.271, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685248979.631, "dur": 9.000, + "args": { + "External id": 124057, "cbid": 211, "correlation": 241665298 + } + }, + { + "ph": "s", "id": 241665298, "pid": 5717, "tid": 5717, "ts": 6302685248979.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685252666.793, "dur": 339.619, + "args": { + "External id": 124058, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665321, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665321, "pid": 3, "tid": 7, "ts": 6302685252666.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685249021.141, "dur": 8.990, + "args": { + "External id": 124058, "cbid": 211, "correlation": 241665321 + } + }, + { + "ph": "s", "id": 241665321, "pid": 5717, "tid": 5717, "ts": 6302685249021.141, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 3, "tid": 7, + "ts": 6302685253007.052, "dur": 216.033, + "args": { + "External id": 124059, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665333, "registers per thread": 16, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665333, "pid": 3, "tid": 7, "ts": 6302685253007.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685249081.031, "dur": 8.150, + "args": { + "External id": 124059, "cbid": 307, "correlation": 241665333 + } + }, + { + "ph": "s", "id": 241665333, "pid": 5717, "tid": 5717, "ts": 6302685249081.031, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685249125.711, "dur": 1.870, + "args": { + "External id": 124060, "cbid": 210, "correlation": 241665353 + } + }, + { + "ph": "f", "id": 241665353, "pid": 5717, "tid": 5717, "ts": 6302685249125.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685253223.757, "dur": 355.299, + "args": { + "External id": 124060, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665354, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665354, "pid": 3, "tid": 7, "ts": 6302685253223.757, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685249130.351, "dur": 8.320, + "args": { + "External id": 124060, "cbid": 211, "correlation": 241665354 + } + }, + { + "ph": "s", "id": 241665354, "pid": 5717, "tid": 5717, "ts": 6302685249130.351, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 3, "tid": 7, + "ts": 6302685253579.696, "dur": 120.577, + "args": { + "External id": 124061, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665361, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665361, "pid": 3, "tid": 7, "ts": 6302685253579.696, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685249183.010, "dur": 7.971, + "args": { + "External id": 124061, "cbid": 307, "correlation": 241665361 + } + }, + { + "ph": "s", "id": 241665361, "pid": 5717, "tid": 5717, "ts": 6302685249183.010, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685249810.900, "dur": 13.504, + "args": { + "External id": 124077, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241665376, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241665376, "pid": 3, "tid": 17, "ts": 6302685249810.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685249793.889, "dur": 18.290, + "args": { + "External id": 124077, "cbid": 211, "correlation": 241665376 + } + }, + { + "ph": "s", "id": 241665376, "pid": 5717, "tid": 5717, "ts": 6302685249793.889, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685249981.173, "dur": 8.480, + "args": { + "External id": 124093, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241665389, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241665389, "pid": 3, "tid": 17, "ts": 6302685249981.173, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685249968.029, "dur": 14.000, + "args": { + "External id": 124093, "cbid": 211, "correlation": 241665389 + } + }, + { + "ph": "s", "id": 241665389, "pid": 5717, "tid": 5717, "ts": 6302685249968.029, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685250020.759, "dur": 1.970, + "args": { + "cbid": 135, "correlation": 241665399 + } + }, + { + "ph": "f", "id": 241665399, "pid": 5717, "tid": 5717, "ts": 6302685250020.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685250025.608, "dur": 2.031, + "args": { + "cbid": 147, "correlation": 241665403 + } + }, + { + "ph": "s", "id": 241665403, "pid": 5717, "tid": 5717, "ts": 6302685250025.608, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685250111.619, "dur": 1.509, + "args": { + "External id": 124095, "cbid": 317, "correlation": 241665416 + } + }, + { + "ph": "f", "id": 241665416, "pid": 5717, "tid": 5717, "ts": 6302685250111.619, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685250117.068, "dur": 2.100, + "args": { + "External id": 124095, "cbid": 135, "correlation": 241665418 + } + }, + { + "ph": "f", "id": 241665418, "pid": 5717, "tid": 5717, "ts": 6302685250117.068, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685250121.359, "dur": 1.639, + "args": { + "External id": 124095, "cbid": 147, "correlation": 241665422 + } + }, + { + "ph": "s", "id": 241665422, "pid": 5717, "tid": 5717, "ts": 6302685250121.359, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685250149.638, "dur": 1.080, + "args": { + "External id": 124095, "cbid": 409, "correlation": 241665425 + } + }, + { + "ph": "f", "id": 241665425, "pid": 5717, "tid": 5717, "ts": 6302685250149.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685250156.948, "dur": 1.180, + "args": { + "External id": 124095, "cbid": 135, "correlation": 241665428 + } + }, + { + "ph": "f", "id": 241665428, "pid": 5717, "tid": 5717, "ts": 6302685250156.948, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685250158.388, "dur": 1.370, + "args": { + "External id": 124095, "cbid": 147, "correlation": 241665429 + } + }, + { + "ph": "s", "id": 241665429, "pid": 5717, "tid": 5717, "ts": 6302685250158.388, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685250402.264, "dur": 5151.751, + "args": { + "External id": 124095, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241665431, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241665431, "pid": 3, "tid": 20, "ts": 6302685250402.264, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685250161.458, "dur": 16.530, + "args": { + "External id": 124095, "cbid": 430, "correlation": 241665431 + } + }, + { + "ph": "s", "id": 241665431, "pid": 5717, "tid": 5717, "ts": 6302685250161.458, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685250179.688, "dur": 0.620, + "args": { + "External id": 124095, "cbid": 135, "correlation": 241665433 + } + }, + { + "ph": "f", "id": 241665433, "pid": 5717, "tid": 5717, "ts": 6302685250179.688, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685250180.488, "dur": 0.830, + "args": { + "External id": 124095, "cbid": 147, "correlation": 241665434 + } + }, + { + "ph": "s", "id": 241665434, "pid": 5717, "tid": 5717, "ts": 6302685250180.488, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685250183.818, "dur": 1.140, + "args": { + "External id": 124095, "cbid": 135, "correlation": 241665437 + } + }, + { + "ph": "f", "id": 241665437, "pid": 5717, "tid": 5717, "ts": 6302685250183.818, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685250197.448, "dur": 0.700, + "args": { + "External id": 124095, "cbid": 135, "correlation": 241665444 + } + }, + { + "ph": "f", "id": 241665444, "pid": 5717, "tid": 5717, "ts": 6302685250197.448, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685250242.468, "dur": 1.730, + "args": { + "External id": 124097, "cbid": 147, "correlation": 241665449 + } + }, + { + "ph": "s", "id": 241665449, "pid": 5717, "tid": 5717, "ts": 6302685250242.468, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685250270.508, "dur": 1.470, + "args": { + "cbid": 135, "correlation": 241665464 + } + }, + { + "ph": "f", "id": 241665464, "pid": 5717, "tid": 5717, "ts": 6302685250270.508, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685250348.958, "dur": 2.060, + "args": { + "cbid": 147, "correlation": 241665469 + } + }, + { + "ph": "s", "id": 241665469, "pid": 5717, "tid": 5717, "ts": 6302685250348.958, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685250354.158, "dur": 1.050, + "args": { + "cbid": 147, "correlation": 241665473 + } + }, + { + "ph": "s", "id": 241665473, "pid": 5717, "tid": 5717, "ts": 6302685250354.158, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685250416.768, "dur": 3.850, + "args": { + "cbid": 147, "correlation": 241665479 + } + }, + { + "ph": "s", "id": 241665479, "pid": 5717, "tid": 5717, "ts": 6302685250416.768, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685250582.767, "dur": 1.780, + "args": { + "External id": 124110, "cbid": 317, "correlation": 241665520 + } + }, + { + "ph": "f", "id": 241665520, "pid": 5717, "tid": 5717, "ts": 6302685250582.767, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685250598.847, "dur": 4.160, + "args": { + "External id": 124111, "cbid": 138, "correlation": 241665523 + } + }, + { + "ph": "f", "id": 241665523, "pid": 5717, "tid": 5717, "ts": 6302685250598.847, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685255558.079, "dur": 2.336, + "args": { + "External id": 124115, "device": 3, "context": 1, "stream": 7, "correlation": 241665533, "bytes": 7224, "memory bandwidth (GB/s)": 3.0924657534246576 + } + }, + { + "ph": "f", "id": 241665533, "pid": 3, "tid": 7, "ts": 6302685255558.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685250636.347, "dur": 18.300, + "args": { + "External id": 124115, "cbid": 41, "correlation": 241665533 + } + }, + { + "ph": "s", "id": 241665533, "pid": 5717, "tid": 5717, "ts": 6302685250636.347, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685250661.297, "dur": 2.830, + "args": { + "External id": 124110, "cbid": 135, "correlation": 241665537 + } + }, + { + "ph": "f", "id": 241665537, "pid": 5717, "tid": 5717, "ts": 6302685250661.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685255563.199, "dur": 354.979, + "args": { + "External id": 124110, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665541, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665541, "pid": 3, "tid": 7, "ts": 6302685255563.199, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685250668.307, "dur": 17.010, + "args": { + "External id": 124110, "cbid": 211, "correlation": 241665541 + } + }, + { + "ph": "s", "id": 241665541, "pid": 5717, "tid": 5717, "ts": 6302685250668.307, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685250821.377, "dur": 2.240, + "args": { + "cbid": 135, "correlation": 241665552 + } + }, + { + "ph": "f", "id": 241665552, "pid": 5717, "tid": 5717, "ts": 6302685250821.377, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685255921.922, "dur": 360.771, + "args": { + "External id": 124122, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665578, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665578, "pid": 3, "tid": 7, "ts": 6302685255921.922, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685251149.816, "dur": 16.770, + "args": { + "External id": 124122, "cbid": 307, "correlation": 241665578 + } + }, + { + "ph": "s", "id": 241665578, "pid": 5717, "tid": 5717, "ts": 6302685251149.816, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685256283.333, "dur": 142.017, + "args": { + "External id": 124128, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665601, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665601, "pid": 3, "tid": 7, "ts": 6302685256283.333, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685251399.485, "dur": 17.871, + "args": { + "External id": 124128, "cbid": 211, "correlation": 241665601 + } + }, + { + "ph": "s", "id": 241665601, "pid": 5717, "tid": 5717, "ts": 6302685251399.485, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685256425.958, "dur": 139.905, + "args": { + "External id": 124129, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665624, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665624, "pid": 3, "tid": 7, "ts": 6302685256425.958, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685251455.995, "dur": 8.730, + "args": { + "External id": 124129, "cbid": 211, "correlation": 241665624 + } + }, + { + "ph": "s", "id": 241665624, "pid": 5717, "tid": 5717, "ts": 6302685251455.995, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685256566.503, "dur": 140.065, + "args": { + "External id": 124130, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665647, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665647, "pid": 3, "tid": 7, "ts": 6302685256566.503, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685251496.705, "dur": 8.170, + "args": { + "External id": 124130, "cbid": 211, "correlation": 241665647 + } + }, + { + "ph": "s", "id": 241665647, "pid": 5717, "tid": 5717, "ts": 6302685251496.705, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685256707.208, "dur": 52.512, + "args": { + "External id": 124147, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665667, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665667, "pid": 3, "tid": 7, "ts": 6302685256707.208, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685251912.304, "dur": 15.250, + "args": { + "External id": 124147, "cbid": 307, "correlation": 241665667 + } + }, + { + "ph": "s", "id": 241665667, "pid": 5717, "tid": 5717, "ts": 6302685251912.304, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685256760.360, "dur": 62.369, + "args": { + "External id": 124163, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665685, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665685, "pid": 3, "tid": 7, "ts": 6302685256760.360, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685252223.464, "dur": 13.370, + "args": { + "External id": 124163, "cbid": 307, "correlation": 241665685 + } + }, + { + "ph": "s", "id": 241665685, "pid": 5717, "tid": 5717, "ts": 6302685252223.464, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685252456.873, "dur": 0.900, + "args": { + "External id": 124169, "cbid": 200, "correlation": 241665692 + } + }, + { + "ph": "f", "id": 241665692, "pid": 5717, "tid": 5717, "ts": 6302685252456.873, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685252457.973, "dur": 0.330, + "args": { + "External id": 124169, "cbid": 200, "correlation": 241665693 + } + }, + { + "ph": "f", "id": 241665693, "pid": 5717, "tid": 5717, "ts": 6302685252457.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685252501.573, "dur": 0.580, + "args": { + "External id": 124169, "cbid": 200, "correlation": 241665716 + } + }, + { + "ph": "f", "id": 241665716, "pid": 5717, "tid": 5717, "ts": 6302685252501.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685252511.783, "dur": 3.100, + "args": { + "External id": 124169, "cbid": 273, "correlation": 241665725 + } + }, + { + "ph": "f", "id": 241665725, "pid": 5717, "tid": 5717, "ts": 6302685252511.783, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685256823.433, "dur": 415.491, + "args": { + "External id": 124169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665726, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665726, "pid": 3, "tid": 7, "ts": 6302685256823.433, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685252515.823, "dur": 16.560, + "args": { + "External id": 124169, "cbid": 211, "correlation": 241665726 + } + }, + { + "ph": "s", "id": 241665726, "pid": 5717, "tid": 5717, "ts": 6302685252515.823, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685257239.500, "dur": 141.697, + "args": { + "External id": 124175, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665749, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665749, "pid": 3, "tid": 7, "ts": 6302685257239.500, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685252648.053, "dur": 12.270, + "args": { + "External id": 124175, "cbid": 211, "correlation": 241665749 + } + }, + { + "ph": "s", "id": 241665749, "pid": 5717, "tid": 5717, "ts": 6302685252648.053, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685257381.901, "dur": 91.616, + "args": { + "External id": 124179, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665775, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665775, "pid": 3, "tid": 7, "ts": 6302685257381.901, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685252903.342, "dur": 14.510, + "args": { + "External id": 124179, "cbid": 307, "correlation": 241665775 + } + }, + { + "ph": "s", "id": 241665775, "pid": 5717, "tid": 5717, "ts": 6302685252903.342, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685257474.125, "dur": 365.603, + "args": { + "External id": 124180, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665795, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665795, "pid": 3, "tid": 7, "ts": 6302685257474.125, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685252966.902, "dur": 11.090, + "args": { + "External id": 124180, "cbid": 211, "correlation": 241665795 + } + }, + { + "ph": "s", "id": 241665795, "pid": 5717, "tid": 5717, "ts": 6302685252966.902, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685257840.368, "dur": 454.212, + "args": { + "External id": 124181, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665818, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665818, "pid": 3, "tid": 7, "ts": 6302685257840.368, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685253012.272, "dur": 7.930, + "args": { + "External id": 124181, "cbid": 211, "correlation": 241665818 + } + }, + { + "ph": "s", "id": 241665818, "pid": 5717, "tid": 5717, "ts": 6302685253012.272, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 3, "tid": 7, + "ts": 6302685258295.284, "dur": 216.449, + "args": { + "External id": 124182, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665830, "registers per thread": 16, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665830, "pid": 3, "tid": 7, "ts": 6302685258295.284, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685253072.422, "dur": 8.800, + "args": { + "External id": 124182, "cbid": 307, "correlation": 241665830 + } + }, + { + "ph": "s", "id": 241665830, "pid": 5717, "tid": 5717, "ts": 6302685253072.422, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685253118.372, "dur": 1.900, + "args": { + "External id": 124183, "cbid": 210, "correlation": 241665850 + } + }, + { + "ph": "f", "id": 241665850, "pid": 5717, "tid": 5717, "ts": 6302685253118.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685258512.437, "dur": 360.451, + "args": { + "External id": 124183, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665851, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241665851, "pid": 3, "tid": 7, "ts": 6302685258512.437, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685253123.101, "dur": 8.520, + "args": { + "External id": 124183, "cbid": 211, "correlation": 241665851 + } + }, + { + "ph": "s", "id": 241665851, "pid": 5717, "tid": 5717, "ts": 6302685253123.101, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 3, "tid": 7, + "ts": 6302685258873.592, "dur": 114.145, + "args": { + "External id": 124184, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241665858, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241665858, "pid": 3, "tid": 7, "ts": 6302685258873.592, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685253175.641, "dur": 7.740, + "args": { + "External id": 124184, "cbid": 307, "correlation": 241665858 + } + }, + { + "ph": "s", "id": 241665858, "pid": 5717, "tid": 5717, "ts": 6302685253175.641, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685253769.521, "dur": 61.633, + "args": { + "External id": 124200, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241665873, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241665873, "pid": 3, "tid": 17, "ts": 6302685253769.521, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685253743.110, "dur": 17.880, + "args": { + "External id": 124200, "cbid": 211, "correlation": 241665873 + } + }, + { + "ph": "s", "id": 241665873, "pid": 5717, "tid": 5717, "ts": 6302685253743.110, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685253929.715, "dur": 8.000, + "args": { + "External id": 124216, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241665886, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241665886, "pid": 3, "tid": 17, "ts": 6302685253929.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685253916.590, "dur": 13.910, + "args": { + "External id": 124216, "cbid": 211, "correlation": 241665886 + } + }, + { + "ph": "s", "id": 241665886, "pid": 5717, "tid": 5717, "ts": 6302685253916.590, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685253966.660, "dur": 1.980, + "args": { + "cbid": 135, "correlation": 241665896 + } + }, + { + "ph": "f", "id": 241665896, "pid": 5717, "tid": 5717, "ts": 6302685253966.660, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685253971.450, "dur": 2.010, + "args": { + "cbid": 147, "correlation": 241665900 + } + }, + { + "ph": "s", "id": 241665900, "pid": 5717, "tid": 5717, "ts": 6302685253971.450, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685254052.550, "dur": 1.380, + "args": { + "External id": 124218, "cbid": 317, "correlation": 241665913 + } + }, + { + "ph": "f", "id": 241665913, "pid": 5717, "tid": 5717, "ts": 6302685254052.550, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685254056.719, "dur": 1.940, + "args": { + "External id": 124218, "cbid": 135, "correlation": 241665915 + } + }, + { + "ph": "f", "id": 241665915, "pid": 5717, "tid": 5717, "ts": 6302685254056.719, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685254060.659, "dur": 1.571, + "args": { + "External id": 124218, "cbid": 147, "correlation": 241665919 + } + }, + { + "ph": "s", "id": 241665919, "pid": 5717, "tid": 5717, "ts": 6302685254060.659, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685254088.070, "dur": 1.080, + "args": { + "External id": 124218, "cbid": 409, "correlation": 241665922 + } + }, + { + "ph": "f", "id": 241665922, "pid": 5717, "tid": 5717, "ts": 6302685254088.070, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685254095.010, "dur": 1.149, + "args": { + "External id": 124218, "cbid": 135, "correlation": 241665925 + } + }, + { + "ph": "f", "id": 241665925, "pid": 5717, "tid": 5717, "ts": 6302685254095.010, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685254096.419, "dur": 1.190, + "args": { + "External id": 124218, "cbid": 147, "correlation": 241665926 + } + }, + { + "ph": "s", "id": 241665926, "pid": 5717, "tid": 5717, "ts": 6302685254096.419, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685255554.719, "dur": 5208.871, + "args": { + "External id": 124218, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241665928, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241665928, "pid": 3, "tid": 20, "ts": 6302685255554.719, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685254099.229, "dur": 14.250, + "args": { + "External id": 124218, "cbid": 430, "correlation": 241665928 + } + }, + { + "ph": "s", "id": 241665928, "pid": 5717, "tid": 5717, "ts": 6302685254099.229, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685254115.069, "dur": 0.590, + "args": { + "External id": 124218, "cbid": 135, "correlation": 241665930 + } + }, + { + "ph": "f", "id": 241665930, "pid": 5717, "tid": 5717, "ts": 6302685254115.069, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685254115.829, "dur": 0.720, + "args": { + "External id": 124218, "cbid": 147, "correlation": 241665931 + } + }, + { + "ph": "s", "id": 241665931, "pid": 5717, "tid": 5717, "ts": 6302685254115.829, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685254118.949, "dur": 1.060, + "args": { + "External id": 124218, "cbid": 135, "correlation": 241665934 + } + }, + { + "ph": "f", "id": 241665934, "pid": 5717, "tid": 5717, "ts": 6302685254118.949, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685254132.279, "dur": 0.690, + "args": { + "External id": 124218, "cbid": 135, "correlation": 241665941 + } + }, + { + "ph": "f", "id": 241665941, "pid": 5717, "tid": 5717, "ts": 6302685254132.279, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685254174.449, "dur": 1.610, + "args": { + "External id": 124220, "cbid": 147, "correlation": 241665946 + } + }, + { + "ph": "s", "id": 241665946, "pid": 5717, "tid": 5717, "ts": 6302685254174.449, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685254201.179, "dur": 1.430, + "args": { + "cbid": 135, "correlation": 241665961 + } + }, + { + "ph": "f", "id": 241665961, "pid": 5717, "tid": 5717, "ts": 6302685254201.179, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685254263.149, "dur": 1.610, + "args": { + "cbid": 147, "correlation": 241665966 + } + }, + { + "ph": "s", "id": 241665966, "pid": 5717, "tid": 5717, "ts": 6302685254263.149, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685254267.729, "dur": 0.940, + "args": { + "cbid": 147, "correlation": 241665970 + } + }, + { + "ph": "s", "id": 241665970, "pid": 5717, "tid": 5717, "ts": 6302685254267.729, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685254337.469, "dur": 3.680, + "args": { + "cbid": 147, "correlation": 241665976 + } + }, + { + "ph": "s", "id": 241665976, "pid": 5717, "tid": 5717, "ts": 6302685254337.469, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685254509.409, "dur": 7.049, + "args": { + "External id": 124233, "cbid": 317, "correlation": 241666017 + } + }, + { + "ph": "f", "id": 241666017, "pid": 5717, "tid": 5717, "ts": 6302685254509.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685254530.338, "dur": 3.591, + "args": { + "External id": 124234, "cbid": 138, "correlation": 241666020 + } + }, + { + "ph": "f", "id": 241666020, "pid": 5717, "tid": 5717, "ts": 6302685254530.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685260766.182, "dur": 1.664, + "args": { + "External id": 124238, "device": 3, "context": 1, "stream": 7, "correlation": 241666031, "bytes": 7224, "memory bandwidth (GB/s)": 4.341346153846154 + } + }, + { + "ph": "f", "id": 241666031, "pid": 3, "tid": 7, "ts": 6302685260766.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685254566.808, "dur": 17.270, + "args": { + "External id": 124238, "cbid": 41, "correlation": 241666031 + } + }, + { + "ph": "s", "id": 241666031, "pid": 5717, "tid": 5717, "ts": 6302685254566.808, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685254591.998, "dur": 2.660, + "args": { + "External id": 124233, "cbid": 135, "correlation": 241666035 + } + }, + { + "ph": "f", "id": 241666035, "pid": 5717, "tid": 5717, "ts": 6302685254591.998, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685260770.342, "dur": 16.800, + "args": { + "External id": 124233, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666039, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666039, "pid": 3, "tid": 7, "ts": 6302685260770.342, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685254598.328, "dur": 15.750, + "args": { + "External id": 124233, "cbid": 211, "correlation": 241666039 + } + }, + { + "ph": "s", "id": 241666039, "pid": 5717, "tid": 5717, "ts": 6302685254598.328, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685254743.988, "dur": 1.930, + "args": { + "cbid": 135, "correlation": 241666050 + } + }, + { + "ph": "f", "id": 241666050, "pid": 5717, "tid": 5717, "ts": 6302685254743.988, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685260787.750, "dur": 19.521, + "args": { + "External id": 124245, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666076, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666076, "pid": 3, "tid": 7, "ts": 6302685260787.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685255062.267, "dur": 24.280, + "args": { + "External id": 124245, "cbid": 307, "correlation": 241666076 + } + }, + { + "ph": "s", "id": 241666076, "pid": 5717, "tid": 5717, "ts": 6302685255062.267, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685260807.879, "dur": 122.880, + "args": { + "External id": 124251, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666099, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666099, "pid": 3, "tid": 7, "ts": 6302685260807.879, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685255331.236, "dur": 17.251, + "args": { + "External id": 124251, "cbid": 211, "correlation": 241666099 + } + }, + { + "ph": "s", "id": 241666099, "pid": 5717, "tid": 5717, "ts": 6302685255331.236, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685260931.463, "dur": 120.385, + "args": { + "External id": 124252, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666122, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666122, "pid": 3, "tid": 7, "ts": 6302685260931.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685255402.867, "dur": 8.840, + "args": { + "External id": 124252, "cbid": 211, "correlation": 241666122 + } + }, + { + "ph": "s", "id": 241666122, "pid": 5717, "tid": 5717, "ts": 6302685255402.867, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685261052.456, "dur": 119.937, + "args": { + "External id": 124253, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666145, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666145, "pid": 3, "tid": 7, "ts": 6302685261052.456, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685255444.556, "dur": 7.770, + "args": { + "External id": 124253, "cbid": 211, "correlation": 241666145 + } + }, + { + "ph": "s", "id": 241666145, "pid": 5717, "tid": 5717, "ts": 6302685255444.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685261173.065, "dur": 52.161, + "args": { + "External id": 124270, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666165, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666165, "pid": 3, "tid": 7, "ts": 6302685261173.065, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685255857.366, "dur": 15.069, + "args": { + "External id": 124270, "cbid": 307, "correlation": 241666165 + } + }, + { + "ph": "s", "id": 241666165, "pid": 5717, "tid": 5717, "ts": 6302685255857.366, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685261225.898, "dur": 61.120, + "args": { + "External id": 124286, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666183, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666183, "pid": 3, "tid": 7, "ts": 6302685261225.898, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685256173.235, "dur": 13.770, + "args": { + "External id": 124286, "cbid": 307, "correlation": 241666183 + } + }, + { + "ph": "s", "id": 241666183, "pid": 5717, "tid": 5717, "ts": 6302685256173.235, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685256416.554, "dur": 0.920, + "args": { + "External id": 124292, "cbid": 200, "correlation": 241666190 + } + }, + { + "ph": "f", "id": 241666190, "pid": 5717, "tid": 5717, "ts": 6302685256416.554, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685256417.684, "dur": 0.330, + "args": { + "External id": 124292, "cbid": 200, "correlation": 241666191 + } + }, + { + "ph": "f", "id": 241666191, "pid": 5717, "tid": 5717, "ts": 6302685256417.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685256462.104, "dur": 0.610, + "args": { + "External id": 124292, "cbid": 200, "correlation": 241666214 + } + }, + { + "ph": "f", "id": 241666214, "pid": 5717, "tid": 5717, "ts": 6302685256462.104, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685256471.834, "dur": 3.380, + "args": { + "External id": 124292, "cbid": 273, "correlation": 241666223 + } + }, + { + "ph": "f", "id": 241666223, "pid": 5717, "tid": 5717, "ts": 6302685256471.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685261287.690, "dur": 404.131, + "args": { + "External id": 124292, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666224, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666224, "pid": 3, "tid": 7, "ts": 6302685261287.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685256476.204, "dur": 17.140, + "args": { + "External id": 124292, "cbid": 211, "correlation": 241666224 + } + }, + { + "ph": "s", "id": 241666224, "pid": 5717, "tid": 5717, "ts": 6302685256476.204, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685261692.493, "dur": 122.049, + "args": { + "External id": 124298, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666247, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666247, "pid": 3, "tid": 7, "ts": 6302685261692.493, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685256598.134, "dur": 12.480, + "args": { + "External id": 124298, "cbid": 211, "correlation": 241666247 + } + }, + { + "ph": "s", "id": 241666247, "pid": 5717, "tid": 5717, "ts": 6302685256598.134, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685261815.118, "dur": 88.705, + "args": { + "External id": 124302, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666273, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666273, "pid": 3, "tid": 7, "ts": 6302685261815.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685256861.193, "dur": 14.470, + "args": { + "External id": 124302, "cbid": 307, "correlation": 241666273 + } + }, + { + "ph": "s", "id": 241666273, "pid": 5717, "tid": 5717, "ts": 6302685256861.193, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685261904.527, "dur": 325.186, + "args": { + "External id": 124303, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666293, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666293, "pid": 3, "tid": 7, "ts": 6302685261904.527, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685256926.563, "dur": 11.280, + "args": { + "External id": 124303, "cbid": 211, "correlation": 241666293 + } + }, + { + "ph": "s", "id": 241666293, "pid": 5717, "tid": 5717, "ts": 6302685256926.563, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685262230.353, "dur": 320.547, + "args": { + "External id": 124304, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666316, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666316, "pid": 3, "tid": 7, "ts": 6302685262230.353, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685256973.233, "dur": 8.440, + "args": { + "External id": 124304, "cbid": 211, "correlation": 241666316 + } + }, + { + "ph": "s", "id": 241666316, "pid": 5717, "tid": 5717, "ts": 6302685256973.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 3, "tid": 7, + "ts": 6302685262551.604, "dur": 213.057, + "args": { + "External id": 124305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666328, "pid": 3, "tid": 7, "ts": 6302685262551.604, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257034.253, "dur": 8.960, + "args": { + "External id": 124305, "cbid": 307, "correlation": 241666328 + } + }, + { + "ph": "s", "id": 241666328, "pid": 5717, "tid": 5717, "ts": 6302685257034.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685257082.833, "dur": 1.990, + "args": { + "External id": 124306, "cbid": 210, "correlation": 241666348 + } + }, + { + "ph": "f", "id": 241666348, "pid": 5717, "tid": 5717, "ts": 6302685257082.833, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685262765.365, "dur": 324.995, + "args": { + "External id": 124306, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666349, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666349, "pid": 3, "tid": 7, "ts": 6302685262765.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257087.763, "dur": 9.340, + "args": { + "External id": 124306, "cbid": 211, "correlation": 241666349 + } + }, + { + "ph": "s", "id": 241666349, "pid": 5717, "tid": 5717, "ts": 6302685257087.763, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 3, "tid": 7, + "ts": 6302685263091.032, "dur": 41.920, + "args": { + "External id": 124307, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666356, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666356, "pid": 3, "tid": 7, "ts": 6302685263091.032, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257144.703, "dur": 8.489, + "args": { + "External id": 124307, "cbid": 307, "correlation": 241666356 + } + }, + { + "ph": "s", "id": 241666356, "pid": 5717, "tid": 5717, "ts": 6302685257144.703, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685263133.592, "dur": 32.097, + "args": { + "External id": 124313, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666367, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666367, "pid": 3, "tid": 7, "ts": 6302685263133.592, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257514.742, "dur": 25.480, + "args": { + "External id": 124313, "cbid": 211, "correlation": 241666367 + } + }, + { + "ph": "s", "id": 241666367, "pid": 5717, "tid": 5717, "ts": 6302685257514.742, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685263166.393, "dur": 75.264, + "args": { + "External id": 124314, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666378, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666378, "pid": 3, "tid": 7, "ts": 6302685263166.393, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257599.991, "dur": 17.871, + "args": { + "External id": 124314, "cbid": 211, "correlation": 241666378 + } + }, + { + "ph": "s", "id": 241666378, "pid": 5717, "tid": 5717, "ts": 6302685257599.991, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685263242.361, "dur": 15.264, + "args": { + "External id": 124317, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666392, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666392, "pid": 3, "tid": 7, "ts": 6302685263242.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257669.601, "dur": 13.820, + "args": { + "External id": 124317, "cbid": 211, "correlation": 241666392 + } + }, + { + "ph": "s", "id": 241666392, "pid": 5717, "tid": 5717, "ts": 6302685257669.601, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685263258.233, "dur": 1.792, + "args": { + "External id": 124319, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666398, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241666398, "pid": 3, "tid": 7, "ts": 6302685263258.233, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257715.811, "dur": 9.600, + "args": { + "External id": 124319, "cbid": 211, "correlation": 241666398 + } + }, + { + "ph": "s", "id": 241666398, "pid": 5717, "tid": 5717, "ts": 6302685257715.811, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685263260.633, "dur": 1.024, + "args": { + "External id": 124320, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666408, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241666408, "pid": 3, "tid": 7, "ts": 6302685263260.633, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257753.301, "dur": 11.690, + "args": { + "External id": 124320, "cbid": 211, "correlation": 241666408 + } + }, + { + "ph": "s", "id": 241666408, "pid": 5717, "tid": 5717, "ts": 6302685257753.301, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685263262.233, "dur": 88.609, + "args": { + "External id": 124321, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666418, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666418, "pid": 3, "tid": 7, "ts": 6302685263262.233, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257795.191, "dur": 9.590, + "args": { + "External id": 124321, "cbid": 211, "correlation": 241666418 + } + }, + { + "ph": "s", "id": 241666418, "pid": 5717, "tid": 5717, "ts": 6302685257795.191, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685263351.546, "dur": 48.000, + "args": { + "External id": 124326, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666431, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666431, "pid": 3, "tid": 7, "ts": 6302685263351.546, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257836.961, "dur": 10.240, + "args": { + "External id": 124326, "cbid": 211, "correlation": 241666431 + } + }, + { + "ph": "s", "id": 241666431, "pid": 5717, "tid": 5717, "ts": 6302685257836.961, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685263400.250, "dur": 22.528, + "args": { + "External id": 124327, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666442, "pid": 3, "tid": 7, "ts": 6302685263400.250, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685257869.061, "dur": 8.170, + "args": { + "External id": 124327, "cbid": 211, "correlation": 241666442 + } + }, + { + "ph": "s", "id": 241666442, "pid": 5717, "tid": 5717, "ts": 6302685257869.061, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685263423.482, "dur": 121.345, + "args": { + "External id": 124335, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666465, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666465, "pid": 3, "tid": 7, "ts": 6302685263423.482, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685258004.901, "dur": 13.169, + "args": { + "External id": 124335, "cbid": 211, "correlation": 241666465 + } + }, + { + "ph": "s", "id": 241666465, "pid": 5717, "tid": 5717, "ts": 6302685258004.901, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685263545.435, "dur": 120.129, + "args": { + "External id": 124344, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666488, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666488, "pid": 3, "tid": 7, "ts": 6302685263545.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685258106.490, "dur": 10.690, + "args": { + "External id": 124344, "cbid": 211, "correlation": 241666488 + } + }, + { + "ph": "s", "id": 241666488, "pid": 5717, "tid": 5717, "ts": 6302685258106.490, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685263666.204, "dur": 119.841, + "args": { + "External id": 124353, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666511, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666511, "pid": 3, "tid": 7, "ts": 6302685263666.204, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685258192.290, "dur": 10.180, + "args": { + "External id": 124353, "cbid": 211, "correlation": 241666511 + } + }, + { + "ph": "s", "id": 241666511, "pid": 5717, "tid": 5717, "ts": 6302685258192.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685263786.781, "dur": 51.840, + "args": { + "External id": 124361, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666530, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666530, "pid": 3, "tid": 7, "ts": 6302685263786.781, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685258563.239, "dur": 14.420, + "args": { + "External id": 124361, "cbid": 307, "correlation": 241666530 + } + }, + { + "ph": "s", "id": 241666530, "pid": 5717, "tid": 5717, "ts": 6302685258563.239, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685263839.261, "dur": 60.705, + "args": { + "External id": 124364, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666547, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666547, "pid": 3, "tid": 7, "ts": 6302685263839.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685258804.129, "dur": 13.220, + "args": { + "External id": 124364, "cbid": 307, "correlation": 241666547 + } + }, + { + "ph": "s", "id": 241666547, "pid": 5717, "tid": 5717, "ts": 6302685258804.129, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685259000.788, "dur": 0.750, + "args": { + "External id": 124368, "cbid": 200, "correlation": 241666551 + } + }, + { + "ph": "f", "id": 241666551, "pid": 5717, "tid": 5717, "ts": 6302685259000.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685259001.708, "dur": 0.310, + "args": { + "External id": 124368, "cbid": 200, "correlation": 241666552 + } + }, + { + "ph": "f", "id": 241666552, "pid": 5717, "tid": 5717, "ts": 6302685259001.708, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685259051.138, "dur": 1.650, + "args": { + "External id": 124368, "cbid": 200, "correlation": 241666575 + } + }, + { + "ph": "f", "id": 241666575, "pid": 5717, "tid": 5717, "ts": 6302685259051.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685259068.078, "dur": 3.030, + "args": { + "External id": 124368, "cbid": 273, "correlation": 241666584 + } + }, + { + "ph": "f", "id": 241666584, "pid": 5717, "tid": 5717, "ts": 6302685259068.078, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685263900.606, "dur": 409.347, + "args": { + "External id": 124368, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666585, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666585, "pid": 3, "tid": 7, "ts": 6302685263900.606, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259071.958, "dur": 15.330, + "args": { + "External id": 124368, "cbid": 211, "correlation": 241666585 + } + }, + { + "ph": "s", "id": 241666585, "pid": 5717, "tid": 5717, "ts": 6302685259071.958, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685264310.657, "dur": 122.369, + "args": { + "External id": 124384, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666611, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666611, "pid": 3, "tid": 7, "ts": 6302685264310.657, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259272.788, "dur": 14.740, + "args": { + "External id": 124384, "cbid": 211, "correlation": 241666611 + } + }, + { + "ph": "s", "id": 241666611, "pid": 5717, "tid": 5717, "ts": 6302685259272.788, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685264433.666, "dur": 60.928, + "args": { + "External id": 124386, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666621, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666621, "pid": 3, "tid": 7, "ts": 6302685264433.666, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259355.978, "dur": 12.809, + "args": { + "External id": 124386, "cbid": 211, "correlation": 241666621 + } + }, + { + "ph": "s", "id": 241666621, "pid": 5717, "tid": 5717, "ts": 6302685259355.978, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685264495.266, "dur": 49.697, + "args": { + "External id": 124391, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666634, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666634, "pid": 3, "tid": 7, "ts": 6302685264495.266, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259431.727, "dur": 11.510, + "args": { + "External id": 124391, "cbid": 211, "correlation": 241666634 + } + }, + { + "ph": "s", "id": 241666634, "pid": 5717, "tid": 5717, "ts": 6302685259431.727, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685264545.571, "dur": 68.928, + "args": { + "External id": 124392, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666645, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666645, "pid": 3, "tid": 7, "ts": 6302685264545.571, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259469.537, "dur": 9.360, + "args": { + "External id": 124392, "cbid": 211, "correlation": 241666645 + } + }, + { + "ph": "s", "id": 241666645, "pid": 5717, "tid": 5717, "ts": 6302685259469.537, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685264615.203, "dur": 15.200, + "args": { + "External id": 124395, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666659, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666659, "pid": 3, "tid": 7, "ts": 6302685264615.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259502.357, "dur": 7.640, + "args": { + "External id": 124395, "cbid": 211, "correlation": 241666659 + } + }, + { + "ph": "s", "id": 241666659, "pid": 5717, "tid": 5717, "ts": 6302685259502.357, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685264631.107, "dur": 1.504, + "args": { + "External id": 124397, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666665, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241666665, "pid": 3, "tid": 7, "ts": 6302685264631.107, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259530.927, "dur": 6.320, + "args": { + "External id": 124397, "cbid": 211, "correlation": 241666665 + } + }, + { + "ph": "s", "id": 241666665, "pid": 5717, "tid": 5717, "ts": 6302685259530.927, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685264633.251, "dur": 0.992, + "args": { + "External id": 124398, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666675, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241666675, "pid": 3, "tid": 7, "ts": 6302685264633.251, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259554.897, "dur": 6.410, + "args": { + "External id": 124398, "cbid": 211, "correlation": 241666675 + } + }, + { + "ph": "s", "id": 241666675, "pid": 5717, "tid": 5717, "ts": 6302685259554.897, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685264634.851, "dur": 90.657, + "args": { + "External id": 124399, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666685, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666685, "pid": 3, "tid": 7, "ts": 6302685264634.851, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259582.357, "dur": 7.150, + "args": { + "External id": 124399, "cbid": 211, "correlation": 241666685 + } + }, + { + "ph": "s", "id": 241666685, "pid": 5717, "tid": 5717, "ts": 6302685259582.357, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685264726.212, "dur": 47.712, + "args": { + "External id": 124404, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666698, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666698, "pid": 3, "tid": 7, "ts": 6302685264726.212, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259620.047, "dur": 6.980, + "args": { + "External id": 124404, "cbid": 211, "correlation": 241666698 + } + }, + { + "ph": "s", "id": 241666698, "pid": 5717, "tid": 5717, "ts": 6302685259620.047, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685264774.564, "dur": 22.081, + "args": { + "External id": 124405, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666709, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666709, "pid": 3, "tid": 7, "ts": 6302685264774.564, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259647.087, "dur": 6.420, + "args": { + "External id": 124405, "cbid": 211, "correlation": 241666709 + } + }, + { + "ph": "s", "id": 241666709, "pid": 5717, "tid": 5717, "ts": 6302685259647.087, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685264797.349, "dur": 318.018, + "args": { + "External id": 124413, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666732, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666732, "pid": 3, "tid": 7, "ts": 6302685264797.349, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259785.766, "dur": 12.400, + "args": { + "External id": 124413, "cbid": 211, "correlation": 241666732 + } + }, + { + "ph": "s", "id": 241666732, "pid": 5717, "tid": 5717, "ts": 6302685259785.766, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685265116.071, "dur": 321.219, + "args": { + "External id": 124422, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666755, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666755, "pid": 3, "tid": 7, "ts": 6302685265116.071, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685259894.656, "dur": 10.730, + "args": { + "External id": 124422, "cbid": 211, "correlation": 241666755 + } + }, + { + "ph": "s", "id": 241666755, "pid": 5717, "tid": 5717, "ts": 6302685259894.656, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 3, "tid": 7, + "ts": 6302685265437.898, "dur": 213.985, + "args": { + "External id": 124424, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666769, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666769, "pid": 3, "tid": 7, "ts": 6302685265437.898, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260035.006, "dur": 13.930, + "args": { + "External id": 124424, "cbid": 307, "correlation": 241666769 + } + }, + { + "ph": "s", "id": 241666769, "pid": 5717, "tid": 5717, "ts": 6302685260035.006, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685260115.646, "dur": 2.160, + "args": { + "External id": 124433, "cbid": 210, "correlation": 241666791 + } + }, + { + "ph": "f", "id": 241666791, "pid": 5717, "tid": 5717, "ts": 6302685260115.646, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685265652.555, "dur": 323.811, + "args": { + "External id": 124433, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666792, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666792, "pid": 3, "tid": 7, "ts": 6302685265652.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260120.766, "dur": 10.630, + "args": { + "External id": 124433, "cbid": 211, "correlation": 241666792 + } + }, + { + "ph": "s", "id": 241666792, "pid": 5717, "tid": 5717, "ts": 6302685260120.766, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685265977.038, "dur": 52.384, + "args": { + "External id": 124435, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666802, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666802, "pid": 3, "tid": 7, "ts": 6302685265977.038, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260190.016, "dur": 10.320, + "args": { + "External id": 124435, "cbid": 211, "correlation": 241666802 + } + }, + { + "ph": "s", "id": 241666802, "pid": 5717, "tid": 5717, "ts": 6302685260190.016, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685266030.030, "dur": 63.008, + "args": { + "External id": 124440, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666815, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666815, "pid": 3, "tid": 7, "ts": 6302685266030.030, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260274.556, "dur": 12.619, + "args": { + "External id": 124440, "cbid": 211, "correlation": 241666815 + } + }, + { + "ph": "s", "id": 241666815, "pid": 5717, "tid": 5717, "ts": 6302685260274.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685266093.646, "dur": 65.569, + "args": { + "External id": 124441, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666826, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666826, "pid": 3, "tid": 7, "ts": 6302685266093.646, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260323.565, "dur": 8.700, + "args": { + "External id": 124441, "cbid": 211, "correlation": 241666826 + } + }, + { + "ph": "s", "id": 241666826, "pid": 5717, "tid": 5717, "ts": 6302685260323.565, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685266159.855, "dur": 16.224, + "args": { + "External id": 124444, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666840, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666840, "pid": 3, "tid": 7, "ts": 6302685266159.855, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260354.445, "dur": 7.260, + "args": { + "External id": 124444, "cbid": 211, "correlation": 241666840 + } + }, + { + "ph": "s", "id": 241666840, "pid": 5717, "tid": 5717, "ts": 6302685260354.445, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685266176.751, "dur": 1.568, + "args": { + "External id": 124446, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666846, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241666846, "pid": 3, "tid": 7, "ts": 6302685266176.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260379.875, "dur": 6.450, + "args": { + "External id": 124446, "cbid": 211, "correlation": 241666846 + } + }, + { + "ph": "s", "id": 241666846, "pid": 5717, "tid": 5717, "ts": 6302685260379.875, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685266179.023, "dur": 0.992, + "args": { + "External id": 124447, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666856, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241666856, "pid": 3, "tid": 7, "ts": 6302685266179.023, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260406.505, "dur": 6.480, + "args": { + "External id": 124447, "cbid": 211, "correlation": 241666856 + } + }, + { + "ph": "s", "id": 241666856, "pid": 5717, "tid": 5717, "ts": 6302685260406.505, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685266180.623, "dur": 88.769, + "args": { + "External id": 124448, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666866, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666866, "pid": 3, "tid": 7, "ts": 6302685266180.623, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260436.065, "dur": 8.630, + "args": { + "External id": 124448, "cbid": 211, "correlation": 241666866 + } + }, + { + "ph": "s", "id": 241666866, "pid": 5717, "tid": 5717, "ts": 6302685260436.065, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685266270.064, "dur": 48.064, + "args": { + "External id": 124453, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666879, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666879, "pid": 3, "tid": 7, "ts": 6302685266270.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260472.315, "dur": 6.590, + "args": { + "External id": 124453, "cbid": 211, "correlation": 241666879 + } + }, + { + "ph": "s", "id": 241666879, "pid": 5717, "tid": 5717, "ts": 6302685260472.315, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685266318.832, "dur": 22.432, + "args": { + "External id": 124454, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666890, "pid": 3, "tid": 7, "ts": 6302685266318.832, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260498.225, "dur": 8.160, + "args": { + "External id": 124454, "cbid": 211, "correlation": 241666890 + } + }, + { + "ph": "s", "id": 241666890, "pid": 5717, "tid": 5717, "ts": 6302685260498.225, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685266342.000, "dur": 121.025, + "args": { + "External id": 124462, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666913, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666913, "pid": 3, "tid": 7, "ts": 6302685266342.000, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260604.185, "dur": 11.590, + "args": { + "External id": 124462, "cbid": 211, "correlation": 241666913 + } + }, + { + "ph": "s", "id": 241666913, "pid": 5717, "tid": 5717, "ts": 6302685260604.185, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685266463.729, "dur": 120.353, + "args": { + "External id": 124471, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666936, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666936, "pid": 3, "tid": 7, "ts": 6302685266463.729, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260701.575, "dur": 11.589, + "args": { + "External id": 124471, "cbid": 211, "correlation": 241666936 + } + }, + { + "ph": "s", "id": 241666936, "pid": 5717, "tid": 5717, "ts": 6302685260701.575, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685266584.690, "dur": 120.289, + "args": { + "External id": 124480, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666959, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241666959, "pid": 3, "tid": 7, "ts": 6302685266584.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685260810.504, "dur": 11.270, + "args": { + "External id": 124480, "cbid": 211, "correlation": 241666959 + } + }, + { + "ph": "s", "id": 241666959, "pid": 5717, "tid": 5717, "ts": 6302685260810.504, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685266705.587, "dur": 52.288, + "args": { + "External id": 124488, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666978, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666978, "pid": 3, "tid": 7, "ts": 6302685266705.587, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685261116.754, "dur": 14.309, + "args": { + "External id": 124488, "cbid": 307, "correlation": 241666978 + } + }, + { + "ph": "s", "id": 241666978, "pid": 5717, "tid": 5717, "ts": 6302685261116.754, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685266758.515, "dur": 59.969, + "args": { + "External id": 124491, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241666995, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241666995, "pid": 3, "tid": 7, "ts": 6302685266758.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685261340.953, "dur": 13.860, + "args": { + "External id": 124491, "cbid": 307, "correlation": 241666995 + } + }, + { + "ph": "s", "id": 241666995, "pid": 5717, "tid": 5717, "ts": 6302685261340.953, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685261501.373, "dur": 0.780, + "args": { + "External id": 124495, "cbid": 200, "correlation": 241666999 + } + }, + { + "ph": "f", "id": 241666999, "pid": 5717, "tid": 5717, "ts": 6302685261501.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685261502.363, "dur": 0.350, + "args": { + "External id": 124495, "cbid": 200, "correlation": 241667000 + } + }, + { + "ph": "f", "id": 241667000, "pid": 5717, "tid": 5717, "ts": 6302685261502.363, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685261546.133, "dur": 0.580, + "args": { + "External id": 124495, "cbid": 200, "correlation": 241667023 + } + }, + { + "ph": "f", "id": 241667023, "pid": 5717, "tid": 5717, "ts": 6302685261546.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685261556.642, "dur": 3.311, + "args": { + "External id": 124495, "cbid": 273, "correlation": 241667032 + } + }, + { + "ph": "f", "id": 241667032, "pid": 5717, "tid": 5717, "ts": 6302685261556.642, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685266819.124, "dur": 406.115, + "args": { + "External id": 124495, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667033, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667033, "pid": 3, "tid": 7, "ts": 6302685266819.124, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685261560.862, "dur": 15.760, + "args": { + "External id": 124495, "cbid": 211, "correlation": 241667033 + } + }, + { + "ph": "s", "id": 241667033, "pid": 5717, "tid": 5717, "ts": 6302685261560.862, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685267225.943, "dur": 122.017, + "args": { + "External id": 124511, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667059, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667059, "pid": 3, "tid": 7, "ts": 6302685267225.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685261765.132, "dur": 17.610, + "args": { + "External id": 124511, "cbid": 211, "correlation": 241667059 + } + }, + { + "ph": "s", "id": 241667059, "pid": 5717, "tid": 5717, "ts": 6302685261765.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685267348.664, "dur": 60.672, + "args": { + "External id": 124513, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667069, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667069, "pid": 3, "tid": 7, "ts": 6302685267348.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685261835.652, "dur": 14.890, + "args": { + "External id": 124513, "cbid": 211, "correlation": 241667069 + } + }, + { + "ph": "s", "id": 241667069, "pid": 5717, "tid": 5717, "ts": 6302685261835.652, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685267410.008, "dur": 50.753, + "args": { + "External id": 124518, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667082, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667082, "pid": 3, "tid": 7, "ts": 6302685267410.008, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685261927.362, "dur": 12.020, + "args": { + "External id": 124518, "cbid": 211, "correlation": 241667082 + } + }, + { + "ph": "s", "id": 241667082, "pid": 5717, "tid": 5717, "ts": 6302685261927.362, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685267461.369, "dur": 68.736, + "args": { + "External id": 124519, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667093, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667093, "pid": 3, "tid": 7, "ts": 6302685267461.369, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685261965.621, "dur": 8.151, + "args": { + "External id": 124519, "cbid": 211, "correlation": 241667093 + } + }, + { + "ph": "s", "id": 241667093, "pid": 5717, "tid": 5717, "ts": 6302685261965.621, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685267530.809, "dur": 14.944, + "args": { + "External id": 124522, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667107, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667107, "pid": 3, "tid": 7, "ts": 6302685267530.809, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685261998.892, "dur": 8.749, + "args": { + "External id": 124522, "cbid": 211, "correlation": 241667107 + } + }, + { + "ph": "s", "id": 241667107, "pid": 5717, "tid": 5717, "ts": 6302685261998.892, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685267546.457, "dur": 1.568, + "args": { + "External id": 124524, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667113, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241667113, "pid": 3, "tid": 7, "ts": 6302685267546.457, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262027.712, "dur": 6.589, + "args": { + "External id": 124524, "cbid": 211, "correlation": 241667113 + } + }, + { + "ph": "s", "id": 241667113, "pid": 5717, "tid": 5717, "ts": 6302685262027.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685267548.729, "dur": 1.024, + "args": { + "External id": 124525, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667123, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241667123, "pid": 3, "tid": 7, "ts": 6302685267548.729, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262051.081, "dur": 6.600, + "args": { + "External id": 124525, "cbid": 211, "correlation": 241667123 + } + }, + { + "ph": "s", "id": 241667123, "pid": 5717, "tid": 5717, "ts": 6302685262051.081, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685267550.329, "dur": 89.281, + "args": { + "External id": 124526, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667133, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667133, "pid": 3, "tid": 7, "ts": 6302685267550.329, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262080.141, "dur": 7.270, + "args": { + "External id": 124526, "cbid": 211, "correlation": 241667133 + } + }, + { + "ph": "s", "id": 241667133, "pid": 5717, "tid": 5717, "ts": 6302685262080.141, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685267640.282, "dur": 46.848, + "args": { + "External id": 124531, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667146, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667146, "pid": 3, "tid": 7, "ts": 6302685267640.282, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262118.171, "dur": 6.980, + "args": { + "External id": 124531, "cbid": 211, "correlation": 241667146 + } + }, + { + "ph": "s", "id": 241667146, "pid": 5717, "tid": 5717, "ts": 6302685262118.171, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685267687.802, "dur": 22.433, + "args": { + "External id": 124532, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667157, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667157, "pid": 3, "tid": 7, "ts": 6302685267687.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262146.231, "dur": 7.000, + "args": { + "External id": 124532, "cbid": 211, "correlation": 241667157 + } + }, + { + "ph": "s", "id": 241667157, "pid": 5717, "tid": 5717, "ts": 6302685262146.231, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685267710.971, "dur": 318.786, + "args": { + "External id": 124540, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667180, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667180, "pid": 3, "tid": 7, "ts": 6302685267710.971, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262249.671, "dur": 11.480, + "args": { + "External id": 124540, "cbid": 211, "correlation": 241667180 + } + }, + { + "ph": "s", "id": 241667180, "pid": 5717, "tid": 5717, "ts": 6302685262249.671, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685268030.365, "dur": 319.490, + "args": { + "External id": 124549, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667203, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667203, "pid": 3, "tid": 7, "ts": 6302685268030.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262348.611, "dur": 11.260, + "args": { + "External id": 124549, "cbid": 211, "correlation": 241667203 + } + }, + { + "ph": "s", "id": 241667203, "pid": 5717, "tid": 5717, "ts": 6302685262348.611, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 3, "tid": 7, + "ts": 6302685268350.527, "dur": 212.898, + "args": { + "External id": 124551, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667217, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667217, "pid": 3, "tid": 7, "ts": 6302685268350.527, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262473.100, "dur": 11.631, + "args": { + "External id": 124551, "cbid": 307, "correlation": 241667217 + } + }, + { + "ph": "s", "id": 241667217, "pid": 5717, "tid": 5717, "ts": 6302685262473.100, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685262551.360, "dur": 2.190, + "args": { + "External id": 124560, "cbid": 210, "correlation": 241667239 + } + }, + { + "ph": "f", "id": 241667239, "pid": 5717, "tid": 5717, "ts": 6302685262551.360, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685268564.033, "dur": 323.011, + "args": { + "External id": 124560, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667240, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667240, "pid": 3, "tid": 7, "ts": 6302685268564.033, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262556.580, "dur": 10.320, + "args": { + "External id": 124560, "cbid": 211, "correlation": 241667240 + } + }, + { + "ph": "s", "id": 241667240, "pid": 5717, "tid": 5717, "ts": 6302685262556.580, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685268887.652, "dur": 50.720, + "args": { + "External id": 124562, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667250, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667250, "pid": 3, "tid": 7, "ts": 6302685268887.652, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262621.790, "dur": 10.260, + "args": { + "External id": 124562, "cbid": 211, "correlation": 241667250 + } + }, + { + "ph": "s", "id": 241667250, "pid": 5717, "tid": 5717, "ts": 6302685262621.790, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685268939.076, "dur": 63.584, + "args": { + "External id": 124567, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667263, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667263, "pid": 3, "tid": 7, "ts": 6302685268939.076, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262704.220, "dur": 11.710, + "args": { + "External id": 124567, "cbid": 211, "correlation": 241667263 + } + }, + { + "ph": "s", "id": 241667263, "pid": 5717, "tid": 5717, "ts": 6302685262704.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685269003.300, "dur": 69.505, + "args": { + "External id": 124568, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667274, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667274, "pid": 3, "tid": 7, "ts": 6302685269003.300, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262739.290, "dur": 8.850, + "args": { + "External id": 124568, "cbid": 211, "correlation": 241667274 + } + }, + { + "ph": "s", "id": 241667274, "pid": 5717, "tid": 5717, "ts": 6302685262739.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685269073.509, "dur": 14.496, + "args": { + "External id": 124571, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667288, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667288, "pid": 3, "tid": 7, "ts": 6302685269073.509, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262770.120, "dur": 7.260, + "args": { + "External id": 124571, "cbid": 211, "correlation": 241667288 + } + }, + { + "ph": "s", "id": 241667288, "pid": 5717, "tid": 5717, "ts": 6302685262770.120, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685269088.645, "dur": 1.920, + "args": { + "External id": 124573, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667294, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241667294, "pid": 3, "tid": 7, "ts": 6302685269088.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262796.790, "dur": 6.110, + "args": { + "External id": 124573, "cbid": 211, "correlation": 241667294 + } + }, + { + "ph": "s", "id": 241667294, "pid": 5717, "tid": 5717, "ts": 6302685262796.790, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685269091.173, "dur": 1.024, + "args": { + "External id": 124574, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667304, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241667304, "pid": 3, "tid": 7, "ts": 6302685269091.173, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262820.660, "dur": 6.440, + "args": { + "External id": 124574, "cbid": 211, "correlation": 241667304 + } + }, + { + "ph": "s", "id": 241667304, "pid": 5717, "tid": 5717, "ts": 6302685262820.660, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685269092.901, "dur": 88.833, + "args": { + "External id": 124575, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667314, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667314, "pid": 3, "tid": 7, "ts": 6302685269092.901, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262849.730, "dur": 6.840, + "args": { + "External id": 124575, "cbid": 211, "correlation": 241667314 + } + }, + { + "ph": "s", "id": 241667314, "pid": 5717, "tid": 5717, "ts": 6302685262849.730, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685269182.342, "dur": 47.616, + "args": { + "External id": 124580, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667327, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667327, "pid": 3, "tid": 7, "ts": 6302685269182.342, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262885.399, "dur": 6.900, + "args": { + "External id": 124580, "cbid": 211, "correlation": 241667327 + } + }, + { + "ph": "s", "id": 241667327, "pid": 5717, "tid": 5717, "ts": 6302685262885.399, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685269230.598, "dur": 22.880, + "args": { + "External id": 124581, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667338, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667338, "pid": 3, "tid": 7, "ts": 6302685269230.598, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685262911.219, "dur": 6.511, + "args": { + "External id": 124581, "cbid": 211, "correlation": 241667338 + } + }, + { + "ph": "s", "id": 241667338, "pid": 5717, "tid": 5717, "ts": 6302685262911.219, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685269254.118, "dur": 121.185, + "args": { + "External id": 124589, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667361, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667361, "pid": 3, "tid": 7, "ts": 6302685269254.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685263018.319, "dur": 11.740, + "args": { + "External id": 124589, "cbid": 211, "correlation": 241667361 + } + }, + { + "ph": "s", "id": 241667361, "pid": 5717, "tid": 5717, "ts": 6302685263018.319, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685269375.943, "dur": 120.225, + "args": { + "External id": 124598, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667384, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667384, "pid": 3, "tid": 7, "ts": 6302685269375.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685263103.149, "dur": 9.950, + "args": { + "External id": 124598, "cbid": 211, "correlation": 241667384 + } + }, + { + "ph": "s", "id": 241667384, "pid": 5717, "tid": 5717, "ts": 6302685263103.149, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685269496.808, "dur": 120.065, + "args": { + "External id": 124607, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667407, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667407, "pid": 3, "tid": 7, "ts": 6302685269496.808, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685263194.449, "dur": 14.000, + "args": { + "External id": 124607, "cbid": 211, "correlation": 241667407 + } + }, + { + "ph": "s", "id": 241667407, "pid": 5717, "tid": 5717, "ts": 6302685263194.449, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685269617.577, "dur": 51.840, + "args": { + "External id": 124615, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667426, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667426, "pid": 3, "tid": 7, "ts": 6302685269617.577, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685263600.178, "dur": 14.570, + "args": { + "External id": 124615, "cbid": 307, "correlation": 241667426 + } + }, + { + "ph": "s", "id": 241667426, "pid": 5717, "tid": 5717, "ts": 6302685263600.178, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685269670.537, "dur": 60.289, + "args": { + "External id": 124618, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667443, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667443, "pid": 3, "tid": 7, "ts": 6302685269670.537, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685263785.168, "dur": 11.749, + "args": { + "External id": 124618, "cbid": 307, "correlation": 241667443 + } + }, + { + "ph": "s", "id": 241667443, "pid": 5717, "tid": 5717, "ts": 6302685263785.168, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685263922.347, "dur": 0.670, + "args": { + "External id": 124622, "cbid": 200, "correlation": 241667447 + } + }, + { + "ph": "f", "id": 241667447, "pid": 5717, "tid": 5717, "ts": 6302685263922.347, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685263923.187, "dur": 0.290, + "args": { + "External id": 124622, "cbid": 200, "correlation": 241667448 + } + }, + { + "ph": "f", "id": 241667448, "pid": 5717, "tid": 5717, "ts": 6302685263923.187, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685263959.997, "dur": 0.510, + "args": { + "External id": 124622, "cbid": 200, "correlation": 241667471 + } + }, + { + "ph": "f", "id": 241667471, "pid": 5717, "tid": 5717, "ts": 6302685263959.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685263971.697, "dur": 2.920, + "args": { + "External id": 124622, "cbid": 273, "correlation": 241667480 + } + }, + { + "ph": "f", "id": 241667480, "pid": 5717, "tid": 5717, "ts": 6302685263971.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685269731.402, "dur": 415.939, + "args": { + "External id": 124622, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667481, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667481, "pid": 3, "tid": 7, "ts": 6302685269731.402, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685263975.427, "dur": 14.110, + "args": { + "External id": 124622, "cbid": 211, "correlation": 241667481 + } + }, + { + "ph": "s", "id": 241667481, "pid": 5717, "tid": 5717, "ts": 6302685263975.427, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685270148.013, "dur": 122.145, + "args": { + "External id": 124638, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667507, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667507, "pid": 3, "tid": 7, "ts": 6302685270148.013, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264163.787, "dur": 22.720, + "args": { + "External id": 124638, "cbid": 211, "correlation": 241667507 + } + }, + { + "ph": "s", "id": 241667507, "pid": 5717, "tid": 5717, "ts": 6302685264163.787, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685270270.766, "dur": 61.793, + "args": { + "External id": 124640, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667517, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667517, "pid": 3, "tid": 7, "ts": 6302685270270.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264251.556, "dur": 10.730, + "args": { + "External id": 124640, "cbid": 211, "correlation": 241667517 + } + }, + { + "ph": "s", "id": 241667517, "pid": 5717, "tid": 5717, "ts": 6302685264251.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685270333.199, "dur": 50.432, + "args": { + "External id": 124645, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667530, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667530, "pid": 3, "tid": 7, "ts": 6302685270333.199, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264366.286, "dur": 12.890, + "args": { + "External id": 124645, "cbid": 211, "correlation": 241667530 + } + }, + { + "ph": "s", "id": 241667530, "pid": 5717, "tid": 5717, "ts": 6302685264366.286, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685270384.335, "dur": 67.968, + "args": { + "External id": 124646, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667541, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667541, "pid": 3, "tid": 7, "ts": 6302685270384.335, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264403.606, "dur": 11.470, + "args": { + "External id": 124646, "cbid": 211, "correlation": 241667541 + } + }, + { + "ph": "s", "id": 241667541, "pid": 5717, "tid": 5717, "ts": 6302685264403.606, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685270452.943, "dur": 15.361, + "args": { + "External id": 124649, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667555, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667555, "pid": 3, "tid": 7, "ts": 6302685270452.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264445.216, "dur": 11.310, + "args": { + "External id": 124649, "cbid": 211, "correlation": 241667555 + } + }, + { + "ph": "s", "id": 241667555, "pid": 5717, "tid": 5717, "ts": 6302685264445.216, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685270469.008, "dur": 1.664, + "args": { + "External id": 124651, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667561, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241667561, "pid": 3, "tid": 7, "ts": 6302685270469.008, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264478.776, "dur": 5.360, + "args": { + "External id": 124651, "cbid": 211, "correlation": 241667561 + } + }, + { + "ph": "s", "id": 241667561, "pid": 5717, "tid": 5717, "ts": 6302685264478.776, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685270471.248, "dur": 1.024, + "args": { + "External id": 124652, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667571, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241667571, "pid": 3, "tid": 7, "ts": 6302685270471.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264498.486, "dur": 5.720, + "args": { + "External id": 124652, "cbid": 211, "correlation": 241667571 + } + }, + { + "ph": "s", "id": 241667571, "pid": 5717, "tid": 5717, "ts": 6302685264498.486, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685270473.008, "dur": 88.960, + "args": { + "External id": 124653, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667581, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667581, "pid": 3, "tid": 7, "ts": 6302685270473.008, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264535.056, "dur": 9.420, + "args": { + "External id": 124653, "cbid": 211, "correlation": 241667581 + } + }, + { + "ph": "s", "id": 241667581, "pid": 5717, "tid": 5717, "ts": 6302685264535.056, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685270562.608, "dur": 48.065, + "args": { + "External id": 124658, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667594, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667594, "pid": 3, "tid": 7, "ts": 6302685270562.608, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264577.906, "dur": 6.190, + "args": { + "External id": 124658, "cbid": 211, "correlation": 241667594 + } + }, + { + "ph": "s", "id": 241667594, "pid": 5717, "tid": 5717, "ts": 6302685264577.906, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685270611.313, "dur": 21.951, + "args": { + "External id": 124659, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667605, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667605, "pid": 3, "tid": 7, "ts": 6302685270611.313, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264601.535, "dur": 5.591, + "args": { + "External id": 124659, "cbid": 211, "correlation": 241667605 + } + }, + { + "ph": "s", "id": 241667605, "pid": 5717, "tid": 5717, "ts": 6302685264601.535, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685270633.840, "dur": 318.467, + "args": { + "External id": 124667, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667628, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667628, "pid": 3, "tid": 7, "ts": 6302685270633.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264687.635, "dur": 10.820, + "args": { + "External id": 124667, "cbid": 211, "correlation": 241667628 + } + }, + { + "ph": "s", "id": 241667628, "pid": 5717, "tid": 5717, "ts": 6302685264687.635, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685270952.915, "dur": 319.042, + "args": { + "External id": 124676, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667651, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667651, "pid": 3, "tid": 7, "ts": 6302685270952.915, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264763.725, "dur": 8.580, + "args": { + "External id": 124676, "cbid": 211, "correlation": 241667651 + } + }, + { + "ph": "s", "id": 241667651, "pid": 5717, "tid": 5717, "ts": 6302685264763.725, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 3, "tid": 7, + "ts": 6302685271272.629, "dur": 212.418, + "args": { + "External id": 124678, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667665, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667665, "pid": 3, "tid": 7, "ts": 6302685271272.629, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264863.315, "dur": 16.260, + "args": { + "External id": 124678, "cbid": 307, "correlation": 241667665 + } + }, + { + "ph": "s", "id": 241667665, "pid": 5717, "tid": 5717, "ts": 6302685264863.315, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685264949.365, "dur": 1.780, + "args": { + "External id": 124687, "cbid": 210, "correlation": 241667687 + } + }, + { + "ph": "f", "id": 241667687, "pid": 5717, "tid": 5717, "ts": 6302685264949.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685271485.655, "dur": 324.738, + "args": { + "External id": 124687, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667688, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667688, "pid": 3, "tid": 7, "ts": 6302685271485.655, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685264953.565, "dur": 8.740, + "args": { + "External id": 124687, "cbid": 211, "correlation": 241667688 + } + }, + { + "ph": "s", "id": 241667688, "pid": 5717, "tid": 5717, "ts": 6302685264953.565, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685271811.097, "dur": 51.297, + "args": { + "External id": 124689, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667698, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667698, "pid": 3, "tid": 7, "ts": 6302685271811.097, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265009.125, "dur": 8.820, + "args": { + "External id": 124689, "cbid": 211, "correlation": 241667698 + } + }, + { + "ph": "s", "id": 241667698, "pid": 5717, "tid": 5717, "ts": 6302685265009.125, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685271863.066, "dur": 63.552, + "args": { + "External id": 124694, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667711, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667711, "pid": 3, "tid": 7, "ts": 6302685271863.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265077.285, "dur": 9.960, + "args": { + "External id": 124694, "cbid": 211, "correlation": 241667711 + } + }, + { + "ph": "s", "id": 241667711, "pid": 5717, "tid": 5717, "ts": 6302685265077.285, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685271927.258, "dur": 64.001, + "args": { + "External id": 124695, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667722, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667722, "pid": 3, "tid": 7, "ts": 6302685271927.258, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265108.014, "dur": 7.091, + "args": { + "External id": 124695, "cbid": 211, "correlation": 241667722 + } + }, + { + "ph": "s", "id": 241667722, "pid": 5717, "tid": 5717, "ts": 6302685265108.014, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685271991.899, "dur": 15.744, + "args": { + "External id": 124698, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667736, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667736, "pid": 3, "tid": 7, "ts": 6302685271991.899, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265137.074, "dur": 7.070, + "args": { + "External id": 124698, "cbid": 211, "correlation": 241667736 + } + }, + { + "ph": "s", "id": 241667736, "pid": 5717, "tid": 5717, "ts": 6302685265137.074, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685272008.315, "dur": 1.536, + "args": { + "External id": 124700, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667742, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241667742, "pid": 3, "tid": 7, "ts": 6302685272008.315, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265161.274, "dur": 6.180, + "args": { + "External id": 124700, "cbid": 211, "correlation": 241667742 + } + }, + { + "ph": "s", "id": 241667742, "pid": 5717, "tid": 5717, "ts": 6302685265161.274, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685272010.459, "dur": 0.992, + "args": { + "External id": 124701, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667752, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241667752, "pid": 3, "tid": 7, "ts": 6302685272010.459, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265186.054, "dur": 5.910, + "args": { + "External id": 124701, "cbid": 211, "correlation": 241667752 + } + }, + { + "ph": "s", "id": 241667752, "pid": 5717, "tid": 5717, "ts": 6302685265186.054, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685272012.155, "dur": 88.576, + "args": { + "External id": 124702, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667762, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667762, "pid": 3, "tid": 7, "ts": 6302685272012.155, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265214.094, "dur": 7.450, + "args": { + "External id": 124702, "cbid": 211, "correlation": 241667762 + } + }, + { + "ph": "s", "id": 241667762, "pid": 5717, "tid": 5717, "ts": 6302685265214.094, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685272101.339, "dur": 47.521, + "args": { + "External id": 124707, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667775, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667775, "pid": 3, "tid": 7, "ts": 6302685272101.339, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265249.844, "dur": 7.390, + "args": { + "External id": 124707, "cbid": 211, "correlation": 241667775 + } + }, + { + "ph": "s", "id": 241667775, "pid": 5717, "tid": 5717, "ts": 6302685265249.844, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685272149.564, "dur": 22.400, + "args": { + "External id": 124708, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667786, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667786, "pid": 3, "tid": 7, "ts": 6302685272149.564, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265274.804, "dur": 6.200, + "args": { + "External id": 124708, "cbid": 211, "correlation": 241667786 + } + }, + { + "ph": "s", "id": 241667786, "pid": 5717, "tid": 5717, "ts": 6302685265274.804, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685272172.700, "dur": 120.961, + "args": { + "External id": 124716, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667809, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667809, "pid": 3, "tid": 7, "ts": 6302685272172.700, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265372.954, "dur": 9.750, + "args": { + "External id": 124716, "cbid": 211, "correlation": 241667809 + } + }, + { + "ph": "s", "id": 241667809, "pid": 5717, "tid": 5717, "ts": 6302685265372.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685272294.301, "dur": 119.617, + "args": { + "External id": 124725, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667832, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667832, "pid": 3, "tid": 7, "ts": 6302685272294.301, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265440.954, "dur": 8.010, + "args": { + "External id": 124725, "cbid": 211, "correlation": 241667832 + } + }, + { + "ph": "s", "id": 241667832, "pid": 5717, "tid": 5717, "ts": 6302685265440.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685272414.558, "dur": 120.257, + "args": { + "External id": 124734, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667855, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667855, "pid": 3, "tid": 7, "ts": 6302685272414.558, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265616.983, "dur": 7.710, + "args": { + "External id": 124734, "cbid": 211, "correlation": 241667855 + } + }, + { + "ph": "s", "id": 241667855, "pid": 5717, "tid": 5717, "ts": 6302685265616.983, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685272535.423, "dur": 52.096, + "args": { + "External id": 124742, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667874, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667874, "pid": 3, "tid": 7, "ts": 6302685272535.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265828.623, "dur": 10.840, + "args": { + "External id": 124742, "cbid": 307, "correlation": 241667874 + } + }, + { + "ph": "s", "id": 241667874, "pid": 5717, "tid": 5717, "ts": 6302685265828.623, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685272588.191, "dur": 60.577, + "args": { + "External id": 124745, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667891, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667891, "pid": 3, "tid": 7, "ts": 6302685272588.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685265982.232, "dur": 9.540, + "args": { + "External id": 124745, "cbid": 307, "correlation": 241667891 + } + }, + { + "ph": "s", "id": 241667891, "pid": 5717, "tid": 5717, "ts": 6302685265982.232, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685266095.552, "dur": 0.560, + "args": { + "External id": 124749, "cbid": 200, "correlation": 241667895 + } + }, + { + "ph": "f", "id": 241667895, "pid": 5717, "tid": 5717, "ts": 6302685266095.552, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685266096.262, "dur": 0.230, + "args": { + "External id": 124749, "cbid": 200, "correlation": 241667896 + } + }, + { + "ph": "f", "id": 241667896, "pid": 5717, "tid": 5717, "ts": 6302685266096.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685266129.582, "dur": 0.500, + "args": { + "External id": 124749, "cbid": 200, "correlation": 241667919 + } + }, + { + "ph": "f", "id": 241667919, "pid": 5717, "tid": 5717, "ts": 6302685266129.582, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685266138.752, "dur": 2.790, + "args": { + "External id": 124749, "cbid": 273, "correlation": 241667928 + } + }, + { + "ph": "f", "id": 241667928, "pid": 5717, "tid": 5717, "ts": 6302685266138.752, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 3, "tid": 7, + "ts": 6302685272649.440, "dur": 401.411, + "args": { + "External id": 124749, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667929, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667929, "pid": 3, "tid": 7, "ts": 6302685272649.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266142.352, "dur": 13.560, + "args": { + "External id": 124749, "cbid": 211, "correlation": 241667929 + } + }, + { + "ph": "s", "id": 241667929, "pid": 5717, "tid": 5717, "ts": 6302685266142.352, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 3, "tid": 7, + "ts": 6302685273051.555, "dur": 121.921, + "args": { + "External id": 124765, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667955, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241667955, "pid": 3, "tid": 7, "ts": 6302685273051.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266321.192, "dur": 12.370, + "args": { + "External id": 124765, "cbid": 211, "correlation": 241667955 + } + }, + { + "ph": "s", "id": 241667955, "pid": 5717, "tid": 5717, "ts": 6302685266321.192, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685273174.084, "dur": 62.144, + "args": { + "External id": 124767, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667965, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667965, "pid": 3, "tid": 7, "ts": 6302685273174.084, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266371.122, "dur": 7.140, + "args": { + "External id": 124767, "cbid": 211, "correlation": 241667965 + } + }, + { + "ph": "s", "id": 241667965, "pid": 5717, "tid": 5717, "ts": 6302685266371.122, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685273236.836, "dur": 48.609, + "args": { + "External id": 124772, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667978, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667978, "pid": 3, "tid": 7, "ts": 6302685273236.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266421.591, "dur": 7.900, + "args": { + "External id": 124772, "cbid": 211, "correlation": 241667978 + } + }, + { + "ph": "s", "id": 241667978, "pid": 5717, "tid": 5717, "ts": 6302685266421.591, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685273286.117, "dur": 70.656, + "args": { + "External id": 124773, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241667989, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241667989, "pid": 3, "tid": 7, "ts": 6302685273286.117, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266447.511, "dur": 5.720, + "args": { + "External id": 124773, "cbid": 211, "correlation": 241667989 + } + }, + { + "ph": "s", "id": 241667989, "pid": 5717, "tid": 5717, "ts": 6302685266447.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685273357.413, "dur": 15.072, + "args": { + "External id": 124776, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668003, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668003, "pid": 3, "tid": 7, "ts": 6302685273357.413, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266472.231, "dur": 5.470, + "args": { + "External id": 124776, "cbid": 211, "correlation": 241668003 + } + }, + { + "ph": "s", "id": 241668003, "pid": 5717, "tid": 5717, "ts": 6302685266472.231, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685273373.157, "dur": 1.696, + "args": { + "External id": 124778, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668009, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241668009, "pid": 3, "tid": 7, "ts": 6302685273373.157, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266491.591, "dur": 10.550, + "args": { + "External id": 124778, "cbid": 211, "correlation": 241668009 + } + }, + { + "ph": "s", "id": 241668009, "pid": 5717, "tid": 5717, "ts": 6302685266491.591, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685273375.557, "dur": 0.992, + "args": { + "External id": 124779, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668019, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241668019, "pid": 3, "tid": 7, "ts": 6302685273375.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266519.881, "dur": 4.500, + "args": { + "External id": 124779, "cbid": 211, "correlation": 241668019 + } + }, + { + "ph": "s", "id": 241668019, "pid": 5717, "tid": 5717, "ts": 6302685266519.881, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685273377.253, "dur": 91.137, + "args": { + "External id": 124780, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668029, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668029, "pid": 3, "tid": 7, "ts": 6302685273377.253, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266540.651, "dur": 5.270, + "args": { + "External id": 124780, "cbid": 211, "correlation": 241668029 + } + }, + { + "ph": "s", "id": 241668029, "pid": 5717, "tid": 5717, "ts": 6302685266540.651, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685273469.094, "dur": 50.176, + "args": { + "External id": 124785, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668042, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668042, "pid": 3, "tid": 7, "ts": 6302685273469.094, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266565.761, "dur": 5.190, + "args": { + "External id": 124785, "cbid": 211, "correlation": 241668042 + } + }, + { + "ph": "s", "id": 241668042, "pid": 5717, "tid": 5717, "ts": 6302685266565.761, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685273519.846, "dur": 22.656, + "args": { + "External id": 124786, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668053, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668053, "pid": 3, "tid": 7, "ts": 6302685273519.846, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266585.231, "dur": 4.950, + "args": { + "External id": 124786, "cbid": 211, "correlation": 241668053 + } + }, + { + "ph": "s", "id": 241668053, "pid": 5717, "tid": 5717, "ts": 6302685266585.231, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685273543.142, "dur": 318.627, + "args": { + "External id": 124794, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668076, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668076, "pid": 3, "tid": 7, "ts": 6302685273543.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266655.591, "dur": 8.150, + "args": { + "External id": 124794, "cbid": 211, "correlation": 241668076 + } + }, + { + "ph": "s", "id": 241668076, "pid": 5717, "tid": 5717, "ts": 6302685266655.591, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685273862.441, "dur": 319.042, + "args": { + "External id": 124803, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668099, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668099, "pid": 3, "tid": 7, "ts": 6302685273862.441, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266731.351, "dur": 7.380, + "args": { + "External id": 124803, "cbid": 211, "correlation": 241668099 + } + }, + { + "ph": "s", "id": 241668099, "pid": 5717, "tid": 5717, "ts": 6302685266731.351, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 3, "tid": 7, + "ts": 6302685274182.219, "dur": 213.634, + "args": { + "External id": 124805, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668113, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668113, "pid": 3, "tid": 7, "ts": 6302685274182.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266818.701, "dur": 8.480, + "args": { + "External id": 124805, "cbid": 307, "correlation": 241668113 + } + }, + { + "ph": "s", "id": 241668113, "pid": 5717, "tid": 5717, "ts": 6302685266818.701, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685266888.761, "dur": 1.509, + "args": { + "External id": 124814, "cbid": 210, "correlation": 241668135 + } + }, + { + "ph": "f", "id": 241668135, "pid": 5717, "tid": 5717, "ts": 6302685266888.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685274396.525, "dur": 321.218, + "args": { + "External id": 124814, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668136, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668136, "pid": 3, "tid": 7, "ts": 6302685274396.525, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266892.230, "dur": 7.160, + "args": { + "External id": 124814, "cbid": 211, "correlation": 241668136 + } + }, + { + "ph": "s", "id": 241668136, "pid": 5717, "tid": 5717, "ts": 6302685266892.230, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685274718.415, "dur": 53.377, + "args": { + "External id": 124816, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668146, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668146, "pid": 3, "tid": 7, "ts": 6302685274718.415, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685266942.790, "dur": 8.000, + "args": { + "External id": 124816, "cbid": 211, "correlation": 241668146 + } + }, + { + "ph": "s", "id": 241668146, "pid": 5717, "tid": 5717, "ts": 6302685266942.790, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy_contig, unsigned int, 3, 128, 1>(at::native::(anonymous namespace)::OpaqueType<2u>*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, unsigned int, 128, 1>, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 3, "tid": 7, + "ts": 6302685274772.496, "dur": 224.801, + "args": { + "External id": 124818, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668157, "registers per thread": 20, "shared memory": 0, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [256, 4, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668157, "pid": 3, "tid": 7, "ts": 6302685274772.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267008.020, "dur": 16.940, + "args": { + "External id": 124818, "cbid": 211, "correlation": 241668157 + } + }, + { + "ph": "s", "id": 241668157, "pid": 5717, "tid": 5717, "ts": 6302685267008.020, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 3, "tid": 7, + "ts": 6302685274998.481, "dur": 195.842, + "args": { + "External id": 124823, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668176, "registers per thread": 22, "shared memory": 32, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668176, "pid": 3, "tid": 7, "ts": 6302685274998.481, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267251.989, "dur": 10.771, + "args": { + "External id": 124823, "cbid": 307, "correlation": 241668176 + } + }, + { + "ph": "s", "id": 241668176, "pid": 5717, "tid": 5717, "ts": 6302685267251.989, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685275195.027, "dur": 1.248, + "args": { + "External id": 124827, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668188, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668188, "pid": 3, "tid": 7, "ts": 6302685275195.027, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267420.869, "dur": 12.300, + "args": { + "External id": 124827, "cbid": 211, "correlation": 241668188 + } + }, + { + "ph": "s", "id": 241668188, "pid": 5717, "tid": 5717, "ts": 6302685267420.869, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685275196.915, "dur": 1.056, + "args": { + "External id": 124831, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668204, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668204, "pid": 3, "tid": 7, "ts": 6302685275196.915, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267454.599, "dur": 5.530, + "args": { + "External id": 124831, "cbid": 211, "correlation": 241668204 + } + }, + { + "ph": "s", "id": 241668204, "pid": 5717, "tid": 5717, "ts": 6302685267454.599, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685275198.611, "dur": 0.832, + "args": { + "External id": 124835, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668220, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668220, "pid": 3, "tid": 7, "ts": 6302685275198.611, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267476.939, "dur": 5.030, + "args": { + "External id": 124835, "cbid": 211, "correlation": 241668220 + } + }, + { + "ph": "s", "id": 241668220, "pid": 5717, "tid": 5717, "ts": 6302685267476.939, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685275200.083, "dur": 2.144, + "args": { + "External id": 124871, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 241668248, "pid": 3, "tid": 7, "ts": 6302685275200.083, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267637.479, "dur": 10.980, + "args": { + "External id": 124871, "cbid": 211, "correlation": 241668248 + } + }, + { + "ph": "s", "id": 241668248, "pid": 5717, "tid": 5717, "ts": 6302685267637.479, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685275202.931, "dur": 49.440, + "args": { + "External id": 124879, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668266, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668266, "pid": 3, "tid": 7, "ts": 6302685275202.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267751.899, "dur": 11.320, + "args": { + "External id": 124879, "cbid": 211, "correlation": 241668266 + } + }, + { + "ph": "s", "id": 241668266, "pid": 5717, "tid": 5717, "ts": 6302685267751.899, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685275252.979, "dur": 18.176, + "args": { + "External id": 124884, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668283, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668283, "pid": 3, "tid": 7, "ts": 6302685275252.979, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267806.078, "dur": 7.610, + "args": { + "External id": 124884, "cbid": 211, "correlation": 241668283 + } + }, + { + "ph": "s", "id": 241668283, "pid": 5717, "tid": 5717, "ts": 6302685267806.078, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685275271.859, "dur": 101.313, + "args": { + "External id": 124889, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668299, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668299, "pid": 3, "tid": 7, "ts": 6302685275271.859, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267833.758, "dur": 5.390, + "args": { + "External id": 124889, "cbid": 211, "correlation": 241668299 + } + }, + { + "ph": "s", "id": 241668299, "pid": 5717, "tid": 5717, "ts": 6302685267833.758, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685275374.260, "dur": 1.920, + "args": { + "External id": 124893, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241668315, "pid": 3, "tid": 7, "ts": 6302685275374.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267858.708, "dur": 5.230, + "args": { + "External id": 124893, "cbid": 211, "correlation": 241668315 + } + }, + { + "ph": "s", "id": 241668315, "pid": 5717, "tid": 5717, "ts": 6302685267858.708, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685275376.820, "dur": 1.792, + "args": { + "External id": 124894, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668327, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241668327, "pid": 3, "tid": 7, "ts": 6302685275376.820, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267896.548, "dur": 7.710, + "args": { + "External id": 124894, "cbid": 211, "correlation": 241668327 + } + }, + { + "ph": "s", "id": 241668327, "pid": 5717, "tid": 5717, "ts": 6302685267896.548, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685275379.252, "dur": 2.016, + "args": { + "External id": 124901, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668345, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241668345, "pid": 3, "tid": 7, "ts": 6302685275379.252, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267938.018, "dur": 7.950, + "args": { + "External id": 124901, "cbid": 211, "correlation": 241668345 + } + }, + { + "ph": "s", "id": 241668345, "pid": 5717, "tid": 5717, "ts": 6302685267938.018, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 3, "tid": 7, + "ts": 6302685275381.844, "dur": 3.712, + "args": { + "External id": 124896, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668354, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668354, "pid": 3, "tid": 7, "ts": 6302685275381.844, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685267955.368, "dur": 7.220, + "args": { + "External id": 124896, "cbid": 211, "correlation": 241668354 + } + }, + { + "ph": "s", "id": 241668354, "pid": 5717, "tid": 5717, "ts": 6302685267955.368, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685267980.558, "dur": 10.880, + "args": { + "External id": 124903, "cbid": 138, "correlation": 241668359 + } + }, + { + "ph": "f", "id": 241668359, "pid": 5717, "tid": 5717, "ts": 6302685267980.558, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685267992.498, "dur": 0.760, + "args": { + "External id": 124903, "cbid": 138, "correlation": 241668360 + } + }, + { + "ph": "f", "id": 241668360, "pid": 5717, "tid": 5717, "ts": 6302685267992.498, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 3, "tid": 7, + "ts": 6302685275390.036, "dur": 0.992, + "args": { + "External id": 124903, "device": 3, "context": 1, "stream": 7, "correlation": 241668362, "bytes": 8, "memory bandwidth (GB/s)": 0.008064516129032258 + } + }, + { + "ph": "f", "id": 241668362, "pid": 3, "tid": 7, "ts": 6302685275390.036, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685267997.768, "dur": 17.330, + "args": { + "External id": 124903, "cbid": 41, "correlation": 241668362 + } + }, + { + "ph": "s", "id": 241668362, "pid": 5717, "tid": 5717, "ts": 6302685267997.768, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 5717, + "ts": 6302685268015.558, "dur": 7380.113, + "args": { + "External id": 124903, "cbid": 131, "correlation": 241668363 + } + }, + { + "ph": "s", "id": 241668363, "pid": 5717, "tid": 5717, "ts": 6302685268015.558, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685275457.931, "dur": 1.700, + "args": { + "External id": 124911, "cbid": 210, "correlation": 241668388 + } + }, + { + "ph": "f", "id": 241668388, "pid": 5717, "tid": 5717, "ts": 6302685275457.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685275474.293, "dur": 633.125, + "args": { + "External id": 124911, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668389, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668389, "pid": 3, "tid": 7, "ts": 6302685275474.293, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275463.481, "dur": 10.680, + "args": { + "External id": 124911, "cbid": 211, "correlation": 241668389 + } + }, + { + "ph": "s", "id": 241668389, "pid": 5717, "tid": 5717, "ts": 6302685275463.481, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685276108.058, "dur": 171.457, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668408, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668408, "pid": 3, "tid": 7, "ts": 6302685276108.058, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275642.430, "dur": 17.280, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241668408 + } + }, + { + "ph": "s", "id": 241668408, "pid": 5717, "tid": 5717, "ts": 6302685275642.430, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685276280.251, "dur": 3.936, + "args": { + "External id": 124921, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668425, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668425, "pid": 3, "tid": 7, "ts": 6302685276280.251, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275712.490, "dur": 10.131, + "args": { + "External id": 124921, "cbid": 211, "correlation": 241668425 + } + }, + { + "ph": "s", "id": 241668425, "pid": 5717, "tid": 5717, "ts": 6302685275712.490, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685276284.923, "dur": 1.152, + "args": { + "External id": 124926, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668442, "pid": 3, "tid": 7, "ts": 6302685276284.923, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275761.810, "dur": 8.370, + "args": { + "External id": 124926, "cbid": 211, "correlation": 241668442 + } + }, + { + "ph": "s", "id": 241668442, "pid": 5717, "tid": 5717, "ts": 6302685275761.810, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685276286.747, "dur": 0.992, + "args": { + "External id": 124928, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668452, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668452, "pid": 3, "tid": 7, "ts": 6302685276286.747, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275789.250, "dur": 6.500, + "args": { + "External id": 124928, "cbid": 211, "correlation": 241668452 + } + }, + { + "ph": "s", "id": 241668452, "pid": 5717, "tid": 5717, "ts": 6302685275789.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685276288.347, "dur": 0.992, + "args": { + "External id": 124929, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668458, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668458, "pid": 3, "tid": 7, "ts": 6302685276288.347, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275819.250, "dur": 7.460, + "args": { + "External id": 124929, "cbid": 211, "correlation": 241668458 + } + }, + { + "ph": "s", "id": 241668458, "pid": 5717, "tid": 5717, "ts": 6302685275819.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685276290.043, "dur": 0.992, + "args": { + "External id": 124930, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668468, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668468, "pid": 3, "tid": 7, "ts": 6302685276290.043, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275841.820, "dur": 6.100, + "args": { + "External id": 124930, "cbid": 211, "correlation": 241668468 + } + }, + { + "ph": "s", "id": 241668468, "pid": 5717, "tid": 5717, "ts": 6302685275841.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685276291.739, "dur": 1.024, + "args": { + "External id": 124931, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668474, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668474, "pid": 3, "tid": 7, "ts": 6302685276291.739, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275857.740, "dur": 5.680, + "args": { + "External id": 124931, "cbid": 211, "correlation": 241668474 + } + }, + { + "ph": "s", "id": 241668474, "pid": 5717, "tid": 5717, "ts": 6302685275857.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685276293.371, "dur": 3.264, + "args": { + "External id": 124932, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668487, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668487, "pid": 3, "tid": 7, "ts": 6302685276293.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275887.300, "dur": 7.640, + "args": { + "External id": 124932, "cbid": 211, "correlation": 241668487 + } + }, + { + "ph": "s", "id": 241668487, "pid": 5717, "tid": 5717, "ts": 6302685275887.300, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685276297.307, "dur": 1.248, + "args": { + "External id": 124935, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668493, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668493, "pid": 3, "tid": 7, "ts": 6302685276297.307, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275905.560, "dur": 5.840, + "args": { + "External id": 124935, "cbid": 211, "correlation": 241668493 + } + }, + { + "ph": "s", "id": 241668493, "pid": 5717, "tid": 5717, "ts": 6302685275905.560, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685276299.163, "dur": 0.992, + "args": { + "External id": 124936, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668499, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668499, "pid": 3, "tid": 7, "ts": 6302685276299.163, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685275916.850, "dur": 4.000, + "args": { + "External id": 124936, "cbid": 211, "correlation": 241668499 + } + }, + { + "ph": "s", "id": 241668499, "pid": 5717, "tid": 5717, "ts": 6302685275916.850, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685276300.731, "dur": 233.538, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668513, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241668513, "pid": 3, "tid": 7, "ts": 6302685276300.731, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276019.650, "dur": 9.540, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241668513 + } + }, + { + "ph": "s", "id": 241668513, "pid": 5717, "tid": 5717, "ts": 6302685276019.650, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685276072.560, "dur": 0.710, + "args": { + "External id": 124940, "cbid": 200, "correlation": 241668536 + } + }, + { + "ph": "f", "id": 241668536, "pid": 5717, "tid": 5717, "ts": 6302685276072.560, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685276535.069, "dur": 0.384, + "args": { + "External id": 124940, "device": 3, "context": 1, "stream": 7, "correlation": 241668539, "bytes": 1536, "memory bandwidth (GB/s)": 4 + } + }, + { + "ph": "f", "id": 241668539, "pid": 3, "tid": 7, "ts": 6302685276535.069, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685276076.300, "dur": 11.000, + "args": { + "External id": 124940, "cbid": 51, "correlation": 241668539 + } + }, + { + "ph": "s", "id": 241668539, "pid": 5717, "tid": 5717, "ts": 6302685276076.300, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685276536.221, "dur": 685.669, + "args": { + "External id": 124940, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668540, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668540, "pid": 3, "tid": 7, "ts": 6302685276536.221, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276087.649, "dur": 7.411, + "args": { + "External id": 124940, "cbid": 307, "correlation": 241668540 + } + }, + { + "ph": "s", "id": 241668540, "pid": 5717, "tid": 5717, "ts": 6302685276087.649, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685277222.594, "dur": 2.912, + "args": { + "External id": 124943, "device": 3, "context": 1, "stream": 7, "correlation": 241668545, "bytes": 3145728, "memory bandwidth (GB/s)": 1080.2637362637363 + } + }, + { + "ph": "f", "id": 241668545, "pid": 3, "tid": 7, "ts": 6302685277222.594, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685276122.049, "dur": 27.940, + "args": { + "External id": 124943, "cbid": 41, "correlation": 241668545 + } + }, + { + "ph": "s", "id": 241668545, "pid": 5717, "tid": 5717, "ts": 6302685276122.049, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685276337.559, "dur": 0.520, + "args": { + "External id": 124948, "cbid": 200, "correlation": 241668573 + } + }, + { + "ph": "f", "id": 241668573, "pid": 5717, "tid": 5717, "ts": 6302685276337.559, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685277226.114, "dur": 687.014, + "args": { + "External id": 124948, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668576, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668576, "pid": 3, "tid": 7, "ts": 6302685277226.114, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276340.049, "dur": 9.700, + "args": { + "External id": 124948, "cbid": 307, "correlation": 241668576 + } + }, + { + "ph": "s", "id": 241668576, "pid": 5717, "tid": 5717, "ts": 6302685276340.049, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685277913.736, "dur": 220.641, + "args": { + "External id": 124949, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668581, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668581, "pid": 3, "tid": 7, "ts": 6302685277913.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276375.999, "dur": 8.200, + "args": { + "External id": 124949, "cbid": 211, "correlation": 241668581 + } + }, + { + "ph": "s", "id": 241668581, "pid": 5717, "tid": 5717, "ts": 6302685276375.999, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685276434.869, "dur": 1.460, + "args": { + "External id": 124957, "cbid": 210, "correlation": 241668607 + } + }, + { + "ph": "f", "id": 241668607, "pid": 5717, "tid": 5717, "ts": 6302685276434.869, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685278135.049, "dur": 628.901, + "args": { + "External id": 124957, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668608, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668608, "pid": 3, "tid": 7, "ts": 6302685278135.049, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276439.959, "dur": 7.990, + "args": { + "External id": 124957, "cbid": 211, "correlation": 241668608 + } + }, + { + "ph": "s", "id": 241668608, "pid": 5717, "tid": 5717, "ts": 6302685276439.959, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685278764.686, "dur": 170.785, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668627, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668627, "pid": 3, "tid": 7, "ts": 6302685278764.686, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276560.259, "dur": 9.620, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241668627 + } + }, + { + "ph": "s", "id": 241668627, "pid": 5717, "tid": 5717, "ts": 6302685276560.259, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685278936.079, "dur": 3.936, + "args": { + "External id": 124967, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668644, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668644, "pid": 3, "tid": 7, "ts": 6302685278936.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276606.708, "dur": 8.060, + "args": { + "External id": 124967, "cbid": 211, "correlation": 241668644 + } + }, + { + "ph": "s", "id": 241668644, "pid": 5717, "tid": 5717, "ts": 6302685276606.708, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685278940.719, "dur": 1.152, + "args": { + "External id": 124972, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668661, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668661, "pid": 3, "tid": 7, "ts": 6302685278940.719, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276638.828, "dur": 5.780, + "args": { + "External id": 124972, "cbid": 211, "correlation": 241668661 + } + }, + { + "ph": "s", "id": 241668661, "pid": 5717, "tid": 5717, "ts": 6302685276638.828, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685278942.543, "dur": 0.992, + "args": { + "External id": 124974, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668671, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668671, "pid": 3, "tid": 7, "ts": 6302685278942.543, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276662.298, "dur": 5.480, + "args": { + "External id": 124974, "cbid": 211, "correlation": 241668671 + } + }, + { + "ph": "s", "id": 241668671, "pid": 5717, "tid": 5717, "ts": 6302685276662.298, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685278944.143, "dur": 0.992, + "args": { + "External id": 124975, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668677, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668677, "pid": 3, "tid": 7, "ts": 6302685278944.143, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276675.928, "dur": 5.580, + "args": { + "External id": 124975, "cbid": 211, "correlation": 241668677 + } + }, + { + "ph": "s", "id": 241668677, "pid": 5717, "tid": 5717, "ts": 6302685276675.928, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685278945.839, "dur": 0.992, + "args": { + "External id": 124976, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668687, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668687, "pid": 3, "tid": 7, "ts": 6302685278945.839, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276692.638, "dur": 4.590, + "args": { + "External id": 124976, "cbid": 211, "correlation": 241668687 + } + }, + { + "ph": "s", "id": 241668687, "pid": 5717, "tid": 5717, "ts": 6302685276692.638, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685278947.535, "dur": 1.024, + "args": { + "External id": 124977, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668693, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668693, "pid": 3, "tid": 7, "ts": 6302685278947.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276703.308, "dur": 4.370, + "args": { + "External id": 124977, "cbid": 211, "correlation": 241668693 + } + }, + { + "ph": "s", "id": 241668693, "pid": 5717, "tid": 5717, "ts": 6302685276703.308, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685278949.167, "dur": 3.232, + "args": { + "External id": 124978, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668706, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668706, "pid": 3, "tid": 7, "ts": 6302685278949.167, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276724.688, "dur": 6.020, + "args": { + "External id": 124978, "cbid": 211, "correlation": 241668706 + } + }, + { + "ph": "s", "id": 241668706, "pid": 5717, "tid": 5717, "ts": 6302685276724.688, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685278953.103, "dur": 1.248, + "args": { + "External id": 124981, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668712, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668712, "pid": 3, "tid": 7, "ts": 6302685278953.103, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276738.018, "dur": 4.410, + "args": { + "External id": 124981, "cbid": 211, "correlation": 241668712 + } + }, + { + "ph": "s", "id": 241668712, "pid": 5717, "tid": 5717, "ts": 6302685276738.018, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685278954.927, "dur": 0.992, + "args": { + "External id": 124982, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668718, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668718, "pid": 3, "tid": 7, "ts": 6302685278954.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276747.698, "dur": 4.000, + "args": { + "External id": 124982, "cbid": 211, "correlation": 241668718 + } + }, + { + "ph": "s", "id": 241668718, "pid": 5717, "tid": 5717, "ts": 6302685276747.698, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685278956.527, "dur": 233.250, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668732, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241668732, "pid": 3, "tid": 7, "ts": 6302685278956.527, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276841.188, "dur": 8.370, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241668732 + } + }, + { + "ph": "s", "id": 241668732, "pid": 5717, "tid": 5717, "ts": 6302685276841.188, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685276883.348, "dur": 0.600, + "args": { + "External id": 124986, "cbid": 200, "correlation": 241668755 + } + }, + { + "ph": "f", "id": 241668755, "pid": 5717, "tid": 5717, "ts": 6302685276883.348, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685279190.609, "dur": 0.800, + "args": { + "External id": 124986, "device": 3, "context": 1, "stream": 7, "correlation": 241668758, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241668758, "pid": 3, "tid": 7, "ts": 6302685279190.609, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685276885.758, "dur": 7.590, + "args": { + "External id": 124986, "cbid": 51, "correlation": 241668758 + } + }, + { + "ph": "s", "id": 241668758, "pid": 5717, "tid": 5717, "ts": 6302685276885.758, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685279192.561, "dur": 682.981, + "args": { + "External id": 124986, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668759, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668759, "pid": 3, "tid": 7, "ts": 6302685279192.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276893.598, "dur": 6.190, + "args": { + "External id": 124986, "cbid": 307, "correlation": 241668759 + } + }, + { + "ph": "s", "id": 241668759, "pid": 5717, "tid": 5717, "ts": 6302685276893.598, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685279876.214, "dur": 2.976, + "args": { + "External id": 124989, "device": 3, "context": 1, "stream": 7, "correlation": 241668764, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 241668764, "pid": 3, "tid": 7, "ts": 6302685279876.214, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685276925.298, "dur": 13.200, + "args": { + "External id": 124989, "cbid": 41, "correlation": 241668764 + } + }, + { + "ph": "s", "id": 241668764, "pid": 5717, "tid": 5717, "ts": 6302685276925.298, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685276979.638, "dur": 0.380, + "args": { + "External id": 124994, "cbid": 200, "correlation": 241668792 + } + }, + { + "ph": "f", "id": 241668792, "pid": 5717, "tid": 5717, "ts": 6302685276979.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685279879.830, "dur": 682.149, + "args": { + "External id": 124994, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668795, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668795, "pid": 3, "tid": 7, "ts": 6302685279879.830, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685276981.587, "dur": 7.891, + "args": { + "External id": 124994, "cbid": 307, "correlation": 241668795 + } + }, + { + "ph": "s", "id": 241668795, "pid": 5717, "tid": 5717, "ts": 6302685276981.587, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685280562.715, "dur": 221.346, + "args": { + "External id": 124995, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668800, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241668800, "pid": 3, "tid": 7, "ts": 6302685280562.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277004.027, "dur": 6.300, + "args": { + "External id": 124995, "cbid": 211, "correlation": 241668800 + } + }, + { + "ph": "s", "id": 241668800, "pid": 5717, "tid": 5717, "ts": 6302685277004.027, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685277055.667, "dur": 1.351, + "args": { + "External id": 125003, "cbid": 210, "correlation": 241668826 + } + }, + { + "ph": "f", "id": 241668826, "pid": 5717, "tid": 5717, "ts": 6302685277055.667, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685280784.733, "dur": 626.949, + "args": { + "External id": 125003, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668827, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668827, "pid": 3, "tid": 7, "ts": 6302685280784.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277060.627, "dur": 7.830, + "args": { + "External id": 125003, "cbid": 211, "correlation": 241668827 + } + }, + { + "ph": "s", "id": 241668827, "pid": 5717, "tid": 5717, "ts": 6302685277060.627, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685281412.290, "dur": 171.425, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668846, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668846, "pid": 3, "tid": 7, "ts": 6302685281412.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277175.787, "dur": 9.070, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241668846 + } + }, + { + "ph": "s", "id": 241668846, "pid": 5717, "tid": 5717, "ts": 6302685277175.787, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685281584.419, "dur": 4.032, + "args": { + "External id": 125013, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668863, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668863, "pid": 3, "tid": 7, "ts": 6302685281584.419, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277335.797, "dur": 7.810, + "args": { + "External id": 125013, "cbid": 211, "correlation": 241668863 + } + }, + { + "ph": "s", "id": 241668863, "pid": 5717, "tid": 5717, "ts": 6302685277335.797, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685281589.155, "dur": 1.184, + "args": { + "External id": 125018, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668880, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668880, "pid": 3, "tid": 7, "ts": 6302685281589.155, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277369.407, "dur": 5.530, + "args": { + "External id": 125018, "cbid": 211, "correlation": 241668880 + } + }, + { + "ph": "s", "id": 241668880, "pid": 5717, "tid": 5717, "ts": 6302685277369.407, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685281591.011, "dur": 0.992, + "args": { + "External id": 125020, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668890, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668890, "pid": 3, "tid": 7, "ts": 6302685281591.011, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277389.227, "dur": 4.920, + "args": { + "External id": 125020, "cbid": 211, "correlation": 241668890 + } + }, + { + "ph": "s", "id": 241668890, "pid": 5717, "tid": 5717, "ts": 6302685277389.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685281592.739, "dur": 0.992, + "args": { + "External id": 125021, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668896, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668896, "pid": 3, "tid": 7, "ts": 6302685281592.739, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277401.467, "dur": 4.619, + "args": { + "External id": 125021, "cbid": 211, "correlation": 241668896 + } + }, + { + "ph": "s", "id": 241668896, "pid": 5717, "tid": 5717, "ts": 6302685277401.467, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685281594.435, "dur": 1.024, + "args": { + "External id": 125022, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668906, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668906, "pid": 3, "tid": 7, "ts": 6302685281594.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277416.597, "dur": 4.589, + "args": { + "External id": 125022, "cbid": 211, "correlation": 241668906 + } + }, + { + "ph": "s", "id": 241668906, "pid": 5717, "tid": 5717, "ts": 6302685277416.597, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685281596.163, "dur": 0.992, + "args": { + "External id": 125023, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668912, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668912, "pid": 3, "tid": 7, "ts": 6302685281596.163, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277428.226, "dur": 4.280, + "args": { + "External id": 125023, "cbid": 211, "correlation": 241668912 + } + }, + { + "ph": "s", "id": 241668912, "pid": 5717, "tid": 5717, "ts": 6302685277428.226, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685281597.763, "dur": 3.264, + "args": { + "External id": 125024, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668925, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668925, "pid": 3, "tid": 7, "ts": 6302685281597.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277448.026, "dur": 5.171, + "args": { + "External id": 125024, "cbid": 211, "correlation": 241668925 + } + }, + { + "ph": "s", "id": 241668925, "pid": 5717, "tid": 5717, "ts": 6302685277448.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685281601.699, "dur": 1.216, + "args": { + "External id": 125027, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668931, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668931, "pid": 3, "tid": 7, "ts": 6302685281601.699, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277459.557, "dur": 4.140, + "args": { + "External id": 125027, "cbid": 211, "correlation": 241668931 + } + }, + { + "ph": "s", "id": 241668931, "pid": 5717, "tid": 5717, "ts": 6302685277459.557, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685281603.555, "dur": 0.992, + "args": { + "External id": 125028, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668937, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241668937, "pid": 3, "tid": 7, "ts": 6302685281603.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277468.986, "dur": 4.091, + "args": { + "External id": 125028, "cbid": 211, "correlation": 241668937 + } + }, + { + "ph": "s", "id": 241668937, "pid": 5717, "tid": 5717, "ts": 6302685277468.986, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685281605.123, "dur": 232.930, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668951, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241668951, "pid": 3, "tid": 7, "ts": 6302685281605.123, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277562.366, "dur": 8.470, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241668951 + } + }, + { + "ph": "s", "id": 241668951, "pid": 5717, "tid": 5717, "ts": 6302685277562.366, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685277604.086, "dur": 0.590, + "args": { + "External id": 125032, "cbid": 200, "correlation": 241668974 + } + }, + { + "ph": "f", "id": 241668974, "pid": 5717, "tid": 5717, "ts": 6302685277604.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685281838.949, "dur": 0.800, + "args": { + "External id": 125032, "device": 3, "context": 1, "stream": 7, "correlation": 241668977, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241668977, "pid": 3, "tid": 7, "ts": 6302685281838.949, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685277606.506, "dur": 7.220, + "args": { + "External id": 125032, "cbid": 51, "correlation": 241668977 + } + }, + { + "ph": "s", "id": 241668977, "pid": 5717, "tid": 5717, "ts": 6302685277606.506, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685281840.901, "dur": 681.157, + "args": { + "External id": 125032, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241668978, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241668978, "pid": 3, "tid": 7, "ts": 6302685281840.901, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277613.986, "dur": 5.990, + "args": { + "External id": 125032, "cbid": 307, "correlation": 241668978 + } + }, + { + "ph": "s", "id": 241668978, "pid": 5717, "tid": 5717, "ts": 6302685277613.986, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685282522.794, "dur": 3.040, + "args": { + "External id": 125035, "device": 3, "context": 1, "stream": 7, "correlation": 241668983, "bytes": 3145728, "memory bandwidth (GB/s)": 1034.778947368421 + } + }, + { + "ph": "f", "id": 241668983, "pid": 3, "tid": 7, "ts": 6302685282522.794, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685277643.276, "dur": 12.810, + "args": { + "External id": 125035, "cbid": 41, "correlation": 241668983 + } + }, + { + "ph": "s", "id": 241668983, "pid": 5717, "tid": 5717, "ts": 6302685277643.276, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685277698.196, "dur": 0.420, + "args": { + "External id": 125040, "cbid": 200, "correlation": 241669011 + } + }, + { + "ph": "f", "id": 241669011, "pid": 5717, "tid": 5717, "ts": 6302685277698.196, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685282526.506, "dur": 683.365, + "args": { + "External id": 125040, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669014, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669014, "pid": 3, "tid": 7, "ts": 6302685282526.506, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277700.206, "dur": 7.410, + "args": { + "External id": 125040, "cbid": 307, "correlation": 241669014 + } + }, + { + "ph": "s", "id": 241669014, "pid": 5717, "tid": 5717, "ts": 6302685277700.206, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685283210.543, "dur": 221.218, + "args": { + "External id": 125041, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669019, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241669019, "pid": 3, "tid": 7, "ts": 6302685283210.543, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277721.886, "dur": 6.180, + "args": { + "External id": 125041, "cbid": 211, "correlation": 241669019 + } + }, + { + "ph": "s", "id": 241669019, "pid": 5717, "tid": 5717, "ts": 6302685277721.886, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685277772.726, "dur": 1.320, + "args": { + "External id": 125049, "cbid": 210, "correlation": 241669045 + } + }, + { + "ph": "f", "id": 241669045, "pid": 5717, "tid": 5717, "ts": 6302685277772.726, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685283432.401, "dur": 629.669, + "args": { + "External id": 125049, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669046, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669046, "pid": 3, "tid": 7, "ts": 6302685283432.401, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277777.576, "dur": 7.470, + "args": { + "External id": 125049, "cbid": 211, "correlation": 241669046 + } + }, + { + "ph": "s", "id": 241669046, "pid": 5717, "tid": 5717, "ts": 6302685277777.576, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685284062.742, "dur": 170.497, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669065, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669065, "pid": 3, "tid": 7, "ts": 6302685284062.742, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277918.745, "dur": 9.980, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241669065 + } + }, + { + "ph": "s", "id": 241669065, "pid": 5717, "tid": 5717, "ts": 6302685277918.745, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685284233.943, "dur": 4.032, + "args": { + "External id": 125059, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669082, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669082, "pid": 3, "tid": 7, "ts": 6302685284233.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277964.735, "dur": 7.060, + "args": { + "External id": 125059, "cbid": 211, "correlation": 241669082 + } + }, + { + "ph": "s", "id": 241669082, "pid": 5717, "tid": 5717, "ts": 6302685277964.735, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685284238.615, "dur": 1.152, + "args": { + "External id": 125064, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669099, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669099, "pid": 3, "tid": 7, "ts": 6302685284238.615, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685277997.705, "dur": 5.490, + "args": { + "External id": 125064, "cbid": 211, "correlation": 241669099 + } + }, + { + "ph": "s", "id": 241669099, "pid": 5717, "tid": 5717, "ts": 6302685277997.705, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685284240.471, "dur": 0.992, + "args": { + "External id": 125066, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669109, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669109, "pid": 3, "tid": 7, "ts": 6302685284240.471, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278018.505, "dur": 4.800, + "args": { + "External id": 125066, "cbid": 211, "correlation": 241669109 + } + }, + { + "ph": "s", "id": 241669109, "pid": 5717, "tid": 5717, "ts": 6302685278018.505, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685284242.167, "dur": 1.024, + "args": { + "External id": 125067, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669115, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669115, "pid": 3, "tid": 7, "ts": 6302685284242.167, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278031.065, "dur": 5.050, + "args": { + "External id": 125067, "cbid": 211, "correlation": 241669115 + } + }, + { + "ph": "s", "id": 241669115, "pid": 5717, "tid": 5717, "ts": 6302685278031.065, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685284243.863, "dur": 1.024, + "args": { + "External id": 125068, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669125, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669125, "pid": 3, "tid": 7, "ts": 6302685284243.863, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278046.855, "dur": 4.420, + "args": { + "External id": 125068, "cbid": 211, "correlation": 241669125 + } + }, + { + "ph": "s", "id": 241669125, "pid": 5717, "tid": 5717, "ts": 6302685278046.855, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685284245.591, "dur": 0.992, + "args": { + "External id": 125069, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669131, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669131, "pid": 3, "tid": 7, "ts": 6302685284245.591, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278057.275, "dur": 4.140, + "args": { + "External id": 125069, "cbid": 211, "correlation": 241669131 + } + }, + { + "ph": "s", "id": 241669131, "pid": 5717, "tid": 5717, "ts": 6302685278057.275, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685284247.191, "dur": 3.264, + "args": { + "External id": 125070, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669144, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669144, "pid": 3, "tid": 7, "ts": 6302685284247.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278078.015, "dur": 5.350, + "args": { + "External id": 125070, "cbid": 211, "correlation": 241669144 + } + }, + { + "ph": "s", "id": 241669144, "pid": 5717, "tid": 5717, "ts": 6302685278078.015, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685284251.127, "dur": 1.248, + "args": { + "External id": 125073, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669150, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669150, "pid": 3, "tid": 7, "ts": 6302685284251.127, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278090.055, "dur": 4.210, + "args": { + "External id": 125073, "cbid": 211, "correlation": 241669150 + } + }, + { + "ph": "s", "id": 241669150, "pid": 5717, "tid": 5717, "ts": 6302685278090.055, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685284252.983, "dur": 0.960, + "args": { + "External id": 125074, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669156, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669156, "pid": 3, "tid": 7, "ts": 6302685284252.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278100.585, "dur": 4.120, + "args": { + "External id": 125074, "cbid": 211, "correlation": 241669156 + } + }, + { + "ph": "s", "id": 241669156, "pid": 5717, "tid": 5717, "ts": 6302685278100.585, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685284254.551, "dur": 233.506, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669170, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241669170, "pid": 3, "tid": 7, "ts": 6302685284254.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278191.065, "dur": 8.480, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241669170 + } + }, + { + "ph": "s", "id": 241669170, "pid": 5717, "tid": 5717, "ts": 6302685278191.065, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685278232.845, "dur": 0.570, + "args": { + "External id": 125078, "cbid": 200, "correlation": 241669193 + } + }, + { + "ph": "f", "id": 241669193, "pid": 5717, "tid": 5717, "ts": 6302685278232.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685284488.889, "dur": 0.768, + "args": { + "External id": 125078, "device": 3, "context": 1, "stream": 7, "correlation": 241669196, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 241669196, "pid": 3, "tid": 7, "ts": 6302685284488.889, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685278235.215, "dur": 6.970, + "args": { + "External id": 125078, "cbid": 51, "correlation": 241669196 + } + }, + { + "ph": "s", "id": 241669196, "pid": 5717, "tid": 5717, "ts": 6302685278235.215, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685284490.425, "dur": 681.253, + "args": { + "External id": 125078, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669197, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669197, "pid": 3, "tid": 7, "ts": 6302685284490.425, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278242.425, "dur": 6.140, + "args": { + "External id": 125078, "cbid": 307, "correlation": 241669197 + } + }, + { + "ph": "s", "id": 241669197, "pid": 5717, "tid": 5717, "ts": 6302685278242.425, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685285172.382, "dur": 2.912, + "args": { + "External id": 125081, "device": 3, "context": 1, "stream": 7, "correlation": 241669202, "bytes": 3145728, "memory bandwidth (GB/s)": 1080.2637362637363 + } + }, + { + "ph": "f", "id": 241669202, "pid": 3, "tid": 7, "ts": 6302685285172.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685278272.795, "dur": 13.369, + "args": { + "External id": 125081, "cbid": 41, "correlation": 241669202 + } + }, + { + "ph": "s", "id": 241669202, "pid": 5717, "tid": 5717, "ts": 6302685278272.795, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685278337.895, "dur": 0.440, + "args": { + "External id": 125086, "cbid": 200, "correlation": 241669230 + } + }, + { + "ph": "f", "id": 241669230, "pid": 5717, "tid": 5717, "ts": 6302685278337.895, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685285175.966, "dur": 681.989, + "args": { + "External id": 125086, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669233, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669233, "pid": 3, "tid": 7, "ts": 6302685285175.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278339.895, "dur": 8.169, + "args": { + "External id": 125086, "cbid": 307, "correlation": 241669233 + } + }, + { + "ph": "s", "id": 241669233, "pid": 5717, "tid": 5717, "ts": 6302685278339.895, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685285858.659, "dur": 221.730, + "args": { + "External id": 125087, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669238, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241669238, "pid": 3, "tid": 7, "ts": 6302685285858.659, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278362.804, "dur": 6.260, + "args": { + "External id": 125087, "cbid": 211, "correlation": 241669238 + } + }, + { + "ph": "s", "id": 241669238, "pid": 5717, "tid": 5717, "ts": 6302685278362.804, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685278415.124, "dur": 1.410, + "args": { + "External id": 125095, "cbid": 210, "correlation": 241669264 + } + }, + { + "ph": "f", "id": 241669264, "pid": 5717, "tid": 5717, "ts": 6302685278415.124, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685286081.093, "dur": 628.196, + "args": { + "External id": 125095, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669265, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669265, "pid": 3, "tid": 7, "ts": 6302685286081.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278420.054, "dur": 7.780, + "args": { + "External id": 125095, "cbid": 211, "correlation": 241669265 + } + }, + { + "ph": "s", "id": 241669265, "pid": 5717, "tid": 5717, "ts": 6302685278420.054, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685286709.897, "dur": 171.618, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669284, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669284, "pid": 3, "tid": 7, "ts": 6302685286709.897, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278533.584, "dur": 9.390, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241669284 + } + }, + { + "ph": "s", "id": 241669284, "pid": 5717, "tid": 5717, "ts": 6302685278533.584, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685286882.155, "dur": 4.000, + "args": { + "External id": 125105, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669301, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669301, "pid": 3, "tid": 7, "ts": 6302685286882.155, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278577.444, "dur": 7.040, + "args": { + "External id": 125105, "cbid": 211, "correlation": 241669301 + } + }, + { + "ph": "s", "id": 241669301, "pid": 5717, "tid": 5717, "ts": 6302685278577.444, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685286886.795, "dur": 1.184, + "args": { + "External id": 125110, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669318, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669318, "pid": 3, "tid": 7, "ts": 6302685286886.795, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278608.904, "dur": 5.270, + "args": { + "External id": 125110, "cbid": 211, "correlation": 241669318 + } + }, + { + "ph": "s", "id": 241669318, "pid": 5717, "tid": 5717, "ts": 6302685278608.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685286888.651, "dur": 0.992, + "args": { + "External id": 125112, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669328, "pid": 3, "tid": 7, "ts": 6302685286888.651, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278628.104, "dur": 4.820, + "args": { + "External id": 125112, "cbid": 211, "correlation": 241669328 + } + }, + { + "ph": "s", "id": 241669328, "pid": 5717, "tid": 5717, "ts": 6302685278628.104, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685286890.347, "dur": 1.024, + "args": { + "External id": 125113, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669334, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669334, "pid": 3, "tid": 7, "ts": 6302685286890.347, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278642.314, "dur": 4.760, + "args": { + "External id": 125113, "cbid": 211, "correlation": 241669334 + } + }, + { + "ph": "s", "id": 241669334, "pid": 5717, "tid": 5717, "ts": 6302685278642.314, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685286892.075, "dur": 1.024, + "args": { + "External id": 125114, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669344, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669344, "pid": 3, "tid": 7, "ts": 6302685286892.075, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278657.814, "dur": 4.440, + "args": { + "External id": 125114, "cbid": 211, "correlation": 241669344 + } + }, + { + "ph": "s", "id": 241669344, "pid": 5717, "tid": 5717, "ts": 6302685278657.814, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685286893.771, "dur": 0.992, + "args": { + "External id": 125115, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669350, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669350, "pid": 3, "tid": 7, "ts": 6302685286893.771, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278668.064, "dur": 4.290, + "args": { + "External id": 125115, "cbid": 211, "correlation": 241669350 + } + }, + { + "ph": "s", "id": 241669350, "pid": 5717, "tid": 5717, "ts": 6302685278668.064, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685286895.371, "dur": 3.296, + "args": { + "External id": 125116, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669363, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669363, "pid": 3, "tid": 7, "ts": 6302685286895.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278688.314, "dur": 5.130, + "args": { + "External id": 125116, "cbid": 211, "correlation": 241669363 + } + }, + { + "ph": "s", "id": 241669363, "pid": 5717, "tid": 5717, "ts": 6302685278688.314, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685286899.307, "dur": 1.248, + "args": { + "External id": 125119, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669369, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669369, "pid": 3, "tid": 7, "ts": 6302685286899.307, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278699.804, "dur": 4.380, + "args": { + "External id": 125119, "cbid": 211, "correlation": 241669369 + } + }, + { + "ph": "s", "id": 241669369, "pid": 5717, "tid": 5717, "ts": 6302685278699.804, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685286901.163, "dur": 0.992, + "args": { + "External id": 125120, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669375, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669375, "pid": 3, "tid": 7, "ts": 6302685286901.163, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278712.404, "dur": 4.930, + "args": { + "External id": 125120, "cbid": 211, "correlation": 241669375 + } + }, + { + "ph": "s", "id": 241669375, "pid": 5717, "tid": 5717, "ts": 6302685278712.404, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685286902.763, "dur": 233.602, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669389, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241669389, "pid": 3, "tid": 7, "ts": 6302685286902.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278812.974, "dur": 8.969, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241669389 + } + }, + { + "ph": "s", "id": 241669389, "pid": 5717, "tid": 5717, "ts": 6302685278812.974, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685278856.303, "dur": 0.570, + "args": { + "External id": 125124, "cbid": 200, "correlation": 241669412 + } + }, + { + "ph": "f", "id": 241669412, "pid": 5717, "tid": 5717, "ts": 6302685278856.303, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685287137.165, "dur": 0.800, + "args": { + "External id": 125124, "device": 3, "context": 1, "stream": 7, "correlation": 241669415, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241669415, "pid": 3, "tid": 7, "ts": 6302685287137.165, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685278858.753, "dur": 7.010, + "args": { + "External id": 125124, "cbid": 51, "correlation": 241669415 + } + }, + { + "ph": "s", "id": 241669415, "pid": 5717, "tid": 5717, "ts": 6302685278858.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685287138.733, "dur": 683.525, + "args": { + "External id": 125124, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669416, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669416, "pid": 3, "tid": 7, "ts": 6302685287138.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278866.013, "dur": 6.190, + "args": { + "External id": 125124, "cbid": 307, "correlation": 241669416 + } + }, + { + "ph": "s", "id": 241669416, "pid": 5717, "tid": 5717, "ts": 6302685278866.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685287822.994, "dur": 2.944, + "args": { + "External id": 125127, "device": 3, "context": 1, "stream": 7, "correlation": 241669421, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 241669421, "pid": 3, "tid": 7, "ts": 6302685287822.994, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685278901.643, "dur": 18.920, + "args": { + "External id": 125127, "cbid": 41, "correlation": 241669421 + } + }, + { + "ph": "s", "id": 241669421, "pid": 5717, "tid": 5717, "ts": 6302685278901.643, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685278988.833, "dur": 0.470, + "args": { + "External id": 125132, "cbid": 200, "correlation": 241669449 + } + }, + { + "ph": "f", "id": 241669449, "pid": 5717, "tid": 5717, "ts": 6302685278988.833, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685287826.610, "dur": 687.365, + "args": { + "External id": 125132, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669452, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669452, "pid": 3, "tid": 7, "ts": 6302685287826.610, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685278992.183, "dur": 10.380, + "args": { + "External id": 125132, "cbid": 307, "correlation": 241669452 + } + }, + { + "ph": "s", "id": 241669452, "pid": 5717, "tid": 5717, "ts": 6302685278992.183, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685288514.711, "dur": 220.674, + "args": { + "External id": 125133, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669457, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241669457, "pid": 3, "tid": 7, "ts": 6302685288514.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279018.973, "dur": 6.200, + "args": { + "External id": 125133, "cbid": 211, "correlation": 241669457 + } + }, + { + "ph": "s", "id": 241669457, "pid": 5717, "tid": 5717, "ts": 6302685279018.973, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685279093.823, "dur": 2.680, + "args": { + "External id": 125141, "cbid": 210, "correlation": 241669483 + } + }, + { + "ph": "f", "id": 241669483, "pid": 5717, "tid": 5717, "ts": 6302685279093.823, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685288736.089, "dur": 631.237, + "args": { + "External id": 125141, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669484, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669484, "pid": 3, "tid": 7, "ts": 6302685288736.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279101.183, "dur": 13.410, + "args": { + "External id": 125141, "cbid": 211, "correlation": 241669484 + } + }, + { + "ph": "s", "id": 241669484, "pid": 5717, "tid": 5717, "ts": 6302685279101.183, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685289368.030, "dur": 170.721, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669503, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669503, "pid": 3, "tid": 7, "ts": 6302685289368.030, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279231.322, "dur": 9.220, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241669503 + } + }, + { + "ph": "s", "id": 241669503, "pid": 5717, "tid": 5717, "ts": 6302685279231.322, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685289539.359, "dur": 4.096, + "args": { + "External id": 125151, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669520, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669520, "pid": 3, "tid": 7, "ts": 6302685289539.359, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279275.752, "dur": 7.300, + "args": { + "External id": 125151, "cbid": 211, "correlation": 241669520 + } + }, + { + "ph": "s", "id": 241669520, "pid": 5717, "tid": 5717, "ts": 6302685279275.752, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685289544.095, "dur": 1.184, + "args": { + "External id": 125156, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669537, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669537, "pid": 3, "tid": 7, "ts": 6302685289544.095, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279316.782, "dur": 6.540, + "args": { + "External id": 125156, "cbid": 211, "correlation": 241669537 + } + }, + { + "ph": "s", "id": 241669537, "pid": 5717, "tid": 5717, "ts": 6302685279316.782, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685289545.983, "dur": 0.960, + "args": { + "External id": 125158, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669547, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669547, "pid": 3, "tid": 7, "ts": 6302685289545.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279338.042, "dur": 5.050, + "args": { + "External id": 125158, "cbid": 211, "correlation": 241669547 + } + }, + { + "ph": "s", "id": 241669547, "pid": 5717, "tid": 5717, "ts": 6302685279338.042, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685289547.679, "dur": 0.992, + "args": { + "External id": 125159, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669553, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669553, "pid": 3, "tid": 7, "ts": 6302685289547.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279350.662, "dur": 4.730, + "args": { + "External id": 125159, "cbid": 211, "correlation": 241669553 + } + }, + { + "ph": "s", "id": 241669553, "pid": 5717, "tid": 5717, "ts": 6302685279350.662, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685289549.375, "dur": 1.024, + "args": { + "External id": 125160, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669563, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669563, "pid": 3, "tid": 7, "ts": 6302685289549.375, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279365.592, "dur": 4.400, + "args": { + "External id": 125160, "cbid": 211, "correlation": 241669563 + } + }, + { + "ph": "s", "id": 241669563, "pid": 5717, "tid": 5717, "ts": 6302685279365.592, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685289551.103, "dur": 0.992, + "args": { + "External id": 125161, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669569, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669569, "pid": 3, "tid": 7, "ts": 6302685289551.103, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279375.872, "dur": 4.310, + "args": { + "External id": 125161, "cbid": 211, "correlation": 241669569 + } + }, + { + "ph": "s", "id": 241669569, "pid": 5717, "tid": 5717, "ts": 6302685279375.872, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685289552.703, "dur": 3.264, + "args": { + "External id": 125162, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669582, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669582, "pid": 3, "tid": 7, "ts": 6302685289552.703, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279397.772, "dur": 4.940, + "args": { + "External id": 125162, "cbid": 211, "correlation": 241669582 + } + }, + { + "ph": "s", "id": 241669582, "pid": 5717, "tid": 5717, "ts": 6302685279397.772, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685289556.639, "dur": 1.248, + "args": { + "External id": 125165, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669588, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669588, "pid": 3, "tid": 7, "ts": 6302685289556.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279409.142, "dur": 4.390, + "args": { + "External id": 125165, "cbid": 211, "correlation": 241669588 + } + }, + { + "ph": "s", "id": 241669588, "pid": 5717, "tid": 5717, "ts": 6302685279409.142, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685289558.495, "dur": 0.960, + "args": { + "External id": 125166, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669594, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669594, "pid": 3, "tid": 7, "ts": 6302685289558.495, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279418.632, "dur": 3.830, + "args": { + "External id": 125166, "cbid": 211, "correlation": 241669594 + } + }, + { + "ph": "s", "id": 241669594, "pid": 5717, "tid": 5717, "ts": 6302685279418.632, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685289560.063, "dur": 234.498, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669608, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241669608, "pid": 3, "tid": 7, "ts": 6302685289560.063, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279508.792, "dur": 8.580, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241669608 + } + }, + { + "ph": "s", "id": 241669608, "pid": 5717, "tid": 5717, "ts": 6302685279508.792, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685279551.642, "dur": 0.590, + "args": { + "External id": 125170, "cbid": 200, "correlation": 241669631 + } + }, + { + "ph": "f", "id": 241669631, "pid": 5717, "tid": 5717, "ts": 6302685279551.642, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685289795.361, "dur": 0.800, + "args": { + "External id": 125170, "device": 3, "context": 1, "stream": 7, "correlation": 241669634, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241669634, "pid": 3, "tid": 7, "ts": 6302685289795.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685279554.092, "dur": 6.890, + "args": { + "External id": 125170, "cbid": 51, "correlation": 241669634 + } + }, + { + "ph": "s", "id": 241669634, "pid": 5717, "tid": 5717, "ts": 6302685279554.092, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685289796.929, "dur": 685.157, + "args": { + "External id": 125170, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669635, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669635, "pid": 3, "tid": 7, "ts": 6302685289796.929, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279561.242, "dur": 6.350, + "args": { + "External id": 125170, "cbid": 307, "correlation": 241669635 + } + }, + { + "ph": "s", "id": 241669635, "pid": 5717, "tid": 5717, "ts": 6302685279561.242, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685290482.694, "dur": 2.944, + "args": { + "External id": 125173, "device": 3, "context": 1, "stream": 7, "correlation": 241669640, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 241669640, "pid": 3, "tid": 7, "ts": 6302685290482.694, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685279591.412, "dur": 13.060, + "args": { + "External id": 125173, "cbid": 41, "correlation": 241669640 + } + }, + { + "ph": "s", "id": 241669640, "pid": 5717, "tid": 5717, "ts": 6302685279591.412, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685279645.792, "dur": 0.449, + "args": { + "External id": 125178, "cbid": 200, "correlation": 241669668 + } + }, + { + "ph": "f", "id": 241669668, "pid": 5717, "tid": 5717, "ts": 6302685279645.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685290486.278, "dur": 683.973, + "args": { + "External id": 125178, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669671, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669671, "pid": 3, "tid": 7, "ts": 6302685290486.278, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279647.881, "dur": 7.491, + "args": { + "External id": 125178, "cbid": 307, "correlation": 241669671 + } + }, + { + "ph": "s", "id": 241669671, "pid": 5717, "tid": 5717, "ts": 6302685279647.881, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685291170.859, "dur": 221.474, + "args": { + "External id": 125179, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669676, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241669676, "pid": 3, "tid": 7, "ts": 6302685291170.859, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279669.501, "dur": 6.140, + "args": { + "External id": 125179, "cbid": 211, "correlation": 241669676 + } + }, + { + "ph": "s", "id": 241669676, "pid": 5717, "tid": 5717, "ts": 6302685279669.501, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685279720.651, "dur": 1.270, + "args": { + "External id": 125187, "cbid": 210, "correlation": 241669702 + } + }, + { + "ph": "f", "id": 241669702, "pid": 5717, "tid": 5717, "ts": 6302685279720.651, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685291392.973, "dur": 630.789, + "args": { + "External id": 125187, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669703, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669703, "pid": 3, "tid": 7, "ts": 6302685291392.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279725.471, "dur": 7.680, + "args": { + "External id": 125187, "cbid": 211, "correlation": 241669703 + } + }, + { + "ph": "s", "id": 241669703, "pid": 5717, "tid": 5717, "ts": 6302685279725.471, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685292024.402, "dur": 170.945, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669722, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669722, "pid": 3, "tid": 7, "ts": 6302685292024.402, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279835.811, "dur": 9.040, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241669722 + } + }, + { + "ph": "s", "id": 241669722, "pid": 5717, "tid": 5717, "ts": 6302685279835.811, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685292196.051, "dur": 3.936, + "args": { + "External id": 125197, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669739, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669739, "pid": 3, "tid": 7, "ts": 6302685292196.051, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279883.901, "dur": 7.980, + "args": { + "External id": 125197, "cbid": 211, "correlation": 241669739 + } + }, + { + "ph": "s", "id": 241669739, "pid": 5717, "tid": 5717, "ts": 6302685279883.901, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685292200.691, "dur": 1.184, + "args": { + "External id": 125202, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669756, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669756, "pid": 3, "tid": 7, "ts": 6302685292200.691, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279931.091, "dur": 8.340, + "args": { + "External id": 125202, "cbid": 211, "correlation": 241669756 + } + }, + { + "ph": "s", "id": 241669756, "pid": 5717, "tid": 5717, "ts": 6302685279931.091, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685292202.547, "dur": 0.992, + "args": { + "External id": 125204, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669766, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669766, "pid": 3, "tid": 7, "ts": 6302685292202.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279957.731, "dur": 5.430, + "args": { + "External id": 125204, "cbid": 211, "correlation": 241669766 + } + }, + { + "ph": "s", "id": 241669766, "pid": 5717, "tid": 5717, "ts": 6302685279957.731, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685292204.243, "dur": 1.024, + "args": { + "External id": 125205, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669772, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669772, "pid": 3, "tid": 7, "ts": 6302685292204.243, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279970.841, "dur": 5.060, + "args": { + "External id": 125205, "cbid": 211, "correlation": 241669772 + } + }, + { + "ph": "s", "id": 241669772, "pid": 5717, "tid": 5717, "ts": 6302685279970.841, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685292205.939, "dur": 1.024, + "args": { + "External id": 125206, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669782, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669782, "pid": 3, "tid": 7, "ts": 6302685292205.939, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279986.271, "dur": 4.660, + "args": { + "External id": 125206, "cbid": 211, "correlation": 241669782 + } + }, + { + "ph": "s", "id": 241669782, "pid": 5717, "tid": 5717, "ts": 6302685279986.271, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685292207.667, "dur": 0.992, + "args": { + "External id": 125207, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669788, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669788, "pid": 3, "tid": 7, "ts": 6302685292207.667, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685279997.761, "dur": 4.360, + "args": { + "External id": 125207, "cbid": 211, "correlation": 241669788 + } + }, + { + "ph": "s", "id": 241669788, "pid": 5717, "tid": 5717, "ts": 6302685279997.761, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685292209.267, "dur": 3.264, + "args": { + "External id": 125208, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669801, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669801, "pid": 3, "tid": 7, "ts": 6302685292209.267, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280018.791, "dur": 5.180, + "args": { + "External id": 125208, "cbid": 211, "correlation": 241669801 + } + }, + { + "ph": "s", "id": 241669801, "pid": 5717, "tid": 5717, "ts": 6302685280018.791, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685292213.203, "dur": 1.248, + "args": { + "External id": 125211, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669807, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669807, "pid": 3, "tid": 7, "ts": 6302685292213.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280029.931, "dur": 4.270, + "args": { + "External id": 125211, "cbid": 211, "correlation": 241669807 + } + }, + { + "ph": "s", "id": 241669807, "pid": 5717, "tid": 5717, "ts": 6302685280029.931, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685292215.059, "dur": 0.992, + "args": { + "External id": 125212, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669813, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669813, "pid": 3, "tid": 7, "ts": 6302685292215.059, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280039.481, "dur": 4.020, + "args": { + "External id": 125212, "cbid": 211, "correlation": 241669813 + } + }, + { + "ph": "s", "id": 241669813, "pid": 5717, "tid": 5717, "ts": 6302685280039.481, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685292216.627, "dur": 233.378, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669827, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241669827, "pid": 3, "tid": 7, "ts": 6302685292216.627, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280133.471, "dur": 8.529, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241669827 + } + }, + { + "ph": "s", "id": 241669827, "pid": 5717, "tid": 5717, "ts": 6302685280133.471, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685280179.020, "dur": 0.580, + "args": { + "External id": 125216, "cbid": 200, "correlation": 241669850 + } + }, + { + "ph": "f", "id": 241669850, "pid": 5717, "tid": 5717, "ts": 6302685280179.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685292450.805, "dur": 0.800, + "args": { + "External id": 125216, "device": 3, "context": 1, "stream": 7, "correlation": 241669853, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241669853, "pid": 3, "tid": 7, "ts": 6302685292450.805, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685280181.490, "dur": 7.390, + "args": { + "External id": 125216, "cbid": 51, "correlation": 241669853 + } + }, + { + "ph": "s", "id": 241669853, "pid": 5717, "tid": 5717, "ts": 6302685280181.490, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685292452.757, "dur": 681.285, + "args": { + "External id": 125216, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669854, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669854, "pid": 3, "tid": 7, "ts": 6302685292452.757, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280189.130, "dur": 6.650, + "args": { + "External id": 125216, "cbid": 307, "correlation": 241669854 + } + }, + { + "ph": "s", "id": 241669854, "pid": 5717, "tid": 5717, "ts": 6302685280189.130, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685293134.746, "dur": 2.912, + "args": { + "External id": 125219, "device": 3, "context": 1, "stream": 7, "correlation": 241669859, "bytes": 3145728, "memory bandwidth (GB/s)": 1080.2637362637363 + } + }, + { + "ph": "f", "id": 241669859, "pid": 3, "tid": 7, "ts": 6302685293134.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685280221.190, "dur": 12.320, + "args": { + "External id": 125219, "cbid": 41, "correlation": 241669859 + } + }, + { + "ph": "s", "id": 241669859, "pid": 5717, "tid": 5717, "ts": 6302685280221.190, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685280274.980, "dur": 0.400, + "args": { + "External id": 125224, "cbid": 200, "correlation": 241669887 + } + }, + { + "ph": "f", "id": 241669887, "pid": 5717, "tid": 5717, "ts": 6302685280274.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685293138.330, "dur": 684.133, + "args": { + "External id": 125224, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669890, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669890, "pid": 3, "tid": 7, "ts": 6302685293138.330, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280276.960, "dur": 7.320, + "args": { + "External id": 125224, "cbid": 307, "correlation": 241669890 + } + }, + { + "ph": "s", "id": 241669890, "pid": 5717, "tid": 5717, "ts": 6302685280276.960, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685293823.135, "dur": 220.930, + "args": { + "External id": 125225, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669895, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241669895, "pid": 3, "tid": 7, "ts": 6302685293823.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280319.670, "dur": 7.040, + "args": { + "External id": 125225, "cbid": 211, "correlation": 241669895 + } + }, + { + "ph": "s", "id": 241669895, "pid": 5717, "tid": 5717, "ts": 6302685280319.670, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685280376.610, "dur": 1.430, + "args": { + "External id": 125233, "cbid": 210, "correlation": 241669921 + } + }, + { + "ph": "f", "id": 241669921, "pid": 5717, "tid": 5717, "ts": 6302685280376.610, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685294044.705, "dur": 630.149, + "args": { + "External id": 125233, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669922, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669922, "pid": 3, "tid": 7, "ts": 6302685294044.705, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280381.540, "dur": 8.080, + "args": { + "External id": 125233, "cbid": 211, "correlation": 241669922 + } + }, + { + "ph": "s", "id": 241669922, "pid": 5717, "tid": 5717, "ts": 6302685280381.540, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685294675.526, "dur": 170.721, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669941, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241669941, "pid": 3, "tid": 7, "ts": 6302685294675.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280494.979, "dur": 8.880, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241669941 + } + }, + { + "ph": "s", "id": 241669941, "pid": 5717, "tid": 5717, "ts": 6302685280494.979, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685294846.887, "dur": 3.968, + "args": { + "External id": 125243, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669958, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669958, "pid": 3, "tid": 7, "ts": 6302685294846.887, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280539.830, "dur": 7.009, + "args": { + "External id": 125243, "cbid": 211, "correlation": 241669958 + } + }, + { + "ph": "s", "id": 241669958, "pid": 5717, "tid": 5717, "ts": 6302685280539.830, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685294851.463, "dur": 1.184, + "args": { + "External id": 125248, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669975, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669975, "pid": 3, "tid": 7, "ts": 6302685294851.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280570.899, "dur": 5.571, + "args": { + "External id": 125248, "cbid": 211, "correlation": 241669975 + } + }, + { + "ph": "s", "id": 241669975, "pid": 5717, "tid": 5717, "ts": 6302685280570.899, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685294853.319, "dur": 0.992, + "args": { + "External id": 125250, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669985, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669985, "pid": 3, "tid": 7, "ts": 6302685294853.319, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280591.699, "dur": 4.850, + "args": { + "External id": 125250, "cbid": 211, "correlation": 241669985 + } + }, + { + "ph": "s", "id": 241669985, "pid": 5717, "tid": 5717, "ts": 6302685280591.699, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685294855.047, "dur": 1.024, + "args": { + "External id": 125251, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241669991, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241669991, "pid": 3, "tid": 7, "ts": 6302685294855.047, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280603.689, "dur": 4.650, + "args": { + "External id": 125251, "cbid": 211, "correlation": 241669991 + } + }, + { + "ph": "s", "id": 241669991, "pid": 5717, "tid": 5717, "ts": 6302685280603.689, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685294856.743, "dur": 1.024, + "args": { + "External id": 125252, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670001, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670001, "pid": 3, "tid": 7, "ts": 6302685294856.743, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280618.819, "dur": 4.720, + "args": { + "External id": 125252, "cbid": 211, "correlation": 241670001 + } + }, + { + "ph": "s", "id": 241670001, "pid": 5717, "tid": 5717, "ts": 6302685280618.819, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685294858.471, "dur": 0.992, + "args": { + "External id": 125253, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670007, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670007, "pid": 3, "tid": 7, "ts": 6302685294858.471, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280629.319, "dur": 4.170, + "args": { + "External id": 125253, "cbid": 211, "correlation": 241670007 + } + }, + { + "ph": "s", "id": 241670007, "pid": 5717, "tid": 5717, "ts": 6302685280629.319, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685294860.071, "dur": 3.232, + "args": { + "External id": 125254, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670020, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670020, "pid": 3, "tid": 7, "ts": 6302685294860.071, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280648.859, "dur": 4.960, + "args": { + "External id": 125254, "cbid": 211, "correlation": 241670020 + } + }, + { + "ph": "s", "id": 241670020, "pid": 5717, "tid": 5717, "ts": 6302685280648.859, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685294863.879, "dur": 1.248, + "args": { + "External id": 125257, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670026, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670026, "pid": 3, "tid": 7, "ts": 6302685294863.879, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280661.399, "dur": 4.250, + "args": { + "External id": 125257, "cbid": 211, "correlation": 241670026 + } + }, + { + "ph": "s", "id": 241670026, "pid": 5717, "tid": 5717, "ts": 6302685280661.399, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685294865.735, "dur": 0.960, + "args": { + "External id": 125258, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670032, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670032, "pid": 3, "tid": 7, "ts": 6302685294865.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280670.569, "dur": 3.880, + "args": { + "External id": 125258, "cbid": 211, "correlation": 241670032 + } + }, + { + "ph": "s", "id": 241670032, "pid": 5717, "tid": 5717, "ts": 6302685280670.569, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685294867.303, "dur": 234.658, + "args": { + "External id": 124874, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670046, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241670046, "pid": 3, "tid": 7, "ts": 6302685294867.303, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280791.029, "dur": 13.550, + "args": { + "External id": 124874, "cbid": 307, "correlation": 241670046 + } + }, + { + "ph": "s", "id": 241670046, "pid": 5717, "tid": 5717, "ts": 6302685280791.029, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685280857.579, "dur": 0.610, + "args": { + "External id": 125262, "cbid": 200, "correlation": 241670069 + } + }, + { + "ph": "f", "id": 241670069, "pid": 5717, "tid": 5717, "ts": 6302685280857.579, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685295102.761, "dur": 0.800, + "args": { + "External id": 125262, "device": 3, "context": 1, "stream": 7, "correlation": 241670072, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241670072, "pid": 3, "tid": 7, "ts": 6302685295102.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685280861.159, "dur": 12.150, + "args": { + "External id": 125262, "cbid": 51, "correlation": 241670072 + } + }, + { + "ph": "s", "id": 241670072, "pid": 5717, "tid": 5717, "ts": 6302685280861.159, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685295104.297, "dur": 681.637, + "args": { + "External id": 125262, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670073, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670073, "pid": 3, "tid": 7, "ts": 6302685295104.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685280873.549, "dur": 10.530, + "args": { + "External id": 125262, "cbid": 307, "correlation": 241670073 + } + }, + { + "ph": "s", "id": 241670073, "pid": 5717, "tid": 5717, "ts": 6302685280873.549, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685295786.606, "dur": 2.880, + "args": { + "External id": 125265, "device": 3, "context": 1, "stream": 7, "correlation": 241670078, "bytes": 3145728, "memory bandwidth (GB/s)": 1092.2666666666667 + } + }, + { + "ph": "f", "id": 241670078, "pid": 3, "tid": 7, "ts": 6302685295786.606, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685280923.769, "dur": 21.409, + "args": { + "External id": 125265, "cbid": 41, "correlation": 241670078 + } + }, + { + "ph": "s", "id": 241670078, "pid": 5717, "tid": 5717, "ts": 6302685280923.769, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685281008.469, "dur": 0.469, + "args": { + "External id": 125270, "cbid": 200, "correlation": 241670106 + } + }, + { + "ph": "f", "id": 241670106, "pid": 5717, "tid": 5717, "ts": 6302685281008.469, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685295790.190, "dur": 685.509, + "args": { + "External id": 125270, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670109, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670109, "pid": 3, "tid": 7, "ts": 6302685295790.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281011.778, "dur": 11.540, + "args": { + "External id": 125270, "cbid": 307, "correlation": 241670109 + } + }, + { + "ph": "s", "id": 241670109, "pid": 5717, "tid": 5717, "ts": 6302685281011.778, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685296476.435, "dur": 221.570, + "args": { + "External id": 125271, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670114, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241670114, "pid": 3, "tid": 7, "ts": 6302685296476.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281047.658, "dur": 8.660, + "args": { + "External id": 125271, "cbid": 211, "correlation": 241670114 + } + }, + { + "ph": "s", "id": 241670114, "pid": 5717, "tid": 5717, "ts": 6302685281047.658, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685296698.709, "dur": 4.800, + "args": { + "External id": 125273, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670127, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670127, "pid": 3, "tid": 7, "ts": 6302685296698.709, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281083.548, "dur": 7.530, + "args": { + "External id": 125273, "cbid": 211, "correlation": 241670127 + } + }, + { + "ph": "s", "id": 241670127, "pid": 5717, "tid": 5717, "ts": 6302685281083.548, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685296704.181, "dur": 158.273, + "args": { + "External id": 125278, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241670140, "pid": 3, "tid": 7, "ts": 6302685296704.181, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281119.988, "dur": 6.300, + "args": { + "External id": 125278, "cbid": 211, "correlation": 241670140 + } + }, + { + "ph": "s", "id": 241670140, "pid": 5717, "tid": 5717, "ts": 6302685281119.988, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685296863.126, "dur": 1.504, + "args": { + "External id": 125283, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670148, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670148, "pid": 3, "tid": 7, "ts": 6302685296863.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281217.438, "dur": 8.640, + "args": { + "External id": 125283, "cbid": 211, "correlation": 241670148 + } + }, + { + "ph": "s", "id": 241670148, "pid": 5717, "tid": 5717, "ts": 6302685281217.438, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685296865.270, "dur": 2.240, + "args": { + "External id": 125302, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670168, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 241670168, "pid": 3, "tid": 7, "ts": 6302685296865.270, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281327.328, "dur": 10.100, + "args": { + "External id": 125302, "cbid": 211, "correlation": 241670168 + } + }, + { + "ph": "s", "id": 241670168, "pid": 5717, "tid": 5717, "ts": 6302685281327.328, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685296868.150, "dur": 58.881, + "args": { + "External id": 125310, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670186, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241670186, "pid": 3, "tid": 7, "ts": 6302685296868.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281444.477, "dur": 10.531, + "args": { + "External id": 125310, "cbid": 211, "correlation": 241670186 + } + }, + { + "ph": "s", "id": 241670186, "pid": 5717, "tid": 5717, "ts": 6302685281444.477, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685296927.639, "dur": 14.880, + "args": { + "External id": 125315, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670203, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241670203, "pid": 3, "tid": 7, "ts": 6302685296927.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281492.677, "dur": 7.660, + "args": { + "External id": 125315, "cbid": 211, "correlation": 241670203 + } + }, + { + "ph": "s", "id": 241670203, "pid": 5717, "tid": 5717, "ts": 6302685281492.677, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685296943.127, "dur": 101.568, + "args": { + "External id": 125320, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241670219, "pid": 3, "tid": 7, "ts": 6302685296943.127, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281518.437, "dur": 5.530, + "args": { + "External id": 125320, "cbid": 211, "correlation": 241670219 + } + }, + { + "ph": "s", "id": 241670219, "pid": 5717, "tid": 5717, "ts": 6302685281518.437, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685297045.399, "dur": 1.888, + "args": { + "External id": 125324, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670235, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241670235, "pid": 3, "tid": 7, "ts": 6302685297045.399, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281544.167, "dur": 5.170, + "args": { + "External id": 125324, "cbid": 211, "correlation": 241670235 + } + }, + { + "ph": "s", "id": 241670235, "pid": 5717, "tid": 5717, "ts": 6302685281544.167, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685297047.927, "dur": 1.696, + "args": { + "External id": 125325, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670247, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241670247, "pid": 3, "tid": 7, "ts": 6302685297047.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281575.437, "dur": 7.090, + "args": { + "External id": 125325, "cbid": 211, "correlation": 241670247 + } + }, + { + "ph": "s", "id": 241670247, "pid": 5717, "tid": 5717, "ts": 6302685281575.437, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685297050.359, "dur": 2.016, + "args": { + "External id": 125332, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670265, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241670265, "pid": 3, "tid": 7, "ts": 6302685297050.359, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281613.037, "dur": 7.790, + "args": { + "External id": 125332, "cbid": 211, "correlation": 241670265 + } + }, + { + "ph": "s", "id": 241670265, "pid": 5717, "tid": 5717, "ts": 6302685281613.037, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 3, "tid": 7, + "ts": 6302685297052.951, "dur": 3.808, + "args": { + "External id": 125327, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670274, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670274, "pid": 3, "tid": 7, "ts": 6302685297052.951, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685281628.467, "dur": 5.260, + "args": { + "External id": 125327, "cbid": 211, "correlation": 241670274 + } + }, + { + "ph": "s", "id": 241670274, "pid": 5717, "tid": 5717, "ts": 6302685281628.467, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 3, "tid": 7, + "ts": 6302685297059.064, "dur": 0.960, + "args": { + "External id": 125334, "device": 3, "context": 1, "stream": 7, "correlation": 241670280, "bytes": 8, "memory bandwidth (GB/s)": 0.008333333333333333 + } + }, + { + "ph": "f", "id": 241670280, "pid": 3, "tid": 7, "ts": 6302685297059.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685281649.387, "dur": 11.380, + "args": { + "External id": 125334, "cbid": 41, "correlation": 241670280 + } + }, + { + "ph": "s", "id": 241670280, "pid": 5717, "tid": 5717, "ts": 6302685281649.387, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 5717, + "ts": 6302685281661.107, "dur": 15402.665, + "args": { + "External id": 125334, "cbid": 131, "correlation": 241670281 + } + }, + { + "ph": "s", "id": 241670281, "pid": 5717, "tid": 5717, "ts": 6302685281661.107, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685297115.522, "dur": 1.510, + "args": { + "External id": 125342, "cbid": 210, "correlation": 241670306 + } + }, + { + "ph": "f", "id": 241670306, "pid": 5717, "tid": 5717, "ts": 6302685297115.522, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685297129.816, "dur": 627.333, + "args": { + "External id": 125342, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670307, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670307, "pid": 3, "tid": 7, "ts": 6302685297129.816, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297120.172, "dur": 8.900, + "args": { + "External id": 125342, "cbid": 211, "correlation": 241670307 + } + }, + { + "ph": "s", "id": 241670307, "pid": 5717, "tid": 5717, "ts": 6302685297120.172, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685297757.757, "dur": 170.689, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670326, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670326, "pid": 3, "tid": 7, "ts": 6302685297757.757, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297228.722, "dur": 8.030, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241670326 + } + }, + { + "ph": "s", "id": 241670326, "pid": 5717, "tid": 5717, "ts": 6302685297228.722, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685297929.182, "dur": 3.936, + "args": { + "External id": 125352, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670343, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670343, "pid": 3, "tid": 7, "ts": 6302685297929.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297270.522, "dur": 6.550, + "args": { + "External id": 125352, "cbid": 211, "correlation": 241670343 + } + }, + { + "ph": "s", "id": 241670343, "pid": 5717, "tid": 5717, "ts": 6302685297270.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685297933.822, "dur": 1.184, + "args": { + "External id": 125357, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670360, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670360, "pid": 3, "tid": 7, "ts": 6302685297933.822, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297306.252, "dur": 5.709, + "args": { + "External id": 125357, "cbid": 211, "correlation": 241670360 + } + }, + { + "ph": "s", "id": 241670360, "pid": 5717, "tid": 5717, "ts": 6302685297306.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685297935.614, "dur": 0.992, + "args": { + "External id": 125359, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670370, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670370, "pid": 3, "tid": 7, "ts": 6302685297935.614, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297325.581, "dur": 4.811, + "args": { + "External id": 125359, "cbid": 211, "correlation": 241670370 + } + }, + { + "ph": "s", "id": 241670370, "pid": 5717, "tid": 5717, "ts": 6302685297325.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685297937.310, "dur": 1.024, + "args": { + "External id": 125360, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670376, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670376, "pid": 3, "tid": 7, "ts": 6302685297937.310, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297337.132, "dur": 4.709, + "args": { + "External id": 125360, "cbid": 211, "correlation": 241670376 + } + }, + { + "ph": "s", "id": 241670376, "pid": 5717, "tid": 5717, "ts": 6302685297337.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685297939.102, "dur": 0.992, + "args": { + "External id": 125361, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670386, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670386, "pid": 3, "tid": 7, "ts": 6302685297939.102, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297351.412, "dur": 4.149, + "args": { + "External id": 125361, "cbid": 211, "correlation": 241670386 + } + }, + { + "ph": "s", "id": 241670386, "pid": 5717, "tid": 5717, "ts": 6302685297351.412, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685297940.798, "dur": 1.024, + "args": { + "External id": 125362, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670392, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670392, "pid": 3, "tid": 7, "ts": 6302685297940.798, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297360.732, "dur": 3.889, + "args": { + "External id": 125362, "cbid": 211, "correlation": 241670392 + } + }, + { + "ph": "s", "id": 241670392, "pid": 5717, "tid": 5717, "ts": 6302685297360.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685297942.430, "dur": 3.232, + "args": { + "External id": 125363, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670405, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670405, "pid": 3, "tid": 7, "ts": 6302685297942.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297381.201, "dur": 4.720, + "args": { + "External id": 125363, "cbid": 211, "correlation": 241670405 + } + }, + { + "ph": "s", "id": 241670405, "pid": 5717, "tid": 5717, "ts": 6302685297381.201, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685297946.238, "dur": 1.216, + "args": { + "External id": 125366, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670411, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670411, "pid": 3, "tid": 7, "ts": 6302685297946.238, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297391.711, "dur": 4.000, + "args": { + "External id": 125366, "cbid": 211, "correlation": 241670411 + } + }, + { + "ph": "s", "id": 241670411, "pid": 5717, "tid": 5717, "ts": 6302685297391.711, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685297948.158, "dur": 0.960, + "args": { + "External id": 125367, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670417, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670417, "pid": 3, "tid": 7, "ts": 6302685297948.158, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297400.441, "dur": 3.740, + "args": { + "External id": 125367, "cbid": 211, "correlation": 241670417 + } + }, + { + "ph": "s", "id": 241670417, "pid": 5717, "tid": 5717, "ts": 6302685297400.441, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685297949.854, "dur": 233.762, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670431, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241670431, "pid": 3, "tid": 7, "ts": 6302685297949.854, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297488.001, "dur": 7.430, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241670431 + } + }, + { + "ph": "s", "id": 241670431, "pid": 5717, "tid": 5717, "ts": 6302685297488.001, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685297524.791, "dur": 0.510, + "args": { + "External id": 125371, "cbid": 200, "correlation": 241670454 + } + }, + { + "ph": "f", "id": 241670454, "pid": 5717, "tid": 5717, "ts": 6302685297524.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685298184.384, "dur": 0.800, + "args": { + "External id": 125371, "device": 3, "context": 1, "stream": 7, "correlation": 241670457, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241670457, "pid": 3, "tid": 7, "ts": 6302685298184.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685297526.851, "dur": 6.160, + "args": { + "External id": 125371, "cbid": 51, "correlation": 241670457 + } + }, + { + "ph": "s", "id": 241670457, "pid": 5717, "tid": 5717, "ts": 6302685297526.851, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685298186.336, "dur": 682.533, + "args": { + "External id": 125371, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670458, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670458, "pid": 3, "tid": 7, "ts": 6302685298186.336, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297533.231, "dur": 5.580, + "args": { + "External id": 125371, "cbid": 307, "correlation": 241670458 + } + }, + { + "ph": "s", "id": 241670458, "pid": 5717, "tid": 5717, "ts": 6302685297533.231, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685298869.541, "dur": 3.040, + "args": { + "External id": 125374, "device": 3, "context": 1, "stream": 7, "correlation": 241670463, "bytes": 3145728, "memory bandwidth (GB/s)": 1034.778947368421 + } + }, + { + "ph": "f", "id": 241670463, "pid": 3, "tid": 7, "ts": 6302685298869.541, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685297562.001, "dur": 12.680, + "args": { + "External id": 125374, "cbid": 41, "correlation": 241670463 + } + }, + { + "ph": "s", "id": 241670463, "pid": 5717, "tid": 5717, "ts": 6302685297562.001, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685297612.371, "dur": 0.430, + "args": { + "External id": 125379, "cbid": 200, "correlation": 241670491 + } + }, + { + "ph": "f", "id": 241670491, "pid": 5717, "tid": 5717, "ts": 6302685297612.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685298873.157, "dur": 688.230, + "args": { + "External id": 125379, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670494, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670494, "pid": 3, "tid": 7, "ts": 6302685298873.157, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297614.171, "dur": 6.810, + "args": { + "External id": 125379, "cbid": 307, "correlation": 241670494 + } + }, + { + "ph": "s", "id": 241670494, "pid": 5717, "tid": 5717, "ts": 6302685297614.171, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685299562.027, "dur": 221.345, + "args": { + "External id": 125380, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670499, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241670499, "pid": 3, "tid": 7, "ts": 6302685299562.027, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297633.611, "dur": 5.520, + "args": { + "External id": 125380, "cbid": 211, "correlation": 241670499 + } + }, + { + "ph": "s", "id": 241670499, "pid": 5717, "tid": 5717, "ts": 6302685297633.611, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685297679.451, "dur": 1.180, + "args": { + "External id": 125388, "cbid": 210, "correlation": 241670525 + } + }, + { + "ph": "f", "id": 241670525, "pid": 5717, "tid": 5717, "ts": 6302685297679.451, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685299784.044, "dur": 626.693, + "args": { + "External id": 125388, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670526, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670526, "pid": 3, "tid": 7, "ts": 6302685299784.044, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297683.811, "dur": 7.050, + "args": { + "External id": 125388, "cbid": 211, "correlation": 241670526 + } + }, + { + "ph": "s", "id": 241670526, "pid": 5717, "tid": 5717, "ts": 6302685297683.811, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685300411.441, "dur": 170.945, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670545, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670545, "pid": 3, "tid": 7, "ts": 6302685300411.441, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297785.651, "dur": 8.340, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241670545 + } + }, + { + "ph": "s", "id": 241670545, "pid": 5717, "tid": 5717, "ts": 6302685297785.651, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685300583.090, "dur": 3.936, + "args": { + "External id": 125398, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670562, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670562, "pid": 3, "tid": 7, "ts": 6302685300583.090, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297825.570, "dur": 6.470, + "args": { + "External id": 125398, "cbid": 211, "correlation": 241670562 + } + }, + { + "ph": "s", "id": 241670562, "pid": 5717, "tid": 5717, "ts": 6302685297825.570, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685300587.730, "dur": 1.216, + "args": { + "External id": 125403, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670579, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670579, "pid": 3, "tid": 7, "ts": 6302685300587.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297852.860, "dur": 4.810, + "args": { + "External id": 125403, "cbid": 211, "correlation": 241670579 + } + }, + { + "ph": "s", "id": 241670579, "pid": 5717, "tid": 5717, "ts": 6302685297852.860, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685300589.554, "dur": 0.992, + "args": { + "External id": 125405, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670589, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670589, "pid": 3, "tid": 7, "ts": 6302685300589.554, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297871.440, "dur": 4.600, + "args": { + "External id": 125405, "cbid": 211, "correlation": 241670589 + } + }, + { + "ph": "s", "id": 241670589, "pid": 5717, "tid": 5717, "ts": 6302685297871.440, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685300591.122, "dur": 0.992, + "args": { + "External id": 125406, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670595, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670595, "pid": 3, "tid": 7, "ts": 6302685300591.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297882.360, "dur": 4.140, + "args": { + "External id": 125406, "cbid": 211, "correlation": 241670595 + } + }, + { + "ph": "s", "id": 241670595, "pid": 5717, "tid": 5717, "ts": 6302685297882.360, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685300592.818, "dur": 1.024, + "args": { + "External id": 125407, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670605, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670605, "pid": 3, "tid": 7, "ts": 6302685300592.818, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297896.350, "dur": 4.160, + "args": { + "External id": 125407, "cbid": 211, "correlation": 241670605 + } + }, + { + "ph": "s", "id": 241670605, "pid": 5717, "tid": 5717, "ts": 6302685297896.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685300594.546, "dur": 0.992, + "args": { + "External id": 125408, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670611, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670611, "pid": 3, "tid": 7, "ts": 6302685300594.546, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297906.800, "dur": 3.700, + "args": { + "External id": 125408, "cbid": 211, "correlation": 241670611 + } + }, + { + "ph": "s", "id": 241670611, "pid": 5717, "tid": 5717, "ts": 6302685297906.800, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685300596.146, "dur": 3.232, + "args": { + "External id": 125409, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670624, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670624, "pid": 3, "tid": 7, "ts": 6302685300596.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297924.450, "dur": 4.490, + "args": { + "External id": 125409, "cbid": 211, "correlation": 241670624 + } + }, + { + "ph": "s", "id": 241670624, "pid": 5717, "tid": 5717, "ts": 6302685297924.450, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685300599.986, "dur": 1.216, + "args": { + "External id": 125412, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670630, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670630, "pid": 3, "tid": 7, "ts": 6302685300599.986, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297934.690, "dur": 4.070, + "args": { + "External id": 125412, "cbid": 211, "correlation": 241670630 + } + }, + { + "ph": "s", "id": 241670630, "pid": 5717, "tid": 5717, "ts": 6302685297934.690, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685300601.810, "dur": 0.992, + "args": { + "External id": 125413, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670636, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670636, "pid": 3, "tid": 7, "ts": 6302685300601.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685297943.250, "dur": 3.550, + "args": { + "External id": 125413, "cbid": 211, "correlation": 241670636 + } + }, + { + "ph": "s", "id": 241670636, "pid": 5717, "tid": 5717, "ts": 6302685297943.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685300603.410, "dur": 232.738, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670650, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241670650, "pid": 3, "tid": 7, "ts": 6302685300603.410, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298022.960, "dur": 7.300, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241670650 + } + }, + { + "ph": "s", "id": 241670650, "pid": 5717, "tid": 5717, "ts": 6302685298022.960, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685298058.870, "dur": 0.510, + "args": { + "External id": 125417, "cbid": 200, "correlation": 241670673 + } + }, + { + "ph": "f", "id": 241670673, "pid": 5717, "tid": 5717, "ts": 6302685298058.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685300837.044, "dur": 0.800, + "args": { + "External id": 125417, "device": 3, "context": 1, "stream": 7, "correlation": 241670676, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241670676, "pid": 3, "tid": 7, "ts": 6302685300837.044, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685298061.040, "dur": 7.000, + "args": { + "External id": 125417, "cbid": 51, "correlation": 241670676 + } + }, + { + "ph": "s", "id": 241670676, "pid": 5717, "tid": 5717, "ts": 6302685298061.040, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685300838.996, "dur": 681.509, + "args": { + "External id": 125417, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670677, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670677, "pid": 3, "tid": 7, "ts": 6302685300838.996, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298068.250, "dur": 5.370, + "args": { + "External id": 125417, "cbid": 307, "correlation": 241670677 + } + }, + { + "ph": "s", "id": 241670677, "pid": 5717, "tid": 5717, "ts": 6302685298068.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685301521.209, "dur": 2.880, + "args": { + "External id": 125420, "device": 3, "context": 1, "stream": 7, "correlation": 241670682, "bytes": 3145728, "memory bandwidth (GB/s)": 1092.2666666666667 + } + }, + { + "ph": "f", "id": 241670682, "pid": 3, "tid": 7, "ts": 6302685301521.209, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685298094.780, "dur": 11.220, + "args": { + "External id": 125420, "cbid": 41, "correlation": 241670682 + } + }, + { + "ph": "s", "id": 241670682, "pid": 5717, "tid": 5717, "ts": 6302685298094.780, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685298141.630, "dur": 0.380, + "args": { + "External id": 125425, "cbid": 200, "correlation": 241670710 + } + }, + { + "ph": "f", "id": 241670710, "pid": 5717, "tid": 5717, "ts": 6302685298141.630, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685301524.921, "dur": 687.557, + "args": { + "External id": 125425, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670713, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670713, "pid": 3, "tid": 7, "ts": 6302685301524.921, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298143.350, "dur": 6.450, + "args": { + "External id": 125425, "cbid": 307, "correlation": 241670713 + } + }, + { + "ph": "s", "id": 241670713, "pid": 5717, "tid": 5717, "ts": 6302685298143.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685302213.182, "dur": 220.866, + "args": { + "External id": 125426, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670718, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241670718, "pid": 3, "tid": 7, "ts": 6302685302213.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298162.290, "dur": 5.289, + "args": { + "External id": 125426, "cbid": 211, "correlation": 241670718 + } + }, + { + "ph": "s", "id": 241670718, "pid": 5717, "tid": 5717, "ts": 6302685298162.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685298207.999, "dur": 1.180, + "args": { + "External id": 125434, "cbid": 210, "correlation": 241670744 + } + }, + { + "ph": "f", "id": 241670744, "pid": 5717, "tid": 5717, "ts": 6302685298207.999, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685302434.720, "dur": 630.213, + "args": { + "External id": 125434, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670745, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670745, "pid": 3, "tid": 7, "ts": 6302685302434.720, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298212.299, "dur": 7.011, + "args": { + "External id": 125434, "cbid": 211, "correlation": 241670745 + } + }, + { + "ph": "s", "id": 241670745, "pid": 5717, "tid": 5717, "ts": 6302685298212.299, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685303065.605, "dur": 170.497, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670764, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670764, "pid": 3, "tid": 7, "ts": 6302685303065.605, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298319.869, "dur": 8.490, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241670764 + } + }, + { + "ph": "s", "id": 241670764, "pid": 5717, "tid": 5717, "ts": 6302685298319.869, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685303236.710, "dur": 4.000, + "args": { + "External id": 125444, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670781, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670781, "pid": 3, "tid": 7, "ts": 6302685303236.710, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298376.389, "dur": 6.690, + "args": { + "External id": 125444, "cbid": 211, "correlation": 241670781 + } + }, + { + "ph": "s", "id": 241670781, "pid": 5717, "tid": 5717, "ts": 6302685298376.389, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685303241.350, "dur": 1.184, + "args": { + "External id": 125449, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670798, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670798, "pid": 3, "tid": 7, "ts": 6302685303241.350, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298403.789, "dur": 4.930, + "args": { + "External id": 125449, "cbid": 211, "correlation": 241670798 + } + }, + { + "ph": "s", "id": 241670798, "pid": 5717, "tid": 5717, "ts": 6302685298403.789, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685303243.206, "dur": 0.992, + "args": { + "External id": 125451, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670808, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670808, "pid": 3, "tid": 7, "ts": 6302685303243.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298422.069, "dur": 4.500, + "args": { + "External id": 125451, "cbid": 211, "correlation": 241670808 + } + }, + { + "ph": "s", "id": 241670808, "pid": 5717, "tid": 5717, "ts": 6302685298422.069, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685303244.902, "dur": 1.024, + "args": { + "External id": 125452, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670814, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670814, "pid": 3, "tid": 7, "ts": 6302685303244.902, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298432.839, "dur": 4.070, + "args": { + "External id": 125452, "cbid": 211, "correlation": 241670814 + } + }, + { + "ph": "s", "id": 241670814, "pid": 5717, "tid": 5717, "ts": 6302685298432.839, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685303246.598, "dur": 1.024, + "args": { + "External id": 125453, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670824, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670824, "pid": 3, "tid": 7, "ts": 6302685303246.598, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298447.339, "dur": 3.970, + "args": { + "External id": 125453, "cbid": 211, "correlation": 241670824 + } + }, + { + "ph": "s", "id": 241670824, "pid": 5717, "tid": 5717, "ts": 6302685298447.339, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685303248.326, "dur": 0.992, + "args": { + "External id": 125454, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670830, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670830, "pid": 3, "tid": 7, "ts": 6302685303248.326, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298456.429, "dur": 3.790, + "args": { + "External id": 125454, "cbid": 211, "correlation": 241670830 + } + }, + { + "ph": "s", "id": 241670830, "pid": 5717, "tid": 5717, "ts": 6302685298456.429, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685303249.926, "dur": 3.264, + "args": { + "External id": 125455, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670843, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670843, "pid": 3, "tid": 7, "ts": 6302685303249.926, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298473.379, "dur": 4.480, + "args": { + "External id": 125455, "cbid": 211, "correlation": 241670843 + } + }, + { + "ph": "s", "id": 241670843, "pid": 5717, "tid": 5717, "ts": 6302685298473.379, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685303253.894, "dur": 1.216, + "args": { + "External id": 125458, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670849, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670849, "pid": 3, "tid": 7, "ts": 6302685303253.894, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298483.389, "dur": 3.760, + "args": { + "External id": 125458, "cbid": 211, "correlation": 241670849 + } + }, + { + "ph": "s", "id": 241670849, "pid": 5717, "tid": 5717, "ts": 6302685298483.389, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685303255.750, "dur": 0.960, + "args": { + "External id": 125459, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670855, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241670855, "pid": 3, "tid": 7, "ts": 6302685303255.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298491.789, "dur": 3.530, + "args": { + "External id": 125459, "cbid": 211, "correlation": 241670855 + } + }, + { + "ph": "s", "id": 241670855, "pid": 5717, "tid": 5717, "ts": 6302685298491.789, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685303257.446, "dur": 233.154, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670869, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241670869, "pid": 3, "tid": 7, "ts": 6302685303257.446, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298576.109, "dur": 7.620, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241670869 + } + }, + { + "ph": "s", "id": 241670869, "pid": 5717, "tid": 5717, "ts": 6302685298576.109, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685298612.758, "dur": 0.500, + "args": { + "External id": 125463, "cbid": 200, "correlation": 241670892 + } + }, + { + "ph": "f", "id": 241670892, "pid": 5717, "tid": 5717, "ts": 6302685298612.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685303491.432, "dur": 0.768, + "args": { + "External id": 125463, "device": 3, "context": 1, "stream": 7, "correlation": 241670895, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 241670895, "pid": 3, "tid": 7, "ts": 6302685303491.432, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685298614.909, "dur": 7.209, + "args": { + "External id": 125463, "cbid": 51, "correlation": 241670895 + } + }, + { + "ph": "s", "id": 241670895, "pid": 5717, "tid": 5717, "ts": 6302685298614.909, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685303493.352, "dur": 682.565, + "args": { + "External id": 125463, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670896, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670896, "pid": 3, "tid": 7, "ts": 6302685303493.352, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298622.329, "dur": 5.749, + "args": { + "External id": 125463, "cbid": 307, "correlation": 241670896 + } + }, + { + "ph": "s", "id": 241670896, "pid": 5717, "tid": 5717, "ts": 6302685298622.329, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685304176.589, "dur": 2.880, + "args": { + "External id": 125466, "device": 3, "context": 1, "stream": 7, "correlation": 241670901, "bytes": 3145728, "memory bandwidth (GB/s)": 1092.2666666666667 + } + }, + { + "ph": "f", "id": 241670901, "pid": 3, "tid": 7, "ts": 6302685304176.589, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685298649.218, "dur": 10.951, + "args": { + "External id": 125466, "cbid": 41, "correlation": 241670901 + } + }, + { + "ph": "s", "id": 241670901, "pid": 5717, "tid": 5717, "ts": 6302685298649.218, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685298696.069, "dur": 0.340, + "args": { + "External id": 125471, "cbid": 200, "correlation": 241670929 + } + }, + { + "ph": "f", "id": 241670929, "pid": 5717, "tid": 5717, "ts": 6302685298696.069, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685304180.173, "dur": 685.190, + "args": { + "External id": 125471, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670932, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670932, "pid": 3, "tid": 7, "ts": 6302685304180.173, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298697.818, "dur": 6.640, + "args": { + "External id": 125471, "cbid": 307, "correlation": 241670932 + } + }, + { + "ph": "s", "id": 241670932, "pid": 5717, "tid": 5717, "ts": 6302685298697.818, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685304866.035, "dur": 221.345, + "args": { + "External id": 125472, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670937, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241670937, "pid": 3, "tid": 7, "ts": 6302685304866.035, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298717.208, "dur": 5.660, + "args": { + "External id": 125472, "cbid": 211, "correlation": 241670937 + } + }, + { + "ph": "s", "id": 241670937, "pid": 5717, "tid": 5717, "ts": 6302685298717.208, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685298764.188, "dur": 1.200, + "args": { + "External id": 125480, "cbid": 210, "correlation": 241670963 + } + }, + { + "ph": "f", "id": 241670963, "pid": 5717, "tid": 5717, "ts": 6302685298764.188, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685305088.052, "dur": 630.789, + "args": { + "External id": 125480, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670964, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670964, "pid": 3, "tid": 7, "ts": 6302685305088.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298768.488, "dur": 6.830, + "args": { + "External id": 125480, "cbid": 211, "correlation": 241670964 + } + }, + { + "ph": "s", "id": 241670964, "pid": 5717, "tid": 5717, "ts": 6302685298768.488, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685305719.481, "dur": 171.041, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241670983, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241670983, "pid": 3, "tid": 7, "ts": 6302685305719.481, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298866.108, "dur": 8.030, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241670983 + } + }, + { + "ph": "s", "id": 241670983, "pid": 5717, "tid": 5717, "ts": 6302685298866.108, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685305891.226, "dur": 3.936, + "args": { + "External id": 125490, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671000, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671000, "pid": 3, "tid": 7, "ts": 6302685305891.226, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298905.058, "dur": 6.440, + "args": { + "External id": 125490, "cbid": 211, "correlation": 241671000 + } + }, + { + "ph": "s", "id": 241671000, "pid": 5717, "tid": 5717, "ts": 6302685298905.058, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685305895.866, "dur": 1.184, + "args": { + "External id": 125495, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671017, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671017, "pid": 3, "tid": 7, "ts": 6302685305895.866, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298934.888, "dur": 4.780, + "args": { + "External id": 125495, "cbid": 211, "correlation": 241671017 + } + }, + { + "ph": "s", "id": 241671017, "pid": 5717, "tid": 5717, "ts": 6302685298934.888, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685305897.722, "dur": 0.960, + "args": { + "External id": 125497, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671027, "pid": 3, "tid": 7, "ts": 6302685305897.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298951.778, "dur": 4.440, + "args": { + "External id": 125497, "cbid": 211, "correlation": 241671027 + } + }, + { + "ph": "s", "id": 241671027, "pid": 5717, "tid": 5717, "ts": 6302685298951.778, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685305899.290, "dur": 0.992, + "args": { + "External id": 125498, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671033, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671033, "pid": 3, "tid": 7, "ts": 6302685305899.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298962.588, "dur": 3.990, + "args": { + "External id": 125498, "cbid": 211, "correlation": 241671033 + } + }, + { + "ph": "s", "id": 241671033, "pid": 5717, "tid": 5717, "ts": 6302685298962.588, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685305900.986, "dur": 0.992, + "args": { + "External id": 125499, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671043, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671043, "pid": 3, "tid": 7, "ts": 6302685305900.986, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298975.858, "dur": 4.030, + "args": { + "External id": 125499, "cbid": 211, "correlation": 241671043 + } + }, + { + "ph": "s", "id": 241671043, "pid": 5717, "tid": 5717, "ts": 6302685298975.858, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685305902.682, "dur": 1.024, + "args": { + "External id": 125500, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671049, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671049, "pid": 3, "tid": 7, "ts": 6302685305902.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685298985.008, "dur": 3.800, + "args": { + "External id": 125500, "cbid": 211, "correlation": 241671049 + } + }, + { + "ph": "s", "id": 241671049, "pid": 5717, "tid": 5717, "ts": 6302685298985.008, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685305904.314, "dur": 3.232, + "args": { + "External id": 125501, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671062, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671062, "pid": 3, "tid": 7, "ts": 6302685305904.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299003.868, "dur": 4.500, + "args": { + "External id": 125501, "cbid": 211, "correlation": 241671062 + } + }, + { + "ph": "s", "id": 241671062, "pid": 5717, "tid": 5717, "ts": 6302685299003.868, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685305908.122, "dur": 1.216, + "args": { + "External id": 125504, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671068, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671068, "pid": 3, "tid": 7, "ts": 6302685305908.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299013.988, "dur": 3.890, + "args": { + "External id": 125504, "cbid": 211, "correlation": 241671068 + } + }, + { + "ph": "s", "id": 241671068, "pid": 5717, "tid": 5717, "ts": 6302685299013.988, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685305909.978, "dur": 0.992, + "args": { + "External id": 125505, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671074, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671074, "pid": 3, "tid": 7, "ts": 6302685305909.978, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299022.298, "dur": 3.640, + "args": { + "External id": 125505, "cbid": 211, "correlation": 241671074 + } + }, + { + "ph": "s", "id": 241671074, "pid": 5717, "tid": 5717, "ts": 6302685299022.298, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685305911.546, "dur": 233.186, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671088, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241671088, "pid": 3, "tid": 7, "ts": 6302685305911.546, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299100.788, "dur": 7.369, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241671088 + } + }, + { + "ph": "s", "id": 241671088, "pid": 5717, "tid": 5717, "ts": 6302685299100.788, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685299136.757, "dur": 0.520, + "args": { + "External id": 125509, "cbid": 200, "correlation": 241671111 + } + }, + { + "ph": "f", "id": 241671111, "pid": 5717, "tid": 5717, "ts": 6302685299136.757, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685306145.596, "dur": 0.800, + "args": { + "External id": 125509, "device": 3, "context": 1, "stream": 7, "correlation": 241671114, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241671114, "pid": 3, "tid": 7, "ts": 6302685306145.596, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685299138.847, "dur": 6.130, + "args": { + "External id": 125509, "cbid": 51, "correlation": 241671114 + } + }, + { + "ph": "s", "id": 241671114, "pid": 5717, "tid": 5717, "ts": 6302685299138.847, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685306147.868, "dur": 684.261, + "args": { + "External id": 125509, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671115, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671115, "pid": 3, "tid": 7, "ts": 6302685306147.868, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299145.197, "dur": 5.730, + "args": { + "External id": 125509, "cbid": 307, "correlation": 241671115 + } + }, + { + "ph": "s", "id": 241671115, "pid": 5717, "tid": 5717, "ts": 6302685299145.197, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685306832.801, "dur": 2.912, + "args": { + "External id": 125512, "device": 3, "context": 1, "stream": 7, "correlation": 241671120, "bytes": 3145728, "memory bandwidth (GB/s)": 1080.2637362637363 + } + }, + { + "ph": "f", "id": 241671120, "pid": 3, "tid": 7, "ts": 6302685306832.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685299173.877, "dur": 10.580, + "args": { + "External id": 125512, "cbid": 41, "correlation": 241671120 + } + }, + { + "ph": "s", "id": 241671120, "pid": 5717, "tid": 5717, "ts": 6302685299173.877, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685299219.827, "dur": 0.360, + "args": { + "External id": 125517, "cbid": 200, "correlation": 241671148 + } + }, + { + "ph": "f", "id": 241671148, "pid": 5717, "tid": 5717, "ts": 6302685299219.827, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685306836.353, "dur": 690.469, + "args": { + "External id": 125517, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671151, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671151, "pid": 3, "tid": 7, "ts": 6302685306836.353, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299221.567, "dur": 6.600, + "args": { + "External id": 125517, "cbid": 307, "correlation": 241671151 + } + }, + { + "ph": "s", "id": 241671151, "pid": 5717, "tid": 5717, "ts": 6302685299221.567, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685307527.462, "dur": 220.866, + "args": { + "External id": 125518, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671156, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241671156, "pid": 3, "tid": 7, "ts": 6302685307527.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299240.877, "dur": 5.450, + "args": { + "External id": 125518, "cbid": 211, "correlation": 241671156 + } + }, + { + "ph": "s", "id": 241671156, "pid": 5717, "tid": 5717, "ts": 6302685299240.877, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685299284.807, "dur": 1.190, + "args": { + "External id": 125526, "cbid": 210, "correlation": 241671182 + } + }, + { + "ph": "f", "id": 241671182, "pid": 5717, "tid": 5717, "ts": 6302685299284.807, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685307749.032, "dur": 630.085, + "args": { + "External id": 125526, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671183, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671183, "pid": 3, "tid": 7, "ts": 6302685307749.032, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299289.137, "dur": 6.760, + "args": { + "External id": 125526, "cbid": 211, "correlation": 241671183 + } + }, + { + "ph": "s", "id": 241671183, "pid": 5717, "tid": 5717, "ts": 6302685299289.137, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685308379.725, "dur": 170.977, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671202, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671202, "pid": 3, "tid": 7, "ts": 6302685308379.725, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299397.287, "dur": 8.210, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241671202 + } + }, + { + "ph": "s", "id": 241671202, "pid": 5717, "tid": 5717, "ts": 6302685299397.287, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685308551.342, "dur": 4.128, + "args": { + "External id": 125536, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671219, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671219, "pid": 3, "tid": 7, "ts": 6302685308551.342, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299435.437, "dur": 6.300, + "args": { + "External id": 125536, "cbid": 211, "correlation": 241671219 + } + }, + { + "ph": "s", "id": 241671219, "pid": 5717, "tid": 5717, "ts": 6302685299435.437, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685308556.110, "dur": 1.184, + "args": { + "External id": 125541, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671236, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671236, "pid": 3, "tid": 7, "ts": 6302685308556.110, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299464.727, "dur": 5.040, + "args": { + "External id": 125541, "cbid": 211, "correlation": 241671236 + } + }, + { + "ph": "s", "id": 241671236, "pid": 5717, "tid": 5717, "ts": 6302685299464.727, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685308557.966, "dur": 0.992, + "args": { + "External id": 125543, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671246, "pid": 3, "tid": 7, "ts": 6302685308557.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299482.596, "dur": 4.391, + "args": { + "External id": 125543, "cbid": 211, "correlation": 241671246 + } + }, + { + "ph": "s", "id": 241671246, "pid": 5717, "tid": 5717, "ts": 6302685299482.596, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685308559.534, "dur": 1.024, + "args": { + "External id": 125544, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671252, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671252, "pid": 3, "tid": 7, "ts": 6302685308559.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299493.107, "dur": 4.049, + "args": { + "External id": 125544, "cbid": 211, "correlation": 241671252 + } + }, + { + "ph": "s", "id": 241671252, "pid": 5717, "tid": 5717, "ts": 6302685299493.107, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685308561.262, "dur": 0.992, + "args": { + "External id": 125545, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671262, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671262, "pid": 3, "tid": 7, "ts": 6302685308561.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299506.636, "dur": 3.980, + "args": { + "External id": 125545, "cbid": 211, "correlation": 241671262 + } + }, + { + "ph": "s", "id": 241671262, "pid": 5717, "tid": 5717, "ts": 6302685299506.636, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685308562.958, "dur": 0.992, + "args": { + "External id": 125546, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671268, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671268, "pid": 3, "tid": 7, "ts": 6302685308562.958, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299515.716, "dur": 3.771, + "args": { + "External id": 125546, "cbid": 211, "correlation": 241671268 + } + }, + { + "ph": "s", "id": 241671268, "pid": 5717, "tid": 5717, "ts": 6302685299515.716, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685308564.558, "dur": 3.232, + "args": { + "External id": 125547, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671281, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671281, "pid": 3, "tid": 7, "ts": 6302685308564.558, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299534.227, "dur": 4.580, + "args": { + "External id": 125547, "cbid": 211, "correlation": 241671281 + } + }, + { + "ph": "s", "id": 241671281, "pid": 5717, "tid": 5717, "ts": 6302685299534.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685308568.526, "dur": 1.216, + "args": { + "External id": 125550, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671287, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671287, "pid": 3, "tid": 7, "ts": 6302685308568.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299544.516, "dur": 3.971, + "args": { + "External id": 125550, "cbid": 211, "correlation": 241671287 + } + }, + { + "ph": "s", "id": 241671287, "pid": 5717, "tid": 5717, "ts": 6302685299544.516, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685308570.382, "dur": 0.960, + "args": { + "External id": 125551, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671293, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671293, "pid": 3, "tid": 7, "ts": 6302685308570.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299552.876, "dur": 3.551, + "args": { + "External id": 125551, "cbid": 211, "correlation": 241671293 + } + }, + { + "ph": "s", "id": 241671293, "pid": 5717, "tid": 5717, "ts": 6302685299552.876, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685308571.950, "dur": 232.866, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671307, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241671307, "pid": 3, "tid": 7, "ts": 6302685308571.950, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299629.856, "dur": 7.330, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241671307 + } + }, + { + "ph": "s", "id": 241671307, "pid": 5717, "tid": 5717, "ts": 6302685299629.856, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685299665.886, "dur": 0.500, + "args": { + "External id": 125555, "cbid": 200, "correlation": 241671330 + } + }, + { + "ph": "f", "id": 241671330, "pid": 5717, "tid": 5717, "ts": 6302685299665.886, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685308805.616, "dur": 0.800, + "args": { + "External id": 125555, "device": 3, "context": 1, "stream": 7, "correlation": 241671333, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241671333, "pid": 3, "tid": 7, "ts": 6302685308805.616, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685299667.996, "dur": 8.540, + "args": { + "External id": 125555, "cbid": 51, "correlation": 241671333 + } + }, + { + "ph": "s", "id": 241671333, "pid": 5717, "tid": 5717, "ts": 6302685299667.996, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685308807.568, "dur": 685.989, + "args": { + "External id": 125555, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671334, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671334, "pid": 3, "tid": 7, "ts": 6302685308807.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299676.946, "dur": 7.420, + "args": { + "External id": 125555, "cbid": 307, "correlation": 241671334 + } + }, + { + "ph": "s", "id": 241671334, "pid": 5717, "tid": 5717, "ts": 6302685299676.946, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685309494.197, "dur": 2.912, + "args": { + "External id": 125558, "device": 3, "context": 1, "stream": 7, "correlation": 241671339, "bytes": 3145728, "memory bandwidth (GB/s)": 1080.2637362637363 + } + }, + { + "ph": "f", "id": 241671339, "pid": 3, "tid": 7, "ts": 6302685309494.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685299716.596, "dur": 11.480, + "args": { + "External id": 125558, "cbid": 41, "correlation": 241671339 + } + }, + { + "ph": "s", "id": 241671339, "pid": 5717, "tid": 5717, "ts": 6302685299716.596, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685299768.046, "dur": 0.420, + "args": { + "External id": 125563, "cbid": 200, "correlation": 241671367 + } + }, + { + "ph": "f", "id": 241671367, "pid": 5717, "tid": 5717, "ts": 6302685299768.046, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685309497.749, "dur": 691.685, + "args": { + "External id": 125563, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671370, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671370, "pid": 3, "tid": 7, "ts": 6302685309497.749, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299769.936, "dur": 6.680, + "args": { + "External id": 125563, "cbid": 307, "correlation": 241671370 + } + }, + { + "ph": "s", "id": 241671370, "pid": 5717, "tid": 5717, "ts": 6302685299769.936, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685310190.074, "dur": 220.962, + "args": { + "External id": 125564, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671375, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241671375, "pid": 3, "tid": 7, "ts": 6302685310190.074, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299789.396, "dur": 5.390, + "args": { + "External id": 125564, "cbid": 211, "correlation": 241671375 + } + }, + { + "ph": "s", "id": 241671375, "pid": 5717, "tid": 5717, "ts": 6302685299789.396, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685299853.376, "dur": 2.410, + "args": { + "External id": 125572, "cbid": 210, "correlation": 241671401 + } + }, + { + "ph": "f", "id": 241671401, "pid": 5717, "tid": 5717, "ts": 6302685299853.376, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685310411.676, "dur": 632.549, + "args": { + "External id": 125572, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671402, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671402, "pid": 3, "tid": 7, "ts": 6302685310411.676, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299860.026, "dur": 11.020, + "args": { + "External id": 125572, "cbid": 211, "correlation": 241671402 + } + }, + { + "ph": "s", "id": 241671402, "pid": 5717, "tid": 5717, "ts": 6302685299860.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685311044.897, "dur": 170.721, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671421, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671421, "pid": 3, "tid": 7, "ts": 6302685311044.897, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685299976.746, "dur": 8.849, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241671421 + } + }, + { + "ph": "s", "id": 241671421, "pid": 5717, "tid": 5717, "ts": 6302685299976.746, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685311216.258, "dur": 4.000, + "args": { + "External id": 125582, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671438, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671438, "pid": 3, "tid": 7, "ts": 6302685311216.258, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300029.345, "dur": 11.540, + "args": { + "External id": 125582, "cbid": 211, "correlation": 241671438 + } + }, + { + "ph": "s", "id": 241671438, "pid": 5717, "tid": 5717, "ts": 6302685300029.345, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685311220.930, "dur": 1.152, + "args": { + "External id": 125587, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671455, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671455, "pid": 3, "tid": 7, "ts": 6302685311220.930, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300068.245, "dur": 4.610, + "args": { + "External id": 125587, "cbid": 211, "correlation": 241671455 + } + }, + { + "ph": "s", "id": 241671455, "pid": 5717, "tid": 5717, "ts": 6302685300068.245, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685311222.754, "dur": 0.992, + "args": { + "External id": 125589, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671465, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671465, "pid": 3, "tid": 7, "ts": 6302685311222.754, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300087.605, "dur": 5.050, + "args": { + "External id": 125589, "cbid": 211, "correlation": 241671465 + } + }, + { + "ph": "s", "id": 241671465, "pid": 5717, "tid": 5717, "ts": 6302685300087.605, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685311224.354, "dur": 0.992, + "args": { + "External id": 125590, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671471, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671471, "pid": 3, "tid": 7, "ts": 6302685311224.354, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300099.675, "dur": 3.990, + "args": { + "External id": 125590, "cbid": 211, "correlation": 241671471 + } + }, + { + "ph": "s", "id": 241671471, "pid": 5717, "tid": 5717, "ts": 6302685300099.675, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685311226.050, "dur": 0.992, + "args": { + "External id": 125591, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671481, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671481, "pid": 3, "tid": 7, "ts": 6302685311226.050, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300113.985, "dur": 4.070, + "args": { + "External id": 125591, "cbid": 211, "correlation": 241671481 + } + }, + { + "ph": "s", "id": 241671481, "pid": 5717, "tid": 5717, "ts": 6302685300113.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685311227.746, "dur": 1.024, + "args": { + "External id": 125592, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671487, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671487, "pid": 3, "tid": 7, "ts": 6302685311227.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300124.125, "dur": 3.860, + "args": { + "External id": 125592, "cbid": 211, "correlation": 241671487 + } + }, + { + "ph": "s", "id": 241671487, "pid": 5717, "tid": 5717, "ts": 6302685300124.125, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685311229.378, "dur": 3.232, + "args": { + "External id": 125593, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671500, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671500, "pid": 3, "tid": 7, "ts": 6302685311229.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300151.415, "dur": 8.300, + "args": { + "External id": 125593, "cbid": 211, "correlation": 241671500 + } + }, + { + "ph": "s", "id": 241671500, "pid": 5717, "tid": 5717, "ts": 6302685300151.415, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685311233.314, "dur": 1.216, + "args": { + "External id": 125596, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671506, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671506, "pid": 3, "tid": 7, "ts": 6302685311233.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300169.175, "dur": 6.530, + "args": { + "External id": 125596, "cbid": 211, "correlation": 241671506 + } + }, + { + "ph": "s", "id": 241671506, "pid": 5717, "tid": 5717, "ts": 6302685300169.175, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685311235.170, "dur": 0.992, + "args": { + "External id": 125597, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671512, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671512, "pid": 3, "tid": 7, "ts": 6302685311235.170, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300182.815, "dur": 6.690, + "args": { + "External id": 125597, "cbid": 211, "correlation": 241671512 + } + }, + { + "ph": "s", "id": 241671512, "pid": 5717, "tid": 5717, "ts": 6302685300182.815, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685311236.738, "dur": 233.058, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671526, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241671526, "pid": 3, "tid": 7, "ts": 6302685311236.738, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300279.025, "dur": 10.080, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241671526 + } + }, + { + "ph": "s", "id": 241671526, "pid": 5717, "tid": 5717, "ts": 6302685300279.025, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685300340.055, "dur": 1.620, + "args": { + "External id": 125601, "cbid": 200, "correlation": 241671549 + } + }, + { + "ph": "f", "id": 241671549, "pid": 5717, "tid": 5717, "ts": 6302685300340.055, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685311470.660, "dur": 0.800, + "args": { + "External id": 125601, "device": 3, "context": 1, "stream": 7, "correlation": 241671552, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241671552, "pid": 3, "tid": 7, "ts": 6302685311470.660, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685300343.335, "dur": 11.210, + "args": { + "External id": 125601, "cbid": 51, "correlation": 241671552 + } + }, + { + "ph": "s", "id": 241671552, "pid": 5717, "tid": 5717, "ts": 6302685300343.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685311472.612, "dur": 685.253, + "args": { + "External id": 125601, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671553, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671553, "pid": 3, "tid": 7, "ts": 6302685311472.612, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300354.795, "dur": 8.590, + "args": { + "External id": 125601, "cbid": 307, "correlation": 241671553 + } + }, + { + "ph": "s", "id": 241671553, "pid": 5717, "tid": 5717, "ts": 6302685300354.795, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685312158.537, "dur": 2.912, + "args": { + "External id": 125604, "device": 3, "context": 1, "stream": 7, "correlation": 241671558, "bytes": 3145728, "memory bandwidth (GB/s)": 1080.2637362637363 + } + }, + { + "ph": "f", "id": 241671558, "pid": 3, "tid": 7, "ts": 6302685312158.537, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685300386.634, "dur": 11.271, + "args": { + "External id": 125604, "cbid": 41, "correlation": 241671558 + } + }, + { + "ph": "s", "id": 241671558, "pid": 5717, "tid": 5717, "ts": 6302685300386.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685300434.665, "dur": 0.380, + "args": { + "External id": 125609, "cbid": 200, "correlation": 241671586 + } + }, + { + "ph": "f", "id": 241671586, "pid": 5717, "tid": 5717, "ts": 6302685300434.665, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685312162.089, "dur": 687.109, + "args": { + "External id": 125609, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671589, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671589, "pid": 3, "tid": 7, "ts": 6302685312162.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300436.414, "dur": 6.551, + "args": { + "External id": 125609, "cbid": 307, "correlation": 241671589 + } + }, + { + "ph": "s", "id": 241671589, "pid": 5717, "tid": 5717, "ts": 6302685300436.414, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685312849.903, "dur": 221.153, + "args": { + "External id": 125610, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671594, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241671594, "pid": 3, "tid": 7, "ts": 6302685312849.903, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300455.625, "dur": 5.380, + "args": { + "External id": 125610, "cbid": 211, "correlation": 241671594 + } + }, + { + "ph": "s", "id": 241671594, "pid": 5717, "tid": 5717, "ts": 6302685300455.625, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685300506.564, "dur": 1.270, + "args": { + "External id": 125618, "cbid": 210, "correlation": 241671620 + } + }, + { + "ph": "f", "id": 241671620, "pid": 5717, "tid": 5717, "ts": 6302685300506.564, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685313071.632, "dur": 630.661, + "args": { + "External id": 125618, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671621, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671621, "pid": 3, "tid": 7, "ts": 6302685313071.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300510.924, "dur": 7.420, + "args": { + "External id": 125618, "cbid": 211, "correlation": 241671621 + } + }, + { + "ph": "s", "id": 241671621, "pid": 5717, "tid": 5717, "ts": 6302685300510.924, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685313702.933, "dur": 170.625, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671640, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671640, "pid": 3, "tid": 7, "ts": 6302685313702.933, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300612.614, "dur": 7.940, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241671640 + } + }, + { + "ph": "s", "id": 241671640, "pid": 5717, "tid": 5717, "ts": 6302685300612.614, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685313874.262, "dur": 4.000, + "args": { + "External id": 125628, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671657, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671657, "pid": 3, "tid": 7, "ts": 6302685313874.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300652.394, "dur": 6.490, + "args": { + "External id": 125628, "cbid": 211, "correlation": 241671657 + } + }, + { + "ph": "s", "id": 241671657, "pid": 5717, "tid": 5717, "ts": 6302685300652.394, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685313878.902, "dur": 1.184, + "args": { + "External id": 125633, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671674, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671674, "pid": 3, "tid": 7, "ts": 6302685313878.902, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300679.774, "dur": 4.690, + "args": { + "External id": 125633, "cbid": 211, "correlation": 241671674 + } + }, + { + "ph": "s", "id": 241671674, "pid": 5717, "tid": 5717, "ts": 6302685300679.774, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685313880.758, "dur": 0.992, + "args": { + "External id": 125635, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671684, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671684, "pid": 3, "tid": 7, "ts": 6302685313880.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300698.704, "dur": 7.120, + "args": { + "External id": 125635, "cbid": 211, "correlation": 241671684 + } + }, + { + "ph": "s", "id": 241671684, "pid": 5717, "tid": 5717, "ts": 6302685300698.704, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685313882.454, "dur": 1.024, + "args": { + "External id": 125636, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671690, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671690, "pid": 3, "tid": 7, "ts": 6302685313882.454, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300717.114, "dur": 5.600, + "args": { + "External id": 125636, "cbid": 211, "correlation": 241671690 + } + }, + { + "ph": "s", "id": 241671690, "pid": 5717, "tid": 5717, "ts": 6302685300717.114, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685313884.182, "dur": 0.992, + "args": { + "External id": 125637, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671700, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671700, "pid": 3, "tid": 7, "ts": 6302685313884.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300732.574, "dur": 4.190, + "args": { + "External id": 125637, "cbid": 211, "correlation": 241671700 + } + }, + { + "ph": "s", "id": 241671700, "pid": 5717, "tid": 5717, "ts": 6302685300732.574, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685313885.878, "dur": 0.992, + "args": { + "External id": 125638, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671706, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671706, "pid": 3, "tid": 7, "ts": 6302685313885.878, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300742.884, "dur": 3.850, + "args": { + "External id": 125638, "cbid": 211, "correlation": 241671706 + } + }, + { + "ph": "s", "id": 241671706, "pid": 5717, "tid": 5717, "ts": 6302685300742.884, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685313887.510, "dur": 3.264, + "args": { + "External id": 125639, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671719, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671719, "pid": 3, "tid": 7, "ts": 6302685313887.510, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300761.094, "dur": 4.730, + "args": { + "External id": 125639, "cbid": 211, "correlation": 241671719 + } + }, + { + "ph": "s", "id": 241671719, "pid": 5717, "tid": 5717, "ts": 6302685300761.094, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685313891.446, "dur": 1.344, + "args": { + "External id": 125642, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671725, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671725, "pid": 3, "tid": 7, "ts": 6302685313891.446, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300775.274, "dur": 6.770, + "args": { + "External id": 125642, "cbid": 211, "correlation": 241671725 + } + }, + { + "ph": "s", "id": 241671725, "pid": 5717, "tid": 5717, "ts": 6302685300775.274, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685313893.431, "dur": 0.992, + "args": { + "External id": 125643, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671731, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671731, "pid": 3, "tid": 7, "ts": 6302685313893.431, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300788.974, "dur": 4.890, + "args": { + "External id": 125643, "cbid": 211, "correlation": 241671731 + } + }, + { + "ph": "s", "id": 241671731, "pid": 5717, "tid": 5717, "ts": 6302685300788.974, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685313895.031, "dur": 233.185, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671745, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241671745, "pid": 3, "tid": 7, "ts": 6302685313895.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300881.424, "dur": 7.629, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241671745 + } + }, + { + "ph": "s", "id": 241671745, "pid": 5717, "tid": 5717, "ts": 6302685300881.424, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685300918.523, "dur": 0.530, + "args": { + "External id": 125647, "cbid": 200, "correlation": 241671768 + } + }, + { + "ph": "f", "id": 241671768, "pid": 5717, "tid": 5717, "ts": 6302685300918.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685314129.080, "dur": 0.800, + "args": { + "External id": 125647, "device": 3, "context": 1, "stream": 7, "correlation": 241671771, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241671771, "pid": 3, "tid": 7, "ts": 6302685314129.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685300920.673, "dur": 6.080, + "args": { + "External id": 125647, "cbid": 51, "correlation": 241671771 + } + }, + { + "ph": "s", "id": 241671771, "pid": 5717, "tid": 5717, "ts": 6302685300920.673, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685314131.064, "dur": 682.981, + "args": { + "External id": 125647, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671772, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671772, "pid": 3, "tid": 7, "ts": 6302685314131.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685300926.973, "dur": 5.490, + "args": { + "External id": 125647, "cbid": 307, "correlation": 241671772 + } + }, + { + "ph": "s", "id": 241671772, "pid": 5717, "tid": 5717, "ts": 6302685300926.973, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685314814.749, "dur": 2.848, + "args": { + "External id": 125650, "device": 3, "context": 1, "stream": 7, "correlation": 241671777, "bytes": 3145728, "memory bandwidth (GB/s)": 1104.5393258426966 + } + }, + { + "ph": "f", "id": 241671777, "pid": 3, "tid": 7, "ts": 6302685314814.749, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685300954.783, "dur": 10.920, + "args": { + "External id": 125650, "cbid": 41, "correlation": 241671777 + } + }, + { + "ph": "s", "id": 241671777, "pid": 5717, "tid": 5717, "ts": 6302685300954.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685301001.323, "dur": 0.380, + "args": { + "External id": 125655, "cbid": 200, "correlation": 241671805 + } + }, + { + "ph": "f", "id": 241671805, "pid": 5717, "tid": 5717, "ts": 6302685301001.323, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685314818.301, "dur": 688.069, + "args": { + "External id": 125655, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671808, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671808, "pid": 3, "tid": 7, "ts": 6302685314818.301, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301003.063, "dur": 6.550, + "args": { + "External id": 125655, "cbid": 307, "correlation": 241671808 + } + }, + { + "ph": "s", "id": 241671808, "pid": 5717, "tid": 5717, "ts": 6302685301003.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685315507.042, "dur": 221.986, + "args": { + "External id": 125656, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671813, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241671813, "pid": 3, "tid": 7, "ts": 6302685315507.042, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301021.993, "dur": 5.230, + "args": { + "External id": 125656, "cbid": 211, "correlation": 241671813 + } + }, + { + "ph": "s", "id": 241671813, "pid": 5717, "tid": 5717, "ts": 6302685301021.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685301066.363, "dur": 1.190, + "args": { + "External id": 125664, "cbid": 210, "correlation": 241671839 + } + }, + { + "ph": "f", "id": 241671839, "pid": 5717, "tid": 5717, "ts": 6302685301066.363, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685315729.668, "dur": 631.685, + "args": { + "External id": 125664, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671840, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671840, "pid": 3, "tid": 7, "ts": 6302685315729.668, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301070.563, "dur": 6.870, + "args": { + "External id": 125664, "cbid": 211, "correlation": 241671840 + } + }, + { + "ph": "s", "id": 241671840, "pid": 5717, "tid": 5717, "ts": 6302685301070.563, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685316361.993, "dur": 170.465, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671859, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671859, "pid": 3, "tid": 7, "ts": 6302685316361.993, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301169.143, "dur": 7.930, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241671859 + } + }, + { + "ph": "s", "id": 241671859, "pid": 5717, "tid": 5717, "ts": 6302685301169.143, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685316533.194, "dur": 3.968, + "args": { + "External id": 125674, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671876, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671876, "pid": 3, "tid": 7, "ts": 6302685316533.194, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301207.753, "dur": 6.430, + "args": { + "External id": 125674, "cbid": 211, "correlation": 241671876 + } + }, + { + "ph": "s", "id": 241671876, "pid": 5717, "tid": 5717, "ts": 6302685301207.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685316537.834, "dur": 1.184, + "args": { + "External id": 125679, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671893, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671893, "pid": 3, "tid": 7, "ts": 6302685316537.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301236.343, "dur": 4.550, + "args": { + "External id": 125679, "cbid": 211, "correlation": 241671893 + } + }, + { + "ph": "s", "id": 241671893, "pid": 5717, "tid": 5717, "ts": 6302685301236.343, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685316539.690, "dur": 0.992, + "args": { + "External id": 125681, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671903, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671903, "pid": 3, "tid": 7, "ts": 6302685316539.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301253.323, "dur": 4.469, + "args": { + "External id": 125681, "cbid": 211, "correlation": 241671903 + } + }, + { + "ph": "s", "id": 241671903, "pid": 5717, "tid": 5717, "ts": 6302685301253.323, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685316541.258, "dur": 1.024, + "args": { + "External id": 125682, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671909, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671909, "pid": 3, "tid": 7, "ts": 6302685316541.258, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301263.972, "dur": 4.031, + "args": { + "External id": 125682, "cbid": 211, "correlation": 241671909 + } + }, + { + "ph": "s", "id": 241671909, "pid": 5717, "tid": 5717, "ts": 6302685301263.972, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685316542.986, "dur": 0.992, + "args": { + "External id": 125683, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671919, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671919, "pid": 3, "tid": 7, "ts": 6302685316542.986, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301277.943, "dur": 4.400, + "args": { + "External id": 125683, "cbid": 211, "correlation": 241671919 + } + }, + { + "ph": "s", "id": 241671919, "pid": 5717, "tid": 5717, "ts": 6302685301277.943, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685316544.682, "dur": 0.992, + "args": { + "External id": 125684, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671925, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671925, "pid": 3, "tid": 7, "ts": 6302685316544.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301287.743, "dur": 4.129, + "args": { + "External id": 125684, "cbid": 211, "correlation": 241671925 + } + }, + { + "ph": "s", "id": 241671925, "pid": 5717, "tid": 5717, "ts": 6302685301287.743, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685316546.282, "dur": 3.232, + "args": { + "External id": 125685, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671938, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671938, "pid": 3, "tid": 7, "ts": 6302685316546.282, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301314.292, "dur": 5.111, + "args": { + "External id": 125685, "cbid": 211, "correlation": 241671938 + } + }, + { + "ph": "s", "id": 241671938, "pid": 5717, "tid": 5717, "ts": 6302685301314.292, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685316550.122, "dur": 1.216, + "args": { + "External id": 125688, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671944, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671944, "pid": 3, "tid": 7, "ts": 6302685316550.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301326.783, "dur": 3.800, + "args": { + "External id": 125688, "cbid": 211, "correlation": 241671944 + } + }, + { + "ph": "s", "id": 241671944, "pid": 5717, "tid": 5717, "ts": 6302685301326.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685316551.946, "dur": 0.992, + "args": { + "External id": 125689, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671950, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241671950, "pid": 3, "tid": 7, "ts": 6302685316551.946, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301335.583, "dur": 3.500, + "args": { + "External id": 125689, "cbid": 211, "correlation": 241671950 + } + }, + { + "ph": "s", "id": 241671950, "pid": 5717, "tid": 5717, "ts": 6302685301335.583, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685316553.546, "dur": 233.570, + "args": { + "External id": 125305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671964, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241671964, "pid": 3, "tid": 7, "ts": 6302685316553.546, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301414.312, "dur": 7.260, + "args": { + "External id": 125305, "cbid": 307, "correlation": 241671964 + } + }, + { + "ph": "s", "id": 241671964, "pid": 5717, "tid": 5717, "ts": 6302685301414.312, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685301450.492, "dur": 0.510, + "args": { + "External id": 125693, "cbid": 200, "correlation": 241671987 + } + }, + { + "ph": "f", "id": 241671987, "pid": 5717, "tid": 5717, "ts": 6302685301450.492, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685316788.108, "dur": 0.800, + "args": { + "External id": 125693, "device": 3, "context": 1, "stream": 7, "correlation": 241671990, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241671990, "pid": 3, "tid": 7, "ts": 6302685316788.108, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685301452.592, "dur": 6.220, + "args": { + "External id": 125693, "cbid": 51, "correlation": 241671990 + } + }, + { + "ph": "s", "id": 241671990, "pid": 5717, "tid": 5717, "ts": 6302685301452.592, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685316789.676, "dur": 688.838, + "args": { + "External id": 125693, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241671991, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241671991, "pid": 3, "tid": 7, "ts": 6302685316789.676, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301459.042, "dur": 5.380, + "args": { + "External id": 125693, "cbid": 307, "correlation": 241671991 + } + }, + { + "ph": "s", "id": 241671991, "pid": 5717, "tid": 5717, "ts": 6302685301459.042, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685317479.154, "dur": 2.912, + "args": { + "External id": 125696, "device": 3, "context": 1, "stream": 7, "correlation": 241671996, "bytes": 3145728, "memory bandwidth (GB/s)": 1080.2637362637363 + } + }, + { + "ph": "f", "id": 241671996, "pid": 3, "tid": 7, "ts": 6302685317479.154, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685301486.532, "dur": 13.320, + "args": { + "External id": 125696, "cbid": 41, "correlation": 241671996 + } + }, + { + "ph": "s", "id": 241671996, "pid": 5717, "tid": 5717, "ts": 6302685301486.532, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685301534.482, "dur": 0.370, + "args": { + "External id": 125701, "cbid": 200, "correlation": 241672024 + } + }, + { + "ph": "f", "id": 241672024, "pid": 5717, "tid": 5717, "ts": 6302685301534.482, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685317482.706, "dur": 688.645, + "args": { + "External id": 125701, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672027, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672027, "pid": 3, "tid": 7, "ts": 6302685317482.706, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301536.272, "dur": 6.540, + "args": { + "External id": 125701, "cbid": 307, "correlation": 241672027 + } + }, + { + "ph": "s", "id": 241672027, "pid": 5717, "tid": 5717, "ts": 6302685301536.272, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685318171.959, "dur": 221.089, + "args": { + "External id": 125702, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672032, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241672032, "pid": 3, "tid": 7, "ts": 6302685318171.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301556.482, "dur": 5.410, + "args": { + "External id": 125702, "cbid": 211, "correlation": 241672032 + } + }, + { + "ph": "s", "id": 241672032, "pid": 5717, "tid": 5717, "ts": 6302685301556.482, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685318393.720, "dur": 5.312, + "args": { + "External id": 125704, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672045, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672045, "pid": 3, "tid": 7, "ts": 6302685318393.720, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301582.152, "dur": 6.610, + "args": { + "External id": 125704, "cbid": 211, "correlation": 241672045 + } + }, + { + "ph": "s", "id": 241672045, "pid": 5717, "tid": 5717, "ts": 6302685301582.152, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685318399.640, "dur": 160.097, + "args": { + "External id": 125709, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672058, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241672058, "pid": 3, "tid": 7, "ts": 6302685318399.640, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301609.982, "dur": 5.690, + "args": { + "External id": 125709, "cbid": 211, "correlation": 241672058 + } + }, + { + "ph": "s", "id": 241672058, "pid": 5717, "tid": 5717, "ts": 6302685301609.982, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685318560.313, "dur": 1.536, + "args": { + "External id": 125714, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672066, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672066, "pid": 3, "tid": 7, "ts": 6302685318560.313, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301668.082, "dur": 7.210, + "args": { + "External id": 125714, "cbid": 211, "correlation": 241672066 + } + }, + { + "ph": "s", "id": 241672066, "pid": 5717, "tid": 5717, "ts": 6302685301668.082, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685318562.457, "dur": 1.280, + "args": { + "External id": 125715, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672072, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672072, "pid": 3, "tid": 7, "ts": 6302685318562.457, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301685.812, "dur": 4.410, + "args": { + "External id": 125715, "cbid": 211, "correlation": 241672072 + } + }, + { + "ph": "s", "id": 241672072, "pid": 5717, "tid": 5717, "ts": 6302685301685.812, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685318564.473, "dur": 2.272, + "args": { + "External id": 125734, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672092, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 241672092, "pid": 3, "tid": 7, "ts": 6302685318564.473, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301763.382, "dur": 7.900, + "args": { + "External id": 125734, "cbid": 211, "correlation": 241672092 + } + }, + { + "ph": "s", "id": 241672092, "pid": 5717, "tid": 5717, "ts": 6302685301763.382, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685318567.450, "dur": 58.400, + "args": { + "External id": 125742, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672110, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241672110, "pid": 3, "tid": 7, "ts": 6302685318567.450, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301852.791, "dur": 8.910, + "args": { + "External id": 125742, "cbid": 211, "correlation": 241672110 + } + }, + { + "ph": "s", "id": 241672110, "pid": 5717, "tid": 5717, "ts": 6302685301852.791, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685318626.426, "dur": 15.808, + "args": { + "External id": 125747, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672127, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241672127, "pid": 3, "tid": 7, "ts": 6302685318626.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301893.001, "dur": 6.620, + "args": { + "External id": 125747, "cbid": 211, "correlation": 241672127 + } + }, + { + "ph": "s", "id": 241672127, "pid": 5717, "tid": 5717, "ts": 6302685301893.001, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685318642.842, "dur": 100.577, + "args": { + "External id": 125752, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672143, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241672143, "pid": 3, "tid": 7, "ts": 6302685318642.842, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301915.251, "dur": 4.580, + "args": { + "External id": 125752, "cbid": 211, "correlation": 241672143 + } + }, + { + "ph": "s", "id": 241672143, "pid": 5717, "tid": 5717, "ts": 6302685301915.251, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685318744.123, "dur": 2.016, + "args": { + "External id": 125756, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672159, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241672159, "pid": 3, "tid": 7, "ts": 6302685318744.123, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301937.731, "dur": 4.780, + "args": { + "External id": 125756, "cbid": 211, "correlation": 241672159 + } + }, + { + "ph": "s", "id": 241672159, "pid": 5717, "tid": 5717, "ts": 6302685301937.731, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685318746.811, "dur": 1.696, + "args": { + "External id": 125757, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672171, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241672171, "pid": 3, "tid": 7, "ts": 6302685318746.811, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301960.761, "dur": 5.060, + "args": { + "External id": 125757, "cbid": 211, "correlation": 241672171 + } + }, + { + "ph": "s", "id": 241672171, "pid": 5717, "tid": 5717, "ts": 6302685301960.761, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685318749.243, "dur": 1.984, + "args": { + "External id": 125764, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672189, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241672189, "pid": 3, "tid": 7, "ts": 6302685318749.243, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685301991.751, "dur": 6.290, + "args": { + "External id": 125764, "cbid": 211, "correlation": 241672189 + } + }, + { + "ph": "s", "id": 241672189, "pid": 5717, "tid": 5717, "ts": 6302685301991.751, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 3, "tid": 7, + "ts": 6302685318751.835, "dur": 3.776, + "args": { + "External id": 125759, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672198, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672198, "pid": 3, "tid": 7, "ts": 6302685318751.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685302004.721, "dur": 4.180, + "args": { + "External id": 125759, "cbid": 211, "correlation": 241672198 + } + }, + { + "ph": "s", "id": 241672198, "pid": 5717, "tid": 5717, "ts": 6302685302004.721, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 3, "tid": 7, + "ts": 6302685318757.787, "dur": 0.960, + "args": { + "External id": 125766, "device": 3, "context": 1, "stream": 7, "correlation": 241672204, "bytes": 8, "memory bandwidth (GB/s)": 0.008333333333333333 + } + }, + { + "ph": "f", "id": 241672204, "pid": 3, "tid": 7, "ts": 6302685318757.787, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685302021.121, "dur": 9.240, + "args": { + "External id": 125766, "cbid": 41, "correlation": 241672204 + } + }, + { + "ph": "s", "id": 241672204, "pid": 5717, "tid": 5717, "ts": 6302685302021.121, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 5717, + "ts": 6302685302030.671, "dur": 16732.212, + "args": { + "External id": 125766, "cbid": 131, "correlation": 241672205 + } + }, + { + "ph": "s", "id": 241672205, "pid": 5717, "tid": 5717, "ts": 6302685302030.671, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685318828.263, "dur": 3.050, + "args": { + "External id": 125774, "cbid": 210, "correlation": 241672230 + } + }, + { + "ph": "f", "id": 241672230, "pid": 5717, "tid": 5717, "ts": 6302685318828.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685318847.420, "dur": 626.628, + "args": { + "External id": 125774, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672231, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672231, "pid": 3, "tid": 7, "ts": 6302685318847.420, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685318836.203, "dur": 10.730, + "args": { + "External id": 125774, "cbid": 211, "correlation": 241672231 + } + }, + { + "ph": "s", "id": 241672231, "pid": 5717, "tid": 5717, "ts": 6302685318836.203, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685319474.657, "dur": 170.945, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672250, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672250, "pid": 3, "tid": 7, "ts": 6302685319474.657, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685318959.552, "dur": 9.191, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241672250 + } + }, + { + "ph": "s", "id": 241672250, "pid": 5717, "tid": 5717, "ts": 6302685318959.552, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685319646.274, "dur": 4.031, + "args": { + "External id": 125784, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672267, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672267, "pid": 3, "tid": 7, "ts": 6302685319646.274, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319006.503, "dur": 7.949, + "args": { + "External id": 125784, "cbid": 211, "correlation": 241672267 + } + }, + { + "ph": "s", "id": 241672267, "pid": 5717, "tid": 5717, "ts": 6302685319006.503, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685319650.913, "dur": 1.216, + "args": { + "External id": 125789, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672284, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672284, "pid": 3, "tid": 7, "ts": 6302685319650.913, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319047.662, "dur": 6.280, + "args": { + "External id": 125789, "cbid": 211, "correlation": 241672284 + } + }, + { + "ph": "s", "id": 241672284, "pid": 5717, "tid": 5717, "ts": 6302685319047.662, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685319652.705, "dur": 0.960, + "args": { + "External id": 125791, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672294, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672294, "pid": 3, "tid": 7, "ts": 6302685319652.705, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319069.732, "dur": 5.290, + "args": { + "External id": 125791, "cbid": 211, "correlation": 241672294 + } + }, + { + "ph": "s", "id": 241672294, "pid": 5717, "tid": 5717, "ts": 6302685319069.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685319654.273, "dur": 1.024, + "args": { + "External id": 125792, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672300, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672300, "pid": 3, "tid": 7, "ts": 6302685319654.273, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319082.612, "dur": 7.730, + "args": { + "External id": 125792, "cbid": 211, "correlation": 241672300 + } + }, + { + "ph": "s", "id": 241672300, "pid": 5717, "tid": 5717, "ts": 6302685319082.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685319656.002, "dur": 0.992, + "args": { + "External id": 125793, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672310, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672310, "pid": 3, "tid": 7, "ts": 6302685319656.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319108.122, "dur": 7.870, + "args": { + "External id": 125793, "cbid": 211, "correlation": 241672310 + } + }, + { + "ph": "s", "id": 241672310, "pid": 5717, "tid": 5717, "ts": 6302685319108.122, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685319657.698, "dur": 1.024, + "args": { + "External id": 125794, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672316, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672316, "pid": 3, "tid": 7, "ts": 6302685319657.698, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319125.962, "dur": 7.110, + "args": { + "External id": 125794, "cbid": 211, "correlation": 241672316 + } + }, + { + "ph": "s", "id": 241672316, "pid": 5717, "tid": 5717, "ts": 6302685319125.962, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685319659.298, "dur": 3.232, + "args": { + "External id": 125795, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672329, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672329, "pid": 3, "tid": 7, "ts": 6302685319659.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319154.672, "dur": 5.440, + "args": { + "External id": 125795, "cbid": 211, "correlation": 241672329 + } + }, + { + "ph": "s", "id": 241672329, "pid": 5717, "tid": 5717, "ts": 6302685319154.672, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685319663.138, "dur": 1.184, + "args": { + "External id": 125798, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672335, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672335, "pid": 3, "tid": 7, "ts": 6302685319663.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319167.752, "dur": 7.030, + "args": { + "External id": 125798, "cbid": 211, "correlation": 241672335 + } + }, + { + "ph": "s", "id": 241672335, "pid": 5717, "tid": 5717, "ts": 6302685319167.752, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685319664.962, "dur": 0.992, + "args": { + "External id": 125799, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672341, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672341, "pid": 3, "tid": 7, "ts": 6302685319664.962, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319181.502, "dur": 4.970, + "args": { + "External id": 125799, "cbid": 211, "correlation": 241672341 + } + }, + { + "ph": "s", "id": 241672341, "pid": 5717, "tid": 5717, "ts": 6302685319181.502, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685319666.530, "dur": 233.858, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672355, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241672355, "pid": 3, "tid": 7, "ts": 6302685319666.530, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319282.422, "dur": 8.710, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241672355 + } + }, + { + "ph": "s", "id": 241672355, "pid": 5717, "tid": 5717, "ts": 6302685319282.422, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685319334.102, "dur": 0.590, + "args": { + "External id": 125803, "cbid": 200, "correlation": 241672378 + } + }, + { + "ph": "f", "id": 241672378, "pid": 5717, "tid": 5717, "ts": 6302685319334.102, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685319901.220, "dur": 0.800, + "args": { + "External id": 125803, "device": 3, "context": 1, "stream": 7, "correlation": 241672381, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241672381, "pid": 3, "tid": 7, "ts": 6302685319901.220, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685319336.542, "dur": 7.010, + "args": { + "External id": 125803, "cbid": 51, "correlation": 241672381 + } + }, + { + "ph": "s", "id": 241672381, "pid": 5717, "tid": 5717, "ts": 6302685319336.542, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685319903.172, "dur": 679.621, + "args": { + "External id": 125803, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672382, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672382, "pid": 3, "tid": 7, "ts": 6302685319903.172, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319343.802, "dur": 8.970, + "args": { + "External id": 125803, "cbid": 307, "correlation": 241672382 + } + }, + { + "ph": "s", "id": 241672382, "pid": 5717, "tid": 5717, "ts": 6302685319343.802, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685320583.497, "dur": 2.944, + "args": { + "External id": 125806, "device": 3, "context": 1, "stream": 7, "correlation": 241672387, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 241672387, "pid": 3, "tid": 7, "ts": 6302685320583.497, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685319383.611, "dur": 14.260, + "args": { + "External id": 125806, "cbid": 41, "correlation": 241672387 + } + }, + { + "ph": "s", "id": 241672387, "pid": 5717, "tid": 5717, "ts": 6302685319383.611, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685319446.931, "dur": 0.480, + "args": { + "External id": 125811, "cbid": 200, "correlation": 241672415 + } + }, + { + "ph": "f", "id": 241672415, "pid": 5717, "tid": 5717, "ts": 6302685319446.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685320587.081, "dur": 686.469, + "args": { + "External id": 125811, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672418, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672418, "pid": 3, "tid": 7, "ts": 6302685320587.081, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319450.331, "dur": 8.030, + "args": { + "External id": 125811, "cbid": 307, "correlation": 241672418 + } + }, + { + "ph": "s", "id": 241672418, "pid": 5717, "tid": 5717, "ts": 6302685319450.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685321274.190, "dur": 221.250, + "args": { + "External id": 125812, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672423, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241672423, "pid": 3, "tid": 7, "ts": 6302685321274.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319476.511, "dur": 7.060, + "args": { + "External id": 125812, "cbid": 211, "correlation": 241672423 + } + }, + { + "ph": "s", "id": 241672423, "pid": 5717, "tid": 5717, "ts": 6302685319476.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685319545.261, "dur": 1.430, + "args": { + "External id": 125820, "cbid": 210, "correlation": 241672449 + } + }, + { + "ph": "f", "id": 241672449, "pid": 5717, "tid": 5717, "ts": 6302685319545.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685321496.048, "dur": 627.748, + "args": { + "External id": 125820, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672450, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672450, "pid": 3, "tid": 7, "ts": 6302685321496.048, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319550.231, "dur": 8.210, + "args": { + "External id": 125820, "cbid": 211, "correlation": 241672450 + } + }, + { + "ph": "s", "id": 241672450, "pid": 5717, "tid": 5717, "ts": 6302685319550.231, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685322124.372, "dur": 171.138, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672469, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672469, "pid": 3, "tid": 7, "ts": 6302685322124.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319669.431, "dur": 9.330, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241672469 + } + }, + { + "ph": "s", "id": 241672469, "pid": 5717, "tid": 5717, "ts": 6302685319669.431, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685322296.150, "dur": 3.968, + "args": { + "External id": 125830, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672486, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672486, "pid": 3, "tid": 7, "ts": 6302685322296.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319713.661, "dur": 7.490, + "args": { + "External id": 125830, "cbid": 211, "correlation": 241672486 + } + }, + { + "ph": "s", "id": 241672486, "pid": 5717, "tid": 5717, "ts": 6302685319713.661, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685322300.790, "dur": 1.184, + "args": { + "External id": 125835, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672503, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672503, "pid": 3, "tid": 7, "ts": 6302685322300.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319746.641, "dur": 5.380, + "args": { + "External id": 125835, "cbid": 211, "correlation": 241672503 + } + }, + { + "ph": "s", "id": 241672503, "pid": 5717, "tid": 5717, "ts": 6302685319746.641, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685322302.614, "dur": 0.992, + "args": { + "External id": 125837, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672513, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672513, "pid": 3, "tid": 7, "ts": 6302685322302.614, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319766.991, "dur": 4.870, + "args": { + "External id": 125837, "cbid": 211, "correlation": 241672513 + } + }, + { + "ph": "s", "id": 241672513, "pid": 5717, "tid": 5717, "ts": 6302685319766.991, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685322304.214, "dur": 0.992, + "args": { + "External id": 125838, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672519, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672519, "pid": 3, "tid": 7, "ts": 6302685322304.214, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319779.121, "dur": 4.480, + "args": { + "External id": 125838, "cbid": 211, "correlation": 241672519 + } + }, + { + "ph": "s", "id": 241672519, "pid": 5717, "tid": 5717, "ts": 6302685319779.121, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685322305.910, "dur": 0.992, + "args": { + "External id": 125839, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672529, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672529, "pid": 3, "tid": 7, "ts": 6302685322305.910, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319794.801, "dur": 4.569, + "args": { + "External id": 125839, "cbid": 211, "correlation": 241672529 + } + }, + { + "ph": "s", "id": 241672529, "pid": 5717, "tid": 5717, "ts": 6302685319794.801, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685322307.606, "dur": 1.024, + "args": { + "External id": 125840, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672535, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672535, "pid": 3, "tid": 7, "ts": 6302685322307.606, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319805.110, "dur": 4.311, + "args": { + "External id": 125840, "cbid": 211, "correlation": 241672535 + } + }, + { + "ph": "s", "id": 241672535, "pid": 5717, "tid": 5717, "ts": 6302685319805.110, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685322309.206, "dur": 3.232, + "args": { + "External id": 125841, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672548, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672548, "pid": 3, "tid": 7, "ts": 6302685322309.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319825.910, "dur": 5.111, + "args": { + "External id": 125841, "cbid": 211, "correlation": 241672548 + } + }, + { + "ph": "s", "id": 241672548, "pid": 5717, "tid": 5717, "ts": 6302685319825.910, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685322313.014, "dur": 1.216, + "args": { + "External id": 125844, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672554, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672554, "pid": 3, "tid": 7, "ts": 6302685322313.014, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319837.441, "dur": 4.369, + "args": { + "External id": 125844, "cbid": 211, "correlation": 241672554 + } + }, + { + "ph": "s", "id": 241672554, "pid": 5717, "tid": 5717, "ts": 6302685319837.441, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685322314.870, "dur": 0.992, + "args": { + "External id": 125845, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672560, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672560, "pid": 3, "tid": 7, "ts": 6302685322314.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319847.081, "dur": 3.929, + "args": { + "External id": 125845, "cbid": 211, "correlation": 241672560 + } + }, + { + "ph": "s", "id": 241672560, "pid": 5717, "tid": 5717, "ts": 6302685319847.081, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685322316.438, "dur": 234.657, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672574, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241672574, "pid": 3, "tid": 7, "ts": 6302685322316.438, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319937.350, "dur": 8.680, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241672574 + } + }, + { + "ph": "s", "id": 241672574, "pid": 5717, "tid": 5717, "ts": 6302685319937.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685319979.720, "dur": 0.600, + "args": { + "External id": 125849, "cbid": 200, "correlation": 241672597 + } + }, + { + "ph": "f", "id": 241672597, "pid": 5717, "tid": 5717, "ts": 6302685319979.720, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685322551.927, "dur": 0.768, + "args": { + "External id": 125849, "device": 3, "context": 1, "stream": 7, "correlation": 241672600, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 241672600, "pid": 3, "tid": 7, "ts": 6302685322551.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685319982.100, "dur": 8.170, + "args": { + "External id": 125849, "cbid": 51, "correlation": 241672600 + } + }, + { + "ph": "s", "id": 241672600, "pid": 5717, "tid": 5717, "ts": 6302685319982.100, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685322553.847, "dur": 684.485, + "args": { + "External id": 125849, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672601, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672601, "pid": 3, "tid": 7, "ts": 6302685322553.847, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685319990.510, "dur": 5.960, + "args": { + "External id": 125849, "cbid": 307, "correlation": 241672601 + } + }, + { + "ph": "s", "id": 241672601, "pid": 5717, "tid": 5717, "ts": 6302685319990.510, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685323238.972, "dur": 2.913, + "args": { + "External id": 125852, "device": 3, "context": 1, "stream": 7, "correlation": 241672606, "bytes": 3145728, "memory bandwidth (GB/s)": 1079.8928939237899 + } + }, + { + "ph": "f", "id": 241672606, "pid": 3, "tid": 7, "ts": 6302685323238.972, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685320021.980, "dur": 12.180, + "args": { + "External id": 125852, "cbid": 41, "correlation": 241672606 + } + }, + { + "ph": "s", "id": 241672606, "pid": 5717, "tid": 5717, "ts": 6302685320021.980, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685320075.660, "dur": 0.420, + "args": { + "External id": 125857, "cbid": 200, "correlation": 241672634 + } + }, + { + "ph": "f", "id": 241672634, "pid": 5717, "tid": 5717, "ts": 6302685320075.660, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685323242.557, "dur": 683.077, + "args": { + "External id": 125857, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672637, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672637, "pid": 3, "tid": 7, "ts": 6302685323242.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320077.680, "dur": 7.430, + "args": { + "External id": 125857, "cbid": 307, "correlation": 241672637 + } + }, + { + "ph": "s", "id": 241672637, "pid": 5717, "tid": 5717, "ts": 6302685320077.680, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685323926.306, "dur": 220.993, + "args": { + "External id": 125858, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672642, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241672642, "pid": 3, "tid": 7, "ts": 6302685323926.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320105.280, "dur": 6.230, + "args": { + "External id": 125858, "cbid": 211, "correlation": 241672642 + } + }, + { + "ph": "s", "id": 241672642, "pid": 5717, "tid": 5717, "ts": 6302685320105.280, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685320157.940, "dur": 1.360, + "args": { + "External id": 125866, "cbid": 210, "correlation": 241672668 + } + }, + { + "ph": "f", "id": 241672668, "pid": 5717, "tid": 5717, "ts": 6302685320157.940, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685324147.939, "dur": 628.549, + "args": { + "External id": 125866, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672669, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672669, "pid": 3, "tid": 7, "ts": 6302685324147.939, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320162.880, "dur": 7.440, + "args": { + "External id": 125866, "cbid": 211, "correlation": 241672669 + } + }, + { + "ph": "s", "id": 241672669, "pid": 5717, "tid": 5717, "ts": 6302685320162.880, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685324777.128, "dur": 170.753, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672688, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672688, "pid": 3, "tid": 7, "ts": 6302685324777.128, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320274.549, "dur": 8.800, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241672688 + } + }, + { + "ph": "s", "id": 241672688, "pid": 5717, "tid": 5717, "ts": 6302685320274.549, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685324948.489, "dur": 3.968, + "args": { + "External id": 125876, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672705, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672705, "pid": 3, "tid": 7, "ts": 6302685324948.489, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320327.040, "dur": 7.879, + "args": { + "External id": 125876, "cbid": 211, "correlation": 241672705 + } + }, + { + "ph": "s", "id": 241672705, "pid": 5717, "tid": 5717, "ts": 6302685320327.040, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685324953.129, "dur": 1.184, + "args": { + "External id": 125881, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672722, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672722, "pid": 3, "tid": 7, "ts": 6302685324953.129, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320360.339, "dur": 5.400, + "args": { + "External id": 125881, "cbid": 211, "correlation": 241672722 + } + }, + { + "ph": "s", "id": 241672722, "pid": 5717, "tid": 5717, "ts": 6302685320360.339, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685324954.985, "dur": 0.992, + "args": { + "External id": 125883, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672732, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672732, "pid": 3, "tid": 7, "ts": 6302685324954.985, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320380.299, "dur": 4.930, + "args": { + "External id": 125883, "cbid": 211, "correlation": 241672732 + } + }, + { + "ph": "s", "id": 241672732, "pid": 5717, "tid": 5717, "ts": 6302685320380.299, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685324956.585, "dur": 0.992, + "args": { + "External id": 125884, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672738, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672738, "pid": 3, "tid": 7, "ts": 6302685324956.585, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320392.249, "dur": 4.380, + "args": { + "External id": 125884, "cbid": 211, "correlation": 241672738 + } + }, + { + "ph": "s", "id": 241672738, "pid": 5717, "tid": 5717, "ts": 6302685320392.249, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685324958.281, "dur": 0.992, + "args": { + "External id": 125885, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672748, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672748, "pid": 3, "tid": 7, "ts": 6302685324958.281, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320406.909, "dur": 4.450, + "args": { + "External id": 125885, "cbid": 211, "correlation": 241672748 + } + }, + { + "ph": "s", "id": 241672748, "pid": 5717, "tid": 5717, "ts": 6302685320406.909, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685324960.009, "dur": 0.992, + "args": { + "External id": 125886, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672754, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672754, "pid": 3, "tid": 7, "ts": 6302685324960.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320417.079, "dur": 4.170, + "args": { + "External id": 125886, "cbid": 211, "correlation": 241672754 + } + }, + { + "ph": "s", "id": 241672754, "pid": 5717, "tid": 5717, "ts": 6302685320417.079, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685324961.609, "dur": 3.201, + "args": { + "External id": 125887, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672767, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672767, "pid": 3, "tid": 7, "ts": 6302685324961.609, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320437.539, "dur": 5.240, + "args": { + "External id": 125887, "cbid": 211, "correlation": 241672767 + } + }, + { + "ph": "s", "id": 241672767, "pid": 5717, "tid": 5717, "ts": 6302685320437.539, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685324965.418, "dur": 1.216, + "args": { + "External id": 125890, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672773, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672773, "pid": 3, "tid": 7, "ts": 6302685324965.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320449.169, "dur": 4.390, + "args": { + "External id": 125890, "cbid": 211, "correlation": 241672773 + } + }, + { + "ph": "s", "id": 241672773, "pid": 5717, "tid": 5717, "ts": 6302685320449.169, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685324967.242, "dur": 0.992, + "args": { + "External id": 125891, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672779, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672779, "pid": 3, "tid": 7, "ts": 6302685324967.242, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320458.679, "dur": 3.950, + "args": { + "External id": 125891, "cbid": 211, "correlation": 241672779 + } + }, + { + "ph": "s", "id": 241672779, "pid": 5717, "tid": 5717, "ts": 6302685320458.679, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685324968.810, "dur": 234.114, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672793, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241672793, "pid": 3, "tid": 7, "ts": 6302685324968.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320549.579, "dur": 8.410, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241672793 + } + }, + { + "ph": "s", "id": 241672793, "pid": 5717, "tid": 5717, "ts": 6302685320549.579, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685320591.159, "dur": 0.580, + "args": { + "External id": 125895, "cbid": 200, "correlation": 241672816 + } + }, + { + "ph": "f", "id": 241672816, "pid": 5717, "tid": 5717, "ts": 6302685320591.159, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685325203.756, "dur": 0.800, + "args": { + "External id": 125895, "device": 3, "context": 1, "stream": 7, "correlation": 241672819, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241672819, "pid": 3, "tid": 7, "ts": 6302685325203.756, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685320593.579, "dur": 7.160, + "args": { + "External id": 125895, "cbid": 51, "correlation": 241672819 + } + }, + { + "ph": "s", "id": 241672819, "pid": 5717, "tid": 5717, "ts": 6302685320593.579, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685325205.708, "dur": 689.477, + "args": { + "External id": 125895, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672820, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672820, "pid": 3, "tid": 7, "ts": 6302685325205.708, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320600.989, "dur": 6.090, + "args": { + "External id": 125895, "cbid": 307, "correlation": 241672820 + } + }, + { + "ph": "s", "id": 241672820, "pid": 5717, "tid": 5717, "ts": 6302685320600.989, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685325895.825, "dur": 2.880, + "args": { + "External id": 125898, "device": 3, "context": 1, "stream": 7, "correlation": 241672825, "bytes": 3145728, "memory bandwidth (GB/s)": 1092.2666666666667 + } + }, + { + "ph": "f", "id": 241672825, "pid": 3, "tid": 7, "ts": 6302685325895.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685320632.439, "dur": 13.860, + "args": { + "External id": 125898, "cbid": 41, "correlation": 241672825 + } + }, + { + "ph": "s", "id": 241672825, "pid": 5717, "tid": 5717, "ts": 6302685320632.439, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685320687.379, "dur": 0.400, + "args": { + "External id": 125903, "cbid": 200, "correlation": 241672853 + } + }, + { + "ph": "f", "id": 241672853, "pid": 5717, "tid": 5717, "ts": 6302685320687.379, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685325899.377, "dur": 683.877, + "args": { + "External id": 125903, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672856, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672856, "pid": 3, "tid": 7, "ts": 6302685325899.377, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320689.328, "dur": 7.540, + "args": { + "External id": 125903, "cbid": 307, "correlation": 241672856 + } + }, + { + "ph": "s", "id": 241672856, "pid": 5717, "tid": 5717, "ts": 6302685320689.328, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685326583.926, "dur": 221.474, + "args": { + "External id": 125904, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672861, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241672861, "pid": 3, "tid": 7, "ts": 6302685326583.926, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320715.439, "dur": 6.940, + "args": { + "External id": 125904, "cbid": 211, "correlation": 241672861 + } + }, + { + "ph": "s", "id": 241672861, "pid": 5717, "tid": 5717, "ts": 6302685320715.439, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685320770.859, "dur": 1.369, + "args": { + "External id": 125912, "cbid": 210, "correlation": 241672887 + } + }, + { + "ph": "f", "id": 241672887, "pid": 5717, "tid": 5717, "ts": 6302685320770.859, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685326806.072, "dur": 633.060, + "args": { + "External id": 125912, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672888, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672888, "pid": 3, "tid": 7, "ts": 6302685326806.072, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320775.699, "dur": 7.669, + "args": { + "External id": 125912, "cbid": 211, "correlation": 241672888 + } + }, + { + "ph": "s", "id": 241672888, "pid": 5717, "tid": 5717, "ts": 6302685320775.699, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685327439.836, "dur": 171.234, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672907, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241672907, "pid": 3, "tid": 7, "ts": 6302685327439.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320899.048, "dur": 9.450, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241672907 + } + }, + { + "ph": "s", "id": 241672907, "pid": 5717, "tid": 5717, "ts": 6302685320899.048, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685327611.710, "dur": 4.000, + "args": { + "External id": 125922, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672924, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672924, "pid": 3, "tid": 7, "ts": 6302685327611.710, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320946.328, "dur": 7.190, + "args": { + "External id": 125922, "cbid": 211, "correlation": 241672924 + } + }, + { + "ph": "s", "id": 241672924, "pid": 5717, "tid": 5717, "ts": 6302685320946.328, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685327616.382, "dur": 1.184, + "args": { + "External id": 125927, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672941, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672941, "pid": 3, "tid": 7, "ts": 6302685327616.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320977.428, "dur": 5.370, + "args": { + "External id": 125927, "cbid": 211, "correlation": 241672941 + } + }, + { + "ph": "s", "id": 241672941, "pid": 5717, "tid": 5717, "ts": 6302685320977.428, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685327618.206, "dur": 0.992, + "args": { + "External id": 125929, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672951, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672951, "pid": 3, "tid": 7, "ts": 6302685327618.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685320997.048, "dur": 4.960, + "args": { + "External id": 125929, "cbid": 211, "correlation": 241672951 + } + }, + { + "ph": "s", "id": 241672951, "pid": 5717, "tid": 5717, "ts": 6302685320997.048, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685327619.806, "dur": 0.992, + "args": { + "External id": 125930, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672957, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672957, "pid": 3, "tid": 7, "ts": 6302685327619.806, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321009.058, "dur": 4.630, + "args": { + "External id": 125930, "cbid": 211, "correlation": 241672957 + } + }, + { + "ph": "s", "id": 241672957, "pid": 5717, "tid": 5717, "ts": 6302685321009.058, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685327621.502, "dur": 1.024, + "args": { + "External id": 125931, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672967, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672967, "pid": 3, "tid": 7, "ts": 6302685327621.502, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321025.418, "dur": 4.460, + "args": { + "External id": 125931, "cbid": 211, "correlation": 241672967 + } + }, + { + "ph": "s", "id": 241672967, "pid": 5717, "tid": 5717, "ts": 6302685321025.418, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685327623.198, "dur": 1.024, + "args": { + "External id": 125932, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672973, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672973, "pid": 3, "tid": 7, "ts": 6302685327623.198, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321035.698, "dur": 4.420, + "args": { + "External id": 125932, "cbid": 211, "correlation": 241672973 + } + }, + { + "ph": "s", "id": 241672973, "pid": 5717, "tid": 5717, "ts": 6302685321035.698, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685327624.830, "dur": 3.232, + "args": { + "External id": 125933, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672986, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672986, "pid": 3, "tid": 7, "ts": 6302685327624.830, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321056.658, "dur": 5.040, + "args": { + "External id": 125933, "cbid": 211, "correlation": 241672986 + } + }, + { + "ph": "s", "id": 241672986, "pid": 5717, "tid": 5717, "ts": 6302685321056.658, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685327628.766, "dur": 1.216, + "args": { + "External id": 125936, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672992, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672992, "pid": 3, "tid": 7, "ts": 6302685327628.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321068.298, "dur": 4.620, + "args": { + "External id": 125936, "cbid": 211, "correlation": 241672992 + } + }, + { + "ph": "s", "id": 241672992, "pid": 5717, "tid": 5717, "ts": 6302685321068.298, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685327630.622, "dur": 0.960, + "args": { + "External id": 125937, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241672998, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241672998, "pid": 3, "tid": 7, "ts": 6302685327630.622, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321079.068, "dur": 3.910, + "args": { + "External id": 125937, "cbid": 211, "correlation": 241672998 + } + }, + { + "ph": "s", "id": 241672998, "pid": 5717, "tid": 5717, "ts": 6302685321079.068, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685327632.190, "dur": 234.081, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673012, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241673012, "pid": 3, "tid": 7, "ts": 6302685327632.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321179.638, "dur": 8.629, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241673012 + } + }, + { + "ph": "s", "id": 241673012, "pid": 5717, "tid": 5717, "ts": 6302685321179.638, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685321221.777, "dur": 0.600, + "args": { + "External id": 125941, "cbid": 200, "correlation": 241673035 + } + }, + { + "ph": "f", "id": 241673035, "pid": 5717, "tid": 5717, "ts": 6302685321221.777, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685327867.103, "dur": 0.800, + "args": { + "External id": 125941, "device": 3, "context": 1, "stream": 7, "correlation": 241673038, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241673038, "pid": 3, "tid": 7, "ts": 6302685327867.103, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685321224.207, "dur": 6.890, + "args": { + "External id": 125941, "cbid": 51, "correlation": 241673038 + } + }, + { + "ph": "s", "id": 241673038, "pid": 5717, "tid": 5717, "ts": 6302685321224.207, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685327869.056, "dur": 687.749, + "args": { + "External id": 125941, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673039, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673039, "pid": 3, "tid": 7, "ts": 6302685327869.056, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321231.347, "dur": 6.000, + "args": { + "External id": 125941, "cbid": 307, "correlation": 241673039 + } + }, + { + "ph": "s", "id": 241673039, "pid": 5717, "tid": 5717, "ts": 6302685321231.347, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685328557.477, "dur": 2.976, + "args": { + "External id": 125944, "device": 3, "context": 1, "stream": 7, "correlation": 241673044, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 241673044, "pid": 3, "tid": 7, "ts": 6302685328557.477, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685321261.787, "dur": 12.460, + "args": { + "External id": 125944, "cbid": 41, "correlation": 241673044 + } + }, + { + "ph": "s", "id": 241673044, "pid": 5717, "tid": 5717, "ts": 6302685321261.787, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685321325.117, "dur": 0.410, + "args": { + "External id": 125949, "cbid": 200, "correlation": 241673072 + } + }, + { + "ph": "f", "id": 241673072, "pid": 5717, "tid": 5717, "ts": 6302685321325.117, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685328561.061, "dur": 685.893, + "args": { + "External id": 125949, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673075, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673075, "pid": 3, "tid": 7, "ts": 6302685328561.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321327.177, "dur": 8.230, + "args": { + "External id": 125949, "cbid": 307, "correlation": 241673075 + } + }, + { + "ph": "s", "id": 241673075, "pid": 5717, "tid": 5717, "ts": 6302685321327.177, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685329247.658, "dur": 221.506, + "args": { + "External id": 125950, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673080, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241673080, "pid": 3, "tid": 7, "ts": 6302685329247.658, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321351.117, "dur": 6.380, + "args": { + "External id": 125950, "cbid": 211, "correlation": 241673080 + } + }, + { + "ph": "s", "id": 241673080, "pid": 5717, "tid": 5717, "ts": 6302685321351.117, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685321418.607, "dur": 1.380, + "args": { + "External id": 125958, "cbid": 210, "correlation": 241673106 + } + }, + { + "ph": "f", "id": 241673106, "pid": 5717, "tid": 5717, "ts": 6302685321418.607, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685329469.772, "dur": 630.148, + "args": { + "External id": 125958, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673107, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673107, "pid": 3, "tid": 7, "ts": 6302685329469.772, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321423.507, "dur": 7.880, + "args": { + "External id": 125958, "cbid": 211, "correlation": 241673107 + } + }, + { + "ph": "s", "id": 241673107, "pid": 5717, "tid": 5717, "ts": 6302685321423.507, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685330100.656, "dur": 171.393, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673126, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673126, "pid": 3, "tid": 7, "ts": 6302685330100.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321551.207, "dur": 12.790, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241673126 + } + }, + { + "ph": "s", "id": 241673126, "pid": 5717, "tid": 5717, "ts": 6302685321551.207, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685330272.753, "dur": 4.065, + "args": { + "External id": 125968, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673143, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673143, "pid": 3, "tid": 7, "ts": 6302685330272.753, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321611.237, "dur": 8.929, + "args": { + "External id": 125968, "cbid": 211, "correlation": 241673143 + } + }, + { + "ph": "s", "id": 241673143, "pid": 5717, "tid": 5717, "ts": 6302685321611.237, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685330277.394, "dur": 1.184, + "args": { + "External id": 125973, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673160, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673160, "pid": 3, "tid": 7, "ts": 6302685330277.394, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321647.666, "dur": 6.011, + "args": { + "External id": 125973, "cbid": 211, "correlation": 241673160 + } + }, + { + "ph": "s", "id": 241673160, "pid": 5717, "tid": 5717, "ts": 6302685321647.666, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685330279.218, "dur": 1.024, + "args": { + "External id": 125975, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673170, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673170, "pid": 3, "tid": 7, "ts": 6302685330279.218, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321676.286, "dur": 5.260, + "args": { + "External id": 125975, "cbid": 211, "correlation": 241673170 + } + }, + { + "ph": "s", "id": 241673170, "pid": 5717, "tid": 5717, "ts": 6302685321676.286, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685330280.818, "dur": 1.024, + "args": { + "External id": 125976, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673176, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673176, "pid": 3, "tid": 7, "ts": 6302685330280.818, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321689.096, "dur": 4.680, + "args": { + "External id": 125976, "cbid": 211, "correlation": 241673176 + } + }, + { + "ph": "s", "id": 241673176, "pid": 5717, "tid": 5717, "ts": 6302685321689.096, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685330282.514, "dur": 1.024, + "args": { + "External id": 125977, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673186, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673186, "pid": 3, "tid": 7, "ts": 6302685330282.514, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321704.906, "dur": 4.770, + "args": { + "External id": 125977, "cbid": 211, "correlation": 241673186 + } + }, + { + "ph": "s", "id": 241673186, "pid": 5717, "tid": 5717, "ts": 6302685321704.906, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685330284.242, "dur": 0.992, + "args": { + "External id": 125978, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673192, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673192, "pid": 3, "tid": 7, "ts": 6302685330284.242, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321715.496, "dur": 4.420, + "args": { + "External id": 125978, "cbid": 211, "correlation": 241673192 + } + }, + { + "ph": "s", "id": 241673192, "pid": 5717, "tid": 5717, "ts": 6302685321715.496, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685330285.842, "dur": 3.232, + "args": { + "External id": 125979, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673205, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673205, "pid": 3, "tid": 7, "ts": 6302685330285.842, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321741.946, "dur": 5.350, + "args": { + "External id": 125979, "cbid": 211, "correlation": 241673205 + } + }, + { + "ph": "s", "id": 241673205, "pid": 5717, "tid": 5717, "ts": 6302685321741.946, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685330289.810, "dur": 1.216, + "args": { + "External id": 125982, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673211, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673211, "pid": 3, "tid": 7, "ts": 6302685330289.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321754.186, "dur": 4.390, + "args": { + "External id": 125982, "cbid": 211, "correlation": 241673211 + } + }, + { + "ph": "s", "id": 241673211, "pid": 5717, "tid": 5717, "ts": 6302685321754.186, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685330291.634, "dur": 0.992, + "args": { + "External id": 125983, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673217, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673217, "pid": 3, "tid": 7, "ts": 6302685330291.634, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321765.176, "dur": 4.160, + "args": { + "External id": 125983, "cbid": 211, "correlation": 241673217 + } + }, + { + "ph": "s", "id": 241673217, "pid": 5717, "tid": 5717, "ts": 6302685321765.176, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685330293.202, "dur": 234.242, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673231, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241673231, "pid": 3, "tid": 7, "ts": 6302685330293.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321858.256, "dur": 8.480, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241673231 + } + }, + { + "ph": "s", "id": 241673231, "pid": 5717, "tid": 5717, "ts": 6302685321858.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685321900.616, "dur": 0.570, + "args": { + "External id": 125987, "cbid": 200, "correlation": 241673254 + } + }, + { + "ph": "f", "id": 241673254, "pid": 5717, "tid": 5717, "ts": 6302685321900.616, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685330528.308, "dur": 0.800, + "args": { + "External id": 125987, "device": 3, "context": 1, "stream": 7, "correlation": 241673257, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241673257, "pid": 3, "tid": 7, "ts": 6302685330528.308, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685321902.996, "dur": 6.870, + "args": { + "External id": 125987, "cbid": 51, "correlation": 241673257 + } + }, + { + "ph": "s", "id": 241673257, "pid": 5717, "tid": 5717, "ts": 6302685321902.996, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685330530.260, "dur": 687.173, + "args": { + "External id": 125987, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673258, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673258, "pid": 3, "tid": 7, "ts": 6302685330530.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321910.126, "dur": 6.610, + "args": { + "External id": 125987, "cbid": 307, "correlation": 241673258 + } + }, + { + "ph": "s", "id": 241673258, "pid": 5717, "tid": 5717, "ts": 6302685321910.126, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685331218.169, "dur": 2.912, + "args": { + "External id": 125990, "device": 3, "context": 1, "stream": 7, "correlation": 241673263, "bytes": 3145728, "memory bandwidth (GB/s)": 1080.2637362637363 + } + }, + { + "ph": "f", "id": 241673263, "pid": 3, "tid": 7, "ts": 6302685331218.169, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685321941.896, "dur": 12.710, + "args": { + "External id": 125990, "cbid": 41, "correlation": 241673263 + } + }, + { + "ph": "s", "id": 241673263, "pid": 5717, "tid": 5717, "ts": 6302685321941.896, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685321995.846, "dur": 0.450, + "args": { + "External id": 125995, "cbid": 200, "correlation": 241673291 + } + }, + { + "ph": "f", "id": 241673291, "pid": 5717, "tid": 5717, "ts": 6302685321995.846, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685331221.753, "dur": 689.893, + "args": { + "External id": 125995, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673294, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673294, "pid": 3, "tid": 7, "ts": 6302685331221.753, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685321997.966, "dur": 7.470, + "args": { + "External id": 125995, "cbid": 307, "correlation": 241673294 + } + }, + { + "ph": "s", "id": 241673294, "pid": 5717, "tid": 5717, "ts": 6302685321997.966, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685331912.382, "dur": 220.609, + "args": { + "External id": 125996, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673299, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241673299, "pid": 3, "tid": 7, "ts": 6302685331912.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322019.856, "dur": 5.949, + "args": { + "External id": 125996, "cbid": 211, "correlation": 241673299 + } + }, + { + "ph": "s", "id": 241673299, "pid": 5717, "tid": 5717, "ts": 6302685322019.856, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685322071.225, "dur": 1.371, + "args": { + "External id": 126004, "cbid": 210, "correlation": 241673325 + } + }, + { + "ph": "f", "id": 241673325, "pid": 5717, "tid": 5717, "ts": 6302685322071.225, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685332133.599, "dur": 633.125, + "args": { + "External id": 126004, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673326, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673326, "pid": 3, "tid": 7, "ts": 6302685332133.599, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322076.196, "dur": 7.729, + "args": { + "External id": 126004, "cbid": 211, "correlation": 241673326 + } + }, + { + "ph": "s", "id": 241673326, "pid": 5717, "tid": 5717, "ts": 6302685322076.196, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685332767.396, "dur": 170.754, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673345, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673345, "pid": 3, "tid": 7, "ts": 6302685332767.396, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322234.785, "dur": 11.160, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241673345 + } + }, + { + "ph": "s", "id": 241673345, "pid": 5717, "tid": 5717, "ts": 6302685322234.785, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685332938.822, "dur": 4.032, + "args": { + "External id": 126014, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673362, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673362, "pid": 3, "tid": 7, "ts": 6302685332938.822, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322282.715, "dur": 7.160, + "args": { + "External id": 126014, "cbid": 211, "correlation": 241673362 + } + }, + { + "ph": "s", "id": 241673362, "pid": 5717, "tid": 5717, "ts": 6302685322282.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685332943.462, "dur": 1.184, + "args": { + "External id": 126019, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673379, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673379, "pid": 3, "tid": 7, "ts": 6302685332943.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322324.105, "dur": 6.470, + "args": { + "External id": 126019, "cbid": 211, "correlation": 241673379 + } + }, + { + "ph": "s", "id": 241673379, "pid": 5717, "tid": 5717, "ts": 6302685322324.105, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685332945.350, "dur": 0.960, + "args": { + "External id": 126021, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673389, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673389, "pid": 3, "tid": 7, "ts": 6302685332945.350, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322344.785, "dur": 5.000, + "args": { + "External id": 126021, "cbid": 211, "correlation": 241673389 + } + }, + { + "ph": "s", "id": 241673389, "pid": 5717, "tid": 5717, "ts": 6302685322344.785, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685332947.046, "dur": 1.024, + "args": { + "External id": 126022, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673395, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673395, "pid": 3, "tid": 7, "ts": 6302685332947.046, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322357.255, "dur": 4.540, + "args": { + "External id": 126022, "cbid": 211, "correlation": 241673395 + } + }, + { + "ph": "s", "id": 241673395, "pid": 5717, "tid": 5717, "ts": 6302685322357.255, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685332948.774, "dur": 0.992, + "args": { + "External id": 126023, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673405, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673405, "pid": 3, "tid": 7, "ts": 6302685332948.774, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322372.425, "dur": 4.780, + "args": { + "External id": 126023, "cbid": 211, "correlation": 241673405 + } + }, + { + "ph": "s", "id": 241673405, "pid": 5717, "tid": 5717, "ts": 6302685322372.425, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685332950.470, "dur": 0.992, + "args": { + "External id": 126024, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673411, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673411, "pid": 3, "tid": 7, "ts": 6302685332950.470, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322383.035, "dur": 4.190, + "args": { + "External id": 126024, "cbid": 211, "correlation": 241673411 + } + }, + { + "ph": "s", "id": 241673411, "pid": 5717, "tid": 5717, "ts": 6302685322383.035, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685332952.070, "dur": 3.264, + "args": { + "External id": 126025, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673424, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673424, "pid": 3, "tid": 7, "ts": 6302685332952.070, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322403.805, "dur": 5.050, + "args": { + "External id": 126025, "cbid": 211, "correlation": 241673424 + } + }, + { + "ph": "s", "id": 241673424, "pid": 5717, "tid": 5717, "ts": 6302685322403.805, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685332956.006, "dur": 1.216, + "args": { + "External id": 126028, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673430, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673430, "pid": 3, "tid": 7, "ts": 6302685332956.006, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322416.635, "dur": 4.170, + "args": { + "External id": 126028, "cbid": 211, "correlation": 241673430 + } + }, + { + "ph": "s", "id": 241673430, "pid": 5717, "tid": 5717, "ts": 6302685322416.635, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685332957.862, "dur": 0.960, + "args": { + "External id": 126029, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673436, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673436, "pid": 3, "tid": 7, "ts": 6302685332957.862, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322426.025, "dur": 3.880, + "args": { + "External id": 126029, "cbid": 211, "correlation": 241673436 + } + }, + { + "ph": "s", "id": 241673436, "pid": 5717, "tid": 5717, "ts": 6302685322426.025, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685332959.430, "dur": 234.466, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673450, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241673450, "pid": 3, "tid": 7, "ts": 6302685332959.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322517.444, "dur": 8.600, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241673450 + } + }, + { + "ph": "s", "id": 241673450, "pid": 5717, "tid": 5717, "ts": 6302685322517.444, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685322559.744, "dur": 0.580, + "args": { + "External id": 126033, "cbid": 200, "correlation": 241673473 + } + }, + { + "ph": "f", "id": 241673473, "pid": 5717, "tid": 5717, "ts": 6302685322559.744, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685333194.760, "dur": 0.800, + "args": { + "External id": 126033, "device": 3, "context": 1, "stream": 7, "correlation": 241673476, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241673476, "pid": 3, "tid": 7, "ts": 6302685333194.760, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685322562.204, "dur": 6.900, + "args": { + "External id": 126033, "cbid": 51, "correlation": 241673476 + } + }, + { + "ph": "s", "id": 241673476, "pid": 5717, "tid": 5717, "ts": 6302685322562.204, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685333196.712, "dur": 688.709, + "args": { + "External id": 126033, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673477, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673477, "pid": 3, "tid": 7, "ts": 6302685333196.712, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322569.354, "dur": 5.990, + "args": { + "External id": 126033, "cbid": 307, "correlation": 241673477 + } + }, + { + "ph": "s", "id": 241673477, "pid": 5717, "tid": 5717, "ts": 6302685322569.354, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685333886.061, "dur": 2.944, + "args": { + "External id": 126036, "device": 3, "context": 1, "stream": 7, "correlation": 241673482, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 241673482, "pid": 3, "tid": 7, "ts": 6302685333886.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685322600.694, "dur": 12.190, + "args": { + "External id": 126036, "cbid": 41, "correlation": 241673482 + } + }, + { + "ph": "s", "id": 241673482, "pid": 5717, "tid": 5717, "ts": 6302685322600.694, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685322653.204, "dur": 0.440, + "args": { + "External id": 126041, "cbid": 200, "correlation": 241673510 + } + }, + { + "ph": "f", "id": 241673510, "pid": 5717, "tid": 5717, "ts": 6302685322653.204, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685333889.677, "dur": 685.509, + "args": { + "External id": 126041, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673513, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673513, "pid": 3, "tid": 7, "ts": 6302685333889.677, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322655.184, "dur": 7.420, + "args": { + "External id": 126041, "cbid": 307, "correlation": 241673513 + } + }, + { + "ph": "s", "id": 241673513, "pid": 5717, "tid": 5717, "ts": 6302685322655.184, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685334575.922, "dur": 221.250, + "args": { + "External id": 126042, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673518, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241673518, "pid": 3, "tid": 7, "ts": 6302685334575.922, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322678.294, "dur": 6.060, + "args": { + "External id": 126042, "cbid": 211, "correlation": 241673518 + } + }, + { + "ph": "s", "id": 241673518, "pid": 5717, "tid": 5717, "ts": 6302685322678.294, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685322730.034, "dur": 1.370, + "args": { + "External id": 126050, "cbid": 210, "correlation": 241673544 + } + }, + { + "ph": "f", "id": 241673544, "pid": 5717, "tid": 5717, "ts": 6302685322730.034, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685334797.908, "dur": 628.356, + "args": { + "External id": 126050, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673545, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673545, "pid": 3, "tid": 7, "ts": 6302685334797.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322734.884, "dur": 7.670, + "args": { + "External id": 126050, "cbid": 211, "correlation": 241673545 + } + }, + { + "ph": "s", "id": 241673545, "pid": 5717, "tid": 5717, "ts": 6302685322734.884, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685335427.000, "dur": 171.202, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673564, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673564, "pid": 3, "tid": 7, "ts": 6302685335427.000, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322845.384, "dur": 8.990, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241673564 + } + }, + { + "ph": "s", "id": 241673564, "pid": 5717, "tid": 5717, "ts": 6302685322845.384, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685335598.810, "dur": 3.936, + "args": { + "External id": 126060, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673581, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673581, "pid": 3, "tid": 7, "ts": 6302685335598.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322889.174, "dur": 7.040, + "args": { + "External id": 126060, "cbid": 211, "correlation": 241673581 + } + }, + { + "ph": "s", "id": 241673581, "pid": 5717, "tid": 5717, "ts": 6302685322889.174, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685335603.450, "dur": 1.184, + "args": { + "External id": 126065, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673598, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673598, "pid": 3, "tid": 7, "ts": 6302685335603.450, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322923.814, "dur": 5.509, + "args": { + "External id": 126065, "cbid": 211, "correlation": 241673598 + } + }, + { + "ph": "s", "id": 241673598, "pid": 5717, "tid": 5717, "ts": 6302685322923.814, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685335605.306, "dur": 0.992, + "args": { + "External id": 126067, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673608, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673608, "pid": 3, "tid": 7, "ts": 6302685335605.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322943.083, "dur": 4.880, + "args": { + "External id": 126067, "cbid": 211, "correlation": 241673608 + } + }, + { + "ph": "s", "id": 241673608, "pid": 5717, "tid": 5717, "ts": 6302685322943.083, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685335606.874, "dur": 1.024, + "args": { + "External id": 126068, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673614, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673614, "pid": 3, "tid": 7, "ts": 6302685335606.874, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322955.403, "dur": 4.431, + "args": { + "External id": 126068, "cbid": 211, "correlation": 241673614 + } + }, + { + "ph": "s", "id": 241673614, "pid": 5717, "tid": 5717, "ts": 6302685322955.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685335608.602, "dur": 0.992, + "args": { + "External id": 126069, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673624, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673624, "pid": 3, "tid": 7, "ts": 6302685335608.602, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322970.463, "dur": 4.531, + "args": { + "External id": 126069, "cbid": 211, "correlation": 241673624 + } + }, + { + "ph": "s", "id": 241673624, "pid": 5717, "tid": 5717, "ts": 6302685322970.463, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685335610.298, "dur": 1.024, + "args": { + "External id": 126070, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673630, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673630, "pid": 3, "tid": 7, "ts": 6302685335610.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685322980.803, "dur": 4.410, + "args": { + "External id": 126070, "cbid": 211, "correlation": 241673630 + } + }, + { + "ph": "s", "id": 241673630, "pid": 5717, "tid": 5717, "ts": 6302685322980.803, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685335611.898, "dur": 3.232, + "args": { + "External id": 126071, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673643, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673643, "pid": 3, "tid": 7, "ts": 6302685335611.898, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323003.693, "dur": 5.180, + "args": { + "External id": 126071, "cbid": 211, "correlation": 241673643 + } + }, + { + "ph": "s", "id": 241673643, "pid": 5717, "tid": 5717, "ts": 6302685323003.693, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685335615.866, "dur": 1.216, + "args": { + "External id": 126074, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673649, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673649, "pid": 3, "tid": 7, "ts": 6302685335615.866, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323015.243, "dur": 4.400, + "args": { + "External id": 126074, "cbid": 211, "correlation": 241673649 + } + }, + { + "ph": "s", "id": 241673649, "pid": 5717, "tid": 5717, "ts": 6302685323015.243, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685335617.690, "dur": 0.992, + "args": { + "External id": 126075, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673655, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673655, "pid": 3, "tid": 7, "ts": 6302685335617.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323024.933, "dur": 3.960, + "args": { + "External id": 126075, "cbid": 211, "correlation": 241673655 + } + }, + { + "ph": "s", "id": 241673655, "pid": 5717, "tid": 5717, "ts": 6302685323024.933, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685335619.290, "dur": 234.273, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673669, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241673669, "pid": 3, "tid": 7, "ts": 6302685335619.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323114.753, "dur": 8.560, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241673669 + } + }, + { + "ph": "s", "id": 241673669, "pid": 5717, "tid": 5717, "ts": 6302685323114.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685323157.333, "dur": 0.570, + "args": { + "External id": 126079, "cbid": 200, "correlation": 241673692 + } + }, + { + "ph": "f", "id": 241673692, "pid": 5717, "tid": 5717, "ts": 6302685323157.333, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685335854.523, "dur": 0.801, + "args": { + "External id": 126079, "device": 3, "context": 1, "stream": 7, "correlation": 241673695, "bytes": 1536, "memory bandwidth (GB/s)": 1.9176029962546817 + } + }, + { + "ph": "f", "id": 241673695, "pid": 3, "tid": 7, "ts": 6302685335854.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685323159.663, "dur": 6.760, + "args": { + "External id": 126079, "cbid": 51, "correlation": 241673695 + } + }, + { + "ph": "s", "id": 241673695, "pid": 5717, "tid": 5717, "ts": 6302685323159.663, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685335856.060, "dur": 687.973, + "args": { + "External id": 126079, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673696, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673696, "pid": 3, "tid": 7, "ts": 6302685335856.060, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323166.673, "dur": 5.900, + "args": { + "External id": 126079, "cbid": 307, "correlation": 241673696 + } + }, + { + "ph": "s", "id": 241673696, "pid": 5717, "tid": 5717, "ts": 6302685323166.673, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685336544.673, "dur": 2.944, + "args": { + "External id": 126082, "device": 3, "context": 1, "stream": 7, "correlation": 241673701, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 241673701, "pid": 3, "tid": 7, "ts": 6302685336544.673, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685323196.703, "dur": 12.220, + "args": { + "External id": 126082, "cbid": 41, "correlation": 241673701 + } + }, + { + "ph": "s", "id": 241673701, "pid": 5717, "tid": 5717, "ts": 6302685323196.703, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685323250.633, "dur": 0.400, + "args": { + "External id": 126087, "cbid": 200, "correlation": 241673729 + } + }, + { + "ph": "f", "id": 241673729, "pid": 5717, "tid": 5717, "ts": 6302685323250.633, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685336548.257, "dur": 683.973, + "args": { + "External id": 126087, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673732, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673732, "pid": 3, "tid": 7, "ts": 6302685336548.257, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323252.623, "dur": 7.590, + "args": { + "External id": 126087, "cbid": 307, "correlation": 241673732 + } + }, + { + "ph": "s", "id": 241673732, "pid": 5717, "tid": 5717, "ts": 6302685323252.623, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685337232.870, "dur": 221.634, + "args": { + "External id": 126088, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673737, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241673737, "pid": 3, "tid": 7, "ts": 6302685337232.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323274.263, "dur": 6.250, + "args": { + "External id": 126088, "cbid": 211, "correlation": 241673737 + } + }, + { + "ph": "s", "id": 241673737, "pid": 5717, "tid": 5717, "ts": 6302685323274.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685323335.033, "dur": 1.440, + "args": { + "External id": 126096, "cbid": 210, "correlation": 241673763 + } + }, + { + "ph": "f", "id": 241673763, "pid": 5717, "tid": 5717, "ts": 6302685323335.033, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685337455.080, "dur": 626.373, + "args": { + "External id": 126096, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673764, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673764, "pid": 3, "tid": 7, "ts": 6302685337455.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323339.953, "dur": 8.540, + "args": { + "External id": 126096, "cbid": 211, "correlation": 241673764 + } + }, + { + "ph": "s", "id": 241673764, "pid": 5717, "tid": 5717, "ts": 6302685323339.953, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685338082.061, "dur": 171.073, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673783, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673783, "pid": 3, "tid": 7, "ts": 6302685338082.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323458.062, "dur": 9.000, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241673783 + } + }, + { + "ph": "s", "id": 241673783, "pid": 5717, "tid": 5717, "ts": 6302685323458.062, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685338253.742, "dur": 3.968, + "args": { + "External id": 126106, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673800, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673800, "pid": 3, "tid": 7, "ts": 6302685338253.742, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323502.602, "dur": 7.460, + "args": { + "External id": 126106, "cbid": 211, "correlation": 241673800 + } + }, + { + "ph": "s", "id": 241673800, "pid": 5717, "tid": 5717, "ts": 6302685323502.602, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685338258.382, "dur": 1.184, + "args": { + "External id": 126111, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673817, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673817, "pid": 3, "tid": 7, "ts": 6302685338258.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323534.112, "dur": 5.310, + "args": { + "External id": 126111, "cbid": 211, "correlation": 241673817 + } + }, + { + "ph": "s", "id": 241673817, "pid": 5717, "tid": 5717, "ts": 6302685323534.112, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685338260.238, "dur": 0.992, + "args": { + "External id": 126113, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673827, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673827, "pid": 3, "tid": 7, "ts": 6302685338260.238, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323553.692, "dur": 4.960, + "args": { + "External id": 126113, "cbid": 211, "correlation": 241673827 + } + }, + { + "ph": "s", "id": 241673827, "pid": 5717, "tid": 5717, "ts": 6302685323553.692, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685338261.966, "dur": 0.992, + "args": { + "External id": 126114, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673833, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673833, "pid": 3, "tid": 7, "ts": 6302685338261.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323566.012, "dur": 4.620, + "args": { + "External id": 126114, "cbid": 211, "correlation": 241673833 + } + }, + { + "ph": "s", "id": 241673833, "pid": 5717, "tid": 5717, "ts": 6302685323566.012, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685338263.662, "dur": 1.024, + "args": { + "External id": 126115, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673843, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673843, "pid": 3, "tid": 7, "ts": 6302685338263.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323582.322, "dur": 4.720, + "args": { + "External id": 126115, "cbid": 211, "correlation": 241673843 + } + }, + { + "ph": "s", "id": 241673843, "pid": 5717, "tid": 5717, "ts": 6302685323582.322, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685338265.358, "dur": 1.024, + "args": { + "External id": 126116, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673849, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673849, "pid": 3, "tid": 7, "ts": 6302685338265.358, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323592.732, "dur": 4.340, + "args": { + "External id": 126116, "cbid": 211, "correlation": 241673849 + } + }, + { + "ph": "s", "id": 241673849, "pid": 5717, "tid": 5717, "ts": 6302685323592.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685338266.990, "dur": 3.200, + "args": { + "External id": 126117, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673862, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673862, "pid": 3, "tid": 7, "ts": 6302685338266.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323612.372, "dur": 5.150, + "args": { + "External id": 126117, "cbid": 211, "correlation": 241673862 + } + }, + { + "ph": "s", "id": 241673862, "pid": 5717, "tid": 5717, "ts": 6302685323612.372, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685338270.926, "dur": 1.216, + "args": { + "External id": 126120, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673868, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673868, "pid": 3, "tid": 7, "ts": 6302685338270.926, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323623.862, "dur": 4.250, + "args": { + "External id": 126120, "cbid": 211, "correlation": 241673868 + } + }, + { + "ph": "s", "id": 241673868, "pid": 5717, "tid": 5717, "ts": 6302685323623.862, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685338272.750, "dur": 0.992, + "args": { + "External id": 126121, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673874, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673874, "pid": 3, "tid": 7, "ts": 6302685338272.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323633.322, "dur": 4.000, + "args": { + "External id": 126121, "cbid": 211, "correlation": 241673874 + } + }, + { + "ph": "s", "id": 241673874, "pid": 5717, "tid": 5717, "ts": 6302685323633.322, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685338274.318, "dur": 235.234, + "args": { + "External id": 125737, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673888, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241673888, "pid": 3, "tid": 7, "ts": 6302685338274.318, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323722.822, "dur": 8.640, + "args": { + "External id": 125737, "cbid": 307, "correlation": 241673888 + } + }, + { + "ph": "s", "id": 241673888, "pid": 5717, "tid": 5717, "ts": 6302685323722.822, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685323764.322, "dur": 0.610, + "args": { + "External id": 126125, "cbid": 200, "correlation": 241673911 + } + }, + { + "ph": "f", "id": 241673911, "pid": 5717, "tid": 5717, "ts": 6302685323764.322, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685338510.448, "dur": 0.800, + "args": { + "External id": 126125, "device": 3, "context": 1, "stream": 7, "correlation": 241673914, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241673914, "pid": 3, "tid": 7, "ts": 6302685338510.448, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685323766.782, "dur": 6.710, + "args": { + "External id": 126125, "cbid": 51, "correlation": 241673914 + } + }, + { + "ph": "s", "id": 241673914, "pid": 5717, "tid": 5717, "ts": 6302685323766.782, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685338511.984, "dur": 688.773, + "args": { + "External id": 126125, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673915, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673915, "pid": 3, "tid": 7, "ts": 6302685338511.984, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323773.761, "dur": 5.951, + "args": { + "External id": 126125, "cbid": 307, "correlation": 241673915 + } + }, + { + "ph": "s", "id": 241673915, "pid": 5717, "tid": 5717, "ts": 6302685323773.761, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685339201.461, "dur": 3.008, + "args": { + "External id": 126128, "device": 3, "context": 1, "stream": 7, "correlation": 241673920, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 241673920, "pid": 3, "tid": 7, "ts": 6302685339201.461, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685323804.021, "dur": 12.051, + "args": { + "External id": 126128, "cbid": 41, "correlation": 241673920 + } + }, + { + "ph": "s", "id": 241673920, "pid": 5717, "tid": 5717, "ts": 6302685323804.021, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685323855.881, "dur": 0.411, + "args": { + "External id": 126133, "cbid": 200, "correlation": 241673948 + } + }, + { + "ph": "f", "id": 241673948, "pid": 5717, "tid": 5717, "ts": 6302685323855.881, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685339205.205, "dur": 683.141, + "args": { + "External id": 126133, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673951, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241673951, "pid": 3, "tid": 7, "ts": 6302685339205.205, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323857.921, "dur": 7.171, + "args": { + "External id": 126133, "cbid": 307, "correlation": 241673951 + } + }, + { + "ph": "s", "id": 241673951, "pid": 5717, "tid": 5717, "ts": 6302685323857.921, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685339889.050, "dur": 221.378, + "args": { + "External id": 126134, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673956, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241673956, "pid": 3, "tid": 7, "ts": 6302685339889.050, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323878.981, "dur": 6.250, + "args": { + "External id": 126134, "cbid": 211, "correlation": 241673956 + } + }, + { + "ph": "s", "id": 241673956, "pid": 5717, "tid": 5717, "ts": 6302685323878.981, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685340111.004, "dur": 5.056, + "args": { + "External id": 126136, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673969, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673969, "pid": 3, "tid": 7, "ts": 6302685340111.004, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323910.731, "dur": 6.730, + "args": { + "External id": 126136, "cbid": 211, "correlation": 241673969 + } + }, + { + "ph": "s", "id": 241673969, "pid": 5717, "tid": 5717, "ts": 6302685323910.731, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685340116.764, "dur": 157.537, + "args": { + "External id": 126141, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673982, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241673982, "pid": 3, "tid": 7, "ts": 6302685340116.764, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685323941.051, "dur": 6.110, + "args": { + "External id": 126141, "cbid": 211, "correlation": 241673982 + } + }, + { + "ph": "s", "id": 241673982, "pid": 5717, "tid": 5717, "ts": 6302685323941.051, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685340274.973, "dur": 1.312, + "args": { + "External id": 126146, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673990, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673990, "pid": 3, "tid": 7, "ts": 6302685340274.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685324005.581, "dur": 8.620, + "args": { + "External id": 126146, "cbid": 211, "correlation": 241673990 + } + }, + { + "ph": "s", "id": 241673990, "pid": 5717, "tid": 5717, "ts": 6302685324005.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685340276.925, "dur": 1.312, + "args": { + "External id": 126147, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241673996, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241673996, "pid": 3, "tid": 7, "ts": 6302685340276.925, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685324025.751, "dur": 4.710, + "args": { + "External id": 126147, "cbid": 211, "correlation": 241673996 + } + }, + { + "ph": "s", "id": 241673996, "pid": 5717, "tid": 5717, "ts": 6302685324025.751, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685340278.941, "dur": 2.176, + "args": { + "External id": 126166, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674016, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 241674016, "pid": 3, "tid": 7, "ts": 6302685340278.941, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685324112.681, "dur": 9.170, + "args": { + "External id": 126166, "cbid": 211, "correlation": 241674016 + } + }, + { + "ph": "s", "id": 241674016, "pid": 5717, "tid": 5717, "ts": 6302685324112.681, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685340281.821, "dur": 59.232, + "args": { + "External id": 126174, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674034, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241674034, "pid": 3, "tid": 7, "ts": 6302685340281.821, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685324216.911, "dur": 9.929, + "args": { + "External id": 126174, "cbid": 211, "correlation": 241674034 + } + }, + { + "ph": "s", "id": 241674034, "pid": 5717, "tid": 5717, "ts": 6302685324216.911, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685340341.661, "dur": 15.713, + "args": { + "External id": 126179, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674051, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241674051, "pid": 3, "tid": 7, "ts": 6302685340341.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685324262.111, "dur": 7.089, + "args": { + "External id": 126179, "cbid": 211, "correlation": 241674051 + } + }, + { + "ph": "s", "id": 241674051, "pid": 5717, "tid": 5717, "ts": 6302685324262.111, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685340358.078, "dur": 100.512, + "args": { + "External id": 126184, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241674067, "pid": 3, "tid": 7, "ts": 6302685340358.078, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685324288.631, "dur": 5.520, + "args": { + "External id": 126184, "cbid": 211, "correlation": 241674067 + } + }, + { + "ph": "s", "id": 241674067, "pid": 5717, "tid": 5717, "ts": 6302685324288.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685340459.230, "dur": 1.664, + "args": { + "External id": 126188, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674083, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241674083, "pid": 3, "tid": 7, "ts": 6302685340459.230, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685324323.620, "dur": 6.100, + "args": { + "External id": 126188, "cbid": 211, "correlation": 241674083 + } + }, + { + "ph": "s", "id": 241674083, "pid": 5717, "tid": 5717, "ts": 6302685324323.620, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685340461.502, "dur": 1.664, + "args": { + "External id": 126189, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674095, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241674095, "pid": 3, "tid": 7, "ts": 6302685340461.502, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685324349.710, "dur": 5.540, + "args": { + "External id": 126189, "cbid": 211, "correlation": 241674095 + } + }, + { + "ph": "s", "id": 241674095, "pid": 5717, "tid": 5717, "ts": 6302685324349.710, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685340463.774, "dur": 2.016, + "args": { + "External id": 126196, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674113, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241674113, "pid": 3, "tid": 7, "ts": 6302685340463.774, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685324385.370, "dur": 6.650, + "args": { + "External id": 126196, "cbid": 211, "correlation": 241674113 + } + }, + { + "ph": "s", "id": 241674113, "pid": 5717, "tid": 5717, "ts": 6302685324385.370, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 3, "tid": 7, + "ts": 6302685340466.526, "dur": 3.712, + "args": { + "External id": 126191, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674122, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674122, "pid": 3, "tid": 7, "ts": 6302685340466.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685324399.740, "dur": 4.680, + "args": { + "External id": 126191, "cbid": 211, "correlation": 241674122 + } + }, + { + "ph": "s", "id": 241674122, "pid": 5717, "tid": 5717, "ts": 6302685324399.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 3, "tid": 7, + "ts": 6302685340474.942, "dur": 0.960, + "args": { + "External id": 126198, "device": 3, "context": 1, "stream": 7, "correlation": 241674128, "bytes": 8, "memory bandwidth (GB/s)": 0.008333333333333333 + } + }, + { + "ph": "f", "id": 241674128, "pid": 3, "tid": 7, "ts": 6302685340474.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685324418.000, "dur": 10.740, + "args": { + "External id": 126198, "cbid": 41, "correlation": 241674128 + } + }, + { + "ph": "s", "id": 241674128, "pid": 5717, "tid": 5717, "ts": 6302685324418.000, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 5717, + "ts": 6302685324429.080, "dur": 16051.894, + "args": { + "External id": 126198, "cbid": 131, "correlation": 241674129 + } + }, + { + "ph": "s", "id": 241674129, "pid": 5717, "tid": 5717, "ts": 6302685324429.080, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685340556.374, "dur": 2.020, + "args": { + "External id": 126206, "cbid": 210, "correlation": 241674154 + } + }, + { + "ph": "f", "id": 241674154, "pid": 5717, "tid": 5717, "ts": 6302685340556.374, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685340575.487, "dur": 631.397, + "args": { + "External id": 126206, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674155, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674155, "pid": 3, "tid": 7, "ts": 6302685340575.487, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340562.303, "dur": 13.660, + "args": { + "External id": 126206, "cbid": 211, "correlation": 241674155 + } + }, + { + "ph": "s", "id": 241674155, "pid": 5717, "tid": 5717, "ts": 6302685340562.303, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685341207.588, "dur": 170.913, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674174, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674174, "pid": 3, "tid": 7, "ts": 6302685341207.588, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340708.693, "dur": 9.760, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241674174 + } + }, + { + "ph": "s", "id": 241674174, "pid": 5717, "tid": 5717, "ts": 6302685340708.693, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685341379.205, "dur": 3.936, + "args": { + "External id": 126216, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674191, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674191, "pid": 3, "tid": 7, "ts": 6302685341379.205, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340756.793, "dur": 8.160, + "args": { + "External id": 126216, "cbid": 211, "correlation": 241674191 + } + }, + { + "ph": "s", "id": 241674191, "pid": 5717, "tid": 5717, "ts": 6302685340756.793, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685341383.845, "dur": 1.152, + "args": { + "External id": 126221, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674208, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674208, "pid": 3, "tid": 7, "ts": 6302685341383.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340792.193, "dur": 5.910, + "args": { + "External id": 126221, "cbid": 211, "correlation": 241674208 + } + }, + { + "ph": "s", "id": 241674208, "pid": 5717, "tid": 5717, "ts": 6302685340792.193, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685341385.637, "dur": 0.992, + "args": { + "External id": 126223, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674218, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674218, "pid": 3, "tid": 7, "ts": 6302685341385.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340813.273, "dur": 5.500, + "args": { + "External id": 126223, "cbid": 211, "correlation": 241674218 + } + }, + { + "ph": "s", "id": 241674218, "pid": 5717, "tid": 5717, "ts": 6302685340813.273, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685341387.365, "dur": 1.024, + "args": { + "External id": 126224, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674224, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674224, "pid": 3, "tid": 7, "ts": 6302685341387.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340826.533, "dur": 4.850, + "args": { + "External id": 126224, "cbid": 211, "correlation": 241674224 + } + }, + { + "ph": "s", "id": 241674224, "pid": 5717, "tid": 5717, "ts": 6302685340826.533, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685341389.061, "dur": 0.992, + "args": { + "External id": 126225, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674234, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674234, "pid": 3, "tid": 7, "ts": 6302685341389.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340842.723, "dur": 5.130, + "args": { + "External id": 126225, "cbid": 211, "correlation": 241674234 + } + }, + { + "ph": "s", "id": 241674234, "pid": 5717, "tid": 5717, "ts": 6302685340842.723, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685341390.757, "dur": 1.025, + "args": { + "External id": 126226, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674240, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674240, "pid": 3, "tid": 7, "ts": 6302685341390.757, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340854.253, "dur": 4.730, + "args": { + "External id": 126226, "cbid": 211, "correlation": 241674240 + } + }, + { + "ph": "s", "id": 241674240, "pid": 5717, "tid": 5717, "ts": 6302685340854.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685341392.390, "dur": 3.232, + "args": { + "External id": 126227, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674253, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674253, "pid": 3, "tid": 7, "ts": 6302685341392.390, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340876.023, "dur": 5.450, + "args": { + "External id": 126227, "cbid": 211, "correlation": 241674253 + } + }, + { + "ph": "s", "id": 241674253, "pid": 5717, "tid": 5717, "ts": 6302685340876.023, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685341396.358, "dur": 1.216, + "args": { + "External id": 126230, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674259, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674259, "pid": 3, "tid": 7, "ts": 6302685341396.358, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340889.293, "dur": 4.420, + "args": { + "External id": 126230, "cbid": 211, "correlation": 241674259 + } + }, + { + "ph": "s", "id": 241674259, "pid": 5717, "tid": 5717, "ts": 6302685340889.293, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685341398.182, "dur": 0.992, + "args": { + "External id": 126231, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674265, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674265, "pid": 3, "tid": 7, "ts": 6302685341398.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340899.263, "dur": 4.050, + "args": { + "External id": 126231, "cbid": 211, "correlation": 241674265 + } + }, + { + "ph": "s", "id": 241674265, "pid": 5717, "tid": 5717, "ts": 6302685340899.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685341399.910, "dur": 234.337, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674279, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241674279, "pid": 3, "tid": 7, "ts": 6302685341399.910, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685340991.593, "dur": 8.949, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241674279 + } + }, + { + "ph": "s", "id": 241674279, "pid": 5717, "tid": 5717, "ts": 6302685340991.593, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685341034.522, "dur": 0.571, + "args": { + "External id": 126235, "cbid": 200, "correlation": 241674302 + } + }, + { + "ph": "f", "id": 241674302, "pid": 5717, "tid": 5717, "ts": 6302685341034.522, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685341635.111, "dur": 0.800, + "args": { + "External id": 126235, "device": 3, "context": 1, "stream": 7, "correlation": 241674305, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241674305, "pid": 3, "tid": 7, "ts": 6302685341635.111, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685341037.033, "dur": 8.340, + "args": { + "External id": 126235, "cbid": 51, "correlation": 241674305 + } + }, + { + "ph": "s", "id": 241674305, "pid": 5717, "tid": 5717, "ts": 6302685341037.033, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685341637.063, "dur": 685.893, + "args": { + "External id": 126235, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674306, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674306, "pid": 3, "tid": 7, "ts": 6302685341637.063, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341045.633, "dur": 6.300, + "args": { + "External id": 126235, "cbid": 307, "correlation": 241674306 + } + }, + { + "ph": "s", "id": 241674306, "pid": 5717, "tid": 5717, "ts": 6302685341045.633, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685342323.596, "dur": 2.944, + "args": { + "External id": 126238, "device": 3, "context": 1, "stream": 7, "correlation": 241674311, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 241674311, "pid": 3, "tid": 7, "ts": 6302685342323.596, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685341079.202, "dur": 14.940, + "args": { + "External id": 126238, "cbid": 41, "correlation": 241674311 + } + }, + { + "ph": "s", "id": 241674311, "pid": 5717, "tid": 5717, "ts": 6302685341079.202, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685341137.982, "dur": 0.470, + "args": { + "External id": 126243, "cbid": 200, "correlation": 241674339 + } + }, + { + "ph": "f", "id": 241674339, "pid": 5717, "tid": 5717, "ts": 6302685341137.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685342327.180, "dur": 695.173, + "args": { + "External id": 126243, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674342, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674342, "pid": 3, "tid": 7, "ts": 6302685342327.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341140.052, "dur": 7.890, + "args": { + "External id": 126243, "cbid": 307, "correlation": 241674342 + } + }, + { + "ph": "s", "id": 241674342, "pid": 5717, "tid": 5717, "ts": 6302685341140.052, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685343023.025, "dur": 220.738, + "args": { + "External id": 126244, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674347, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241674347, "pid": 3, "tid": 7, "ts": 6302685343023.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341164.092, "dur": 6.180, + "args": { + "External id": 126244, "cbid": 211, "correlation": 241674347 + } + }, + { + "ph": "s", "id": 241674347, "pid": 5717, "tid": 5717, "ts": 6302685341164.092, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685341217.832, "dur": 1.350, + "args": { + "External id": 126252, "cbid": 210, "correlation": 241674373 + } + }, + { + "ph": "f", "id": 241674373, "pid": 5717, "tid": 5717, "ts": 6302685341217.832, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685343244.435, "dur": 627.493, + "args": { + "External id": 126252, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674374, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674374, "pid": 3, "tid": 7, "ts": 6302685343244.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341222.792, "dur": 8.200, + "args": { + "External id": 126252, "cbid": 211, "correlation": 241674374 + } + }, + { + "ph": "s", "id": 241674374, "pid": 5717, "tid": 5717, "ts": 6302685341222.792, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685343872.536, "dur": 171.265, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674393, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674393, "pid": 3, "tid": 7, "ts": 6302685343872.536, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341358.302, "dur": 9.870, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241674393 + } + }, + { + "ph": "s", "id": 241674393, "pid": 5717, "tid": 5717, "ts": 6302685341358.302, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685344044.505, "dur": 4.032, + "args": { + "External id": 126262, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674410, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674410, "pid": 3, "tid": 7, "ts": 6302685344044.505, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341409.372, "dur": 7.570, + "args": { + "External id": 126262, "cbid": 211, "correlation": 241674410 + } + }, + { + "ph": "s", "id": 241674410, "pid": 5717, "tid": 5717, "ts": 6302685341409.372, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685344049.177, "dur": 1.184, + "args": { + "External id": 126267, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674427, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674427, "pid": 3, "tid": 7, "ts": 6302685344049.177, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341442.292, "dur": 5.309, + "args": { + "External id": 126267, "cbid": 211, "correlation": 241674427 + } + }, + { + "ph": "s", "id": 241674427, "pid": 5717, "tid": 5717, "ts": 6302685341442.292, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685344050.969, "dur": 0.992, + "args": { + "External id": 126269, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674437, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674437, "pid": 3, "tid": 7, "ts": 6302685344050.969, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341462.981, "dur": 5.000, + "args": { + "External id": 126269, "cbid": 211, "correlation": 241674437 + } + }, + { + "ph": "s", "id": 241674437, "pid": 5717, "tid": 5717, "ts": 6302685341462.981, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685344052.697, "dur": 0.992, + "args": { + "External id": 126270, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674443, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674443, "pid": 3, "tid": 7, "ts": 6302685344052.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341475.581, "dur": 4.460, + "args": { + "External id": 126270, "cbid": 211, "correlation": 241674443 + } + }, + { + "ph": "s", "id": 241674443, "pid": 5717, "tid": 5717, "ts": 6302685341475.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685344054.393, "dur": 0.992, + "args": { + "External id": 126271, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674453, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674453, "pid": 3, "tid": 7, "ts": 6302685344054.393, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341491.292, "dur": 4.600, + "args": { + "External id": 126271, "cbid": 211, "correlation": 241674453 + } + }, + { + "ph": "s", "id": 241674453, "pid": 5717, "tid": 5717, "ts": 6302685341491.292, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685344056.089, "dur": 1.024, + "args": { + "External id": 126272, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674459, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674459, "pid": 3, "tid": 7, "ts": 6302685344056.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341501.812, "dur": 4.469, + "args": { + "External id": 126272, "cbid": 211, "correlation": 241674459 + } + }, + { + "ph": "s", "id": 241674459, "pid": 5717, "tid": 5717, "ts": 6302685341501.812, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685344057.721, "dur": 3.232, + "args": { + "External id": 126273, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674472, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674472, "pid": 3, "tid": 7, "ts": 6302685344057.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341524.612, "dur": 5.269, + "args": { + "External id": 126273, "cbid": 211, "correlation": 241674472 + } + }, + { + "ph": "s", "id": 241674472, "pid": 5717, "tid": 5717, "ts": 6302685341524.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685344061.689, "dur": 1.248, + "args": { + "External id": 126276, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674478, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674478, "pid": 3, "tid": 7, "ts": 6302685344061.689, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341536.971, "dur": 4.640, + "args": { + "External id": 126276, "cbid": 211, "correlation": 241674478 + } + }, + { + "ph": "s", "id": 241674478, "pid": 5717, "tid": 5717, "ts": 6302685341536.971, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685344063.673, "dur": 0.960, + "args": { + "External id": 126277, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674484, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674484, "pid": 3, "tid": 7, "ts": 6302685344063.673, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341547.141, "dur": 3.940, + "args": { + "External id": 126277, "cbid": 211, "correlation": 241674484 + } + }, + { + "ph": "s", "id": 241674484, "pid": 5717, "tid": 5717, "ts": 6302685341547.141, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685344065.241, "dur": 233.122, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674498, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241674498, "pid": 3, "tid": 7, "ts": 6302685344065.241, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341638.581, "dur": 8.420, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241674498 + } + }, + { + "ph": "s", "id": 241674498, "pid": 5717, "tid": 5717, "ts": 6302685341638.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685341683.791, "dur": 0.600, + "args": { + "External id": 126281, "cbid": 200, "correlation": 241674521 + } + }, + { + "ph": "f", "id": 241674521, "pid": 5717, "tid": 5717, "ts": 6302685341683.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685344299.259, "dur": 0.800, + "args": { + "External id": 126281, "device": 3, "context": 1, "stream": 7, "correlation": 241674524, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241674524, "pid": 3, "tid": 7, "ts": 6302685344299.259, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685341686.401, "dur": 7.630, + "args": { + "External id": 126281, "cbid": 51, "correlation": 241674524 + } + }, + { + "ph": "s", "id": 241674524, "pid": 5717, "tid": 5717, "ts": 6302685341686.401, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685344301.211, "dur": 691.078, + "args": { + "External id": 126281, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674525, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674525, "pid": 3, "tid": 7, "ts": 6302685344301.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341694.291, "dur": 8.070, + "args": { + "External id": 126281, "cbid": 307, "correlation": 241674525 + } + }, + { + "ph": "s", "id": 241674525, "pid": 5717, "tid": 5717, "ts": 6302685341694.291, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685344993.025, "dur": 2.976, + "args": { + "External id": 126284, "device": 3, "context": 1, "stream": 7, "correlation": 241674530, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 241674530, "pid": 3, "tid": 7, "ts": 6302685344993.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685341731.091, "dur": 13.100, + "args": { + "External id": 126284, "cbid": 41, "correlation": 241674530 + } + }, + { + "ph": "s", "id": 241674530, "pid": 5717, "tid": 5717, "ts": 6302685341731.091, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685341789.001, "dur": 0.430, + "args": { + "External id": 126289, "cbid": 200, "correlation": 241674558 + } + }, + { + "ph": "f", "id": 241674558, "pid": 5717, "tid": 5717, "ts": 6302685341789.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685344996.577, "dur": 681.541, + "args": { + "External id": 126289, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674561, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674561, "pid": 3, "tid": 7, "ts": 6302685344996.577, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341790.991, "dur": 7.450, + "args": { + "External id": 126289, "cbid": 307, "correlation": 241674561 + } + }, + { + "ph": "s", "id": 241674561, "pid": 5717, "tid": 5717, "ts": 6302685341790.991, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685345678.726, "dur": 221.313, + "args": { + "External id": 126290, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674566, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241674566, "pid": 3, "tid": 7, "ts": 6302685345678.726, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341813.321, "dur": 6.290, + "args": { + "External id": 126290, "cbid": 211, "correlation": 241674566 + } + }, + { + "ph": "s", "id": 241674566, "pid": 5717, "tid": 5717, "ts": 6302685341813.321, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685341865.511, "dur": 1.410, + "args": { + "External id": 126298, "cbid": 210, "correlation": 241674592 + } + }, + { + "ph": "f", "id": 241674592, "pid": 5717, "tid": 5717, "ts": 6302685341865.511, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685345900.711, "dur": 624.933, + "args": { + "External id": 126298, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674593, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674593, "pid": 3, "tid": 7, "ts": 6302685345900.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341870.521, "dur": 7.790, + "args": { + "External id": 126298, "cbid": 211, "correlation": 241674593 + } + }, + { + "ph": "s", "id": 241674593, "pid": 5717, "tid": 5717, "ts": 6302685341870.521, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685346526.316, "dur": 170.401, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674612, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674612, "pid": 3, "tid": 7, "ts": 6302685346526.316, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685341987.380, "dur": 9.490, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241674612 + } + }, + { + "ph": "s", "id": 241674612, "pid": 5717, "tid": 5717, "ts": 6302685341987.380, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685346697.421, "dur": 3.969, + "args": { + "External id": 126308, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674629, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674629, "pid": 3, "tid": 7, "ts": 6302685346697.421, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342035.480, "dur": 7.480, + "args": { + "External id": 126308, "cbid": 211, "correlation": 241674629 + } + }, + { + "ph": "s", "id": 241674629, "pid": 5717, "tid": 5717, "ts": 6302685342035.480, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685346702.062, "dur": 1.184, + "args": { + "External id": 126313, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674646, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674646, "pid": 3, "tid": 7, "ts": 6302685346702.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342066.460, "dur": 5.610, + "args": { + "External id": 126313, "cbid": 211, "correlation": 241674646 + } + }, + { + "ph": "s", "id": 241674646, "pid": 5717, "tid": 5717, "ts": 6302685342066.460, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685346703.886, "dur": 0.992, + "args": { + "External id": 126315, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674656, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674656, "pid": 3, "tid": 7, "ts": 6302685346703.886, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342087.680, "dur": 5.250, + "args": { + "External id": 126315, "cbid": 211, "correlation": 241674656 + } + }, + { + "ph": "s", "id": 241674656, "pid": 5717, "tid": 5717, "ts": 6302685342087.680, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685346705.454, "dur": 1.024, + "args": { + "External id": 126316, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674662, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674662, "pid": 3, "tid": 7, "ts": 6302685346705.454, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342100.250, "dur": 4.830, + "args": { + "External id": 126316, "cbid": 211, "correlation": 241674662 + } + }, + { + "ph": "s", "id": 241674662, "pid": 5717, "tid": 5717, "ts": 6302685342100.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685346707.182, "dur": 0.992, + "args": { + "External id": 126317, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674672, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674672, "pid": 3, "tid": 7, "ts": 6302685346707.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342117.570, "dur": 4.640, + "args": { + "External id": 126317, "cbid": 211, "correlation": 241674672 + } + }, + { + "ph": "s", "id": 241674672, "pid": 5717, "tid": 5717, "ts": 6302685342117.570, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685346708.878, "dur": 0.992, + "args": { + "External id": 126318, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674678, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674678, "pid": 3, "tid": 7, "ts": 6302685346708.878, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342127.890, "dur": 4.430, + "args": { + "External id": 126318, "cbid": 211, "correlation": 241674678 + } + }, + { + "ph": "s", "id": 241674678, "pid": 5717, "tid": 5717, "ts": 6302685342127.890, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685346710.478, "dur": 3.232, + "args": { + "External id": 126319, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674691, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674691, "pid": 3, "tid": 7, "ts": 6302685346710.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342150.490, "dur": 5.280, + "args": { + "External id": 126319, "cbid": 211, "correlation": 241674691 + } + }, + { + "ph": "s", "id": 241674691, "pid": 5717, "tid": 5717, "ts": 6302685342150.490, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685346714.446, "dur": 1.216, + "args": { + "External id": 126322, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674697, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674697, "pid": 3, "tid": 7, "ts": 6302685346714.446, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342162.130, "dur": 4.480, + "args": { + "External id": 126322, "cbid": 211, "correlation": 241674697 + } + }, + { + "ph": "s", "id": 241674697, "pid": 5717, "tid": 5717, "ts": 6302685342162.130, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685346716.302, "dur": 0.960, + "args": { + "External id": 126323, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674703, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674703, "pid": 3, "tid": 7, "ts": 6302685346716.302, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342171.900, "dur": 4.320, + "args": { + "External id": 126323, "cbid": 211, "correlation": 241674703 + } + }, + { + "ph": "s", "id": 241674703, "pid": 5717, "tid": 5717, "ts": 6302685342171.900, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342190.870, "dur": 7.090, + "args": { + "cbid": 138, "correlation": 241674707 + } + }, + { + "ph": "f", "id": 241674707, "pid": 5717, "tid": 423623104, "ts": 6302685342190.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342198.430, "dur": 1.220, + "args": { + "cbid": 138, "correlation": 241674708 + } + }, + { + "ph": "f", "id": 241674708, "pid": 5717, "tid": 423623104, "ts": 6302685342198.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342203.840, "dur": 0.900, + "args": { + "cbid": 138, "correlation": 241674710 + } + }, + { + "ph": "f", "id": 241674710, "pid": 5717, "tid": 423623104, "ts": 6302685342203.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342209.180, "dur": 1.350, + "args": { + "cbid": 138, "correlation": 241674712 + } + }, + { + "ph": "f", "id": 241674712, "pid": 5717, "tid": 423623104, "ts": 6302685342209.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342210.920, "dur": 0.460, + "args": { + "cbid": 138, "correlation": 241674714 + } + }, + { + "ph": "f", "id": 241674714, "pid": 5717, "tid": 423623104, "ts": 6302685342210.920, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342212.610, "dur": 0.530, + "args": { + "cbid": 138, "correlation": 241674715 + } + }, + { + "ph": "f", "id": 241674715, "pid": 5717, "tid": 423623104, "ts": 6302685342212.610, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342214.990, "dur": 1.740, + "args": { + "cbid": 138, "correlation": 241674716 + } + }, + { + "ph": "f", "id": 241674716, "pid": 5717, "tid": 423623104, "ts": 6302685342214.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342216.870, "dur": 0.430, + "args": { + "cbid": 138, "correlation": 241674717 + } + }, + { + "ph": "f", "id": 241674717, "pid": 5717, "tid": 423623104, "ts": 6302685342216.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342218.220, "dur": 0.490, + "args": { + "cbid": 138, "correlation": 241674718 + } + }, + { + "ph": "f", "id": 241674718, "pid": 5717, "tid": 423623104, "ts": 6302685342218.220, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342220.680, "dur": 1.410, + "args": { + "cbid": 138, "correlation": 241674719 + } + }, + { + "ph": "f", "id": 241674719, "pid": 5717, "tid": 423623104, "ts": 6302685342220.680, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342222.240, "dur": 0.440, + "args": { + "cbid": 138, "correlation": 241674720 + } + }, + { + "ph": "f", "id": 241674720, "pid": 5717, "tid": 423623104, "ts": 6302685342222.240, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685342223.680, "dur": 0.450, + "args": { + "cbid": 138, "correlation": 241674721 + } + }, + { + "ph": "f", "id": 241674721, "pid": 5717, "tid": 423623104, "ts": 6302685342223.680, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685346717.998, "dur": 233.089, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674729, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241674729, "pid": 3, "tid": 7, "ts": 6302685346717.998, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342265.200, "dur": 10.770, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241674729 + } + }, + { + "ph": "s", "id": 241674729, "pid": 5717, "tid": 5717, "ts": 6302685342265.200, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685342318.819, "dur": 0.611, + "args": { + "External id": 126327, "cbid": 200, "correlation": 241674752 + } + }, + { + "ph": "f", "id": 241674752, "pid": 5717, "tid": 5717, "ts": 6302685342318.819, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685346951.919, "dur": 0.800, + "args": { + "External id": 126327, "device": 3, "context": 1, "stream": 7, "correlation": 241674755, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241674755, "pid": 3, "tid": 7, "ts": 6302685346951.919, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685342321.259, "dur": 7.180, + "args": { + "External id": 126327, "cbid": 51, "correlation": 241674755 + } + }, + { + "ph": "s", "id": 241674755, "pid": 5717, "tid": 5717, "ts": 6302685342321.259, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685346953.871, "dur": 689.541, + "args": { + "External id": 126327, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674756, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674756, "pid": 3, "tid": 7, "ts": 6302685346953.871, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342328.730, "dur": 7.000, + "args": { + "External id": 126327, "cbid": 307, "correlation": 241674756 + } + }, + { + "ph": "s", "id": 241674756, "pid": 5717, "tid": 5717, "ts": 6302685342328.730, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685347644.052, "dur": 2.976, + "args": { + "External id": 126330, "device": 3, "context": 1, "stream": 7, "correlation": 241674761, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 241674761, "pid": 3, "tid": 7, "ts": 6302685347644.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685342362.759, "dur": 12.651, + "args": { + "External id": 126330, "cbid": 41, "correlation": 241674761 + } + }, + { + "ph": "s", "id": 241674761, "pid": 5717, "tid": 5717, "ts": 6302685342362.759, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685342417.779, "dur": 0.440, + "args": { + "External id": 126335, "cbid": 200, "correlation": 241674789 + } + }, + { + "ph": "f", "id": 241674789, "pid": 5717, "tid": 5717, "ts": 6302685342417.779, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685347647.668, "dur": 683.173, + "args": { + "External id": 126335, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674792, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674792, "pid": 3, "tid": 7, "ts": 6302685347647.668, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342419.849, "dur": 7.660, + "args": { + "External id": 126335, "cbid": 307, "correlation": 241674792 + } + }, + { + "ph": "s", "id": 241674792, "pid": 5717, "tid": 5717, "ts": 6302685342419.849, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685348331.481, "dur": 221.634, + "args": { + "External id": 126336, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674797, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241674797, "pid": 3, "tid": 7, "ts": 6302685348331.481, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342442.319, "dur": 6.000, + "args": { + "External id": 126336, "cbid": 211, "correlation": 241674797 + } + }, + { + "ph": "s", "id": 241674797, "pid": 5717, "tid": 5717, "ts": 6302685342442.319, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685342494.889, "dur": 1.360, + "args": { + "External id": 126344, "cbid": 210, "correlation": 241674823 + } + }, + { + "ph": "f", "id": 241674823, "pid": 5717, "tid": 5717, "ts": 6302685342494.889, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685348553.819, "dur": 624.357, + "args": { + "External id": 126344, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674824, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674824, "pid": 3, "tid": 7, "ts": 6302685348553.819, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342499.849, "dur": 7.530, + "args": { + "External id": 126344, "cbid": 211, "correlation": 241674824 + } + }, + { + "ph": "s", "id": 241674824, "pid": 5717, "tid": 5717, "ts": 6302685342499.849, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685349178.816, "dur": 171.329, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674843, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674843, "pid": 3, "tid": 7, "ts": 6302685349178.816, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342612.489, "dur": 9.310, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241674843 + } + }, + { + "ph": "s", "id": 241674843, "pid": 5717, "tid": 5717, "ts": 6302685342612.489, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685349350.849, "dur": 4.064, + "args": { + "External id": 126354, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674860, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674860, "pid": 3, "tid": 7, "ts": 6302685349350.849, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342657.839, "dur": 7.350, + "args": { + "External id": 126354, "cbid": 211, "correlation": 241674860 + } + }, + { + "ph": "s", "id": 241674860, "pid": 5717, "tid": 5717, "ts": 6302685342657.839, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685349355.649, "dur": 1.184, + "args": { + "External id": 126359, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674877, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674877, "pid": 3, "tid": 7, "ts": 6302685349355.649, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342689.929, "dur": 5.320, + "args": { + "External id": 126359, "cbid": 211, "correlation": 241674877 + } + }, + { + "ph": "s", "id": 241674877, "pid": 5717, "tid": 5717, "ts": 6302685342689.929, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685349357.473, "dur": 0.960, + "args": { + "External id": 126361, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674887, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674887, "pid": 3, "tid": 7, "ts": 6302685349357.473, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342709.249, "dur": 4.890, + "args": { + "External id": 126361, "cbid": 211, "correlation": 241674887 + } + }, + { + "ph": "s", "id": 241674887, "pid": 5717, "tid": 5717, "ts": 6302685342709.249, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685349359.169, "dur": 1.024, + "args": { + "External id": 126362, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674893, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674893, "pid": 3, "tid": 7, "ts": 6302685349359.169, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342721.439, "dur": 4.480, + "args": { + "External id": 126362, "cbid": 211, "correlation": 241674893 + } + }, + { + "ph": "s", "id": 241674893, "pid": 5717, "tid": 5717, "ts": 6302685342721.439, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685349360.865, "dur": 1.024, + "args": { + "External id": 126363, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674903, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674903, "pid": 3, "tid": 7, "ts": 6302685349360.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342736.109, "dur": 4.460, + "args": { + "External id": 126363, "cbid": 211, "correlation": 241674903 + } + }, + { + "ph": "s", "id": 241674903, "pid": 5717, "tid": 5717, "ts": 6302685342736.109, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685349362.593, "dur": 0.992, + "args": { + "External id": 126364, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674909, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674909, "pid": 3, "tid": 7, "ts": 6302685349362.593, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342746.359, "dur": 4.320, + "args": { + "External id": 126364, "cbid": 211, "correlation": 241674909 + } + }, + { + "ph": "s", "id": 241674909, "pid": 5717, "tid": 5717, "ts": 6302685342746.359, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685349364.193, "dur": 3.232, + "args": { + "External id": 126365, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674922, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674922, "pid": 3, "tid": 7, "ts": 6302685349364.193, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342769.058, "dur": 5.300, + "args": { + "External id": 126365, "cbid": 211, "correlation": 241674922 + } + }, + { + "ph": "s", "id": 241674922, "pid": 5717, "tid": 5717, "ts": 6302685342769.058, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685349368.001, "dur": 1.248, + "args": { + "External id": 126368, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674928, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674928, "pid": 3, "tid": 7, "ts": 6302685349368.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342780.629, "dur": 4.680, + "args": { + "External id": 126368, "cbid": 211, "correlation": 241674928 + } + }, + { + "ph": "s", "id": 241674928, "pid": 5717, "tid": 5717, "ts": 6302685342780.629, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685349369.857, "dur": 0.992, + "args": { + "External id": 126369, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674934, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241674934, "pid": 3, "tid": 7, "ts": 6302685349369.857, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342790.738, "dur": 3.940, + "args": { + "External id": 126369, "cbid": 211, "correlation": 241674934 + } + }, + { + "ph": "s", "id": 241674934, "pid": 5717, "tid": 5717, "ts": 6302685342790.738, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685349371.425, "dur": 233.218, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674948, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241674948, "pid": 3, "tid": 7, "ts": 6302685349371.425, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342879.788, "dur": 8.370, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241674948 + } + }, + { + "ph": "s", "id": 241674948, "pid": 5717, "tid": 5717, "ts": 6302685342879.788, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685342922.518, "dur": 0.580, + "args": { + "External id": 126373, "cbid": 200, "correlation": 241674971 + } + }, + { + "ph": "f", "id": 241674971, "pid": 5717, "tid": 5717, "ts": 6302685342922.518, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685349605.571, "dur": 0.768, + "args": { + "External id": 126373, "device": 3, "context": 1, "stream": 7, "correlation": 241674974, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 241674974, "pid": 3, "tid": 7, "ts": 6302685349605.571, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685342924.958, "dur": 6.700, + "args": { + "External id": 126373, "cbid": 51, "correlation": 241674974 + } + }, + { + "ph": "s", "id": 241674974, "pid": 5717, "tid": 5717, "ts": 6302685342924.958, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685349607.491, "dur": 685.990, + "args": { + "External id": 126373, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241674975, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241674975, "pid": 3, "tid": 7, "ts": 6302685349607.491, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685342931.908, "dur": 5.980, + "args": { + "External id": 126373, "cbid": 307, "correlation": 241674975 + } + }, + { + "ph": "s", "id": 241674975, "pid": 5717, "tid": 5717, "ts": 6302685342931.908, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685350294.185, "dur": 2.976, + "args": { + "External id": 126376, "device": 3, "context": 1, "stream": 7, "correlation": 241674980, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 241674980, "pid": 3, "tid": 7, "ts": 6302685350294.185, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685342962.328, "dur": 12.380, + "args": { + "External id": 126376, "cbid": 41, "correlation": 241674980 + } + }, + { + "ph": "s", "id": 241674980, "pid": 5717, "tid": 5717, "ts": 6302685342962.328, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685343016.918, "dur": 0.450, + "args": { + "External id": 126381, "cbid": 200, "correlation": 241675008 + } + }, + { + "ph": "f", "id": 241675008, "pid": 5717, "tid": 5717, "ts": 6302685343016.918, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685350297.801, "dur": 685.189, + "args": { + "External id": 126381, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675011, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675011, "pid": 3, "tid": 7, "ts": 6302685350297.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343019.028, "dur": 7.450, + "args": { + "External id": 126381, "cbid": 307, "correlation": 241675011 + } + }, + { + "ph": "s", "id": 241675011, "pid": 5717, "tid": 5717, "ts": 6302685343019.028, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685350983.662, "dur": 220.513, + "args": { + "External id": 126382, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675016, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241675016, "pid": 3, "tid": 7, "ts": 6302685350983.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343040.928, "dur": 6.200, + "args": { + "External id": 126382, "cbid": 211, "correlation": 241675016 + } + }, + { + "ph": "s", "id": 241675016, "pid": 5717, "tid": 5717, "ts": 6302685343040.928, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685343092.838, "dur": 1.310, + "args": { + "External id": 126390, "cbid": 210, "correlation": 241675042 + } + }, + { + "ph": "f", "id": 241675042, "pid": 5717, "tid": 5717, "ts": 6302685343092.838, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685351204.911, "dur": 631.077, + "args": { + "External id": 126390, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675043, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675043, "pid": 3, "tid": 7, "ts": 6302685351204.911, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343097.678, "dur": 7.950, + "args": { + "External id": 126390, "cbid": 211, "correlation": 241675043 + } + }, + { + "ph": "s", "id": 241675043, "pid": 5717, "tid": 5717, "ts": 6302685343097.678, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685351836.692, "dur": 170.817, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675062, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675062, "pid": 3, "tid": 7, "ts": 6302685351836.692, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343209.937, "dur": 9.151, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241675062 + } + }, + { + "ph": "s", "id": 241675062, "pid": 5717, "tid": 5717, "ts": 6302685343209.937, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685352008.149, "dur": 4.257, + "args": { + "External id": 126400, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675079, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675079, "pid": 3, "tid": 7, "ts": 6302685352008.149, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343255.237, "dur": 7.720, + "args": { + "External id": 126400, "cbid": 211, "correlation": 241675079 + } + }, + { + "ph": "s", "id": 241675079, "pid": 5717, "tid": 5717, "ts": 6302685343255.237, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685352013.142, "dur": 1.184, + "args": { + "External id": 126405, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675096, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675096, "pid": 3, "tid": 7, "ts": 6302685352013.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343285.477, "dur": 5.191, + "args": { + "External id": 126405, "cbid": 211, "correlation": 241675096 + } + }, + { + "ph": "s", "id": 241675096, "pid": 5717, "tid": 5717, "ts": 6302685343285.477, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685352014.998, "dur": 1.024, + "args": { + "External id": 126407, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675106, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675106, "pid": 3, "tid": 7, "ts": 6302685352014.998, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343330.837, "dur": 5.750, + "args": { + "External id": 126407, "cbid": 211, "correlation": 241675106 + } + }, + { + "ph": "s", "id": 241675106, "pid": 5717, "tid": 5717, "ts": 6302685343330.837, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685352016.758, "dur": 1.056, + "args": { + "External id": 126408, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675112, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675112, "pid": 3, "tid": 7, "ts": 6302685352016.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343344.637, "dur": 4.570, + "args": { + "External id": 126408, "cbid": 211, "correlation": 241675112 + } + }, + { + "ph": "s", "id": 241675112, "pid": 5717, "tid": 5717, "ts": 6302685343344.637, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685352018.550, "dur": 1.024, + "args": { + "External id": 126409, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675122, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675122, "pid": 3, "tid": 7, "ts": 6302685352018.550, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343359.877, "dur": 4.540, + "args": { + "External id": 126409, "cbid": 211, "correlation": 241675122 + } + }, + { + "ph": "s", "id": 241675122, "pid": 5717, "tid": 5717, "ts": 6302685343359.877, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685352020.310, "dur": 1.024, + "args": { + "External id": 126410, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675128, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675128, "pid": 3, "tid": 7, "ts": 6302685352020.310, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343370.077, "dur": 4.250, + "args": { + "External id": 126410, "cbid": 211, "correlation": 241675128 + } + }, + { + "ph": "s", "id": 241675128, "pid": 5717, "tid": 5717, "ts": 6302685343370.077, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685352021.974, "dur": 3.456, + "args": { + "External id": 126411, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675141, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675141, "pid": 3, "tid": 7, "ts": 6302685352021.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343391.897, "dur": 5.290, + "args": { + "External id": 126411, "cbid": 211, "correlation": 241675141 + } + }, + { + "ph": "s", "id": 241675141, "pid": 5717, "tid": 5717, "ts": 6302685343391.897, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685352026.070, "dur": 1.248, + "args": { + "External id": 126414, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675147, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675147, "pid": 3, "tid": 7, "ts": 6302685352026.070, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343403.907, "dur": 4.290, + "args": { + "External id": 126414, "cbid": 211, "correlation": 241675147 + } + }, + { + "ph": "s", "id": 241675147, "pid": 5717, "tid": 5717, "ts": 6302685343403.907, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685352027.990, "dur": 0.992, + "args": { + "External id": 126415, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675153, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675153, "pid": 3, "tid": 7, "ts": 6302685352027.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343413.467, "dur": 3.980, + "args": { + "External id": 126415, "cbid": 211, "correlation": 241675153 + } + }, + { + "ph": "s", "id": 241675153, "pid": 5717, "tid": 5717, "ts": 6302685343413.467, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685352029.622, "dur": 233.121, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675167, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241675167, "pid": 3, "tid": 7, "ts": 6302685352029.622, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343505.377, "dur": 8.590, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241675167 + } + }, + { + "ph": "s", "id": 241675167, "pid": 5717, "tid": 5717, "ts": 6302685343505.377, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685343546.727, "dur": 0.560, + "args": { + "External id": 126419, "cbid": 200, "correlation": 241675190 + } + }, + { + "ph": "f", "id": 241675190, "pid": 5717, "tid": 5717, "ts": 6302685343546.727, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685352263.607, "dur": 0.832, + "args": { + "External id": 126419, "device": 3, "context": 1, "stream": 7, "correlation": 241675193, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 241675193, "pid": 3, "tid": 7, "ts": 6302685352263.607, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685343549.057, "dur": 8.060, + "args": { + "External id": 126419, "cbid": 51, "correlation": 241675193 + } + }, + { + "ph": "s", "id": 241675193, "pid": 5717, "tid": 5717, "ts": 6302685343549.057, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685352265.623, "dur": 690.597, + "args": { + "External id": 126419, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675194, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675194, "pid": 3, "tid": 7, "ts": 6302685352265.623, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343557.347, "dur": 6.170, + "args": { + "External id": 126419, "cbid": 307, "correlation": 241675194 + } + }, + { + "ph": "s", "id": 241675194, "pid": 5717, "tid": 5717, "ts": 6302685343557.347, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685352956.860, "dur": 3.008, + "args": { + "External id": 126422, "device": 3, "context": 1, "stream": 7, "correlation": 241675199, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 241675199, "pid": 3, "tid": 7, "ts": 6302685352956.860, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685343588.847, "dur": 12.530, + "args": { + "External id": 126422, "cbid": 41, "correlation": 241675199 + } + }, + { + "ph": "s", "id": 241675199, "pid": 5717, "tid": 5717, "ts": 6302685343588.847, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685343641.196, "dur": 0.440, + "args": { + "External id": 126427, "cbid": 200, "correlation": 241675227 + } + }, + { + "ph": "f", "id": 241675227, "pid": 5717, "tid": 5717, "ts": 6302685343641.196, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685352960.540, "dur": 690.502, + "args": { + "External id": 126427, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675230, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675230, "pid": 3, "tid": 7, "ts": 6302685352960.540, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343643.247, "dur": 7.660, + "args": { + "External id": 126427, "cbid": 307, "correlation": 241675230 + } + }, + { + "ph": "s", "id": 241675230, "pid": 5717, "tid": 5717, "ts": 6302685343643.247, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685353651.650, "dur": 221.186, + "args": { + "External id": 126428, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675235, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241675235, "pid": 3, "tid": 7, "ts": 6302685353651.650, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343664.996, "dur": 6.260, + "args": { + "External id": 126428, "cbid": 211, "correlation": 241675235 + } + }, + { + "ph": "s", "id": 241675235, "pid": 5717, "tid": 5717, "ts": 6302685343664.996, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685343717.836, "dur": 1.331, + "args": { + "External id": 126436, "cbid": 210, "correlation": 241675261 + } + }, + { + "ph": "f", "id": 241675261, "pid": 5717, "tid": 5717, "ts": 6302685343717.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685353873.476, "dur": 641.636, + "args": { + "External id": 126436, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675262, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675262, "pid": 3, "tid": 7, "ts": 6302685353873.476, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343722.716, "dur": 8.011, + "args": { + "External id": 126436, "cbid": 211, "correlation": 241675262 + } + }, + { + "ph": "s", "id": 241675262, "pid": 5717, "tid": 5717, "ts": 6302685343722.716, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685354515.784, "dur": 171.394, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675281, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675281, "pid": 3, "tid": 7, "ts": 6302685354515.784, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343833.746, "dur": 9.060, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241675281 + } + }, + { + "ph": "s", "id": 241675281, "pid": 5717, "tid": 5717, "ts": 6302685343833.746, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685354687.850, "dur": 4.096, + "args": { + "External id": 126446, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675298, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675298, "pid": 3, "tid": 7, "ts": 6302685354687.850, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343879.656, "dur": 7.430, + "args": { + "External id": 126446, "cbid": 211, "correlation": 241675298 + } + }, + { + "ph": "s", "id": 241675298, "pid": 5717, "tid": 5717, "ts": 6302685343879.656, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685354692.682, "dur": 1.216, + "args": { + "External id": 126451, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675315, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675315, "pid": 3, "tid": 7, "ts": 6302685354692.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343911.056, "dur": 5.110, + "args": { + "External id": 126451, "cbid": 211, "correlation": 241675315 + } + }, + { + "ph": "s", "id": 241675315, "pid": 5717, "tid": 5717, "ts": 6302685343911.056, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685354694.538, "dur": 1.024, + "args": { + "External id": 126453, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675325, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675325, "pid": 3, "tid": 7, "ts": 6302685354694.538, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343930.536, "dur": 4.870, + "args": { + "External id": 126453, "cbid": 211, "correlation": 241675325 + } + }, + { + "ph": "s", "id": 241675325, "pid": 5717, "tid": 5717, "ts": 6302685343930.536, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685354696.170, "dur": 1.056, + "args": { + "External id": 126454, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675331, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675331, "pid": 3, "tid": 7, "ts": 6302685354696.170, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343942.886, "dur": 4.590, + "args": { + "External id": 126454, "cbid": 211, "correlation": 241675331 + } + }, + { + "ph": "s", "id": 241675331, "pid": 5717, "tid": 5717, "ts": 6302685343942.886, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685354697.930, "dur": 1.056, + "args": { + "External id": 126455, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675341, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675341, "pid": 3, "tid": 7, "ts": 6302685354697.930, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343959.486, "dur": 4.420, + "args": { + "External id": 126455, "cbid": 211, "correlation": 241675341 + } + }, + { + "ph": "s", "id": 241675341, "pid": 5717, "tid": 5717, "ts": 6302685343959.486, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685354699.722, "dur": 1.024, + "args": { + "External id": 126456, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675347, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675347, "pid": 3, "tid": 7, "ts": 6302685354699.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343969.946, "dur": 4.300, + "args": { + "External id": 126456, "cbid": 211, "correlation": 241675347 + } + }, + { + "ph": "s", "id": 241675347, "pid": 5717, "tid": 5717, "ts": 6302685343969.946, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685354701.386, "dur": 3.328, + "args": { + "External id": 126457, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675360, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675360, "pid": 3, "tid": 7, "ts": 6302685354701.386, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685343990.256, "dur": 4.900, + "args": { + "External id": 126457, "cbid": 211, "correlation": 241675360 + } + }, + { + "ph": "s", "id": 241675360, "pid": 5717, "tid": 5717, "ts": 6302685343990.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685354705.354, "dur": 1.248, + "args": { + "External id": 126460, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675366, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675366, "pid": 3, "tid": 7, "ts": 6302685354705.354, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344001.546, "dur": 4.230, + "args": { + "External id": 126460, "cbid": 211, "correlation": 241675366 + } + }, + { + "ph": "s", "id": 241675366, "pid": 5717, "tid": 5717, "ts": 6302685344001.546, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685354707.242, "dur": 1.024, + "args": { + "External id": 126461, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675372, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675372, "pid": 3, "tid": 7, "ts": 6302685354707.242, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344011.966, "dur": 4.070, + "args": { + "External id": 126461, "cbid": 211, "correlation": 241675372 + } + }, + { + "ph": "s", "id": 241675372, "pid": 5717, "tid": 5717, "ts": 6302685344011.966, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685354708.874, "dur": 233.601, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675386, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241675386, "pid": 3, "tid": 7, "ts": 6302685354708.874, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344103.335, "dur": 8.791, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241675386 + } + }, + { + "ph": "s", "id": 241675386, "pid": 5717, "tid": 5717, "ts": 6302685344103.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685344159.006, "dur": 0.560, + "args": { + "External id": 126465, "cbid": 200, "correlation": 241675409 + } + }, + { + "ph": "f", "id": 241675409, "pid": 5717, "tid": 5717, "ts": 6302685344159.006, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685354943.275, "dur": 0.800, + "args": { + "External id": 126465, "device": 3, "context": 1, "stream": 7, "correlation": 241675412, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241675412, "pid": 3, "tid": 7, "ts": 6302685354943.275, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685344161.386, "dur": 7.929, + "args": { + "External id": 126465, "cbid": 51, "correlation": 241675412 + } + }, + { + "ph": "s", "id": 241675412, "pid": 5717, "tid": 5717, "ts": 6302685344161.386, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685354944.843, "dur": 688.229, + "args": { + "External id": 126465, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675413, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675413, "pid": 3, "tid": 7, "ts": 6302685354944.843, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344169.535, "dur": 6.420, + "args": { + "External id": 126465, "cbid": 307, "correlation": 241675413 + } + }, + { + "ph": "s", "id": 241675413, "pid": 5717, "tid": 5717, "ts": 6302685344169.535, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685355633.776, "dur": 3.008, + "args": { + "External id": 126468, "device": 3, "context": 1, "stream": 7, "correlation": 241675418, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 241675418, "pid": 3, "tid": 7, "ts": 6302685355633.776, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685344201.435, "dur": 12.300, + "args": { + "External id": 126468, "cbid": 41, "correlation": 241675418 + } + }, + { + "ph": "s", "id": 241675418, "pid": 5717, "tid": 5717, "ts": 6302685344201.435, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685344252.805, "dur": 0.440, + "args": { + "External id": 126473, "cbid": 200, "correlation": 241675446 + } + }, + { + "ph": "f", "id": 241675446, "pid": 5717, "tid": 5717, "ts": 6302685344252.805, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685355637.488, "dur": 684.998, + "args": { + "External id": 126473, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675449, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675449, "pid": 3, "tid": 7, "ts": 6302685355637.488, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344254.825, "dur": 7.340, + "args": { + "External id": 126473, "cbid": 307, "correlation": 241675449 + } + }, + { + "ph": "s", "id": 241675449, "pid": 5717, "tid": 5717, "ts": 6302685344254.825, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685356323.222, "dur": 221.281, + "args": { + "External id": 126474, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675454, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241675454, "pid": 3, "tid": 7, "ts": 6302685356323.222, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344276.585, "dur": 6.110, + "args": { + "External id": 126474, "cbid": 211, "correlation": 241675454 + } + }, + { + "ph": "s", "id": 241675454, "pid": 5717, "tid": 5717, "ts": 6302685344276.585, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685344338.675, "dur": 1.400, + "args": { + "External id": 126482, "cbid": 210, "correlation": 241675480 + } + }, + { + "ph": "f", "id": 241675480, "pid": 5717, "tid": 5717, "ts": 6302685344338.675, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685356545.239, "dur": 645.029, + "args": { + "External id": 126482, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675481, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675481, "pid": 3, "tid": 7, "ts": 6302685356545.239, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344343.605, "dur": 8.400, + "args": { + "External id": 126482, "cbid": 211, "correlation": 241675481 + } + }, + { + "ph": "s", "id": 241675481, "pid": 5717, "tid": 5717, "ts": 6302685344343.605, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685357190.972, "dur": 170.689, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675500, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675500, "pid": 3, "tid": 7, "ts": 6302685357190.972, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344457.475, "dur": 9.180, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241675500 + } + }, + { + "ph": "s", "id": 241675500, "pid": 5717, "tid": 5717, "ts": 6302685344457.475, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685357362.301, "dur": 4.129, + "args": { + "External id": 126492, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675517, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675517, "pid": 3, "tid": 7, "ts": 6302685357362.301, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344501.795, "dur": 7.260, + "args": { + "External id": 126492, "cbid": 211, "correlation": 241675517 + } + }, + { + "ph": "s", "id": 241675517, "pid": 5717, "tid": 5717, "ts": 6302685344501.795, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685357367.134, "dur": 1.280, + "args": { + "External id": 126497, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675534, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675534, "pid": 3, "tid": 7, "ts": 6302685357367.134, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344532.414, "dur": 5.291, + "args": { + "External id": 126497, "cbid": 211, "correlation": 241675534 + } + }, + { + "ph": "s", "id": 241675534, "pid": 5717, "tid": 5717, "ts": 6302685344532.414, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685357369.150, "dur": 1.024, + "args": { + "External id": 126499, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675544, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675544, "pid": 3, "tid": 7, "ts": 6302685357369.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344553.034, "dur": 4.951, + "args": { + "External id": 126499, "cbid": 211, "correlation": 241675544 + } + }, + { + "ph": "s", "id": 241675544, "pid": 5717, "tid": 5717, "ts": 6302685344553.034, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685357370.910, "dur": 1.056, + "args": { + "External id": 126500, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675550, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675550, "pid": 3, "tid": 7, "ts": 6302685357370.910, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344565.225, "dur": 4.909, + "args": { + "External id": 126500, "cbid": 211, "correlation": 241675550 + } + }, + { + "ph": "s", "id": 241675550, "pid": 5717, "tid": 5717, "ts": 6302685344565.225, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685357372.670, "dur": 1.056, + "args": { + "External id": 126501, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675560, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675560, "pid": 3, "tid": 7, "ts": 6302685357372.670, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344580.485, "dur": 4.549, + "args": { + "External id": 126501, "cbid": 211, "correlation": 241675560 + } + }, + { + "ph": "s", "id": 241675560, "pid": 5717, "tid": 5717, "ts": 6302685344580.485, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685357374.430, "dur": 1.056, + "args": { + "External id": 126502, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675566, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675566, "pid": 3, "tid": 7, "ts": 6302685357374.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344590.925, "dur": 4.420, + "args": { + "External id": 126502, "cbid": 211, "correlation": 241675566 + } + }, + { + "ph": "s", "id": 241675566, "pid": 5717, "tid": 5717, "ts": 6302685344590.925, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685357376.126, "dur": 3.328, + "args": { + "External id": 126503, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675579, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675579, "pid": 3, "tid": 7, "ts": 6302685357376.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344610.685, "dur": 5.220, + "args": { + "External id": 126503, "cbid": 211, "correlation": 241675579 + } + }, + { + "ph": "s", "id": 241675579, "pid": 5717, "tid": 5717, "ts": 6302685344610.685, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685357380.062, "dur": 1.280, + "args": { + "External id": 126506, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675585, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675585, "pid": 3, "tid": 7, "ts": 6302685357380.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344623.154, "dur": 4.490, + "args": { + "External id": 126506, "cbid": 211, "correlation": 241675585 + } + }, + { + "ph": "s", "id": 241675585, "pid": 5717, "tid": 5717, "ts": 6302685344623.154, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685357381.982, "dur": 1.024, + "args": { + "External id": 126507, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675591, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675591, "pid": 3, "tid": 7, "ts": 6302685357381.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344632.784, "dur": 3.860, + "args": { + "External id": 126507, "cbid": 211, "correlation": 241675591 + } + }, + { + "ph": "s", "id": 241675591, "pid": 5717, "tid": 5717, "ts": 6302685344632.784, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685357383.614, "dur": 233.026, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675605, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241675605, "pid": 3, "tid": 7, "ts": 6302685357383.614, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344721.534, "dur": 8.720, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241675605 + } + }, + { + "ph": "s", "id": 241675605, "pid": 5717, "tid": 5717, "ts": 6302685344721.534, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685344763.314, "dur": 0.580, + "args": { + "External id": 126511, "cbid": 200, "correlation": 241675628 + } + }, + { + "ph": "f", "id": 241675628, "pid": 5717, "tid": 5717, "ts": 6302685344763.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685357617.536, "dur": 0.800, + "args": { + "External id": 126511, "device": 3, "context": 1, "stream": 7, "correlation": 241675631, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241675631, "pid": 3, "tid": 7, "ts": 6302685357617.536, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685344765.664, "dur": 6.980, + "args": { + "External id": 126511, "cbid": 51, "correlation": 241675631 + } + }, + { + "ph": "s", "id": 241675631, "pid": 5717, "tid": 5717, "ts": 6302685344765.664, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685357619.520, "dur": 690.757, + "args": { + "External id": 126511, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675632, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675632, "pid": 3, "tid": 7, "ts": 6302685357619.520, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344772.894, "dur": 6.290, + "args": { + "External id": 126511, "cbid": 307, "correlation": 241675632 + } + }, + { + "ph": "s", "id": 241675632, "pid": 5717, "tid": 5717, "ts": 6302685344772.894, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685358310.981, "dur": 2.976, + "args": { + "External id": 126514, "device": 3, "context": 1, "stream": 7, "correlation": 241675637, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 241675637, "pid": 3, "tid": 7, "ts": 6302685358310.981, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685344813.994, "dur": 11.850, + "args": { + "External id": 126514, "cbid": 41, "correlation": 241675637 + } + }, + { + "ph": "s", "id": 241675637, "pid": 5717, "tid": 5717, "ts": 6302685344813.994, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685344864.504, "dur": 0.420, + "args": { + "External id": 126519, "cbid": 200, "correlation": 241675665 + } + }, + { + "ph": "f", "id": 241675665, "pid": 5717, "tid": 5717, "ts": 6302685344864.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685358314.661, "dur": 685.509, + "args": { + "External id": 126519, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675668, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675668, "pid": 3, "tid": 7, "ts": 6302685358314.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344866.494, "dur": 7.500, + "args": { + "External id": 126519, "cbid": 307, "correlation": 241675668 + } + }, + { + "ph": "s", "id": 241675668, "pid": 5717, "tid": 5717, "ts": 6302685344866.494, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685359000.906, "dur": 221.281, + "args": { + "External id": 126520, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675673, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241675673, "pid": 3, "tid": 7, "ts": 6302685359000.906, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344888.344, "dur": 5.960, + "args": { + "External id": 126520, "cbid": 211, "correlation": 241675673 + } + }, + { + "ph": "s", "id": 241675673, "pid": 5717, "tid": 5717, "ts": 6302685344888.344, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5717, "tid": 5717, + "ts": 6302685344939.434, "dur": 1.280, + "args": { + "External id": 126528, "cbid": 210, "correlation": 241675699 + } + }, + { + "ph": "f", "id": 241675699, "pid": 5717, "tid": 5717, "ts": 6302685344939.434, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 3, "tid": 7, + "ts": 6302685359222.891, "dur": 644.486, + "args": { + "External id": 126528, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675700, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675700, "pid": 3, "tid": 7, "ts": 6302685359222.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685344944.274, "dur": 7.670, + "args": { + "External id": 126528, "cbid": 211, "correlation": 241675700 + } + }, + { + "ph": "s", "id": 241675700, "pid": 5717, "tid": 5717, "ts": 6302685344944.274, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 3, "tid": 7, + "ts": 6302685359868.049, "dur": 171.393, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675719, "registers per thread": 141, "shared memory": 32, "blocks per SM": 16.000000, "warps per SM": 128.000000, "grid": [2048, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675719, "pid": 3, "tid": 7, "ts": 6302685359868.049, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345056.504, "dur": 9.389, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241675719 + } + }, + { + "ph": "s", "id": 241675719, "pid": 5717, "tid": 5717, "ts": 6302685345056.504, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685360040.114, "dur": 4.128, + "args": { + "External id": 126538, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675736, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675736, "pid": 3, "tid": 7, "ts": 6302685360040.114, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345099.333, "dur": 7.170, + "args": { + "External id": 126538, "cbid": 211, "correlation": 241675736 + } + }, + { + "ph": "s", "id": 241675736, "pid": 5717, "tid": 5717, "ts": 6302685345099.333, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685360044.946, "dur": 1.216, + "args": { + "External id": 126543, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675753, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675753, "pid": 3, "tid": 7, "ts": 6302685360044.946, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345131.793, "dur": 5.070, + "args": { + "External id": 126543, "cbid": 211, "correlation": 241675753 + } + }, + { + "ph": "s", "id": 241675753, "pid": 5717, "tid": 5717, "ts": 6302685345131.793, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685360046.802, "dur": 1.024, + "args": { + "External id": 126545, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675763, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675763, "pid": 3, "tid": 7, "ts": 6302685360046.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345150.853, "dur": 5.220, + "args": { + "External id": 126545, "cbid": 211, "correlation": 241675763 + } + }, + { + "ph": "s", "id": 241675763, "pid": 5717, "tid": 5717, "ts": 6302685345150.853, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685360048.434, "dur": 1.024, + "args": { + "External id": 126546, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675769, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675769, "pid": 3, "tid": 7, "ts": 6302685360048.434, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345167.063, "dur": 5.190, + "args": { + "External id": 126546, "cbid": 211, "correlation": 241675769 + } + }, + { + "ph": "s", "id": 241675769, "pid": 5717, "tid": 5717, "ts": 6302685345167.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685360050.194, "dur": 1.024, + "args": { + "External id": 126547, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675779, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675779, "pid": 3, "tid": 7, "ts": 6302685360050.194, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345184.393, "dur": 8.020, + "args": { + "External id": 126547, "cbid": 211, "correlation": 241675779 + } + }, + { + "ph": "s", "id": 241675779, "pid": 5717, "tid": 5717, "ts": 6302685345184.393, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685360051.954, "dur": 1.056, + "args": { + "External id": 126548, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675785, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675785, "pid": 3, "tid": 7, "ts": 6302685360051.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345200.913, "dur": 6.990, + "args": { + "External id": 126548, "cbid": 211, "correlation": 241675785 + } + }, + { + "ph": "s", "id": 241675785, "pid": 5717, "tid": 5717, "ts": 6302685345200.913, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685360053.650, "dur": 3.328, + "args": { + "External id": 126549, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675798, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675798, "pid": 3, "tid": 7, "ts": 6302685360053.650, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345238.373, "dur": 8.240, + "args": { + "External id": 126549, "cbid": 211, "correlation": 241675798 + } + }, + { + "ph": "s", "id": 241675798, "pid": 5717, "tid": 5717, "ts": 6302685345238.373, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685360057.618, "dur": 1.248, + "args": { + "External id": 126552, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675804, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675804, "pid": 3, "tid": 7, "ts": 6302685360057.618, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345257.313, "dur": 7.090, + "args": { + "External id": 126552, "cbid": 211, "correlation": 241675804 + } + }, + { + "ph": "s", "id": 241675804, "pid": 5717, "tid": 5717, "ts": 6302685345257.313, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685360059.506, "dur": 1.024, + "args": { + "External id": 126553, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675810, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675810, "pid": 3, "tid": 7, "ts": 6302685360059.506, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345269.993, "dur": 4.080, + "args": { + "External id": 126553, "cbid": 211, "correlation": 241675810 + } + }, + { + "ph": "s", "id": 241675810, "pid": 5717, "tid": 5717, "ts": 6302685345269.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 3, "tid": 7, + "ts": 6302685360061.138, "dur": 233.186, + "args": { + "External id": 126169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675824, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 241675824, "pid": 3, "tid": 7, "ts": 6302685360061.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345378.783, "dur": 9.540, + "args": { + "External id": 126169, "cbid": 307, "correlation": 241675824 + } + }, + { + "ph": "s", "id": 241675824, "pid": 5717, "tid": 5717, "ts": 6302685345378.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685345437.372, "dur": 0.611, + "args": { + "External id": 126557, "cbid": 200, "correlation": 241675847 + } + }, + { + "ph": "f", "id": 241675847, "pid": 5717, "tid": 5717, "ts": 6302685345437.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685360295.188, "dur": 0.832, + "args": { + "External id": 126557, "device": 3, "context": 1, "stream": 7, "correlation": 241675850, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 241675850, "pid": 3, "tid": 7, "ts": 6302685360295.188, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 5717, + "ts": 6302685345441.032, "dur": 12.900, + "args": { + "External id": 126557, "cbid": 51, "correlation": 241675850 + } + }, + { + "ph": "s", "id": 241675850, "pid": 5717, "tid": 5717, "ts": 6302685345441.032, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685360297.204, "dur": 689.669, + "args": { + "External id": 126557, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675851, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675851, "pid": 3, "tid": 7, "ts": 6302685360297.204, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345454.192, "dur": 10.411, + "args": { + "External id": 126557, "cbid": 307, "correlation": 241675851 + } + }, + { + "ph": "s", "id": 241675851, "pid": 5717, "tid": 5717, "ts": 6302685345454.192, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685360987.513, "dur": 3.008, + "args": { + "External id": 126560, "device": 3, "context": 1, "stream": 7, "correlation": 241675856, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 241675856, "pid": 3, "tid": 7, "ts": 6302685360987.513, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685345504.023, "dur": 16.099, + "args": { + "External id": 126560, "cbid": 41, "correlation": 241675856 + } + }, + { + "ph": "s", "id": 241675856, "pid": 5717, "tid": 5717, "ts": 6302685345504.023, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 5717, + "ts": 6302685345563.602, "dur": 0.450, + "args": { + "External id": 126565, "cbid": 200, "correlation": 241675884 + } + }, + { + "ph": "f", "id": 241675884, "pid": 5717, "tid": 5717, "ts": 6302685345563.602, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685360991.193, "dur": 686.021, + "args": { + "External id": 126565, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675887, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241675887, "pid": 3, "tid": 7, "ts": 6302685360991.193, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345565.652, "dur": 7.340, + "args": { + "External id": 126565, "cbid": 307, "correlation": 241675887 + } + }, + { + "ph": "s", "id": 241675887, "pid": 5717, "tid": 5717, "ts": 6302685345565.652, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685361677.886, "dur": 220.706, + "args": { + "External id": 126566, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675892, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241675892, "pid": 3, "tid": 7, "ts": 6302685361677.886, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345587.522, "dur": 6.220, + "args": { + "External id": 126566, "cbid": 211, "correlation": 241675892 + } + }, + { + "ph": "s", "id": 241675892, "pid": 5717, "tid": 5717, "ts": 6302685345587.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685361899.232, "dur": 5.568, + "args": { + "External id": 126568, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675905, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675905, "pid": 3, "tid": 7, "ts": 6302685361899.232, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345616.942, "dur": 6.730, + "args": { + "External id": 126568, "cbid": 211, "correlation": 241675905 + } + }, + { + "ph": "s", "id": 241675905, "pid": 5717, "tid": 5717, "ts": 6302685345616.942, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685361905.568, "dur": 159.585, + "args": { + "External id": 126573, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675918, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241675918, "pid": 3, "tid": 7, "ts": 6302685361905.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345648.822, "dur": 6.270, + "args": { + "External id": 126573, "cbid": 211, "correlation": 241675918 + } + }, + { + "ph": "s", "id": 241675918, "pid": 5717, "tid": 5717, "ts": 6302685345648.822, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685362065.825, "dur": 1.376, + "args": { + "External id": 126578, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675926, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675926, "pid": 3, "tid": 7, "ts": 6302685362065.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345716.702, "dur": 8.490, + "args": { + "External id": 126578, "cbid": 211, "correlation": 241675926 + } + }, + { + "ph": "s", "id": 241675926, "pid": 5717, "tid": 5717, "ts": 6302685345716.702, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685362067.873, "dur": 1.344, + "args": { + "External id": 126579, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675932, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675932, "pid": 3, "tid": 7, "ts": 6302685362067.873, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685345737.132, "dur": 5.050, + "args": { + "External id": 126579, "cbid": 211, "correlation": 241675932 + } + }, + { + "ph": "s", "id": 241675932, "pid": 5717, "tid": 5717, "ts": 6302685345737.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685345927.702, "dur": 3.049, + "args": { + "cbid": 147, "correlation": 241675937 + } + }, + { + "ph": "s", "id": 241675937, "pid": 5717, "tid": 5717, "ts": 6302685345927.702, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685345933.922, "dur": 1.269, + "args": { + "cbid": 147, "correlation": 241675941 + } + }, + { + "ph": "s", "id": 241675941, "pid": 5717, "tid": 5717, "ts": 6302685345933.922, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685362069.921, "dur": 0.992, + "args": { + "External id": 126581, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675958, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675958, "pid": 3, "tid": 7, "ts": 6302685362069.921, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685346005.201, "dur": 13.330, + "args": { + "External id": 126581, "cbid": 211, "correlation": 241675958 + } + }, + { + "ph": "s", "id": 241675958, "pid": 5717, "tid": 5717, "ts": 6302685346005.201, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685362071.521, "dur": 0.896, + "args": { + "External id": 126585, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675971, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675971, "pid": 3, "tid": 7, "ts": 6302685362071.521, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685346090.691, "dur": 8.030, + "args": { + "External id": 126585, "cbid": 211, "correlation": 241675971 + } + }, + { + "ph": "s", "id": 241675971, "pid": 5717, "tid": 5717, "ts": 6302685346090.691, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685362073.153, "dur": 1.088, + "args": { + "External id": 126979, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241675987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241675987, "pid": 3, "tid": 7, "ts": 6302685362073.153, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685346666.300, "dur": 33.780, + "args": { + "External id": 126979, "cbid": 211, "correlation": 241675987 + } + }, + { + "ph": "s", "id": 241675987, "pid": 5717, "tid": 6759, "ts": 6302685346666.300, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685362074.977, "dur": 1.792, + "args": { + "External id": 126985, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676005, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241676005, "pid": 3, "tid": 7, "ts": 6302685362074.977, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685346985.799, "dur": 19.260, + "args": { + "External id": 126985, "cbid": 211, "correlation": 241676005 + } + }, + { + "ph": "s", "id": 241676005, "pid": 5717, "tid": 6759, "ts": 6302685346985.799, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 3, "tid": 7, + "ts": 6302685362077.633, "dur": 0.352, + "args": { + "External id": 126993, "device": 3, "context": 1, "stream": 7, "correlation": 241676023, "bytes": 4, "memory bandwidth (GB/s)": 0.011363636363636364 + } + }, + { + "ph": "f", "id": 241676023, "pid": 3, "tid": 7, "ts": 6302685362077.633, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685347136.379, "dur": 25.430, + "args": { + "External id": 126993, "cbid": 41, "correlation": 241676023 + } + }, + { + "ph": "s", "id": 241676023, "pid": 5717, "tid": 6759, "ts": 6302685347136.379, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 6759, + "ts": 6302685347162.489, "dur": 14920.406, + "args": { + "External id": 126993, "cbid": 131, "correlation": 241676024 + } + }, + { + "ph": "s", "id": 241676024, "pid": 5717, "tid": 6759, "ts": 6302685347162.489, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685362141.922, "dur": 1.120, + "args": { + "External id": 126997, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676033, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241676033, "pid": 3, "tid": 7, "ts": 6302685362141.922, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685362127.635, "dur": 16.130, + "args": { + "External id": 126997, "cbid": 211, "correlation": 241676033 + } + }, + { + "ph": "s", "id": 241676033, "pid": 5717, "tid": 6759, "ts": 6302685362127.635, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 3, "tid": 7, + "ts": 6302685362186.306, "dur": 0.992, + "args": { + "External id": 127000, "device": 3, "context": 1, "stream": 7, "correlation": 241676039, "bytes": 1, "memory bandwidth (GB/s)": 0.0010080645161290322 + } + }, + { + "ph": "f", "id": 241676039, "pid": 3, "tid": 7, "ts": 6302685362186.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685362170.075, "dur": 15.650, + "args": { + "External id": 127000, "cbid": 41, "correlation": 241676039 + } + }, + { + "ph": "s", "id": 241676039, "pid": 5717, "tid": 6759, "ts": 6302685362170.075, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 6759, + "ts": 6302685362185.985, "dur": 3.740, + "args": { + "External id": 127000, "cbid": 131, "correlation": 241676040 + } + }, + { + "ph": "s", "id": 241676040, "pid": 5717, "tid": 6759, "ts": 6302685362185.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685362332.867, "dur": 16.128, + "args": { + "External id": 127011, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676062, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676062, "pid": 3, "tid": 7, "ts": 6302685362332.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685362320.994, "dur": 12.480, + "args": { + "External id": 127011, "cbid": 211, "correlation": 241676062 + } + }, + { + "ph": "s", "id": 241676062, "pid": 5717, "tid": 6759, "ts": 6302685362320.994, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685362396.035, "dur": 44.033, + "args": { + "External id": 127014, "device": 3, "context": 1, "stream": 7, "correlation": 241676069, "bytes": 25165824, "memory bandwidth (GB/s)": 571.5219040265256 + } + }, + { + "ph": "f", "id": 241676069, "pid": 3, "tid": 7, "ts": 6302685362396.035, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685362358.154, "dur": 38.480, + "args": { + "External id": 127014, "cbid": 41, "correlation": 241676069 + } + }, + { + "ph": "s", "id": 241676069, "pid": 5717, "tid": 6759, "ts": 6302685362358.154, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685362466.020, "dur": 85.281, + "args": { + "External id": 127021, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676087, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676087, "pid": 3, "tid": 7, "ts": 6302685362466.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685362457.404, "dur": 7.670, + "args": { + "External id": 127021, "cbid": 211, "correlation": 241676087 + } + }, + { + "ph": "s", "id": 241676087, "pid": 5717, "tid": 6759, "ts": 6302685362457.404, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685362551.941, "dur": 68.448, + "args": { + "External id": 127024, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676095, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676095, "pid": 3, "tid": 7, "ts": 6302685362551.941, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685362489.474, "dur": 9.810, + "args": { + "External id": 127024, "cbid": 211, "correlation": 241676095 + } + }, + { + "ph": "s", "id": 241676095, "pid": 5717, "tid": 6759, "ts": 6302685362489.474, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685362621.125, "dur": 89.121, + "args": { + "External id": 127031, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676114, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676114, "pid": 3, "tid": 7, "ts": 6302685362621.125, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685362533.644, "dur": 5.980, + "args": { + "External id": 127031, "cbid": 211, "correlation": 241676114 + } + }, + { + "ph": "s", "id": 241676114, "pid": 5717, "tid": 6759, "ts": 6302685362533.644, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685362710.950, "dur": 222.369, + "args": { + "External id": 127034, "device": 3, "context": 1, "stream": 7, "correlation": 241676121, "bytes": 100663296, "memory bandwidth (GB/s)": 452.6858330072987 + } + }, + { + "ph": "f", "id": 241676121, "pid": 3, "tid": 7, "ts": 6302685362710.950, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685362550.424, "dur": 25.210, + "args": { + "External id": 127034, "cbid": 41, "correlation": 241676121 + } + }, + { + "ph": "s", "id": 241676121, "pid": 5717, "tid": 6759, "ts": 6302685362550.424, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685362933.927, "dur": 99.681, + "args": { + "External id": 127041, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676139, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676139, "pid": 3, "tid": 7, "ts": 6302685362933.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685362607.274, "dur": 6.450, + "args": { + "External id": 127041, "cbid": 211, "correlation": 241676139 + } + }, + { + "ph": "s", "id": 241676139, "pid": 5717, "tid": 6759, "ts": 6302685362607.274, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685363034.312, "dur": 225.346, + "args": { + "External id": 127044, "device": 3, "context": 1, "stream": 7, "correlation": 241676146, "bytes": 100663296, "memory bandwidth (GB/s)": 446.70549288649454 + } + }, + { + "ph": "f", "id": 241676146, "pid": 3, "tid": 7, "ts": 6302685363034.312, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685362626.724, "dur": 9.369, + "args": { + "External id": 127044, "cbid": 41, "correlation": 241676146 + } + }, + { + "ph": "s", "id": 241676146, "pid": 5717, "tid": 6759, "ts": 6302685362626.724, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685363260.362, "dur": 2.528, + "args": { + "External id": 127048, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676164, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241676164, "pid": 3, "tid": 7, "ts": 6302685363260.362, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685362676.404, "dur": 11.020, + "args": { + "External id": 127048, "cbid": 211, "correlation": 241676164 + } + }, + { + "ph": "s", "id": 241676164, "pid": 5717, "tid": 6759, "ts": 6302685362676.404, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 3, "tid": 7, + "ts": 6302685363263.754, "dur": 0.352, + "args": { + "External id": 127056, "device": 3, "context": 1, "stream": 7, "correlation": 241676182, "bytes": 4, "memory bandwidth (GB/s)": 0.011363636363636364 + } + }, + { + "ph": "f", "id": 241676182, "pid": 3, "tid": 7, "ts": 6302685363263.754, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685362768.223, "dur": 10.720, + "args": { + "External id": 127056, "cbid": 41, "correlation": 241676182 + } + }, + { + "ph": "s", "id": 241676182, "pid": 5717, "tid": 6759, "ts": 6302685362768.223, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 6759, + "ts": 6302685362779.253, "dur": 488.739, + "args": { + "External id": 127056, "cbid": 131, "correlation": 241676183 + } + }, + { + "ph": "s", "id": 241676183, "pid": 5717, "tid": 6759, "ts": 6302685362779.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685363317.643, "dur": 1.344, + "args": { + "External id": 127060, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676192, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241676192, "pid": 3, "tid": 7, "ts": 6302685363317.643, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685363307.512, "dur": 9.870, + "args": { + "External id": 127060, "cbid": 211, "correlation": 241676192 + } + }, + { + "ph": "s", "id": 241676192, "pid": 5717, "tid": 6759, "ts": 6302685363307.512, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 3, "tid": 7, + "ts": 6302685363343.403, "dur": 0.960, + "args": { + "External id": 127063, "device": 3, "context": 1, "stream": 7, "correlation": 241676198, "bytes": 1, "memory bandwidth (GB/s)": 0.0010416666666666667 + } + }, + { + "ph": "f", "id": 241676198, "pid": 3, "tid": 7, "ts": 6302685363343.403, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685363333.002, "dur": 9.610, + "args": { + "External id": 127063, "cbid": 41, "correlation": 241676198 + } + }, + { + "ph": "s", "id": 241676198, "pid": 5717, "tid": 6759, "ts": 6302685363333.002, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 6759, + "ts": 6302685363342.902, "dur": 3.790, + "args": { + "External id": 127063, "cbid": 131, "correlation": 241676199 + } + }, + { + "ph": "s", "id": 241676199, "pid": 5717, "tid": 6759, "ts": 6302685363342.902, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685363399.083, "dur": 160.097, + "args": { + "External id": 127064, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676206, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676206, "pid": 3, "tid": 7, "ts": 6302685363399.083, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685363389.302, "dur": 10.440, + "args": { + "External id": 127064, "cbid": 211, "correlation": 241676206 + } + }, + { + "ph": "s", "id": 241676206, "pid": 5717, "tid": 6759, "ts": 6302685363389.302, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685363559.884, "dur": 17.504, + "args": { + "External id": 127075, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676230, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676230, "pid": 3, "tid": 7, "ts": 6302685363559.884, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685363453.282, "dur": 7.640, + "args": { + "External id": 127075, "cbid": 211, "correlation": 241676230 + } + }, + { + "ph": "s", "id": 241676230, "pid": 5717, "tid": 6759, "ts": 6302685363453.282, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685363578.124, "dur": 44.161, + "args": { + "External id": 127078, "device": 3, "context": 1, "stream": 7, "correlation": 241676237, "bytes": 25165824, "memory bandwidth (GB/s)": 569.8653563098661 + } + }, + { + "ph": "f", "id": 241676237, "pid": 3, "tid": 7, "ts": 6302685363578.124, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685363473.682, "dur": 13.100, + "args": { + "External id": 127078, "cbid": 41, "correlation": 241676237 + } + }, + { + "ph": "s", "id": 241676237, "pid": 5717, "tid": 6759, "ts": 6302685363473.682, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685363622.957, "dur": 86.752, + "args": { + "External id": 127085, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676255, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676255, "pid": 3, "tid": 7, "ts": 6302685363622.957, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685363522.191, "dur": 6.840, + "args": { + "External id": 127085, "cbid": 211, "correlation": 241676255 + } + }, + { + "ph": "s", "id": 241676255, "pid": 5717, "tid": 6759, "ts": 6302685363522.191, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685363710.349, "dur": 68.833, + "args": { + "External id": 127088, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676263, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676263, "pid": 3, "tid": 7, "ts": 6302685363710.349, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685363544.391, "dur": 5.051, + "args": { + "External id": 127088, "cbid": 211, "correlation": 241676263 + } + }, + { + "ph": "s", "id": 241676263, "pid": 5717, "tid": 6759, "ts": 6302685363544.391, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685363779.854, "dur": 89.473, + "args": { + "External id": 127095, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676282, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676282, "pid": 3, "tid": 7, "ts": 6302685363779.854, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685363580.711, "dur": 5.751, + "args": { + "External id": 127095, "cbid": 211, "correlation": 241676282 + } + }, + { + "ph": "s", "id": 241676282, "pid": 5717, "tid": 6759, "ts": 6302685363580.711, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685363870.031, "dur": 225.025, + "args": { + "External id": 127098, "device": 3, "context": 1, "stream": 7, "correlation": 241676289, "bytes": 100663296, "memory bandwidth (GB/s)": 447.3427219197867 + } + }, + { + "ph": "f", "id": 241676289, "pid": 3, "tid": 7, "ts": 6302685363870.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685363597.891, "dur": 15.580, + "args": { + "External id": 127098, "cbid": 41, "correlation": 241676289 + } + }, + { + "ph": "s", "id": 241676289, "pid": 5717, "tid": 6759, "ts": 6302685363597.891, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685364095.728, "dur": 97.217, + "args": { + "External id": 127105, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676307, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676307, "pid": 3, "tid": 7, "ts": 6302685364095.728, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685363642.341, "dur": 6.240, + "args": { + "External id": 127105, "cbid": 211, "correlation": 241676307 + } + }, + { + "ph": "s", "id": 241676307, "pid": 5717, "tid": 6759, "ts": 6302685363642.341, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685364193.617, "dur": 225.154, + "args": { + "External id": 127108, "device": 3, "context": 1, "stream": 7, "correlation": 241676314, "bytes": 100663296, "memory bandwidth (GB/s)": 447.0864208497295 + } + }, + { + "ph": "f", "id": 241676314, "pid": 3, "tid": 7, "ts": 6302685364193.617, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685363658.471, "dur": 8.660, + "args": { + "External id": 127108, "cbid": 41, "correlation": 241676314 + } + }, + { + "ph": "s", "id": 241676314, "pid": 5717, "tid": 6759, "ts": 6302685363658.471, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685364419.411, "dur": 325.026, + "args": { + "External id": 127109, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676321, "registers per thread": 23, "shared memory": 0, "blocks per SM": 768.000000, "warps per SM": 3072.000000, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676321, "pid": 3, "tid": 7, "ts": 6302685364419.411, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685363678.621, "dur": 4.530, + "args": { + "External id": 127109, "cbid": 211, "correlation": 241676321 + } + }, + { + "ph": "s", "id": 241676321, "pid": 5717, "tid": 6759, "ts": 6302685363678.621, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685364745.141, "dur": 2.624, + "args": { + "External id": 127113, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676341, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241676341, "pid": 3, "tid": 7, "ts": 6302685364745.141, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685363714.421, "dur": 6.900, + "args": { + "External id": 127113, "cbid": 211, "correlation": 241676341 + } + }, + { + "ph": "s", "id": 241676341, "pid": 5717, "tid": 6759, "ts": 6302685363714.421, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 3, "tid": 7, + "ts": 6302685364748.693, "dur": 0.352, + "args": { + "External id": 127121, "device": 3, "context": 1, "stream": 7, "correlation": 241676359, "bytes": 4, "memory bandwidth (GB/s)": 0.011363636363636364 + } + }, + { + "ph": "f", "id": 241676359, "pid": 3, "tid": 7, "ts": 6302685364748.693, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685363787.011, "dur": 9.330, + "args": { + "External id": 127121, "cbid": 41, "correlation": 241676359 + } + }, + { + "ph": "s", "id": 241676359, "pid": 5717, "tid": 6759, "ts": 6302685363787.011, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 6759, + "ts": 6302685363796.671, "dur": 955.148, + "args": { + "External id": 127121, "cbid": 131, "correlation": 241676360 + } + }, + { + "ph": "s", "id": 241676360, "pid": 5717, "tid": 6759, "ts": 6302685363796.671, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685364785.654, "dur": 1.344, + "args": { + "External id": 127125, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676369, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241676369, "pid": 3, "tid": 7, "ts": 6302685364785.654, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685364774.339, "dur": 11.890, + "args": { + "External id": 127125, "cbid": 211, "correlation": 241676369 + } + }, + { + "ph": "s", "id": 241676369, "pid": 5717, "tid": 6759, "ts": 6302685364774.339, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 3, "tid": 7, + "ts": 6302685364809.206, "dur": 0.992, + "args": { + "External id": 127128, "device": 3, "context": 1, "stream": 7, "correlation": 241676375, "bytes": 1, "memory bandwidth (GB/s)": 0.0010080645161290322 + } + }, + { + "ph": "f", "id": 241676375, "pid": 3, "tid": 7, "ts": 6302685364809.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685364800.359, "dur": 8.130, + "args": { + "External id": 127128, "cbid": 41, "correlation": 241676375 + } + }, + { + "ph": "s", "id": 241676375, "pid": 5717, "tid": 6759, "ts": 6302685364800.359, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 6759, + "ts": 6302685364808.749, "dur": 3.680, + "args": { + "External id": 127128, "cbid": 131, "correlation": 241676376 + } + }, + { + "ph": "s", "id": 241676376, "pid": 5717, "tid": 6759, "ts": 6302685364808.749, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685364843.382, "dur": 159.681, + "args": { + "External id": 127129, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676383, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676383, "pid": 3, "tid": 7, "ts": 6302685364843.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685364835.819, "dur": 6.769, + "args": { + "External id": 127129, "cbid": 211, "correlation": 241676383 + } + }, + { + "ph": "s", "id": 241676383, "pid": 5717, "tid": 6759, "ts": 6302685364835.819, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685365003.799, "dur": 16.800, + "args": { + "External id": 127140, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676407, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676407, "pid": 3, "tid": 7, "ts": 6302685365003.799, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685364896.908, "dur": 7.520, + "args": { + "External id": 127140, "cbid": 211, "correlation": 241676407 + } + }, + { + "ph": "s", "id": 241676407, "pid": 5717, "tid": 6759, "ts": 6302685364896.908, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685365021.239, "dur": 44.577, + "args": { + "External id": 127143, "device": 3, "context": 1, "stream": 7, "correlation": 241676414, "bytes": 25165824, "memory bandwidth (GB/s)": 564.5472777441281 + } + }, + { + "ph": "f", "id": 241676414, "pid": 3, "tid": 7, "ts": 6302685365021.239, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685364915.648, "dur": 17.660, + "args": { + "External id": 127143, "cbid": 41, "correlation": 241676414 + } + }, + { + "ph": "s", "id": 241676414, "pid": 5717, "tid": 6759, "ts": 6302685364915.648, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685365066.488, "dur": 87.104, + "args": { + "External id": 127150, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676432, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676432, "pid": 3, "tid": 7, "ts": 6302685365066.488, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685364986.218, "dur": 7.100, + "args": { + "External id": 127150, "cbid": 211, "correlation": 241676432 + } + }, + { + "ph": "s", "id": 241676432, "pid": 5717, "tid": 6759, "ts": 6302685364986.218, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685365154.328, "dur": 70.241, + "args": { + "External id": 127153, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676440, "pid": 3, "tid": 7, "ts": 6302685365154.328, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685365008.428, "dur": 5.220, + "args": { + "External id": 127153, "cbid": 211, "correlation": 241676440 + } + }, + { + "ph": "s", "id": 241676440, "pid": 5717, "tid": 6759, "ts": 6302685365008.428, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685365225.209, "dur": 88.033, + "args": { + "External id": 127160, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676459, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676459, "pid": 3, "tid": 7, "ts": 6302685365225.209, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685365042.928, "dur": 5.720, + "args": { + "External id": 127160, "cbid": 211, "correlation": 241676459 + } + }, + { + "ph": "s", "id": 241676459, "pid": 5717, "tid": 6759, "ts": 6302685365042.928, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685365313.978, "dur": 227.041, + "args": { + "External id": 127163, "device": 3, "context": 1, "stream": 7, "correlation": 241676466, "bytes": 100663296, "memory bandwidth (GB/s)": 443.37056302606135 + } + }, + { + "ph": "f", "id": 241676466, "pid": 3, "tid": 7, "ts": 6302685365313.978, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685365058.118, "dur": 10.580, + "args": { + "External id": 127163, "cbid": 41, "correlation": 241676466 + } + }, + { + "ph": "s", "id": 241676466, "pid": 5717, "tid": 6759, "ts": 6302685365058.118, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685365541.723, "dur": 97.153, + "args": { + "External id": 127170, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676484, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676484, "pid": 3, "tid": 7, "ts": 6302685365541.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685365105.688, "dur": 8.660, + "args": { + "External id": 127170, "cbid": 211, "correlation": 241676484 + } + }, + { + "ph": "s", "id": 241676484, "pid": 5717, "tid": 6759, "ts": 6302685365105.688, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685365639.516, "dur": 225.378, + "args": { + "External id": 127173, "device": 3, "context": 1, "stream": 7, "correlation": 241676491, "bytes": 100663296, "memory bandwidth (GB/s)": 446.64206799243937 + } + }, + { + "ph": "f", "id": 241676491, "pid": 3, "tid": 7, "ts": 6302685365639.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685365125.528, "dur": 10.260, + "args": { + "External id": 127173, "cbid": 41, "correlation": 241676491 + } + }, + { + "ph": "s", "id": 241676491, "pid": 5717, "tid": 6759, "ts": 6302685365125.528, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685365865.566, "dur": 324.226, + "args": { + "External id": 127174, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676498, "registers per thread": 23, "shared memory": 0, "blocks per SM": 768.000000, "warps per SM": 3072.000000, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676498, "pid": 3, "tid": 7, "ts": 6302685365865.566, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685365148.078, "dur": 4.920, + "args": { + "External id": 127174, "cbid": 211, "correlation": 241676498 + } + }, + { + "ph": "s", "id": 241676498, "pid": 5717, "tid": 6759, "ts": 6302685365148.078, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685366190.432, "dur": 3.200, + "args": { + "External id": 127178, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676518, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241676518, "pid": 3, "tid": 7, "ts": 6302685366190.432, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685365185.008, "dur": 7.150, + "args": { + "External id": 127178, "cbid": 211, "correlation": 241676518 + } + }, + { + "ph": "s", "id": 241676518, "pid": 5717, "tid": 6759, "ts": 6302685365185.008, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 3, "tid": 7, + "ts": 6302685366194.560, "dur": 0.352, + "args": { + "External id": 127186, "device": 3, "context": 1, "stream": 7, "correlation": 241676536, "bytes": 4, "memory bandwidth (GB/s)": 0.011363636363636364 + } + }, + { + "ph": "f", "id": 241676536, "pid": 3, "tid": 7, "ts": 6302685366194.560, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685365261.718, "dur": 10.850, + "args": { + "External id": 127186, "cbid": 41, "correlation": 241676536 + } + }, + { + "ph": "s", "id": 241676536, "pid": 5717, "tid": 6759, "ts": 6302685365261.718, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 6759, + "ts": 6302685365272.888, "dur": 924.537, + "args": { + "External id": 127186, "cbid": 131, "correlation": 241676537 + } + }, + { + "ph": "s", "id": 241676537, "pid": 5717, "tid": 6759, "ts": 6302685365272.888, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685366229.024, "dur": 1.344, + "args": { + "External id": 127190, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676546, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241676546, "pid": 3, "tid": 7, "ts": 6302685366229.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366219.176, "dur": 9.909, + "args": { + "External id": 127190, "cbid": 211, "correlation": 241676546 + } + }, + { + "ph": "s", "id": 241676546, "pid": 5717, "tid": 6759, "ts": 6302685366219.176, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 3, "tid": 7, + "ts": 6302685366250.592, "dur": 0.864, + "args": { + "External id": 127193, "device": 3, "context": 1, "stream": 7, "correlation": 241676552, "bytes": 1, "memory bandwidth (GB/s)": 0.0011574074074074073 + } + }, + { + "ph": "f", "id": 241676552, "pid": 3, "tid": 7, "ts": 6302685366250.592, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685366241.696, "dur": 8.120, + "args": { + "External id": 127193, "cbid": 41, "correlation": 241676552 + } + }, + { + "ph": "s", "id": 241676552, "pid": 5717, "tid": 6759, "ts": 6302685366241.696, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 6759, + "ts": 6302685366250.085, "dur": 3.680, + "args": { + "External id": 127193, "cbid": 131, "correlation": 241676553 + } + }, + { + "ph": "s", "id": 241676553, "pid": 5717, "tid": 6759, "ts": 6302685366250.085, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685366284.545, "dur": 158.177, + "args": { + "External id": 127194, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676560, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676560, "pid": 3, "tid": 7, "ts": 6302685366284.545, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366276.815, "dur": 6.980, + "args": { + "External id": 127194, "cbid": 211, "correlation": 241676560 + } + }, + { + "ph": "s", "id": 241676560, "pid": 5717, "tid": 6759, "ts": 6302685366276.815, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685366443.330, "dur": 18.336, + "args": { + "External id": 127205, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676584, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676584, "pid": 3, "tid": 7, "ts": 6302685366443.330, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366342.975, "dur": 7.980, + "args": { + "External id": 127205, "cbid": 211, "correlation": 241676584 + } + }, + { + "ph": "s", "id": 241676584, "pid": 5717, "tid": 6759, "ts": 6302685366342.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685366462.370, "dur": 44.864, + "args": { + "External id": 127208, "device": 3, "context": 1, "stream": 7, "correlation": 241676591, "bytes": 25165824, "memory bandwidth (GB/s)": 560.9358059914408 + } + }, + { + "ph": "f", "id": 241676591, "pid": 3, "tid": 7, "ts": 6302685366462.370, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685366362.495, "dur": 13.910, + "args": { + "External id": 127208, "cbid": 41, "correlation": 241676591 + } + }, + { + "ph": "s", "id": 241676591, "pid": 5717, "tid": 6759, "ts": 6302685366362.495, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685366507.842, "dur": 86.881, + "args": { + "External id": 127215, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676609, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676609, "pid": 3, "tid": 7, "ts": 6302685366507.842, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366421.345, "dur": 7.750, + "args": { + "External id": 127215, "cbid": 211, "correlation": 241676609 + } + }, + { + "ph": "s", "id": 241676609, "pid": 5717, "tid": 6759, "ts": 6302685366421.345, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685366595.427, "dur": 68.768, + "args": { + "External id": 127218, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676617, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676617, "pid": 3, "tid": 7, "ts": 6302685366595.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366443.715, "dur": 5.020, + "args": { + "External id": 127218, "cbid": 211, "correlation": 241676617 + } + }, + { + "ph": "s", "id": 241676617, "pid": 5717, "tid": 6759, "ts": 6302685366443.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685366664.867, "dur": 87.617, + "args": { + "External id": 127225, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676636, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676636, "pid": 3, "tid": 7, "ts": 6302685366664.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366480.135, "dur": 6.130, + "args": { + "External id": 127225, "cbid": 211, "correlation": 241676636 + } + }, + { + "ph": "s", "id": 241676636, "pid": 5717, "tid": 6759, "ts": 6302685366480.135, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685366753.252, "dur": 227.842, + "args": { + "External id": 127228, "device": 3, "context": 1, "stream": 7, "correlation": 241676643, "bytes": 100663296, "memory bandwidth (GB/s)": 441.8118520729277 + } + }, + { + "ph": "f", "id": 241676643, "pid": 3, "tid": 7, "ts": 6302685366753.252, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685366501.615, "dur": 22.300, + "args": { + "External id": 127228, "cbid": 41, "correlation": 241676643 + } + }, + { + "ph": "s", "id": 241676643, "pid": 5717, "tid": 6759, "ts": 6302685366501.615, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685366981.734, "dur": 97.505, + "args": { + "External id": 127235, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676661, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676661, "pid": 3, "tid": 7, "ts": 6302685366981.734, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366562.465, "dur": 7.250, + "args": { + "External id": 127235, "cbid": 211, "correlation": 241676661 + } + }, + { + "ph": "s", "id": 241676661, "pid": 5717, "tid": 6759, "ts": 6302685366562.465, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685367079.975, "dur": 224.802, + "args": { + "External id": 127238, "device": 3, "context": 1, "stream": 7, "correlation": 241676668, "bytes": 100663296, "memory bandwidth (GB/s)": 447.7864787679825 + } + }, + { + "ph": "f", "id": 241676668, "pid": 3, "tid": 7, "ts": 6302685367079.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685366579.165, "dur": 9.700, + "args": { + "External id": 127238, "cbid": 41, "correlation": 241676668 + } + }, + { + "ph": "s", "id": 241676668, "pid": 5717, "tid": 6759, "ts": 6302685366579.165, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685367305.513, "dur": 323.298, + "args": { + "External id": 127239, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676675, "registers per thread": 23, "shared memory": 0, "blocks per SM": 768.000000, "warps per SM": 3072.000000, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676675, "pid": 3, "tid": 7, "ts": 6302685367305.513, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366605.724, "dur": 6.271, + "args": { + "External id": 127239, "cbid": 211, "correlation": 241676675 + } + }, + { + "ph": "s", "id": 241676675, "pid": 5717, "tid": 6759, "ts": 6302685366605.724, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685367629.515, "dur": 197.793, + "args": { + "External id": 127242, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676696, "registers per thread": 35, "shared memory": 1024, "blocks per SM": 16.031250, "warps per SM": 64.125000, "grid": [2052, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676696, "pid": 3, "tid": 7, "ts": 6302685367629.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366834.104, "dur": 10.420, + "args": { + "External id": 127242, "cbid": 307, "correlation": 241676696 + } + }, + { + "ph": "s", "id": 241676696, "pid": 5717, "tid": 6759, "ts": 6302685366834.104, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685367827.980, "dur": 4.512, + "args": { + "External id": 127243, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676704, "registers per thread": 21, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241676704, "pid": 3, "tid": 7, "ts": 6302685367827.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366882.224, "dur": 8.640, + "args": { + "External id": 127243, "cbid": 307, "correlation": 241676704 + } + }, + { + "ph": "s", "id": 241676704, "pid": 5717, "tid": 6759, "ts": 6302685366882.224, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685367833.164, "dur": 311.523, + "args": { + "External id": 127244, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676715, "registers per thread": 24, "shared memory": 32, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676715, "pid": 3, "tid": 7, "ts": 6302685367833.164, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685366943.634, "dur": 12.930, + "args": { + "External id": 127244, "cbid": 307, "correlation": 241676715 + } + }, + { + "ph": "s", "id": 241676715, "pid": 5717, "tid": 6759, "ts": 6302685366943.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685368145.423, "dur": 327.778, + "args": { + "External id": 127273, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676752, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241676752, "pid": 3, "tid": 7, "ts": 6302685368145.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685367243.933, "dur": 13.720, + "args": { + "External id": 127273, "cbid": 211, "correlation": 241676752 + } + }, + { + "ph": "s", "id": 241676752, "pid": 5717, "tid": 6759, "ts": 6302685367243.933, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 3, "tid": 7, + "ts": 6302685368473.873, "dur": 432.420, + "args": { + "External id": 127262, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676780, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676780, "pid": 3, "tid": 7, "ts": 6302685368473.873, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685367360.103, "dur": 9.770, + "args": { + "External id": 127262, "cbid": 307, "correlation": 241676780 + } + }, + { + "ph": "s", "id": 241676780, "pid": 5717, "tid": 6759, "ts": 6302685367360.103, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685367506.702, "dur": 1.260, + "args": { + "External id": 127298, "cbid": 200, "correlation": 241676805 + } + }, + { + "ph": "f", "id": 241676805, "pid": 5717, "tid": 6759, "ts": 6302685367506.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685368907.189, "dur": 1.024, + "args": { + "External id": 127298, "device": 3, "context": 1, "stream": 7, "correlation": 241676808, "bytes": 1536, "memory bandwidth (GB/s)": 1.5 + } + }, + { + "ph": "f", "id": 241676808, "pid": 3, "tid": 7, "ts": 6302685368907.189, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685367511.942, "dur": 14.140, + "args": { + "External id": 127298, "cbid": 51, "correlation": 241676808 + } + }, + { + "ph": "s", "id": 241676808, "pid": 5717, "tid": 6759, "ts": 6302685367511.942, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685368908.981, "dur": 368.482, + "args": { + "External id": 127298, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676809, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241676809, "pid": 3, "tid": 7, "ts": 6302685368908.981, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685367526.442, "dur": 9.920, + "args": { + "External id": 127298, "cbid": 307, "correlation": 241676809 + } + }, + { + "ph": "s", "id": 241676809, "pid": 5717, "tid": 6759, "ts": 6302685367526.442, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685367659.332, "dur": 0.450, + "args": { + "External id": 127316, "cbid": 200, "correlation": 241676846 + } + }, + { + "ph": "f", "id": 241676846, "pid": 5717, "tid": 6759, "ts": 6302685367659.332, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685369278.359, "dur": 0.832, + "args": { + "External id": 127316, "device": 3, "context": 1, "stream": 7, "correlation": 241676849, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 241676849, "pid": 3, "tid": 7, "ts": 6302685369278.359, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685367661.252, "dur": 6.490, + "args": { + "External id": 127316, "cbid": 51, "correlation": 241676849 + } + }, + { + "ph": "s", "id": 241676849, "pid": 5717, "tid": 6759, "ts": 6302685367661.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685369280.343, "dur": 352.099, + "args": { + "External id": 127316, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676850, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241676850, "pid": 3, "tid": 7, "ts": 6302685369280.343, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685367667.952, "dur": 8.080, + "args": { + "External id": 127316, "cbid": 307, "correlation": 241676850 + } + }, + { + "ph": "s", "id": 241676850, "pid": 5717, "tid": 6759, "ts": 6302685367667.952, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685367710.742, "dur": 0.240, + "args": { + "External id": 127323, "cbid": 200, "correlation": 241676875 + } + }, + { + "ph": "f", "id": 241676875, "pid": 5717, "tid": 6759, "ts": 6302685367710.742, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685369633.114, "dur": 353.923, + "args": { + "External id": 127323, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676878, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241676878, "pid": 3, "tid": 7, "ts": 6302685369633.114, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685367712.802, "dur": 7.560, + "args": { + "External id": 127323, "cbid": 307, "correlation": 241676878 + } + }, + { + "ph": "s", "id": 241676878, "pid": 5717, "tid": 6759, "ts": 6302685367712.802, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685367822.302, "dur": 0.330, + "args": { + "External id": 127346, "cbid": 200, "correlation": 241676923 + } + }, + { + "ph": "f", "id": 241676923, "pid": 5717, "tid": 6759, "ts": 6302685367822.302, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685369987.933, "dur": 0.800, + "args": { + "External id": 127346, "device": 3, "context": 1, "stream": 7, "correlation": 241676926, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241676926, "pid": 3, "tid": 7, "ts": 6302685369987.933, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685367824.012, "dur": 5.970, + "args": { + "External id": 127346, "cbid": 51, "correlation": 241676926 + } + }, + { + "ph": "s", "id": 241676926, "pid": 5717, "tid": 6759, "ts": 6302685367824.012, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685369989.501, "dur": 353.443, + "args": { + "External id": 127346, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676927, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241676927, "pid": 3, "tid": 7, "ts": 6302685369989.501, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685367830.182, "dur": 7.200, + "args": { + "External id": 127346, "cbid": 307, "correlation": 241676927 + } + }, + { + "ph": "s", "id": 241676927, "pid": 5717, "tid": 6759, "ts": 6302685367830.182, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685367872.792, "dur": 0.280, + "args": { + "External id": 127353, "cbid": 200, "correlation": 241676952 + } + }, + { + "ph": "f", "id": 241676952, "pid": 5717, "tid": 6759, "ts": 6302685367872.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685370343.584, "dur": 357.378, + "args": { + "External id": 127353, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676955, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241676955, "pid": 3, "tid": 7, "ts": 6302685370343.584, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685367874.212, "dur": 6.510, + "args": { + "External id": 127353, "cbid": 307, "correlation": 241676955 + } + }, + { + "ph": "s", "id": 241676955, "pid": 5717, "tid": 6759, "ts": 6302685367874.212, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685370701.570, "dur": 51.073, + "args": { + "External id": 127358, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676969, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676969, "pid": 3, "tid": 7, "ts": 6302685370701.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685367926.532, "dur": 7.089, + "args": { + "External id": 127358, "cbid": 211, "correlation": 241676969 + } + }, + { + "ph": "s", "id": 241676969, "pid": 5717, "tid": 6759, "ts": 6302685367926.532, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685370753.379, "dur": 45.504, + "args": { + "External id": 127370, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241676993, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241676993, "pid": 3, "tid": 7, "ts": 6302685370753.379, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368007.821, "dur": 10.460, + "args": { + "External id": 127370, "cbid": 211, "correlation": 241676993 + } + }, + { + "ph": "s", "id": 241676993, "pid": 5717, "tid": 6759, "ts": 6302685368007.821, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685370799.587, "dur": 24.608, + "args": { + "External id": 127371, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677003, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677003, "pid": 3, "tid": 7, "ts": 6302685370799.587, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368028.531, "dur": 6.260, + "args": { + "External id": 127371, "cbid": 211, "correlation": 241677003 + } + }, + { + "ph": "s", "id": 241677003, "pid": 5717, "tid": 6759, "ts": 6302685368028.531, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685370824.995, "dur": 0.800, + "args": { + "External id": 127372, "device": 3, "context": 1, "stream": 7, "correlation": 241677018, "bytes": 24, "memory bandwidth (GB/s)": 0.03 + } + }, + { + "ph": "f", "id": 241677018, "pid": 3, "tid": 7, "ts": 6302685370824.995, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685368054.181, "dur": 7.610, + "args": { + "External id": 127372, "cbid": 51, "correlation": 241677018 + } + }, + { + "ph": "s", "id": 241677018, "pid": 5717, "tid": 6759, "ts": 6302685368054.181, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 3, "tid": 7, + "ts": 6302685370826.595, "dur": 43.200, + "args": { + "External id": 127372, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677020, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241677020, "pid": 3, "tid": 7, "ts": 6302685370826.595, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368063.491, "dur": 6.240, + "args": { + "External id": 127372, "cbid": 211, "correlation": 241677020 + } + }, + { + "ph": "s", "id": 241677020, "pid": 5717, "tid": 6759, "ts": 6302685368063.491, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685370870.531, "dur": 52.225, + "args": { + "External id": 127383, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677041, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677041, "pid": 3, "tid": 7, "ts": 6302685370870.531, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368130.401, "dur": 9.700, + "args": { + "External id": 127383, "cbid": 211, "correlation": 241677041 + } + }, + { + "ph": "s", "id": 241677041, "pid": 5717, "tid": 6759, "ts": 6302685368130.401, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685370923.364, "dur": 141.313, + "args": { + "External id": 127386, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677056, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677056, "pid": 3, "tid": 7, "ts": 6302685370923.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368176.191, "dur": 11.720, + "args": { + "External id": 127386, "cbid": 211, "correlation": 241677056 + } + }, + { + "ph": "s", "id": 241677056, "pid": 5717, "tid": 6759, "ts": 6302685368176.191, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685371065.381, "dur": 107.681, + "args": { + "External id": 127387, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677066, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677066, "pid": 3, "tid": 7, "ts": 6302685371065.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368206.591, "dur": 10.620, + "args": { + "External id": 127387, "cbid": 211, "correlation": 241677066 + } + }, + { + "ph": "s", "id": 241677066, "pid": 5717, "tid": 6759, "ts": 6302685368206.591, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685371173.702, "dur": 77.920, + "args": { + "External id": 127388, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677080, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677080, "pid": 3, "tid": 7, "ts": 6302685371173.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368240.281, "dur": 8.000, + "args": { + "External id": 127388, "cbid": 211, "correlation": 241677080 + } + }, + { + "ph": "s", "id": 241677080, "pid": 5717, "tid": 6759, "ts": 6302685368240.281, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685371252.358, "dur": 1.440, + "args": { + "External id": 127391, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677094, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241677094, "pid": 3, "tid": 7, "ts": 6302685371252.358, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368295.241, "dur": 16.640, + "args": { + "External id": 127391, "cbid": 211, "correlation": 241677094 + } + }, + { + "ph": "s", "id": 241677094, "pid": 5717, "tid": 6759, "ts": 6302685368295.241, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685371254.502, "dur": 1.216, + "args": { + "External id": 127395, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677104, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241677104, "pid": 3, "tid": 7, "ts": 6302685371254.502, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368343.591, "dur": 11.480, + "args": { + "External id": 127395, "cbid": 211, "correlation": 241677104 + } + }, + { + "ph": "s", "id": 241677104, "pid": 5717, "tid": 6759, "ts": 6302685368343.591, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685371256.422, "dur": 1.056, + "args": { + "External id": 127396, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677114, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241677114, "pid": 3, "tid": 7, "ts": 6302685371256.422, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368368.451, "dur": 6.940, + "args": { + "External id": 127396, "cbid": 211, "correlation": 241677114 + } + }, + { + "ph": "s", "id": 241677114, "pid": 5717, "tid": 6759, "ts": 6302685368368.451, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685371258.086, "dur": 26.880, + "args": { + "External id": 127404, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677132, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677132, "pid": 3, "tid": 7, "ts": 6302685371258.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368443.740, "dur": 9.951, + "args": { + "External id": 127404, "cbid": 211, "correlation": 241677132 + } + }, + { + "ph": "s", "id": 241677132, "pid": 5717, "tid": 6759, "ts": 6302685368443.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685371285.702, "dur": 114.977, + "args": { + "External id": 127410, "device": 3, "context": 1, "stream": 7, "correlation": 241677146, "bytes": 50331648, "memory bandwidth (GB/s)": 437.75405515885785 + } + }, + { + "ph": "f", "id": 241677146, "pid": 3, "tid": 7, "ts": 6302685371285.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685368483.700, "dur": 17.180, + "args": { + "External id": 127410, "cbid": 41, "correlation": 241677146 + } + }, + { + "ph": "s", "id": 241677146, "pid": 5717, "tid": 6759, "ts": 6302685368483.700, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685371401.383, "dur": 72.833, + "args": { + "External id": 127412, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677158, "pid": 3, "tid": 7, "ts": 6302685371401.383, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368514.150, "dur": 5.530, + "args": { + "External id": 127412, "cbid": 211, "correlation": 241677158 + } + }, + { + "ph": "s", "id": 241677158, "pid": 5717, "tid": 6759, "ts": 6302685368514.150, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685371474.952, "dur": 149.505, + "args": { + "External id": 127413, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677168, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677168, "pid": 3, "tid": 7, "ts": 6302685371474.952, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368527.050, "dur": 3.860, + "args": { + "External id": 127413, "cbid": 211, "correlation": 241677168 + } + }, + { + "ph": "s", "id": 241677168, "pid": 5717, "tid": 6759, "ts": 6302685368527.050, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685371625.097, "dur": 144.353, + "args": { + "External id": 127414, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677175, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677175, "pid": 3, "tid": 7, "ts": 6302685371625.097, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368546.530, "dur": 5.840, + "args": { + "External id": 127414, "cbid": 211, "correlation": 241677175 + } + }, + { + "ph": "s", "id": 241677175, "pid": 5717, "tid": 6759, "ts": 6302685368546.530, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685371770.186, "dur": 46.944, + "args": { + "External id": 127420, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677194, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677194, "pid": 3, "tid": 7, "ts": 6302685371770.186, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368585.870, "dur": 8.010, + "args": { + "External id": 127420, "cbid": 211, "correlation": 241677194 + } + }, + { + "ph": "s", "id": 241677194, "pid": 5717, "tid": 6759, "ts": 6302685368585.870, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685371817.866, "dur": 58.017, + "args": { + "External id": 127421, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677206, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677206, "pid": 3, "tid": 7, "ts": 6302685371817.866, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368608.750, "dur": 7.230, + "args": { + "External id": 127421, "cbid": 211, "correlation": 241677206 + } + }, + { + "ph": "s", "id": 241677206, "pid": 5717, "tid": 6759, "ts": 6302685368608.750, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685368694.130, "dur": 0.510, + "args": { + "External id": 127433, "cbid": 200, "correlation": 241677246 + } + }, + { + "ph": "f", "id": 241677246, "pid": 5717, "tid": 6759, "ts": 6302685368694.130, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685371876.715, "dur": 0.768, + "args": { + "External id": 127433, "device": 3, "context": 1, "stream": 7, "correlation": 241677249, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241677249, "pid": 3, "tid": 7, "ts": 6302685371876.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685368696.330, "dur": 6.960, + "args": { + "External id": 127433, "cbid": 51, "correlation": 241677249 + } + }, + { + "ph": "s", "id": 241677249, "pid": 5717, "tid": 6759, "ts": 6302685368696.330, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685371878.283, "dur": 138.241, + "args": { + "External id": 127433, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677250, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241677250, "pid": 3, "tid": 7, "ts": 6302685371878.283, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368703.500, "dur": 7.150, + "args": { + "External id": 127433, "cbid": 307, "correlation": 241677250 + } + }, + { + "ph": "s", "id": 241677250, "pid": 5717, "tid": 6759, "ts": 6302685368703.500, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685372017.228, "dur": 121.985, + "args": { + "External id": 127440, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677272, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241677272, "pid": 3, "tid": 7, "ts": 6302685372017.228, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685368744.570, "dur": 8.200, + "args": { + "External id": 127440, "cbid": 211, "correlation": 241677272 + } + }, + { + "ph": "s", "id": 241677272, "pid": 5717, "tid": 6759, "ts": 6302685368744.570, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685368985.289, "dur": 0.900, + "args": { + "External id": 127466, "cbid": 200, "correlation": 241677319 + } + }, + { + "ph": "f", "id": 241677319, "pid": 5717, "tid": 6759, "ts": 6302685368985.289, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685368986.299, "dur": 0.280, + "args": { + "External id": 127466, "cbid": 200, "correlation": 241677320 + } + }, + { + "ph": "f", "id": 241677320, "pid": 5717, "tid": 6759, "ts": 6302685368986.299, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685369009.379, "dur": 0.210, + "args": { + "External id": 127466, "cbid": 200, "correlation": 241677338 + } + }, + { + "ph": "f", "id": 241677338, "pid": 5717, "tid": 6759, "ts": 6302685369009.379, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685372139.853, "dur": 93.537, + "args": { + "External id": 127466, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677339, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677339, "pid": 3, "tid": 7, "ts": 6302685372139.853, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685369011.599, "dur": 13.110, + "args": { + "External id": 127466, "cbid": 211, "correlation": 241677339 + } + }, + { + "ph": "s", "id": 241677339, "pid": 5717, "tid": 6759, "ts": 6302685369011.599, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685369026.589, "dur": 1.850, + "args": { + "External id": 127466, "cbid": 273, "correlation": 241677341 + } + }, + { + "ph": "f", "id": 241677341, "pid": 5717, "tid": 6759, "ts": 6302685369026.589, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685372234.126, "dur": 991.495, + "args": { + "External id": 127466, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677342, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241677342, "pid": 3, "tid": 7, "ts": 6302685372234.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685369028.719, "dur": 5.820, + "args": { + "External id": 127466, "cbid": 211, "correlation": 241677342 + } + }, + { + "ph": "s", "id": 241677342, "pid": 5717, "tid": 6759, "ts": 6302685369028.719, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685373226.229, "dur": 71.521, + "args": { + "External id": 127466, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677344, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241677344, "pid": 3, "tid": 7, "ts": 6302685373226.229, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685369035.529, "dur": 5.400, + "args": { + "External id": 127466, "cbid": 211, "correlation": 241677344 + } + }, + { + "ph": "s", "id": 241677344, "pid": 5717, "tid": 6759, "ts": 6302685369035.529, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685373298.486, "dur": 47.008, + "args": { + "External id": 127476, "device": 3, "context": 1, "stream": 7, "correlation": 241677370, "bytes": 25165824, "memory bandwidth (GB/s)": 535.3519400953029 + } + }, + { + "ph": "f", "id": 241677370, "pid": 3, "tid": 7, "ts": 6302685373298.486, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685369225.589, "dur": 21.230, + "args": { + "External id": 127476, "cbid": 41, "correlation": 241677370 + } + }, + { + "ph": "s", "id": 241677370, "pid": 5717, "tid": 6759, "ts": 6302685369225.589, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685373346.102, "dur": 33.312, + "args": { + "External id": 127473, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677388, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677388, "pid": 3, "tid": 7, "ts": 6302685373346.102, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685369482.418, "dur": 13.770, + "args": { + "External id": 127473, "cbid": 307, "correlation": 241677388 + } + }, + { + "ph": "s", "id": 241677388, "pid": 5717, "tid": 6759, "ts": 6302685369482.418, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685373380.054, "dur": 38.912, + "args": { + "External id": 127483, "device": 3, "context": 1, "stream": 7, "correlation": 241677403, "bytes": 25165824, "memory bandwidth (GB/s)": 646.7368421052631 + } + }, + { + "ph": "f", "id": 241677403, "pid": 3, "tid": 7, "ts": 6302685373380.054, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685369563.408, "dur": 14.900, + "args": { + "External id": 127483, "cbid": 41, "correlation": 241677403 + } + }, + { + "ph": "s", "id": 241677403, "pid": 5717, "tid": 6759, "ts": 6302685369563.408, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685373419.638, "dur": 27.329, + "args": { + "External id": 127480, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677421, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677421, "pid": 3, "tid": 7, "ts": 6302685373419.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685369667.448, "dur": 8.070, + "args": { + "External id": 127480, "cbid": 307, "correlation": 241677421 + } + }, + { + "ph": "s", "id": 241677421, "pid": 5717, "tid": 6759, "ts": 6302685369667.448, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685369803.787, "dur": 0.510, + "args": { + "External id": 127507, "cbid": 200, "correlation": 241677465 + } + }, + { + "ph": "f", "id": 241677465, "pid": 5717, "tid": 6759, "ts": 6302685369803.787, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685373447.799, "dur": 0.736, + "args": { + "External id": 127507, "device": 3, "context": 1, "stream": 7, "correlation": 241677468, "bytes": 576, "memory bandwidth (GB/s)": 0.782608695652174 + } + }, + { + "ph": "f", "id": 241677468, "pid": 3, "tid": 7, "ts": 6302685373447.799, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685369806.067, "dur": 7.050, + "args": { + "External id": 127507, "cbid": 51, "correlation": 241677468 + } + }, + { + "ph": "s", "id": 241677468, "pid": 5717, "tid": 6759, "ts": 6302685369806.067, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685373449.719, "dur": 141.345, + "args": { + "External id": 127507, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677469, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241677469, "pid": 3, "tid": 7, "ts": 6302685373449.719, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685369813.407, "dur": 7.770, + "args": { + "External id": 127507, "cbid": 307, "correlation": 241677469 + } + }, + { + "ph": "s", "id": 241677469, "pid": 5717, "tid": 6759, "ts": 6302685369813.407, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685373592.312, "dur": 121.921, + "args": { + "External id": 127514, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677491, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241677491, "pid": 3, "tid": 7, "ts": 6302685373592.312, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685369852.137, "dur": 6.220, + "args": { + "External id": 127514, "cbid": 211, "correlation": 241677491 + } + }, + { + "ph": "s", "id": 241677491, "pid": 5717, "tid": 6759, "ts": 6302685369852.137, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685369991.367, "dur": 0.450, + "args": { + "External id": 127537, "cbid": 200, "correlation": 241677537 + } + }, + { + "ph": "f", "id": 241677537, "pid": 5717, "tid": 6759, "ts": 6302685369991.367, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685373715.129, "dur": 1.056, + "args": { + "External id": 127537, "device": 3, "context": 1, "stream": 7, "correlation": 241677540, "bytes": 576, "memory bandwidth (GB/s)": 0.5454545454545454 + } + }, + { + "ph": "f", "id": 241677540, "pid": 3, "tid": 7, "ts": 6302685373715.129, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685369993.277, "dur": 6.770, + "args": { + "External id": 127537, "cbid": 51, "correlation": 241677540 + } + }, + { + "ph": "s", "id": 241677540, "pid": 5717, "tid": 6759, "ts": 6302685369993.277, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685373716.953, "dur": 141.089, + "args": { + "External id": 127537, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677541, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241677541, "pid": 3, "tid": 7, "ts": 6302685373716.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370000.257, "dur": 7.900, + "args": { + "External id": 127537, "cbid": 307, "correlation": 241677541 + } + }, + { + "ph": "s", "id": 241677541, "pid": 5717, "tid": 6759, "ts": 6302685370000.257, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685373858.714, "dur": 121.953, + "args": { + "External id": 127544, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677563, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241677563, "pid": 3, "tid": 7, "ts": 6302685373858.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370037.887, "dur": 5.900, + "args": { + "External id": 127544, "cbid": 211, "correlation": 241677563 + } + }, + { + "ph": "s", "id": 241677563, "pid": 5717, "tid": 6759, "ts": 6302685370037.887, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685373981.339, "dur": 38.912, + "args": { + "External id": 127549, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677578, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677578, "pid": 3, "tid": 7, "ts": 6302685373981.339, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370085.497, "dur": 7.400, + "args": { + "External id": 127549, "cbid": 211, "correlation": 241677578 + } + }, + { + "ph": "s", "id": 241677578, "pid": 5717, "tid": 6759, "ts": 6302685370085.497, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685370176.416, "dur": 0.431, + "args": { + "External id": 127568, "cbid": 200, "correlation": 241677622 + } + }, + { + "ph": "f", "id": 241677622, "pid": 5717, "tid": 6759, "ts": 6302685370176.416, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685374021.019, "dur": 0.800, + "args": { + "External id": 127568, "device": 3, "context": 1, "stream": 7, "correlation": 241677625, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 241677625, "pid": 3, "tid": 7, "ts": 6302685374021.019, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685370178.236, "dur": 6.260, + "args": { + "External id": 127568, "cbid": 51, "correlation": 241677625 + } + }, + { + "ph": "s", "id": 241677625, "pid": 5717, "tid": 6759, "ts": 6302685370178.236, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685374023.547, "dur": 142.657, + "args": { + "External id": 127568, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677626, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241677626, "pid": 3, "tid": 7, "ts": 6302685374023.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370184.676, "dur": 6.991, + "args": { + "External id": 127568, "cbid": 307, "correlation": 241677626 + } + }, + { + "ph": "s", "id": 241677626, "pid": 5717, "tid": 6759, "ts": 6302685370184.676, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685374166.908, "dur": 121.889, + "args": { + "External id": 127575, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677648, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241677648, "pid": 3, "tid": 7, "ts": 6302685374166.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370219.247, "dur": 5.440, + "args": { + "External id": 127575, "cbid": 211, "correlation": 241677648 + } + }, + { + "ph": "s", "id": 241677648, "pid": 5717, "tid": 6759, "ts": 6302685370219.247, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685374289.437, "dur": 37.888, + "args": { + "External id": 127580, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677659, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677659, "pid": 3, "tid": 7, "ts": 6302685374289.437, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370263.096, "dur": 6.680, + "args": { + "External id": 127580, "cbid": 211, "correlation": 241677659 + } + }, + { + "ph": "s", "id": 241677659, "pid": 5717, "tid": 6759, "ts": 6302685370263.096, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685374327.997, "dur": 41.921, + "args": { + "External id": 127592, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677683, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677683, "pid": 3, "tid": 7, "ts": 6302685374327.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370331.466, "dur": 8.680, + "args": { + "External id": 127592, "cbid": 211, "correlation": 241677683 + } + }, + { + "ph": "s", "id": 241677683, "pid": 5717, "tid": 6759, "ts": 6302685370331.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685374371.166, "dur": 25.280, + "args": { + "External id": 127593, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677693, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677693, "pid": 3, "tid": 7, "ts": 6302685374371.166, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370349.046, "dur": 4.890, + "args": { + "External id": 127593, "cbid": 211, "correlation": 241677693 + } + }, + { + "ph": "s", "id": 241677693, "pid": 5717, "tid": 6759, "ts": 6302685370349.046, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685374397.374, "dur": 0.768, + "args": { + "External id": 127594, "device": 3, "context": 1, "stream": 7, "correlation": 241677708, "bytes": 24, "memory bandwidth (GB/s)": 0.03125 + } + }, + { + "ph": "f", "id": 241677708, "pid": 3, "tid": 7, "ts": 6302685374397.374, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685370368.316, "dur": 6.510, + "args": { + "External id": 127594, "cbid": 51, "correlation": 241677708 + } + }, + { + "ph": "s", "id": 241677708, "pid": 5717, "tid": 6759, "ts": 6302685370368.316, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 3, "tid": 7, + "ts": 6302685374399.358, "dur": 41.792, + "args": { + "External id": 127594, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677710, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241677710, "pid": 3, "tid": 7, "ts": 6302685374399.358, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370375.896, "dur": 5.140, + "args": { + "External id": 127594, "cbid": 211, "correlation": 241677710 + } + }, + { + "ph": "s", "id": 241677710, "pid": 5717, "tid": 6759, "ts": 6302685370375.896, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685374441.790, "dur": 50.401, + "args": { + "External id": 127605, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677731, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677731, "pid": 3, "tid": 7, "ts": 6302685374441.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370435.026, "dur": 8.100, + "args": { + "External id": 127605, "cbid": 211, "correlation": 241677731 + } + }, + { + "ph": "s", "id": 241677731, "pid": 5717, "tid": 6759, "ts": 6302685370435.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685374492.895, "dur": 142.337, + "args": { + "External id": 127608, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677746, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677746, "pid": 3, "tid": 7, "ts": 6302685374492.895, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370473.986, "dur": 9.610, + "args": { + "External id": 127608, "cbid": 211, "correlation": 241677746 + } + }, + { + "ph": "s", "id": 241677746, "pid": 5717, "tid": 6759, "ts": 6302685370473.986, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685374635.904, "dur": 107.937, + "args": { + "External id": 127609, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677756, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677756, "pid": 3, "tid": 7, "ts": 6302685374635.904, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370492.406, "dur": 4.330, + "args": { + "External id": 127609, "cbid": 211, "correlation": 241677756 + } + }, + { + "ph": "s", "id": 241677756, "pid": 5717, "tid": 6759, "ts": 6302685370492.406, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685374744.449, "dur": 77.600, + "args": { + "External id": 127610, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677770, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677770, "pid": 3, "tid": 7, "ts": 6302685374744.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370510.936, "dur": 5.000, + "args": { + "External id": 127610, "cbid": 211, "correlation": 241677770 + } + }, + { + "ph": "s", "id": 241677770, "pid": 5717, "tid": 6759, "ts": 6302685370510.936, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685374822.721, "dur": 1.472, + "args": { + "External id": 127613, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677784, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241677784, "pid": 3, "tid": 7, "ts": 6302685374822.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370545.446, "dur": 6.390, + "args": { + "External id": 127613, "cbid": 211, "correlation": 241677784 + } + }, + { + "ph": "s", "id": 241677784, "pid": 5717, "tid": 6759, "ts": 6302685370545.446, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685374824.929, "dur": 1.216, + "args": { + "External id": 127617, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677794, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241677794, "pid": 3, "tid": 7, "ts": 6302685374824.929, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370563.756, "dur": 4.470, + "args": { + "External id": 127617, "cbid": 211, "correlation": 241677794 + } + }, + { + "ph": "s", "id": 241677794, "pid": 5717, "tid": 6759, "ts": 6302685370563.756, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685374826.849, "dur": 1.024, + "args": { + "External id": 127618, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677804, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241677804, "pid": 3, "tid": 7, "ts": 6302685374826.849, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370575.456, "dur": 4.179, + "args": { + "External id": 127618, "cbid": 211, "correlation": 241677804 + } + }, + { + "ph": "s", "id": 241677804, "pid": 5717, "tid": 6759, "ts": 6302685370575.456, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685374828.545, "dur": 27.136, + "args": { + "External id": 127626, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677822, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677822, "pid": 3, "tid": 7, "ts": 6302685374828.545, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370624.246, "dur": 7.400, + "args": { + "External id": 127626, "cbid": 211, "correlation": 241677822 + } + }, + { + "ph": "s", "id": 241677822, "pid": 5717, "tid": 6759, "ts": 6302685370624.246, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685374856.321, "dur": 113.025, + "args": { + "External id": 127632, "device": 3, "context": 1, "stream": 7, "correlation": 241677836, "bytes": 50331648, "memory bandwidth (GB/s)": 445.3142932979429 + } + }, + { + "ph": "f", "id": 241677836, "pid": 3, "tid": 7, "ts": 6302685374856.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685370661.386, "dur": 14.469, + "args": { + "External id": 127632, "cbid": 41, "correlation": 241677836 + } + }, + { + "ph": "s", "id": 241677836, "pid": 5717, "tid": 6759, "ts": 6302685370661.386, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685374970.050, "dur": 70.177, + "args": { + "External id": 127634, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677848, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677848, "pid": 3, "tid": 7, "ts": 6302685374970.050, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370689.455, "dur": 5.730, + "args": { + "External id": 127634, "cbid": 211, "correlation": 241677848 + } + }, + { + "ph": "s", "id": 241677848, "pid": 5717, "tid": 6759, "ts": 6302685370689.455, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685375040.963, "dur": 149.377, + "args": { + "External id": 127635, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677858, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677858, "pid": 3, "tid": 7, "ts": 6302685375040.963, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370703.685, "dur": 4.350, + "args": { + "External id": 127635, "cbid": 211, "correlation": 241677858 + } + }, + { + "ph": "s", "id": 241677858, "pid": 5717, "tid": 6759, "ts": 6302685370703.685, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685375190.948, "dur": 139.649, + "args": { + "External id": 127636, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677865, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677865, "pid": 3, "tid": 7, "ts": 6302685375190.948, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370720.335, "dur": 4.390, + "args": { + "External id": 127636, "cbid": 211, "correlation": 241677865 + } + }, + { + "ph": "s", "id": 241677865, "pid": 5717, "tid": 6759, "ts": 6302685370720.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685375331.269, "dur": 46.304, + "args": { + "External id": 127642, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677884, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677884, "pid": 3, "tid": 7, "ts": 6302685375331.269, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370755.185, "dur": 6.360, + "args": { + "External id": 127642, "cbid": 211, "correlation": 241677884 + } + }, + { + "ph": "s", "id": 241677884, "pid": 5717, "tid": 6759, "ts": 6302685370755.185, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685375378.278, "dur": 40.287, + "args": { + "External id": 127643, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677892, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677892, "pid": 3, "tid": 7, "ts": 6302685375378.278, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370771.995, "dur": 4.270, + "args": { + "External id": 127643, "cbid": 211, "correlation": 241677892 + } + }, + { + "ph": "s", "id": 241677892, "pid": 5717, "tid": 6759, "ts": 6302685370771.995, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685375419.269, "dur": 327.331, + "args": { + "External id": 127658, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677925, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241677925, "pid": 3, "tid": 7, "ts": 6302685375419.269, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370872.095, "dur": 9.560, + "args": { + "External id": 127658, "cbid": 211, "correlation": 241677925 + } + }, + { + "ph": "s", "id": 241677925, "pid": 5717, "tid": 6759, "ts": 6302685370872.095, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 3, "tid": 7, + "ts": 6302685375747.272, "dur": 428.995, + "args": { + "External id": 127647, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677953, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241677953, "pid": 3, "tid": 7, "ts": 6302685375747.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685370939.635, "dur": 6.880, + "args": { + "External id": 127647, "cbid": 307, "correlation": 241677953 + } + }, + { + "ph": "s", "id": 241677953, "pid": 5717, "tid": 6759, "ts": 6302685370939.635, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685371040.565, "dur": 0.529, + "args": { + "External id": 127683, "cbid": 200, "correlation": 241677978 + } + }, + { + "ph": "f", "id": 241677978, "pid": 5717, "tid": 6759, "ts": 6302685371040.565, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685376177.163, "dur": 0.768, + "args": { + "External id": 127683, "device": 3, "context": 1, "stream": 7, "correlation": 241677981, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 241677981, "pid": 3, "tid": 7, "ts": 6302685376177.163, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685371042.734, "dur": 7.171, + "args": { + "External id": 127683, "cbid": 51, "correlation": 241677981 + } + }, + { + "ph": "s", "id": 241677981, "pid": 5717, "tid": 6759, "ts": 6302685371042.734, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685376178.731, "dur": 369.155, + "args": { + "External id": 127683, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241677982, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241677982, "pid": 3, "tid": 7, "ts": 6302685376178.731, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371050.134, "dur": 7.540, + "args": { + "External id": 127683, "cbid": 307, "correlation": 241677982 + } + }, + { + "ph": "s", "id": 241677982, "pid": 5717, "tid": 6759, "ts": 6302685371050.134, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685371175.254, "dur": 0.450, + "args": { + "External id": 127701, "cbid": 200, "correlation": 241678019 + } + }, + { + "ph": "f", "id": 241678019, "pid": 5717, "tid": 6759, "ts": 6302685371175.254, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685376548.686, "dur": 0.800, + "args": { + "External id": 127701, "device": 3, "context": 1, "stream": 7, "correlation": 241678022, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241678022, "pid": 3, "tid": 7, "ts": 6302685376548.686, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685371177.174, "dur": 6.480, + "args": { + "External id": 127701, "cbid": 51, "correlation": 241678022 + } + }, + { + "ph": "s", "id": 241678022, "pid": 5717, "tid": 6759, "ts": 6302685371177.174, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685376550.638, "dur": 353.027, + "args": { + "External id": 127701, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678023, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678023, "pid": 3, "tid": 7, "ts": 6302685376550.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371183.874, "dur": 8.260, + "args": { + "External id": 127701, "cbid": 307, "correlation": 241678023 + } + }, + { + "ph": "s", "id": 241678023, "pid": 5717, "tid": 6759, "ts": 6302685371183.874, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685371223.044, "dur": 0.240, + "args": { + "External id": 127708, "cbid": 200, "correlation": 241678048 + } + }, + { + "ph": "f", "id": 241678048, "pid": 5717, "tid": 6759, "ts": 6302685371223.044, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685376904.817, "dur": 353.762, + "args": { + "External id": 127708, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678051, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678051, "pid": 3, "tid": 7, "ts": 6302685376904.817, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371224.424, "dur": 5.550, + "args": { + "External id": 127708, "cbid": 307, "correlation": 241678051 + } + }, + { + "ph": "s", "id": 241678051, "pid": 5717, "tid": 6759, "ts": 6302685371224.424, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685371335.884, "dur": 0.370, + "args": { + "External id": 127731, "cbid": 200, "correlation": 241678096 + } + }, + { + "ph": "f", "id": 241678096, "pid": 5717, "tid": 6759, "ts": 6302685371335.884, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685377259.475, "dur": 0.800, + "args": { + "External id": 127731, "device": 3, "context": 1, "stream": 7, "correlation": 241678099, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241678099, "pid": 3, "tid": 7, "ts": 6302685377259.475, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685371337.604, "dur": 5.960, + "args": { + "External id": 127731, "cbid": 51, "correlation": 241678099 + } + }, + { + "ph": "s", "id": 241678099, "pid": 5717, "tid": 6759, "ts": 6302685371337.604, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685377261.427, "dur": 353.155, + "args": { + "External id": 127731, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678100, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678100, "pid": 3, "tid": 7, "ts": 6302685377261.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371343.774, "dur": 7.610, + "args": { + "External id": 127731, "cbid": 307, "correlation": 241678100 + } + }, + { + "ph": "s", "id": 241678100, "pid": 5717, "tid": 6759, "ts": 6302685371343.774, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685371381.094, "dur": 0.260, + "args": { + "External id": 127738, "cbid": 200, "correlation": 241678125 + } + }, + { + "ph": "f", "id": 241678125, "pid": 5717, "tid": 6759, "ts": 6302685371381.094, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685377615.318, "dur": 359.555, + "args": { + "External id": 127738, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678128, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678128, "pid": 3, "tid": 7, "ts": 6302685377615.318, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371382.424, "dur": 6.350, + "args": { + "External id": 127738, "cbid": 307, "correlation": 241678128 + } + }, + { + "ph": "s", "id": 241678128, "pid": 5717, "tid": 6759, "ts": 6302685371382.424, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685377975.513, "dur": 52.544, + "args": { + "External id": 127743, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678142, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678142, "pid": 3, "tid": 7, "ts": 6302685377975.513, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371429.194, "dur": 7.190, + "args": { + "External id": 127743, "cbid": 211, "correlation": 241678142 + } + }, + { + "ph": "s", "id": 241678142, "pid": 5717, "tid": 6759, "ts": 6302685371429.194, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685378028.761, "dur": 45.825, + "args": { + "External id": 127755, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678166, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678166, "pid": 3, "tid": 7, "ts": 6302685378028.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371488.624, "dur": 112.489, + "args": { + "External id": 127755, "cbid": 211, "correlation": 241678166 + } + }, + { + "ph": "s", "id": 241678166, "pid": 5717, "tid": 6759, "ts": 6302685371488.624, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685378075.290, "dur": 27.296, + "args": { + "External id": 127756, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678176, "pid": 3, "tid": 7, "ts": 6302685378075.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371609.893, "dur": 4.540, + "args": { + "External id": 127756, "cbid": 211, "correlation": 241678176 + } + }, + { + "ph": "s", "id": 241678176, "pid": 5717, "tid": 6759, "ts": 6302685371609.893, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685378103.386, "dur": 0.736, + "args": { + "External id": 127757, "device": 3, "context": 1, "stream": 7, "correlation": 241678191, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 241678191, "pid": 3, "tid": 7, "ts": 6302685378103.386, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685371631.703, "dur": 5.560, + "args": { + "External id": 127757, "cbid": 51, "correlation": 241678191 + } + }, + { + "ph": "s", "id": 241678191, "pid": 5717, "tid": 6759, "ts": 6302685371631.703, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 3, "tid": 7, + "ts": 6302685378104.954, "dur": 43.328, + "args": { + "External id": 127757, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678193, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241678193, "pid": 3, "tid": 7, "ts": 6302685378104.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371638.413, "dur": 5.320, + "args": { + "External id": 127757, "cbid": 211, "correlation": 241678193 + } + }, + { + "ph": "s", "id": 241678193, "pid": 5717, "tid": 6759, "ts": 6302685371638.413, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685378148.890, "dur": 50.913, + "args": { + "External id": 127768, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678214, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678214, "pid": 3, "tid": 7, "ts": 6302685378148.890, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371698.413, "dur": 8.020, + "args": { + "External id": 127768, "cbid": 211, "correlation": 241678214 + } + }, + { + "ph": "s", "id": 241678214, "pid": 5717, "tid": 6759, "ts": 6302685371698.413, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685378200.411, "dur": 142.465, + "args": { + "External id": 127771, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678229, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678229, "pid": 3, "tid": 7, "ts": 6302685378200.411, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371730.773, "dur": 5.590, + "args": { + "External id": 127771, "cbid": 211, "correlation": 241678229 + } + }, + { + "ph": "s", "id": 241678229, "pid": 5717, "tid": 6759, "ts": 6302685371730.773, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685378343.516, "dur": 109.152, + "args": { + "External id": 127772, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678239, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678239, "pid": 3, "tid": 7, "ts": 6302685378343.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371744.833, "dur": 4.200, + "args": { + "External id": 127772, "cbid": 211, "correlation": 241678239 + } + }, + { + "ph": "s", "id": 241678239, "pid": 5717, "tid": 6759, "ts": 6302685371744.833, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685378453.404, "dur": 77.697, + "args": { + "External id": 127773, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678253, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678253, "pid": 3, "tid": 7, "ts": 6302685378453.404, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371761.523, "dur": 4.720, + "args": { + "External id": 127773, "cbid": 211, "correlation": 241678253 + } + }, + { + "ph": "s", "id": 241678253, "pid": 5717, "tid": 6759, "ts": 6302685371761.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685378531.805, "dur": 1.440, + "args": { + "External id": 127776, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678267, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241678267, "pid": 3, "tid": 7, "ts": 6302685378531.805, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371798.303, "dur": 8.960, + "args": { + "External id": 127776, "cbid": 211, "correlation": 241678267 + } + }, + { + "ph": "s", "id": 241678267, "pid": 5717, "tid": 6759, "ts": 6302685371798.303, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685378533.981, "dur": 1.120, + "args": { + "External id": 127780, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241678277, "pid": 3, "tid": 7, "ts": 6302685378533.981, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371830.133, "dur": 8.400, + "args": { + "External id": 127780, "cbid": 211, "correlation": 241678277 + } + }, + { + "ph": "s", "id": 241678277, "pid": 5717, "tid": 6759, "ts": 6302685371830.133, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685378535.837, "dur": 1.024, + "args": { + "External id": 127781, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678287, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241678287, "pid": 3, "tid": 7, "ts": 6302685378535.837, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371849.933, "dur": 6.470, + "args": { + "External id": 127781, "cbid": 211, "correlation": 241678287 + } + }, + { + "ph": "s", "id": 241678287, "pid": 5717, "tid": 6759, "ts": 6302685371849.933, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685378537.469, "dur": 26.784, + "args": { + "External id": 127789, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678305, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678305, "pid": 3, "tid": 7, "ts": 6302685378537.469, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371911.403, "dur": 7.309, + "args": { + "External id": 127789, "cbid": 211, "correlation": 241678305 + } + }, + { + "ph": "s", "id": 241678305, "pid": 5717, "tid": 6759, "ts": 6302685371911.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685378564.829, "dur": 113.249, + "args": { + "External id": 127795, "device": 3, "context": 1, "stream": 7, "correlation": 241678319, "bytes": 50331648, "memory bandwidth (GB/s)": 444.4334872714108 + } + }, + { + "ph": "f", "id": 241678319, "pid": 3, "tid": 7, "ts": 6302685378564.829, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685371945.863, "dur": 16.260, + "args": { + "External id": 127795, "cbid": 41, "correlation": 241678319 + } + }, + { + "ph": "s", "id": 241678319, "pid": 5717, "tid": 6759, "ts": 6302685371945.863, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685378678.814, "dur": 71.297, + "args": { + "External id": 127797, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678331, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678331, "pid": 3, "tid": 7, "ts": 6302685378678.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371976.323, "dur": 9.400, + "args": { + "External id": 127797, "cbid": 211, "correlation": 241678331 + } + }, + { + "ph": "s", "id": 241678331, "pid": 5717, "tid": 6759, "ts": 6302685371976.323, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685378750.815, "dur": 150.529, + "args": { + "External id": 127798, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678341, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678341, "pid": 3, "tid": 7, "ts": 6302685378750.815, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685371993.783, "dur": 3.920, + "args": { + "External id": 127798, "cbid": 211, "correlation": 241678341 + } + }, + { + "ph": "s", "id": 241678341, "pid": 5717, "tid": 6759, "ts": 6302685371993.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685378901.984, "dur": 148.385, + "args": { + "External id": 127799, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678348, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678348, "pid": 3, "tid": 7, "ts": 6302685378901.984, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372009.922, "dur": 4.390, + "args": { + "External id": 127799, "cbid": 211, "correlation": 241678348 + } + }, + { + "ph": "s", "id": 241678348, "pid": 5717, "tid": 6759, "ts": 6302685372009.922, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685379051.105, "dur": 47.585, + "args": { + "External id": 127805, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678367, "pid": 3, "tid": 7, "ts": 6302685379051.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372044.942, "dur": 6.290, + "args": { + "External id": 127805, "cbid": 211, "correlation": 241678367 + } + }, + { + "ph": "s", "id": 241678367, "pid": 5717, "tid": 6759, "ts": 6302685372044.942, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685379099.394, "dur": 58.112, + "args": { + "External id": 127806, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678379, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678379, "pid": 3, "tid": 7, "ts": 6302685379099.394, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372065.812, "dur": 4.920, + "args": { + "External id": 127806, "cbid": 211, "correlation": 241678379 + } + }, + { + "ph": "s", "id": 241678379, "pid": 5717, "tid": 6759, "ts": 6302685372065.812, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685379158.274, "dur": 40.736, + "args": { + "External id": 127809, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678392, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678392, "pid": 3, "tid": 7, "ts": 6302685379158.274, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372095.552, "dur": 5.500, + "args": { + "External id": 127809, "cbid": 211, "correlation": 241678392 + } + }, + { + "ph": "s", "id": 241678392, "pid": 5717, "tid": 6759, "ts": 6302685372095.552, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685372158.042, "dur": 0.500, + "args": { + "External id": 127819, "cbid": 200, "correlation": 241678428 + } + }, + { + "ph": "f", "id": 241678428, "pid": 5717, "tid": 6759, "ts": 6302685372158.042, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685379200.002, "dur": 0.768, + "args": { + "External id": 127819, "device": 3, "context": 1, "stream": 7, "correlation": 241678431, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241678431, "pid": 3, "tid": 7, "ts": 6302685379200.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685372160.232, "dur": 6.670, + "args": { + "External id": 127819, "cbid": 51, "correlation": 241678431 + } + }, + { + "ph": "s", "id": 241678431, "pid": 5717, "tid": 6759, "ts": 6302685372160.232, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685379201.538, "dur": 138.017, + "args": { + "External id": 127819, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678432, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678432, "pid": 3, "tid": 7, "ts": 6302685379201.538, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372167.132, "dur": 6.740, + "args": { + "External id": 127819, "cbid": 307, "correlation": 241678432 + } + }, + { + "ph": "s", "id": 241678432, "pid": 5717, "tid": 6759, "ts": 6302685372167.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685379340.195, "dur": 122.177, + "args": { + "External id": 127826, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678454, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678454, "pid": 3, "tid": 7, "ts": 6302685379340.195, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372203.772, "dur": 7.760, + "args": { + "External id": 127826, "cbid": 211, "correlation": 241678454 + } + }, + { + "ph": "s", "id": 241678454, "pid": 5717, "tid": 6759, "ts": 6302685372203.772, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685372430.291, "dur": 0.511, + "args": { + "External id": 127852, "cbid": 200, "correlation": 241678501 + } + }, + { + "ph": "f", "id": 241678501, "pid": 5717, "tid": 6759, "ts": 6302685372430.291, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685372430.922, "dur": 0.200, + "args": { + "External id": 127852, "cbid": 200, "correlation": 241678502 + } + }, + { + "ph": "f", "id": 241678502, "pid": 5717, "tid": 6759, "ts": 6302685372430.922, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685372447.881, "dur": 0.220, + "args": { + "External id": 127852, "cbid": 200, "correlation": 241678520 + } + }, + { + "ph": "f", "id": 241678520, "pid": 5717, "tid": 6759, "ts": 6302685372447.881, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685379463.076, "dur": 95.553, + "args": { + "External id": 127852, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678521, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678521, "pid": 3, "tid": 7, "ts": 6302685379463.076, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372449.291, "dur": 11.510, + "args": { + "External id": 127852, "cbid": 211, "correlation": 241678521 + } + }, + { + "ph": "s", "id": 241678521, "pid": 5717, "tid": 6759, "ts": 6302685372449.291, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685372461.601, "dur": 1.060, + "args": { + "External id": 127852, "cbid": 273, "correlation": 241678523 + } + }, + { + "ph": "f", "id": 241678523, "pid": 5717, "tid": 6759, "ts": 6302685372461.601, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685379559.269, "dur": 989.991, + "args": { + "External id": 127852, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678524, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241678524, "pid": 3, "tid": 7, "ts": 6302685379559.269, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372462.981, "dur": 3.950, + "args": { + "External id": 127852, "cbid": 211, "correlation": 241678524 + } + }, + { + "ph": "s", "id": 241678524, "pid": 5717, "tid": 6759, "ts": 6302685372462.981, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685380549.932, "dur": 71.905, + "args": { + "External id": 127852, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678526, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241678526, "pid": 3, "tid": 7, "ts": 6302685380549.932, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372467.451, "dur": 3.670, + "args": { + "External id": 127852, "cbid": 211, "correlation": 241678526 + } + }, + { + "ph": "s", "id": 241678526, "pid": 5717, "tid": 6759, "ts": 6302685372467.451, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685380622.573, "dur": 46.848, + "args": { + "External id": 127862, "device": 3, "context": 1, "stream": 7, "correlation": 241678552, "bytes": 25165824, "memory bandwidth (GB/s)": 537.1803278688525 + } + }, + { + "ph": "f", "id": 241678552, "pid": 3, "tid": 7, "ts": 6302685380622.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685372592.331, "dur": 17.530, + "args": { + "External id": 127862, "cbid": 41, "correlation": 241678552 + } + }, + { + "ph": "s", "id": 241678552, "pid": 5717, "tid": 6759, "ts": 6302685372592.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685380670.125, "dur": 32.128, + "args": { + "External id": 127859, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678570, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678570, "pid": 3, "tid": 7, "ts": 6302685380670.125, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372705.631, "dur": 8.380, + "args": { + "External id": 127859, "cbid": 307, "correlation": 241678570 + } + }, + { + "ph": "s", "id": 241678570, "pid": 5717, "tid": 6759, "ts": 6302685372705.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685380702.893, "dur": 37.953, + "args": { + "External id": 127869, "device": 3, "context": 1, "stream": 7, "correlation": 241678585, "bytes": 25165824, "memory bandwidth (GB/s)": 663.0786499090981 + } + }, + { + "ph": "f", "id": 241678585, "pid": 3, "tid": 7, "ts": 6302685380702.893, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685372774.251, "dur": 14.830, + "args": { + "External id": 127869, "cbid": 41, "correlation": 241678585 + } + }, + { + "ph": "s", "id": 241678585, "pid": 5717, "tid": 6759, "ts": 6302685372774.251, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685380741.518, "dur": 26.624, + "args": { + "External id": 127866, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678603, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678603, "pid": 3, "tid": 7, "ts": 6302685380741.518, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685372874.581, "dur": 7.940, + "args": { + "External id": 127866, "cbid": 307, "correlation": 241678603 + } + }, + { + "ph": "s", "id": 241678603, "pid": 5717, "tid": 6759, "ts": 6302685372874.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685372997.170, "dur": 0.520, + "args": { + "External id": 127893, "cbid": 200, "correlation": 241678647 + } + }, + { + "ph": "f", "id": 241678647, "pid": 5717, "tid": 6759, "ts": 6302685372997.170, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685380769.070, "dur": 0.768, + "args": { + "External id": 127893, "device": 3, "context": 1, "stream": 7, "correlation": 241678650, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241678650, "pid": 3, "tid": 7, "ts": 6302685380769.070, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685372999.300, "dur": 7.180, + "args": { + "External id": 127893, "cbid": 51, "correlation": 241678650 + } + }, + { + "ph": "s", "id": 241678650, "pid": 5717, "tid": 6759, "ts": 6302685372999.300, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685380770.606, "dur": 141.025, + "args": { + "External id": 127893, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678651, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678651, "pid": 3, "tid": 7, "ts": 6302685380770.606, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373006.730, "dur": 7.800, + "args": { + "External id": 127893, "cbid": 307, "correlation": 241678651 + } + }, + { + "ph": "s", "id": 241678651, "pid": 5717, "tid": 6759, "ts": 6302685373006.730, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685380912.783, "dur": 121.985, + "args": { + "External id": 127900, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678673, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678673, "pid": 3, "tid": 7, "ts": 6302685380912.783, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373047.980, "dur": 10.150, + "args": { + "External id": 127900, "cbid": 211, "correlation": 241678673 + } + }, + { + "ph": "s", "id": 241678673, "pid": 5717, "tid": 6759, "ts": 6302685373047.980, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685373218.180, "dur": 0.370, + "args": { + "External id": 127923, "cbid": 200, "correlation": 241678719 + } + }, + { + "ph": "f", "id": 241678719, "pid": 5717, "tid": 6759, "ts": 6302685373218.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685381035.568, "dur": 0.768, + "args": { + "External id": 127923, "device": 3, "context": 1, "stream": 7, "correlation": 241678722, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241678722, "pid": 3, "tid": 7, "ts": 6302685381035.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685373221.240, "dur": 9.140, + "args": { + "External id": 127923, "cbid": 51, "correlation": 241678722 + } + }, + { + "ph": "s", "id": 241678722, "pid": 5717, "tid": 6759, "ts": 6302685373221.240, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685381037.488, "dur": 140.449, + "args": { + "External id": 127923, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678723, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678723, "pid": 3, "tid": 7, "ts": 6302685381037.488, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373230.609, "dur": 10.691, + "args": { + "External id": 127923, "cbid": 307, "correlation": 241678723 + } + }, + { + "ph": "s", "id": 241678723, "pid": 5717, "tid": 6759, "ts": 6302685373230.609, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685381178.673, "dur": 122.305, + "args": { + "External id": 127930, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678745, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678745, "pid": 3, "tid": 7, "ts": 6302685381178.673, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373286.069, "dur": 9.091, + "args": { + "External id": 127930, "cbid": 211, "correlation": 241678745 + } + }, + { + "ph": "s", "id": 241678745, "pid": 5717, "tid": 6759, "ts": 6302685373286.069, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685381301.810, "dur": 46.464, + "args": { + "External id": 127935, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678760, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678760, "pid": 3, "tid": 7, "ts": 6302685381301.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373354.829, "dur": 8.120, + "args": { + "External id": 127935, "cbid": 211, "correlation": 241678760 + } + }, + { + "ph": "s", "id": 241678760, "pid": 5717, "tid": 6759, "ts": 6302685373354.829, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685373446.849, "dur": 0.500, + "args": { + "External id": 127954, "cbid": 200, "correlation": 241678804 + } + }, + { + "ph": "f", "id": 241678804, "pid": 5717, "tid": 6759, "ts": 6302685373446.849, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685381349.106, "dur": 0.768, + "args": { + "External id": 127954, "device": 3, "context": 1, "stream": 7, "correlation": 241678807, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241678807, "pid": 3, "tid": 7, "ts": 6302685381349.106, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685373448.779, "dur": 6.000, + "args": { + "External id": 127954, "cbid": 51, "correlation": 241678807 + } + }, + { + "ph": "s", "id": 241678807, "pid": 5717, "tid": 6759, "ts": 6302685373448.779, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685381351.186, "dur": 145.569, + "args": { + "External id": 127954, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678808, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678808, "pid": 3, "tid": 7, "ts": 6302685381351.186, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373454.969, "dur": 7.120, + "args": { + "External id": 127954, "cbid": 307, "correlation": 241678808 + } + }, + { + "ph": "s", "id": 241678808, "pid": 5717, "tid": 6759, "ts": 6302685373454.969, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685381497.427, "dur": 122.657, + "args": { + "External id": 127961, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678830, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241678830, "pid": 3, "tid": 7, "ts": 6302685381497.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373489.589, "dur": 5.610, + "args": { + "External id": 127961, "cbid": 211, "correlation": 241678830 + } + }, + { + "ph": "s", "id": 241678830, "pid": 5717, "tid": 6759, "ts": 6302685373489.589, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685381620.788, "dur": 81.857, + "args": { + "External id": 127966, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678841, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678841, "pid": 3, "tid": 7, "ts": 6302685381620.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373531.989, "dur": 6.710, + "args": { + "External id": 127966, "cbid": 211, "correlation": 241678841 + } + }, + { + "ph": "s", "id": 241678841, "pid": 5717, "tid": 6759, "ts": 6302685373531.989, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685381719.285, "dur": 426.243, + "args": { + "External id": 127978, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678865, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678865, "pid": 3, "tid": 7, "ts": 6302685381719.285, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373588.689, "dur": 7.430, + "args": { + "External id": 127978, "cbid": 211, "correlation": 241678865 + } + }, + { + "ph": "s", "id": 241678865, "pid": 5717, "tid": 6759, "ts": 6302685373588.689, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685382174.360, "dur": 92.577, + "args": { + "External id": 127979, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678875, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678875, "pid": 3, "tid": 7, "ts": 6302685382174.360, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373604.949, "dur": 4.340, + "args": { + "External id": 127979, "cbid": 211, "correlation": 241678875 + } + }, + { + "ph": "s", "id": 241678875, "pid": 5717, "tid": 6759, "ts": 6302685373604.949, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685382268.345, "dur": 1.088, + "args": { + "External id": 127980, "device": 3, "context": 1, "stream": 7, "correlation": 241678890, "bytes": 24, "memory bandwidth (GB/s)": 0.022058823529411766 + } + }, + { + "ph": "f", "id": 241678890, "pid": 3, "tid": 7, "ts": 6302685382268.345, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685373624.739, "dur": 5.940, + "args": { + "External id": 127980, "cbid": 51, "correlation": 241678890 + } + }, + { + "ph": "s", "id": 241678890, "pid": 5717, "tid": 6759, "ts": 6302685373624.739, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 3, "tid": 7, + "ts": 6302685382270.905, "dur": 43.072, + "args": { + "External id": 127980, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678892, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241678892, "pid": 3, "tid": 7, "ts": 6302685382270.905, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373631.789, "dur": 5.180, + "args": { + "External id": 127980, "cbid": 211, "correlation": 241678892 + } + }, + { + "ph": "s", "id": 241678892, "pid": 5717, "tid": 6759, "ts": 6302685373631.789, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685382314.617, "dur": 50.017, + "args": { + "External id": 127991, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678913, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678913, "pid": 3, "tid": 7, "ts": 6302685382314.617, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373689.019, "dur": 8.069, + "args": { + "External id": 127991, "cbid": 211, "correlation": 241678913 + } + }, + { + "ph": "s", "id": 241678913, "pid": 5717, "tid": 6759, "ts": 6302685373689.019, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685382365.370, "dur": 151.777, + "args": { + "External id": 127994, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678928, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678928, "pid": 3, "tid": 7, "ts": 6302685382365.370, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373721.108, "dur": 6.131, + "args": { + "External id": 127994, "cbid": 211, "correlation": 241678928 + } + }, + { + "ph": "s", "id": 241678928, "pid": 5717, "tid": 6759, "ts": 6302685373721.108, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685382517.883, "dur": 110.593, + "args": { + "External id": 127995, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678938, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678938, "pid": 3, "tid": 7, "ts": 6302685382517.883, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373735.708, "dur": 4.220, + "args": { + "External id": 127995, "cbid": 211, "correlation": 241678938 + } + }, + { + "ph": "s", "id": 241678938, "pid": 5717, "tid": 6759, "ts": 6302685373735.708, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685382629.148, "dur": 79.136, + "args": { + "External id": 127996, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678952, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241678952, "pid": 3, "tid": 7, "ts": 6302685382629.148, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373753.648, "dur": 5.151, + "args": { + "External id": 127996, "cbid": 211, "correlation": 241678952 + } + }, + { + "ph": "s", "id": 241678952, "pid": 5717, "tid": 6759, "ts": 6302685373753.648, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685382708.860, "dur": 2.784, + "args": { + "External id": 127999, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678966, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241678966, "pid": 3, "tid": 7, "ts": 6302685382708.860, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373796.368, "dur": 6.730, + "args": { + "External id": 127999, "cbid": 211, "correlation": 241678966 + } + }, + { + "ph": "s", "id": 241678966, "pid": 5717, "tid": 6759, "ts": 6302685373796.368, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685382712.252, "dur": 2.912, + "args": { + "External id": 128003, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678976, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241678976, "pid": 3, "tid": 7, "ts": 6302685382712.252, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373815.518, "dur": 4.530, + "args": { + "External id": 128003, "cbid": 211, "correlation": 241678976 + } + }, + { + "ph": "s", "id": 241678976, "pid": 5717, "tid": 6759, "ts": 6302685373815.518, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685382715.772, "dur": 3.008, + "args": { + "External id": 128004, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241678986, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241678986, "pid": 3, "tid": 7, "ts": 6302685382715.772, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373827.618, "dur": 3.850, + "args": { + "External id": 128004, "cbid": 211, "correlation": 241678986 + } + }, + { + "ph": "s", "id": 241678986, "pid": 5717, "tid": 6759, "ts": 6302685373827.618, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685382719.388, "dur": 27.617, + "args": { + "External id": 128012, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679004, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679004, "pid": 3, "tid": 7, "ts": 6302685382719.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373880.298, "dur": 10.720, + "args": { + "External id": 128012, "cbid": 211, "correlation": 241679004 + } + }, + { + "ph": "s", "id": 241679004, "pid": 5717, "tid": 6759, "ts": 6302685373880.298, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685382747.645, "dur": 113.088, + "args": { + "External id": 128018, "device": 3, "context": 1, "stream": 7, "correlation": 241679018, "bytes": 50331648, "memory bandwidth (GB/s)": 445.0662139219015 + } + }, + { + "ph": "f", "id": 241679018, "pid": 3, "tid": 7, "ts": 6302685382747.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685373938.528, "dur": 18.900, + "args": { + "External id": 128018, "cbid": 41, "correlation": 241679018 + } + }, + { + "ph": "s", "id": 241679018, "pid": 5717, "tid": 6759, "ts": 6302685373938.528, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685382861.373, "dur": 74.625, + "args": { + "External id": 128020, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679030, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679030, "pid": 3, "tid": 7, "ts": 6302685382861.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373978.788, "dur": 6.210, + "args": { + "External id": 128020, "cbid": 211, "correlation": 241679030 + } + }, + { + "ph": "s", "id": 241679030, "pid": 5717, "tid": 6759, "ts": 6302685373978.788, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685382936.702, "dur": 150.177, + "args": { + "External id": 128021, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679040, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679040, "pid": 3, "tid": 7, "ts": 6302685382936.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685373993.768, "dur": 4.070, + "args": { + "External id": 128021, "cbid": 211, "correlation": 241679040 + } + }, + { + "ph": "s", "id": 241679040, "pid": 5717, "tid": 6759, "ts": 6302685373993.768, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685383087.519, "dur": 142.082, + "args": { + "External id": 128022, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679047, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679047, "pid": 3, "tid": 7, "ts": 6302685383087.519, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374010.398, "dur": 4.360, + "args": { + "External id": 128022, "cbid": 211, "correlation": 241679047 + } + }, + { + "ph": "s", "id": 241679047, "pid": 5717, "tid": 6759, "ts": 6302685374010.398, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685383230.273, "dur": 46.592, + "args": { + "External id": 128028, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679066, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679066, "pid": 3, "tid": 7, "ts": 6302685383230.273, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374045.508, "dur": 6.430, + "args": { + "External id": 128028, "cbid": 211, "correlation": 241679066 + } + }, + { + "ph": "s", "id": 241679066, "pid": 5717, "tid": 6759, "ts": 6302685374045.508, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685383277.505, "dur": 42.176, + "args": { + "External id": 128029, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679074, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679074, "pid": 3, "tid": 7, "ts": 6302685383277.505, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374062.428, "dur": 4.260, + "args": { + "External id": 128029, "cbid": 211, "correlation": 241679074 + } + }, + { + "ph": "s", "id": 241679074, "pid": 5717, "tid": 6759, "ts": 6302685374062.428, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685383320.353, "dur": 334.563, + "args": { + "External id": 128044, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679107, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679107, "pid": 3, "tid": 7, "ts": 6302685383320.353, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374160.087, "dur": 9.171, + "args": { + "External id": 128044, "cbid": 211, "correlation": 241679107 + } + }, + { + "ph": "s", "id": 241679107, "pid": 5717, "tid": 6759, "ts": 6302685374160.087, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 3, "tid": 7, + "ts": 6302685383655.620, "dur": 428.323, + "args": { + "External id": 128033, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679135, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679135, "pid": 3, "tid": 7, "ts": 6302685383655.620, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374223.417, "dur": 6.860, + "args": { + "External id": 128033, "cbid": 307, "correlation": 241679135 + } + }, + { + "ph": "s", "id": 241679135, "pid": 5717, "tid": 6759, "ts": 6302685374223.417, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685374345.327, "dur": 0.510, + "args": { + "External id": 128069, "cbid": 200, "correlation": 241679160 + } + }, + { + "ph": "f", "id": 241679160, "pid": 5717, "tid": 6759, "ts": 6302685374345.327, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685384085.159, "dur": 1.248, + "args": { + "External id": 128069, "device": 3, "context": 1, "stream": 7, "correlation": 241679163, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 241679163, "pid": 3, "tid": 7, "ts": 6302685384085.159, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685374347.507, "dur": 7.630, + "args": { + "External id": 128069, "cbid": 51, "correlation": 241679163 + } + }, + { + "ph": "s", "id": 241679163, "pid": 5717, "tid": 6759, "ts": 6302685374347.507, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685384087.687, "dur": 360.898, + "args": { + "External id": 128069, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679164, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679164, "pid": 3, "tid": 7, "ts": 6302685384087.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374355.367, "dur": 9.550, + "args": { + "External id": 128069, "cbid": 307, "correlation": 241679164 + } + }, + { + "ph": "s", "id": 241679164, "pid": 5717, "tid": 6759, "ts": 6302685374355.367, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685374520.397, "dur": 0.450, + "args": { + "External id": 128087, "cbid": 200, "correlation": 241679201 + } + }, + { + "ph": "f", "id": 241679201, "pid": 5717, "tid": 6759, "ts": 6302685374520.397, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685384449.929, "dur": 1.216, + "args": { + "External id": 128087, "device": 3, "context": 1, "stream": 7, "correlation": 241679204, "bytes": 1536, "memory bandwidth (GB/s)": 1.263157894736842 + } + }, + { + "ph": "f", "id": 241679204, "pid": 3, "tid": 7, "ts": 6302685384449.929, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685374523.437, "dur": 9.960, + "args": { + "External id": 128087, "cbid": 51, "correlation": 241679204 + } + }, + { + "ph": "s", "id": 241679204, "pid": 5717, "tid": 6759, "ts": 6302685374523.437, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685384452.361, "dur": 350.563, + "args": { + "External id": 128087, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679205, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679205, "pid": 3, "tid": 7, "ts": 6302685384452.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374533.627, "dur": 12.820, + "args": { + "External id": 128087, "cbid": 307, "correlation": 241679205 + } + }, + { + "ph": "s", "id": 241679205, "pid": 5717, "tid": 6759, "ts": 6302685374533.627, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685374594.166, "dur": 0.260, + "args": { + "External id": 128094, "cbid": 200, "correlation": 241679230 + } + }, + { + "ph": "f", "id": 241679230, "pid": 5717, "tid": 6759, "ts": 6302685374594.166, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685384803.564, "dur": 359.363, + "args": { + "External id": 128094, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679233, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679233, "pid": 3, "tid": 7, "ts": 6302685384803.564, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374596.717, "dur": 8.080, + "args": { + "External id": 128094, "cbid": 307, "correlation": 241679233 + } + }, + { + "ph": "s", "id": 241679233, "pid": 5717, "tid": 6759, "ts": 6302685374596.717, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685374715.086, "dur": 0.340, + "args": { + "External id": 128117, "cbid": 200, "correlation": 241679278 + } + }, + { + "ph": "f", "id": 241679278, "pid": 5717, "tid": 6759, "ts": 6302685374715.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685385164.143, "dur": 1.056, + "args": { + "External id": 128117, "device": 3, "context": 1, "stream": 7, "correlation": 241679281, "bytes": 1536, "memory bandwidth (GB/s)": 1.4545454545454546 + } + }, + { + "ph": "f", "id": 241679281, "pid": 3, "tid": 7, "ts": 6302685385164.143, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685374718.026, "dur": 10.030, + "args": { + "External id": 128117, "cbid": 51, "correlation": 241679281 + } + }, + { + "ph": "s", "id": 241679281, "pid": 5717, "tid": 6759, "ts": 6302685374718.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685385166.671, "dur": 355.267, + "args": { + "External id": 128117, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679282, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679282, "pid": 3, "tid": 7, "ts": 6302685385166.671, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374728.256, "dur": 12.210, + "args": { + "External id": 128117, "cbid": 307, "correlation": 241679282 + } + }, + { + "ph": "s", "id": 241679282, "pid": 5717, "tid": 6759, "ts": 6302685374728.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685374780.146, "dur": 0.290, + "args": { + "External id": 128124, "cbid": 200, "correlation": 241679307 + } + }, + { + "ph": "f", "id": 241679307, "pid": 5717, "tid": 6759, "ts": 6302685374780.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685385522.642, "dur": 357.570, + "args": { + "External id": 128124, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679310, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679310, "pid": 3, "tid": 7, "ts": 6302685385522.642, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374781.536, "dur": 5.530, + "args": { + "External id": 128124, "cbid": 307, "correlation": 241679310 + } + }, + { + "ph": "s", "id": 241679310, "pid": 5717, "tid": 6759, "ts": 6302685374781.536, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685385880.948, "dur": 64.929, + "args": { + "External id": 128129, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679324, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679324, "pid": 3, "tid": 7, "ts": 6302685385880.948, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374832.236, "dur": 9.450, + "args": { + "External id": 128129, "cbid": 211, "correlation": 241679324 + } + }, + { + "ph": "s", "id": 241679324, "pid": 5717, "tid": 6759, "ts": 6302685374832.236, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685385946.517, "dur": 72.064, + "args": { + "External id": 128141, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679348, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679348, "pid": 3, "tid": 7, "ts": 6302685385946.517, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374923.936, "dur": 11.130, + "args": { + "External id": 128141, "cbid": 211, "correlation": 241679348 + } + }, + { + "ph": "s", "id": 241679348, "pid": 5717, "tid": 6759, "ts": 6302685374923.936, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685386019.189, "dur": 53.985, + "args": { + "External id": 128142, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679358, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679358, "pid": 3, "tid": 7, "ts": 6302685386019.189, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374950.026, "dur": 6.840, + "args": { + "External id": 128142, "cbid": 211, "correlation": 241679358 + } + }, + { + "ph": "s", "id": 241679358, "pid": 5717, "tid": 6759, "ts": 6302685374950.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685386079.158, "dur": 4.128, + "args": { + "External id": 128143, "device": 3, "context": 1, "stream": 7, "correlation": 241679373, "bytes": 24, "memory bandwidth (GB/s)": 0.005813953488372093 + } + }, + { + "ph": "f", "id": 241679373, "pid": 3, "tid": 7, "ts": 6302685386079.158, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685374982.236, "dur": 9.250, + "args": { + "External id": 128143, "cbid": 51, "correlation": 241679373 + } + }, + { + "ph": "s", "id": 241679373, "pid": 5717, "tid": 6759, "ts": 6302685374982.236, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 3, "tid": 7, + "ts": 6302685386089.142, "dur": 44.864, + "args": { + "External id": 128143, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679375, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241679375, "pid": 3, "tid": 7, "ts": 6302685386089.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685374992.596, "dur": 5.420, + "args": { + "External id": 128143, "cbid": 211, "correlation": 241679375 + } + }, + { + "ph": "s", "id": 241679375, "pid": 5717, "tid": 6759, "ts": 6302685374992.596, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685386134.678, "dur": 66.721, + "args": { + "External id": 128154, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679396, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679396, "pid": 3, "tid": 7, "ts": 6302685386134.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375051.036, "dur": 7.809, + "args": { + "External id": 128154, "cbid": 211, "correlation": 241679396 + } + }, + { + "ph": "s", "id": 241679396, "pid": 5717, "tid": 6759, "ts": 6302685375051.036, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685386201.975, "dur": 200.609, + "args": { + "External id": 128157, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679411, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679411, "pid": 3, "tid": 7, "ts": 6302685386201.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375085.136, "dur": 6.380, + "args": { + "External id": 128157, "cbid": 211, "correlation": 241679411 + } + }, + { + "ph": "s", "id": 241679411, "pid": 5717, "tid": 6759, "ts": 6302685375085.136, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685386403.224, "dur": 145.730, + "args": { + "External id": 128158, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679421, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679421, "pid": 3, "tid": 7, "ts": 6302685386403.224, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375103.915, "dur": 6.820, + "args": { + "External id": 128158, "cbid": 211, "correlation": 241679421 + } + }, + { + "ph": "s", "id": 241679421, "pid": 5717, "tid": 6759, "ts": 6302685375103.915, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685386549.530, "dur": 79.456, + "args": { + "External id": 128159, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679435, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679435, "pid": 3, "tid": 7, "ts": 6302685386549.530, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375131.415, "dur": 8.520, + "args": { + "External id": 128159, "cbid": 211, "correlation": 241679435 + } + }, + { + "ph": "s", "id": 241679435, "pid": 5717, "tid": 6759, "ts": 6302685375131.415, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685386629.658, "dur": 3.328, + "args": { + "External id": 128162, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241679449, "pid": 3, "tid": 7, "ts": 6302685386629.658, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375179.775, "dur": 7.160, + "args": { + "External id": 128162, "cbid": 211, "correlation": 241679449 + } + }, + { + "ph": "s", "id": 241679449, "pid": 5717, "tid": 6759, "ts": 6302685375179.775, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685386633.594, "dur": 2.752, + "args": { + "External id": 128166, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679459, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241679459, "pid": 3, "tid": 7, "ts": 6302685386633.594, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375201.235, "dur": 4.620, + "args": { + "External id": 128166, "cbid": 211, "correlation": 241679459 + } + }, + { + "ph": "s", "id": 241679459, "pid": 5717, "tid": 6759, "ts": 6302685375201.235, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685386636.954, "dur": 2.816, + "args": { + "External id": 128167, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679469, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241679469, "pid": 3, "tid": 7, "ts": 6302685386636.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375213.255, "dur": 4.050, + "args": { + "External id": 128167, "cbid": 211, "correlation": 241679469 + } + }, + { + "ph": "s", "id": 241679469, "pid": 5717, "tid": 6759, "ts": 6302685375213.255, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685386640.346, "dur": 28.160, + "args": { + "External id": 128175, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679487, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679487, "pid": 3, "tid": 7, "ts": 6302685386640.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375260.385, "dur": 7.170, + "args": { + "External id": 128175, "cbid": 211, "correlation": 241679487 + } + }, + { + "ph": "s", "id": 241679487, "pid": 5717, "tid": 6759, "ts": 6302685375260.385, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685386669.146, "dur": 113.889, + "args": { + "External id": 128181, "device": 3, "context": 1, "stream": 7, "correlation": 241679501, "bytes": 50331648, "memory bandwidth (GB/s)": 441.9359903063509 + } + }, + { + "ph": "f", "id": 241679501, "pid": 3, "tid": 7, "ts": 6302685386669.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685375294.905, "dur": 26.060, + "args": { + "External id": 128181, "cbid": 41, "correlation": 241679501 + } + }, + { + "ph": "s", "id": 241679501, "pid": 5717, "tid": 6759, "ts": 6302685375294.905, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685386783.707, "dur": 75.169, + "args": { + "External id": 128183, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679513, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679513, "pid": 3, "tid": 7, "ts": 6302685386783.707, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375336.165, "dur": 5.860, + "args": { + "External id": 128183, "cbid": 211, "correlation": 241679513 + } + }, + { + "ph": "s", "id": 241679513, "pid": 5717, "tid": 6759, "ts": 6302685375336.165, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685386859.516, "dur": 238.466, + "args": { + "External id": 128184, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679523, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679523, "pid": 3, "tid": 7, "ts": 6302685386859.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375349.755, "dur": 3.770, + "args": { + "External id": 128184, "cbid": 211, "correlation": 241679523 + } + }, + { + "ph": "s", "id": 241679523, "pid": 5717, "tid": 6759, "ts": 6302685375349.755, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685387098.622, "dur": 169.601, + "args": { + "External id": 128185, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679530, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679530, "pid": 3, "tid": 7, "ts": 6302685387098.622, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375366.135, "dur": 4.180, + "args": { + "External id": 128185, "cbid": 211, "correlation": 241679530 + } + }, + { + "ph": "s", "id": 241679530, "pid": 5717, "tid": 6759, "ts": 6302685375366.135, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685387268.927, "dur": 50.528, + "args": { + "External id": 128191, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679549, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679549, "pid": 3, "tid": 7, "ts": 6302685387268.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375404.635, "dur": 6.990, + "args": { + "External id": 128191, "cbid": 211, "correlation": 241679549 + } + }, + { + "ph": "s", "id": 241679549, "pid": 5717, "tid": 6759, "ts": 6302685375404.635, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685387320.095, "dur": 59.297, + "args": { + "External id": 128192, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679561, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679561, "pid": 3, "tid": 7, "ts": 6302685387320.095, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375427.295, "dur": 6.810, + "args": { + "External id": 128192, "cbid": 211, "correlation": 241679561 + } + }, + { + "ph": "s", "id": 241679561, "pid": 5717, "tid": 6759, "ts": 6302685375427.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685387380.064, "dur": 42.816, + "args": { + "External id": 128195, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679574, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679574, "pid": 3, "tid": 7, "ts": 6302685387380.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375462.495, "dur": 7.320, + "args": { + "External id": 128195, "cbid": 211, "correlation": 241679574 + } + }, + { + "ph": "s", "id": 241679574, "pid": 5717, "tid": 6759, "ts": 6302685375462.495, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685375528.335, "dur": 0.509, + "args": { + "External id": 128205, "cbid": 200, "correlation": 241679610 + } + }, + { + "ph": "f", "id": 241679610, "pid": 5717, "tid": 6759, "ts": 6302685375528.335, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685387424.000, "dur": 1.248, + "args": { + "External id": 128205, "device": 3, "context": 1, "stream": 7, "correlation": 241679613, "bytes": 576, "memory bandwidth (GB/s)": 0.46153846153846156 + } + }, + { + "ph": "f", "id": 241679613, "pid": 3, "tid": 7, "ts": 6302685387424.000, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685375530.484, "dur": 6.880, + "args": { + "External id": 128205, "cbid": 51, "correlation": 241679613 + } + }, + { + "ph": "s", "id": 241679613, "pid": 5717, "tid": 6759, "ts": 6302685375530.484, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685387426.816, "dur": 174.273, + "args": { + "External id": 128205, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679614, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679614, "pid": 3, "tid": 7, "ts": 6302685387426.816, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375537.564, "dur": 6.800, + "args": { + "External id": 128205, "cbid": 307, "correlation": 241679614 + } + }, + { + "ph": "s", "id": 241679614, "pid": 5717, "tid": 6759, "ts": 6302685375537.564, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685387601.697, "dur": 155.841, + "args": { + "External id": 128212, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679636, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679636, "pid": 3, "tid": 7, "ts": 6302685387601.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375572.144, "dur": 5.930, + "args": { + "External id": 128212, "cbid": 211, "correlation": 241679636 + } + }, + { + "ph": "s", "id": 241679636, "pid": 5717, "tid": 6759, "ts": 6302685375572.144, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685375785.344, "dur": 0.470, + "args": { + "External id": 128238, "cbid": 200, "correlation": 241679683 + } + }, + { + "ph": "f", "id": 241679683, "pid": 5717, "tid": 6759, "ts": 6302685375785.344, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685375785.934, "dur": 0.200, + "args": { + "External id": 128238, "cbid": 200, "correlation": 241679684 + } + }, + { + "ph": "f", "id": 241679684, "pid": 5717, "tid": 6759, "ts": 6302685375785.934, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685375803.304, "dur": 0.230, + "args": { + "External id": 128238, "cbid": 200, "correlation": 241679702 + } + }, + { + "ph": "f", "id": 241679702, "pid": 5717, "tid": 6759, "ts": 6302685375803.304, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685387758.146, "dur": 97.249, + "args": { + "External id": 128238, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679703, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679703, "pid": 3, "tid": 7, "ts": 6302685387758.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375804.784, "dur": 11.380, + "args": { + "External id": 128238, "cbid": 211, "correlation": 241679703 + } + }, + { + "ph": "s", "id": 241679703, "pid": 5717, "tid": 6759, "ts": 6302685375804.784, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685375816.914, "dur": 0.960, + "args": { + "External id": 128238, "cbid": 273, "correlation": 241679705 + } + }, + { + "ph": "f", "id": 241679705, "pid": 5717, "tid": 6759, "ts": 6302685375816.914, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685387856.099, "dur": 986.088, + "args": { + "External id": 128238, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679706, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241679706, "pid": 3, "tid": 7, "ts": 6302685387856.099, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375818.204, "dur": 4.130, + "args": { + "External id": 128238, "cbid": 211, "correlation": 241679706 + } + }, + { + "ph": "s", "id": 241679706, "pid": 5717, "tid": 6759, "ts": 6302685375818.204, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685388842.891, "dur": 71.552, + "args": { + "External id": 128238, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679708, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241679708, "pid": 3, "tid": 7, "ts": 6302685388842.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685375822.874, "dur": 3.700, + "args": { + "External id": 128238, "cbid": 211, "correlation": 241679708 + } + }, + { + "ph": "s", "id": 241679708, "pid": 5717, "tid": 6759, "ts": 6302685375822.874, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685388915.115, "dur": 47.841, + "args": { + "External id": 128248, "device": 3, "context": 1, "stream": 7, "correlation": 241679734, "bytes": 25165824, "memory bandwidth (GB/s)": 526.0304759515897 + } + }, + { + "ph": "f", "id": 241679734, "pid": 3, "tid": 7, "ts": 6302685388915.115, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685375944.834, "dur": 15.420, + "args": { + "External id": 128248, "cbid": 41, "correlation": 241679734 + } + }, + { + "ph": "s", "id": 241679734, "pid": 5717, "tid": 6759, "ts": 6302685375944.834, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685388963.628, "dur": 32.160, + "args": { + "External id": 128245, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679752, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679752, "pid": 3, "tid": 7, "ts": 6302685388963.628, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685376090.133, "dur": 8.880, + "args": { + "External id": 128245, "cbid": 307, "correlation": 241679752 + } + }, + { + "ph": "s", "id": 241679752, "pid": 5717, "tid": 6759, "ts": 6302685376090.133, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685388996.460, "dur": 38.080, + "args": { + "External id": 128255, "device": 3, "context": 1, "stream": 7, "correlation": 241679767, "bytes": 25165824, "memory bandwidth (GB/s)": 660.8672268907563 + } + }, + { + "ph": "f", "id": 241679767, "pid": 3, "tid": 7, "ts": 6302685388996.460, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685376194.733, "dur": 16.580, + "args": { + "External id": 128255, "cbid": 41, "correlation": 241679767 + } + }, + { + "ph": "s", "id": 241679767, "pid": 5717, "tid": 6759, "ts": 6302685376194.733, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685389035.148, "dur": 25.280, + "args": { + "External id": 128252, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679785, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679785, "pid": 3, "tid": 7, "ts": 6302685389035.148, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685376313.283, "dur": 8.310, + "args": { + "External id": 128252, "cbid": 307, "correlation": 241679785 + } + }, + { + "ph": "s", "id": 241679785, "pid": 5717, "tid": 6759, "ts": 6302685376313.283, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685376464.502, "dur": 1.610, + "args": { + "External id": 128279, "cbid": 200, "correlation": 241679829 + } + }, + { + "ph": "f", "id": 241679829, "pid": 5717, "tid": 6759, "ts": 6302685376464.502, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685389061.260, "dur": 0.768, + "args": { + "External id": 128279, "device": 3, "context": 1, "stream": 7, "correlation": 241679832, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241679832, "pid": 3, "tid": 7, "ts": 6302685389061.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685376467.822, "dur": 11.670, + "args": { + "External id": 128279, "cbid": 51, "correlation": 241679832 + } + }, + { + "ph": "s", "id": 241679832, "pid": 5717, "tid": 6759, "ts": 6302685376467.822, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685389063.180, "dur": 140.193, + "args": { + "External id": 128279, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679833, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679833, "pid": 3, "tid": 7, "ts": 6302685389063.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685376479.742, "dur": 9.480, + "args": { + "External id": 128279, "cbid": 307, "correlation": 241679833 + } + }, + { + "ph": "s", "id": 241679833, "pid": 5717, "tid": 6759, "ts": 6302685376479.742, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685389204.077, "dur": 120.321, + "args": { + "External id": 128286, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679855, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679855, "pid": 3, "tid": 7, "ts": 6302685389204.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685376530.562, "dur": 10.470, + "args": { + "External id": 128286, "cbid": 211, "correlation": 241679855 + } + }, + { + "ph": "s", "id": 241679855, "pid": 5717, "tid": 6759, "ts": 6302685376530.562, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685376677.532, "dur": 0.440, + "args": { + "External id": 128309, "cbid": 200, "correlation": 241679901 + } + }, + { + "ph": "f", "id": 241679901, "pid": 5717, "tid": 6759, "ts": 6302685376677.532, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685389325.198, "dur": 0.768, + "args": { + "External id": 128309, "device": 3, "context": 1, "stream": 7, "correlation": 241679904, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241679904, "pid": 3, "tid": 7, "ts": 6302685389325.198, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685376679.482, "dur": 6.660, + "args": { + "External id": 128309, "cbid": 51, "correlation": 241679904 + } + }, + { + "ph": "s", "id": 241679904, "pid": 5717, "tid": 6759, "ts": 6302685376679.482, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685389327.118, "dur": 137.089, + "args": { + "External id": 128309, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679905, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679905, "pid": 3, "tid": 7, "ts": 6302685389327.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685376686.342, "dur": 7.360, + "args": { + "External id": 128309, "cbid": 307, "correlation": 241679905 + } + }, + { + "ph": "s", "id": 241679905, "pid": 5717, "tid": 6759, "ts": 6302685376686.342, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685389464.943, "dur": 119.105, + "args": { + "External id": 128316, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679927, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679927, "pid": 3, "tid": 7, "ts": 6302685389464.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685376721.502, "dur": 5.450, + "args": { + "External id": 128316, "cbid": 211, "correlation": 241679927 + } + }, + { + "ph": "s", "id": 241679927, "pid": 5717, "tid": 6759, "ts": 6302685376721.502, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685389584.720, "dur": 42.017, + "args": { + "External id": 128321, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679942, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241679942, "pid": 3, "tid": 7, "ts": 6302685389584.720, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685376769.952, "dur": 7.889, + "args": { + "External id": 128321, "cbid": 211, "correlation": 241679942 + } + }, + { + "ph": "s", "id": 241679942, "pid": 5717, "tid": 6759, "ts": 6302685376769.952, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685376905.791, "dur": 0.450, + "args": { + "External id": 128340, "cbid": 200, "correlation": 241679986 + } + }, + { + "ph": "f", "id": 241679986, "pid": 5717, "tid": 6759, "ts": 6302685376905.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685389627.505, "dur": 0.768, + "args": { + "External id": 128340, "device": 3, "context": 1, "stream": 7, "correlation": 241679989, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241679989, "pid": 3, "tid": 7, "ts": 6302685389627.505, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685376908.941, "dur": 10.140, + "args": { + "External id": 128340, "cbid": 51, "correlation": 241679989 + } + }, + { + "ph": "s", "id": 241679989, "pid": 5717, "tid": 6759, "ts": 6302685376908.941, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685389629.937, "dur": 139.553, + "args": { + "External id": 128340, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241679990, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241679990, "pid": 3, "tid": 7, "ts": 6302685389629.937, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685376919.281, "dur": 11.680, + "args": { + "External id": 128340, "cbid": 307, "correlation": 241679990 + } + }, + { + "ph": "s", "id": 241679990, "pid": 5717, "tid": 6759, "ts": 6302685376919.281, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685389770.098, "dur": 118.368, + "args": { + "External id": 128347, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680012, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241680012, "pid": 3, "tid": 7, "ts": 6302685389770.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685376957.361, "dur": 5.830, + "args": { + "External id": 128347, "cbid": 211, "correlation": 241680012 + } + }, + { + "ph": "s", "id": 241680012, "pid": 5717, "tid": 6759, "ts": 6302685376957.361, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685389889.202, "dur": 38.241, + "args": { + "External id": 128352, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680023, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680023, "pid": 3, "tid": 7, "ts": 6302685389889.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377000.241, "dur": 6.830, + "args": { + "External id": 128352, "cbid": 211, "correlation": 241680023 + } + }, + { + "ph": "s", "id": 241680023, "pid": 5717, "tid": 6759, "ts": 6302685377000.241, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685389928.051, "dur": 42.624, + "args": { + "External id": 128364, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680047, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680047, "pid": 3, "tid": 7, "ts": 6302685389928.051, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377080.411, "dur": 12.880, + "args": { + "External id": 128364, "cbid": 211, "correlation": 241680047 + } + }, + { + "ph": "s", "id": 241680047, "pid": 5717, "tid": 6759, "ts": 6302685377080.411, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685389971.283, "dur": 23.809, + "args": { + "External id": 128365, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680057, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680057, "pid": 3, "tid": 7, "ts": 6302685389971.283, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377107.521, "dur": 8.010, + "args": { + "External id": 128365, "cbid": 211, "correlation": 241680057 + } + }, + { + "ph": "s", "id": 241680057, "pid": 5717, "tid": 6759, "ts": 6302685377107.521, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685389996.052, "dur": 0.736, + "args": { + "External id": 128366, "device": 3, "context": 1, "stream": 7, "correlation": 241680072, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 241680072, "pid": 3, "tid": 7, "ts": 6302685389996.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685377138.421, "dur": 5.860, + "args": { + "External id": 128366, "cbid": 51, "correlation": 241680072 + } + }, + { + "ph": "s", "id": 241680072, "pid": 5717, "tid": 6759, "ts": 6302685377138.421, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 3, "tid": 7, + "ts": 6302685389997.940, "dur": 42.048, + "args": { + "External id": 128366, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680074, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241680074, "pid": 3, "tid": 7, "ts": 6302685389997.940, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377145.401, "dur": 5.150, + "args": { + "External id": 128366, "cbid": 211, "correlation": 241680074 + } + }, + { + "ph": "s", "id": 241680074, "pid": 5717, "tid": 6759, "ts": 6302685377145.401, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685390040.724, "dur": 43.936, + "args": { + "External id": 128377, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680095, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680095, "pid": 3, "tid": 7, "ts": 6302685390040.724, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377207.831, "dur": 8.369, + "args": { + "External id": 128377, "cbid": 211, "correlation": 241680095 + } + }, + { + "ph": "s", "id": 241680095, "pid": 5717, "tid": 6759, "ts": 6302685377207.831, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685390085.396, "dur": 146.689, + "args": { + "External id": 128380, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680110, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680110, "pid": 3, "tid": 7, "ts": 6302685390085.396, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377240.240, "dur": 9.020, + "args": { + "External id": 128380, "cbid": 211, "correlation": 241680110 + } + }, + { + "ph": "s", "id": 241680110, "pid": 5717, "tid": 6759, "ts": 6302685377240.240, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685390232.693, "dur": 108.897, + "args": { + "External id": 128381, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680120, "pid": 3, "tid": 7, "ts": 6302685390232.693, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377266.231, "dur": 6.889, + "args": { + "External id": 128381, "cbid": 211, "correlation": 241680120 + } + }, + { + "ph": "s", "id": 241680120, "pid": 5717, "tid": 6759, "ts": 6302685377266.231, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685390342.198, "dur": 77.985, + "args": { + "External id": 128382, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680134, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680134, "pid": 3, "tid": 7, "ts": 6302685390342.198, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377294.960, "dur": 14.910, + "args": { + "External id": 128382, "cbid": 211, "correlation": 241680134 + } + }, + { + "ph": "s", "id": 241680134, "pid": 5717, "tid": 6759, "ts": 6302685377294.960, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685390420.919, "dur": 1.440, + "args": { + "External id": 128385, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680148, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241680148, "pid": 3, "tid": 7, "ts": 6302685390420.919, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377358.250, "dur": 11.780, + "args": { + "External id": 128385, "cbid": 211, "correlation": 241680148 + } + }, + { + "ph": "s", "id": 241680148, "pid": 5717, "tid": 6759, "ts": 6302685377358.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685390423.031, "dur": 1.088, + "args": { + "External id": 128389, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680158, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241680158, "pid": 3, "tid": 7, "ts": 6302685390423.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377389.940, "dur": 6.880, + "args": { + "External id": 128389, "cbid": 211, "correlation": 241680158 + } + }, + { + "ph": "s", "id": 241680158, "pid": 5717, "tid": 6759, "ts": 6302685377389.940, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685390424.759, "dur": 0.960, + "args": { + "External id": 128390, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680168, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241680168, "pid": 3, "tid": 7, "ts": 6302685390424.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377408.550, "dur": 3.980, + "args": { + "External id": 128390, "cbid": 211, "correlation": 241680168 + } + }, + { + "ph": "s", "id": 241680168, "pid": 5717, "tid": 6759, "ts": 6302685377408.550, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685390426.327, "dur": 26.176, + "args": { + "External id": 128398, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680186, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680186, "pid": 3, "tid": 7, "ts": 6302685390426.327, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377457.280, "dur": 7.280, + "args": { + "External id": 128398, "cbid": 211, "correlation": 241680186 + } + }, + { + "ph": "s", "id": 241680186, "pid": 5717, "tid": 6759, "ts": 6302685377457.280, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685390453.207, "dur": 112.993, + "args": { + "External id": 128404, "device": 3, "context": 1, "stream": 7, "correlation": 241680200, "bytes": 50331648, "memory bandwidth (GB/s)": 445.4404078128734 + } + }, + { + "ph": "f", "id": 241680200, "pid": 3, "tid": 7, "ts": 6302685390453.207, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685377491.800, "dur": 14.310, + "args": { + "External id": 128404, "cbid": 41, "correlation": 241680200 + } + }, + { + "ph": "s", "id": 241680200, "pid": 5717, "tid": 6759, "ts": 6302685377491.800, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685390566.904, "dur": 73.376, + "args": { + "External id": 128406, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680212, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680212, "pid": 3, "tid": 7, "ts": 6302685390566.904, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377519.220, "dur": 5.460, + "args": { + "External id": 128406, "cbid": 211, "correlation": 241680212 + } + }, + { + "ph": "s", "id": 241680212, "pid": 5717, "tid": 6759, "ts": 6302685377519.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685390640.920, "dur": 149.153, + "args": { + "External id": 128407, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680222, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680222, "pid": 3, "tid": 7, "ts": 6302685390640.920, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377533.140, "dur": 3.970, + "args": { + "External id": 128407, "cbid": 211, "correlation": 241680222 + } + }, + { + "ph": "s", "id": 241680222, "pid": 5717, "tid": 6759, "ts": 6302685377533.140, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685390790.649, "dur": 151.842, + "args": { + "External id": 128408, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680229, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680229, "pid": 3, "tid": 7, "ts": 6302685390790.649, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377549.920, "dur": 4.780, + "args": { + "External id": 128408, "cbid": 211, "correlation": 241680229 + } + }, + { + "ph": "s", "id": 241680229, "pid": 5717, "tid": 6759, "ts": 6302685377549.920, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685390943.131, "dur": 48.192, + "args": { + "External id": 128414, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680248, "pid": 3, "tid": 7, "ts": 6302685390943.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377585.050, "dur": 6.110, + "args": { + "External id": 128414, "cbid": 211, "correlation": 241680248 + } + }, + { + "ph": "s", "id": 241680248, "pid": 5717, "tid": 6759, "ts": 6302685377585.050, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685390991.931, "dur": 40.160, + "args": { + "External id": 128415, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680256, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680256, "pid": 3, "tid": 7, "ts": 6302685390991.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377601.790, "dur": 4.290, + "args": { + "External id": 128415, "cbid": 211, "correlation": 241680256 + } + }, + { + "ph": "s", "id": 241680256, "pid": 5717, "tid": 6759, "ts": 6302685377601.790, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685391032.763, "dur": 320.771, + "args": { + "External id": 128430, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680289, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241680289, "pid": 3, "tid": 7, "ts": 6302685391032.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377697.719, "dur": 9.340, + "args": { + "External id": 128430, "cbid": 211, "correlation": 241680289 + } + }, + { + "ph": "s", "id": 241680289, "pid": 5717, "tid": 6759, "ts": 6302685377697.719, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 3, "tid": 7, + "ts": 6302685391354.206, "dur": 429.923, + "args": { + "External id": 128419, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680317, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680317, "pid": 3, "tid": 7, "ts": 6302685391354.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377780.599, "dur": 12.170, + "args": { + "External id": 128419, "cbid": 307, "correlation": 241680317 + } + }, + { + "ph": "s", "id": 241680317, "pid": 5717, "tid": 6759, "ts": 6302685377780.599, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685377911.009, "dur": 0.530, + "args": { + "External id": 128455, "cbid": 200, "correlation": 241680342 + } + }, + { + "ph": "f", "id": 241680342, "pid": 5717, "tid": 6759, "ts": 6302685377911.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685391785.025, "dur": 0.736, + "args": { + "External id": 128455, "device": 3, "context": 1, "stream": 7, "correlation": 241680345, "bytes": 1536, "memory bandwidth (GB/s)": 2.0869565217391304 + } + }, + { + "ph": "f", "id": 241680345, "pid": 3, "tid": 7, "ts": 6302685391785.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685377914.399, "dur": 12.210, + "args": { + "External id": 128455, "cbid": 51, "correlation": 241680345 + } + }, + { + "ph": "s", "id": 241680345, "pid": 5717, "tid": 6759, "ts": 6302685377914.399, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685391786.913, "dur": 366.371, + "args": { + "External id": 128455, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680346, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241680346, "pid": 3, "tid": 7, "ts": 6302685391786.913, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685377926.889, "dur": 12.400, + "args": { + "External id": 128455, "cbid": 307, "correlation": 241680346 + } + }, + { + "ph": "s", "id": 241680346, "pid": 5717, "tid": 6759, "ts": 6302685377926.889, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685378098.678, "dur": 0.480, + "args": { + "External id": 128473, "cbid": 200, "correlation": 241680383 + } + }, + { + "ph": "f", "id": 241680383, "pid": 5717, "tid": 6759, "ts": 6302685378098.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685392154.180, "dur": 0.768, + "args": { + "External id": 128473, "device": 3, "context": 1, "stream": 7, "correlation": 241680386, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 241680386, "pid": 3, "tid": 7, "ts": 6302685392154.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685378100.678, "dur": 8.580, + "args": { + "External id": 128473, "cbid": 51, "correlation": 241680386 + } + }, + { + "ph": "s", "id": 241680386, "pid": 5717, "tid": 6759, "ts": 6302685378100.678, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685392156.100, "dur": 348.002, + "args": { + "External id": 128473, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680387, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241680387, "pid": 3, "tid": 7, "ts": 6302685392156.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378109.618, "dur": 8.440, + "args": { + "External id": 128473, "cbid": 307, "correlation": 241680387 + } + }, + { + "ph": "s", "id": 241680387, "pid": 5717, "tid": 6759, "ts": 6302685378109.618, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685378164.678, "dur": 0.260, + "args": { + "External id": 128480, "cbid": 200, "correlation": 241680412 + } + }, + { + "ph": "f", "id": 241680412, "pid": 5717, "tid": 6759, "ts": 6302685378164.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685392505.254, "dur": 355.747, + "args": { + "External id": 128480, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680415, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241680415, "pid": 3, "tid": 7, "ts": 6302685392505.254, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378166.109, "dur": 6.040, + "args": { + "External id": 128480, "cbid": 307, "correlation": 241680415 + } + }, + { + "ph": "s", "id": 241680415, "pid": 5717, "tid": 6759, "ts": 6302685378166.109, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685378277.078, "dur": 0.370, + "args": { + "External id": 128503, "cbid": 200, "correlation": 241680460 + } + }, + { + "ph": "f", "id": 241680460, "pid": 5717, "tid": 6759, "ts": 6302685378277.078, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685392861.801, "dur": 0.768, + "args": { + "External id": 128503, "device": 3, "context": 1, "stream": 7, "correlation": 241680463, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 241680463, "pid": 3, "tid": 7, "ts": 6302685392861.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685378280.028, "dur": 9.930, + "args": { + "External id": 128503, "cbid": 51, "correlation": 241680463 + } + }, + { + "ph": "s", "id": 241680463, "pid": 5717, "tid": 6759, "ts": 6302685378280.028, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685392863.721, "dur": 352.995, + "args": { + "External id": 128503, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680464, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241680464, "pid": 3, "tid": 7, "ts": 6302685392863.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378290.148, "dur": 18.960, + "args": { + "External id": 128503, "cbid": 307, "correlation": 241680464 + } + }, + { + "ph": "s", "id": 241680464, "pid": 5717, "tid": 6759, "ts": 6302685378290.148, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685378358.688, "dur": 0.330, + "args": { + "External id": 128510, "cbid": 200, "correlation": 241680489 + } + }, + { + "ph": "f", "id": 241680489, "pid": 5717, "tid": 6759, "ts": 6302685378358.688, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685393217.388, "dur": 354.850, + "args": { + "External id": 128510, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680492, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241680492, "pid": 3, "tid": 7, "ts": 6302685393217.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378360.198, "dur": 6.970, + "args": { + "External id": 128510, "cbid": 307, "correlation": 241680492 + } + }, + { + "ph": "s", "id": 241680492, "pid": 5717, "tid": 6759, "ts": 6302685378360.198, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685393572.974, "dur": 51.649, + "args": { + "External id": 128515, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680506, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680506, "pid": 3, "tid": 7, "ts": 6302685393572.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378417.298, "dur": 7.650, + "args": { + "External id": 128515, "cbid": 211, "correlation": 241680506 + } + }, + { + "ph": "s", "id": 241680506, "pid": 5717, "tid": 6759, "ts": 6302685378417.298, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685393625.231, "dur": 43.712, + "args": { + "External id": 128527, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680530, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680530, "pid": 3, "tid": 7, "ts": 6302685393625.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378480.708, "dur": 8.380, + "args": { + "External id": 128527, "cbid": 211, "correlation": 241680530 + } + }, + { + "ph": "s", "id": 241680530, "pid": 5717, "tid": 6759, "ts": 6302685378480.708, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685393669.679, "dur": 25.600, + "args": { + "External id": 128528, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680540, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680540, "pid": 3, "tid": 7, "ts": 6302685393669.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378498.318, "dur": 4.350, + "args": { + "External id": 128528, "cbid": 211, "correlation": 241680540 + } + }, + { + "ph": "s", "id": 241680540, "pid": 5717, "tid": 6759, "ts": 6302685378498.318, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685393696.079, "dur": 0.736, + "args": { + "External id": 128529, "device": 3, "context": 1, "stream": 7, "correlation": 241680555, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 241680555, "pid": 3, "tid": 7, "ts": 6302685393696.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685378518.358, "dur": 5.400, + "args": { + "External id": 128529, "cbid": 51, "correlation": 241680555 + } + }, + { + "ph": "s", "id": 241680555, "pid": 5717, "tid": 6759, "ts": 6302685378518.358, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 3, "tid": 7, + "ts": 6302685393697.967, "dur": 42.432, + "args": { + "External id": 128529, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680557, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241680557, "pid": 3, "tid": 7, "ts": 6302685393697.967, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378524.848, "dur": 5.009, + "args": { + "External id": 128529, "cbid": 211, "correlation": 241680557 + } + }, + { + "ph": "s", "id": 241680557, "pid": 5717, "tid": 6759, "ts": 6302685378524.848, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685393741.007, "dur": 48.705, + "args": { + "External id": 128540, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680578, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680578, "pid": 3, "tid": 7, "ts": 6302685393741.007, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378582.697, "dur": 10.100, + "args": { + "External id": 128540, "cbid": 211, "correlation": 241680578 + } + }, + { + "ph": "s", "id": 241680578, "pid": 5717, "tid": 6759, "ts": 6302685378582.697, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685393790.384, "dur": 143.361, + "args": { + "External id": 128543, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680593, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680593, "pid": 3, "tid": 7, "ts": 6302685393790.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378633.787, "dur": 9.390, + "args": { + "External id": 128543, "cbid": 211, "correlation": 241680593 + } + }, + { + "ph": "s", "id": 241680593, "pid": 5717, "tid": 6759, "ts": 6302685378633.787, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685393934.353, "dur": 108.769, + "args": { + "External id": 128544, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680603, "pid": 3, "tid": 7, "ts": 6302685393934.353, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378656.507, "dur": 7.210, + "args": { + "External id": 128544, "cbid": 211, "correlation": 241680603 + } + }, + { + "ph": "s", "id": 241680603, "pid": 5717, "tid": 6759, "ts": 6302685378656.507, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685394043.730, "dur": 78.145, + "args": { + "External id": 128545, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680617, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680617, "pid": 3, "tid": 7, "ts": 6302685394043.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378684.897, "dur": 7.280, + "args": { + "External id": 128545, "cbid": 211, "correlation": 241680617 + } + }, + { + "ph": "s", "id": 241680617, "pid": 5717, "tid": 6759, "ts": 6302685378684.897, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685394122.579, "dur": 1.472, + "args": { + "External id": 128548, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680631, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241680631, "pid": 3, "tid": 7, "ts": 6302685394122.579, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378734.747, "dur": 10.080, + "args": { + "External id": 128548, "cbid": 211, "correlation": 241680631 + } + }, + { + "ph": "s", "id": 241680631, "pid": 5717, "tid": 6759, "ts": 6302685378734.747, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685394124.723, "dur": 0.992, + "args": { + "External id": 128552, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680641, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241680641, "pid": 3, "tid": 7, "ts": 6302685394124.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378767.097, "dur": 7.410, + "args": { + "External id": 128552, "cbid": 211, "correlation": 241680641 + } + }, + { + "ph": "s", "id": 241680641, "pid": 5717, "tid": 6759, "ts": 6302685378767.097, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685394126.419, "dur": 0.992, + "args": { + "External id": 128553, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680651, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241680651, "pid": 3, "tid": 7, "ts": 6302685394126.419, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378784.677, "dur": 3.880, + "args": { + "External id": 128553, "cbid": 211, "correlation": 241680651 + } + }, + { + "ph": "s", "id": 241680651, "pid": 5717, "tid": 6759, "ts": 6302685378784.677, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685394128.019, "dur": 26.528, + "args": { + "External id": 128561, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680669, "pid": 3, "tid": 7, "ts": 6302685394128.019, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378831.767, "dur": 7.530, + "args": { + "External id": 128561, "cbid": 211, "correlation": 241680669 + } + }, + { + "ph": "s", "id": 241680669, "pid": 5717, "tid": 6759, "ts": 6302685378831.767, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685394155.667, "dur": 111.425, + "args": { + "External id": 128567, "device": 3, "context": 1, "stream": 7, "correlation": 241680683, "bytes": 50331648, "memory bandwidth (GB/s)": 451.70875476778104 + } + }, + { + "ph": "f", "id": 241680683, "pid": 3, "tid": 7, "ts": 6302685394155.667, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685378866.817, "dur": 15.880, + "args": { + "External id": 128567, "cbid": 41, "correlation": 241680683 + } + }, + { + "ph": "s", "id": 241680683, "pid": 5717, "tid": 6759, "ts": 6302685378866.817, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685394267.668, "dur": 68.352, + "args": { + "External id": 128569, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680695, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680695, "pid": 3, "tid": 7, "ts": 6302685394267.668, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378896.067, "dur": 5.390, + "args": { + "External id": 128569, "cbid": 211, "correlation": 241680695 + } + }, + { + "ph": "s", "id": 241680695, "pid": 5717, "tid": 6759, "ts": 6302685378896.067, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685394336.628, "dur": 149.377, + "args": { + "External id": 128570, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680705, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680705, "pid": 3, "tid": 7, "ts": 6302685394336.628, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378910.017, "dur": 3.880, + "args": { + "External id": 128570, "cbid": 211, "correlation": 241680705 + } + }, + { + "ph": "s", "id": 241680705, "pid": 5717, "tid": 6759, "ts": 6302685378910.017, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685394486.741, "dur": 136.929, + "args": { + "External id": 128571, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680712, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680712, "pid": 3, "tid": 7, "ts": 6302685394486.741, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378925.987, "dur": 4.370, + "args": { + "External id": 128571, "cbid": 211, "correlation": 241680712 + } + }, + { + "ph": "s", "id": 241680712, "pid": 5717, "tid": 6759, "ts": 6302685378925.987, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685394624.278, "dur": 42.657, + "args": { + "External id": 128577, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680731, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680731, "pid": 3, "tid": 7, "ts": 6302685394624.278, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378960.767, "dur": 5.930, + "args": { + "External id": 128577, "cbid": 211, "correlation": 241680731 + } + }, + { + "ph": "s", "id": 241680731, "pid": 5717, "tid": 6759, "ts": 6302685378960.767, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685394667.575, "dur": 58.016, + "args": { + "External id": 128578, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680743, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680743, "pid": 3, "tid": 7, "ts": 6302685394667.575, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685378980.976, "dur": 4.920, + "args": { + "External id": 128578, "cbid": 211, "correlation": 241680743 + } + }, + { + "ph": "s", "id": 241680743, "pid": 5717, "tid": 6759, "ts": 6302685378980.976, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685394726.327, "dur": 41.984, + "args": { + "External id": 128581, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680756, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680756, "pid": 3, "tid": 7, "ts": 6302685394726.327, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685379009.796, "dur": 5.680, + "args": { + "External id": 128581, "cbid": 211, "correlation": 241680756 + } + }, + { + "ph": "s", "id": 241680756, "pid": 5717, "tid": 6759, "ts": 6302685379009.796, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685379070.336, "dur": 0.500, + "args": { + "External id": 128591, "cbid": 200, "correlation": 241680792 + } + }, + { + "ph": "f", "id": 241680792, "pid": 5717, "tid": 6759, "ts": 6302685379070.336, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685394769.175, "dur": 0.928, + "args": { + "External id": 128591, "device": 3, "context": 1, "stream": 7, "correlation": 241680795, "bytes": 576, "memory bandwidth (GB/s)": 0.6206896551724138 + } + }, + { + "ph": "f", "id": 241680795, "pid": 3, "tid": 7, "ts": 6302685394769.175, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685379072.456, "dur": 6.730, + "args": { + "External id": 128591, "cbid": 51, "correlation": 241680795 + } + }, + { + "ph": "s", "id": 241680795, "pid": 5717, "tid": 6759, "ts": 6302685379072.456, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685394771.255, "dur": 135.841, + "args": { + "External id": 128591, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680796, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241680796, "pid": 3, "tid": 7, "ts": 6302685394771.255, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685379079.416, "dur": 6.440, + "args": { + "External id": 128591, "cbid": 307, "correlation": 241680796 + } + }, + { + "ph": "s", "id": 241680796, "pid": 5717, "tid": 6759, "ts": 6302685379079.416, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685394907.768, "dur": 118.401, + "args": { + "External id": 128598, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680818, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241680818, "pid": 3, "tid": 7, "ts": 6302685394907.768, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685379113.376, "dur": 5.820, + "args": { + "External id": 128598, "cbid": 211, "correlation": 241680818 + } + }, + { + "ph": "s", "id": 241680818, "pid": 5717, "tid": 6759, "ts": 6302685379113.376, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685379283.066, "dur": 0.460, + "args": { + "External id": 128624, "cbid": 200, "correlation": 241680865 + } + }, + { + "ph": "f", "id": 241680865, "pid": 5717, "tid": 6759, "ts": 6302685379283.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685379283.636, "dur": 0.200, + "args": { + "External id": 128624, "cbid": 200, "correlation": 241680866 + } + }, + { + "ph": "f", "id": 241680866, "pid": 5717, "tid": 6759, "ts": 6302685379283.636, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685379308.476, "dur": 0.230, + "args": { + "External id": 128624, "cbid": 200, "correlation": 241680884 + } + }, + { + "ph": "f", "id": 241680884, "pid": 5717, "tid": 6759, "ts": 6302685379308.476, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685395026.841, "dur": 96.513, + "args": { + "External id": 128624, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680885, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680885, "pid": 3, "tid": 7, "ts": 6302685395026.841, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685379309.806, "dur": 11.210, + "args": { + "External id": 128624, "cbid": 211, "correlation": 241680885 + } + }, + { + "ph": "s", "id": 241680885, "pid": 5717, "tid": 6759, "ts": 6302685379309.806, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685379321.766, "dur": 0.920, + "args": { + "External id": 128624, "cbid": 273, "correlation": 241680887 + } + }, + { + "ph": "f", "id": 241680887, "pid": 5717, "tid": 6759, "ts": 6302685379321.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685395124.058, "dur": 964.071, + "args": { + "External id": 128624, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680888, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241680888, "pid": 3, "tid": 7, "ts": 6302685395124.058, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685379322.996, "dur": 4.010, + "args": { + "External id": 128624, "cbid": 211, "correlation": 241680888 + } + }, + { + "ph": "s", "id": 241680888, "pid": 5717, "tid": 6759, "ts": 6302685379322.996, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685396088.865, "dur": 71.361, + "args": { + "External id": 128624, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680890, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241680890, "pid": 3, "tid": 7, "ts": 6302685396088.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685379327.536, "dur": 3.480, + "args": { + "External id": 128624, "cbid": 211, "correlation": 241680890 + } + }, + { + "ph": "s", "id": 241680890, "pid": 5717, "tid": 6759, "ts": 6302685379327.536, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685396160.866, "dur": 48.192, + "args": { + "External id": 128634, "device": 3, "context": 1, "stream": 7, "correlation": 241680916, "bytes": 25165824, "memory bandwidth (GB/s)": 522.199203187251 + } + }, + { + "ph": "f", "id": 241680916, "pid": 3, "tid": 7, "ts": 6302685396160.866, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685379454.435, "dur": 18.971, + "args": { + "External id": 128634, "cbid": 41, "correlation": 241680916 + } + }, + { + "ph": "s", "id": 241680916, "pid": 5717, "tid": 6759, "ts": 6302685379454.435, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685396209.730, "dur": 29.760, + "args": { + "External id": 128631, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680934, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680934, "pid": 3, "tid": 7, "ts": 6302685396209.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685379567.775, "dur": 8.560, + "args": { + "External id": 128631, "cbid": 307, "correlation": 241680934 + } + }, + { + "ph": "s", "id": 241680934, "pid": 5717, "tid": 6759, "ts": 6302685379567.775, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685396240.130, "dur": 38.881, + "args": { + "External id": 128641, "device": 3, "context": 1, "stream": 7, "correlation": 241680949, "bytes": 25165824, "memory bandwidth (GB/s)": 647.2524883619249 + } + }, + { + "ph": "f", "id": 241680949, "pid": 3, "tid": 7, "ts": 6302685396240.130, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685379635.325, "dur": 13.270, + "args": { + "External id": 128641, "cbid": 41, "correlation": 241680949 + } + }, + { + "ph": "s", "id": 241680949, "pid": 5717, "tid": 6759, "ts": 6302685379635.325, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685396279.651, "dur": 26.720, + "args": { + "External id": 128638, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241680967, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241680967, "pid": 3, "tid": 7, "ts": 6302685396279.651, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685379733.705, "dur": 8.090, + "args": { + "External id": 128638, "cbid": 307, "correlation": 241680967 + } + }, + { + "ph": "s", "id": 241680967, "pid": 5717, "tid": 6759, "ts": 6302685379733.705, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685379850.625, "dur": 0.490, + "args": { + "External id": 128665, "cbid": 200, "correlation": 241681011 + } + }, + { + "ph": "f", "id": 241681011, "pid": 5717, "tid": 6759, "ts": 6302685379850.625, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685396307.235, "dur": 0.768, + "args": { + "External id": 128665, "device": 3, "context": 1, "stream": 7, "correlation": 241681014, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241681014, "pid": 3, "tid": 7, "ts": 6302685396307.235, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685379852.745, "dur": 6.980, + "args": { + "External id": 128665, "cbid": 51, "correlation": 241681014 + } + }, + { + "ph": "s", "id": 241681014, "pid": 5717, "tid": 6759, "ts": 6302685379852.745, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685396309.155, "dur": 138.721, + "args": { + "External id": 128665, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681015, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681015, "pid": 3, "tid": 7, "ts": 6302685396309.155, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685379859.954, "dur": 7.640, + "args": { + "External id": 128665, "cbid": 307, "correlation": 241681015 + } + }, + { + "ph": "s", "id": 241681015, "pid": 5717, "tid": 6759, "ts": 6302685379859.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685396449.092, "dur": 119.585, + "args": { + "External id": 128672, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681037, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681037, "pid": 3, "tid": 7, "ts": 6302685396449.092, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685379896.754, "dur": 5.691, + "args": { + "External id": 128672, "cbid": 211, "correlation": 241681037 + } + }, + { + "ph": "s", "id": 241681037, "pid": 5717, "tid": 6759, "ts": 6302685379896.754, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685379997.254, "dur": 0.390, + "args": { + "External id": 128695, "cbid": 200, "correlation": 241681083 + } + }, + { + "ph": "f", "id": 241681083, "pid": 5717, "tid": 6759, "ts": 6302685379997.254, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685396569.605, "dur": 0.768, + "args": { + "External id": 128695, "device": 3, "context": 1, "stream": 7, "correlation": 241681086, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241681086, "pid": 3, "tid": 7, "ts": 6302685396569.605, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685379999.034, "dur": 5.870, + "args": { + "External id": 128695, "cbid": 51, "correlation": 241681086 + } + }, + { + "ph": "s", "id": 241681086, "pid": 5717, "tid": 6759, "ts": 6302685379999.034, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685396571.493, "dur": 139.297, + "args": { + "External id": 128695, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681087, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681087, "pid": 3, "tid": 7, "ts": 6302685396571.493, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380005.084, "dur": 6.930, + "args": { + "External id": 128695, "cbid": 307, "correlation": 241681087 + } + }, + { + "ph": "s", "id": 241681087, "pid": 5717, "tid": 6759, "ts": 6302685380005.084, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685396711.430, "dur": 118.113, + "args": { + "External id": 128702, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681109, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681109, "pid": 3, "tid": 7, "ts": 6302685396711.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380038.624, "dur": 5.660, + "args": { + "External id": 128702, "cbid": 211, "correlation": 241681109 + } + }, + { + "ph": "s", "id": 241681109, "pid": 5717, "tid": 6759, "ts": 6302685380038.624, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685396830.279, "dur": 41.216, + "args": { + "External id": 128707, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681124, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681124, "pid": 3, "tid": 7, "ts": 6302685396830.279, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380085.374, "dur": 10.940, + "args": { + "External id": 128707, "cbid": 211, "correlation": 241681124 + } + }, + { + "ph": "s", "id": 241681124, "pid": 5717, "tid": 6759, "ts": 6302685380085.374, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685380179.894, "dur": 0.440, + "args": { + "External id": 128726, "cbid": 200, "correlation": 241681168 + } + }, + { + "ph": "f", "id": 241681168, "pid": 5717, "tid": 6759, "ts": 6302685380179.894, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685396872.359, "dur": 0.768, + "args": { + "External id": 128726, "device": 3, "context": 1, "stream": 7, "correlation": 241681171, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241681171, "pid": 3, "tid": 7, "ts": 6302685396872.359, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685380181.784, "dur": 6.320, + "args": { + "External id": 128726, "cbid": 51, "correlation": 241681171 + } + }, + { + "ph": "s", "id": 241681171, "pid": 5717, "tid": 6759, "ts": 6302685380181.784, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685396874.247, "dur": 141.121, + "args": { + "External id": 128726, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681172, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681172, "pid": 3, "tid": 7, "ts": 6302685396874.247, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380188.304, "dur": 6.890, + "args": { + "External id": 128726, "cbid": 307, "correlation": 241681172 + } + }, + { + "ph": "s", "id": 241681172, "pid": 5717, "tid": 6759, "ts": 6302685380188.304, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685397016.008, "dur": 118.561, + "args": { + "External id": 128733, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681194, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681194, "pid": 3, "tid": 7, "ts": 6302685397016.008, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380223.184, "dur": 5.540, + "args": { + "External id": 128733, "cbid": 211, "correlation": 241681194 + } + }, + { + "ph": "s", "id": 241681194, "pid": 5717, "tid": 6759, "ts": 6302685380223.184, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685397135.241, "dur": 38.272, + "args": { + "External id": 128738, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681205, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681205, "pid": 3, "tid": 7, "ts": 6302685397135.241, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380265.844, "dur": 6.540, + "args": { + "External id": 128738, "cbid": 211, "correlation": 241681205 + } + }, + { + "ph": "s", "id": 241681205, "pid": 5717, "tid": 6759, "ts": 6302685380265.844, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685397174.121, "dur": 48.897, + "args": { + "External id": 128750, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681229, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681229, "pid": 3, "tid": 7, "ts": 6302685397174.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380343.433, "dur": 8.491, + "args": { + "External id": 128750, "cbid": 211, "correlation": 241681229 + } + }, + { + "ph": "s", "id": 241681229, "pid": 5717, "tid": 6759, "ts": 6302685380343.433, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685397224.170, "dur": 30.560, + "args": { + "External id": 128751, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681239, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681239, "pid": 3, "tid": 7, "ts": 6302685397224.170, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380361.524, "dur": 4.360, + "args": { + "External id": 128751, "cbid": 211, "correlation": 241681239 + } + }, + { + "ph": "s", "id": 241681239, "pid": 5717, "tid": 6759, "ts": 6302685380361.524, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685397255.626, "dur": 0.736, + "args": { + "External id": 128752, "device": 3, "context": 1, "stream": 7, "correlation": 241681254, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 241681254, "pid": 3, "tid": 7, "ts": 6302685397255.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685380381.484, "dur": 5.929, + "args": { + "External id": 128752, "cbid": 51, "correlation": 241681254 + } + }, + { + "ph": "s", "id": 241681254, "pid": 5717, "tid": 6759, "ts": 6302685380381.484, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 3, "tid": 7, + "ts": 6302685397257.514, "dur": 43.649, + "args": { + "External id": 128752, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681256, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241681256, "pid": 3, "tid": 7, "ts": 6302685397257.514, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380388.524, "dur": 4.999, + "args": { + "External id": 128752, "cbid": 211, "correlation": 241681256 + } + }, + { + "ph": "s", "id": 241681256, "pid": 5717, "tid": 6759, "ts": 6302685380388.524, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 3, "tid": 7, + "ts": 6302685397301.835, "dur": 52.704, + "args": { + "External id": 128763, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681277, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681277, "pid": 3, "tid": 7, "ts": 6302685397301.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380445.103, "dur": 8.080, + "args": { + "External id": 128763, "cbid": 211, "correlation": 241681277 + } + }, + { + "ph": "s", "id": 241681277, "pid": 5717, "tid": 6759, "ts": 6302685380445.103, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685397355.179, "dur": 148.897, + "args": { + "External id": 128766, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681292, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681292, "pid": 3, "tid": 7, "ts": 6302685397355.179, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380476.653, "dur": 5.660, + "args": { + "External id": 128766, "cbid": 211, "correlation": 241681292 + } + }, + { + "ph": "s", "id": 241681292, "pid": 5717, "tid": 6759, "ts": 6302685380476.653, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685397504.684, "dur": 108.065, + "args": { + "External id": 128767, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681302, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681302, "pid": 3, "tid": 7, "ts": 6302685397504.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380491.753, "dur": 12.340, + "args": { + "External id": 128767, "cbid": 211, "correlation": 241681302 + } + }, + { + "ph": "s", "id": 241681302, "pid": 5717, "tid": 6759, "ts": 6302685380491.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685397613.389, "dur": 78.048, + "args": { + "External id": 128768, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681316, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681316, "pid": 3, "tid": 7, "ts": 6302685397613.389, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380518.793, "dur": 4.670, + "args": { + "External id": 128768, "cbid": 211, "correlation": 241681316 + } + }, + { + "ph": "s", "id": 241681316, "pid": 5717, "tid": 6759, "ts": 6302685380518.793, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685397692.109, "dur": 1.376, + "args": { + "External id": 128771, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681330, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241681330, "pid": 3, "tid": 7, "ts": 6302685397692.109, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380549.933, "dur": 6.040, + "args": { + "External id": 128771, "cbid": 211, "correlation": 241681330 + } + }, + { + "ph": "s", "id": 241681330, "pid": 5717, "tid": 6759, "ts": 6302685380549.933, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685397694.093, "dur": 0.992, + "args": { + "External id": 128775, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241681340, "pid": 3, "tid": 7, "ts": 6302685397694.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380568.523, "dur": 4.740, + "args": { + "External id": 128775, "cbid": 211, "correlation": 241681340 + } + }, + { + "ph": "s", "id": 241681340, "pid": 5717, "tid": 6759, "ts": 6302685380568.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685397695.661, "dur": 0.992, + "args": { + "External id": 128776, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681350, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241681350, "pid": 3, "tid": 7, "ts": 6302685397695.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380580.423, "dur": 3.810, + "args": { + "External id": 128776, "cbid": 211, "correlation": 241681350 + } + }, + { + "ph": "s", "id": 241681350, "pid": 5717, "tid": 6759, "ts": 6302685380580.423, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 3, "tid": 7, + "ts": 6302685397697.325, "dur": 27.905, + "args": { + "External id": 128784, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681368, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681368, "pid": 3, "tid": 7, "ts": 6302685397697.325, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380626.493, "dur": 7.390, + "args": { + "External id": 128784, "cbid": 211, "correlation": 241681368 + } + }, + { + "ph": "s", "id": 241681368, "pid": 5717, "tid": 6759, "ts": 6302685380626.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685397725.838, "dur": 113.633, + "args": { + "External id": 128790, "device": 3, "context": 1, "stream": 7, "correlation": 241681382, "bytes": 50331648, "memory bandwidth (GB/s)": 442.93161317575 + } + }, + { + "ph": "f", "id": 241681382, "pid": 3, "tid": 7, "ts": 6302685397725.838, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685380660.353, "dur": 15.280, + "args": { + "External id": 128790, "cbid": 41, "correlation": 241681382 + } + }, + { + "ph": "s", "id": 241681382, "pid": 5717, "tid": 6759, "ts": 6302685380660.353, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685397840.079, "dur": 69.504, + "args": { + "External id": 128792, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681394, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681394, "pid": 3, "tid": 7, "ts": 6302685397840.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380690.383, "dur": 5.380, + "args": { + "External id": 128792, "cbid": 211, "correlation": 241681394 + } + }, + { + "ph": "s", "id": 241681394, "pid": 5717, "tid": 6759, "ts": 6302685380690.383, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685397910.191, "dur": 144.321, + "args": { + "External id": 128793, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681404, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681404, "pid": 3, "tid": 7, "ts": 6302685397910.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380703.343, "dur": 4.010, + "args": { + "External id": 128793, "cbid": 211, "correlation": 241681404 + } + }, + { + "ph": "s", "id": 241681404, "pid": 5717, "tid": 6759, "ts": 6302685380703.343, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685398055.216, "dur": 141.857, + "args": { + "External id": 128794, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681411, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681411, "pid": 3, "tid": 7, "ts": 6302685398055.216, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380721.523, "dur": 5.260, + "args": { + "External id": 128794, "cbid": 211, "correlation": 241681411 + } + }, + { + "ph": "s", "id": 241681411, "pid": 5717, "tid": 6759, "ts": 6302685380721.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685398197.745, "dur": 47.712, + "args": { + "External id": 128800, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681430, "pid": 3, "tid": 7, "ts": 6302685398197.745, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380770.063, "dur": 9.900, + "args": { + "External id": 128800, "cbid": 211, "correlation": 241681430 + } + }, + { + "ph": "s", "id": 241681430, "pid": 5717, "tid": 6759, "ts": 6302685380770.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685398246.033, "dur": 40.289, + "args": { + "External id": 128801, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681438, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681438, "pid": 3, "tid": 7, "ts": 6302685398246.033, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685380798.732, "dur": 7.720, + "args": { + "External id": 128801, "cbid": 211, "correlation": 241681438 + } + }, + { + "ph": "s", "id": 241681438, "pid": 5717, "tid": 6759, "ts": 6302685380798.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685381129.200, "dur": 40.065, + "args": { + "External id": 128817, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241681461, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241681461, "pid": 3, "tid": 17, "ts": 6302685381129.200, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685381115.142, "dur": 14.390, + "args": { + "External id": 128817, "cbid": 211, "correlation": 241681461 + } + }, + { + "ph": "s", "id": 241681461, "pid": 5717, "tid": 6759, "ts": 6302685381115.142, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685381281.234, "dur": 11.712, + "args": { + "External id": 128833, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241681474, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241681474, "pid": 3, "tid": 17, "ts": 6302685381281.234, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685381265.651, "dur": 10.960, + "args": { + "External id": 128833, "cbid": 211, "correlation": 241681474 + } + }, + { + "ph": "s", "id": 241681474, "pid": 5717, "tid": 6759, "ts": 6302685381265.651, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685381314.471, "dur": 1.850, + "args": { + "External id": 128802, "cbid": 135, "correlation": 241681484 + } + }, + { + "ph": "f", "id": 241681484, "pid": 5717, "tid": 6759, "ts": 6302685381314.471, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685381319.321, "dur": 2.320, + "args": { + "External id": 128802, "cbid": 147, "correlation": 241681488 + } + }, + { + "ph": "s", "id": 241681488, "pid": 5717, "tid": 6759, "ts": 6302685381319.321, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685381422.341, "dur": 1.820, + "args": { + "External id": 128835, "cbid": 317, "correlation": 241681501 + } + }, + { + "ph": "f", "id": 241681501, "pid": 5717, "tid": 6759, "ts": 6302685381422.341, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685381428.131, "dur": 1.590, + "args": { + "External id": 128835, "cbid": 135, "correlation": 241681503 + } + }, + { + "ph": "f", "id": 241681503, "pid": 5717, "tid": 6759, "ts": 6302685381428.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685381431.551, "dur": 1.310, + "args": { + "External id": 128835, "cbid": 147, "correlation": 241681507 + } + }, + { + "ph": "s", "id": 241681507, "pid": 5717, "tid": 6759, "ts": 6302685381431.551, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685381465.551, "dur": 1.470, + "args": { + "External id": 128835, "cbid": 409, "correlation": 241681510 + } + }, + { + "ph": "f", "id": 241681510, "pid": 5717, "tid": 6759, "ts": 6302685381465.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685381478.791, "dur": 2.520, + "args": { + "External id": 128835, "cbid": 135, "correlation": 241681513 + } + }, + { + "ph": "f", "id": 241681513, "pid": 5717, "tid": 6759, "ts": 6302685381478.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685381481.491, "dur": 1.110, + "args": { + "External id": 128835, "cbid": 147, "correlation": 241681514 + } + }, + { + "ph": "s", "id": 241681514, "pid": 5717, "tid": 6759, "ts": 6302685381481.491, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685381617.588, "dur": 6992.853, + "args": { + "External id": 128835, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241681516, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241681516, "pid": 3, "tid": 20, "ts": 6302685381617.588, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685381486.351, "dur": 17.200, + "args": { + "External id": 128835, "cbid": 430, "correlation": 241681516 + } + }, + { + "ph": "s", "id": 241681516, "pid": 5717, "tid": 6759, "ts": 6302685381486.351, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685381506.291, "dur": 0.500, + "args": { + "External id": 128835, "cbid": 135, "correlation": 241681518 + } + }, + { + "ph": "f", "id": 241681518, "pid": 5717, "tid": 6759, "ts": 6302685381506.291, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685381506.921, "dur": 0.600, + "args": { + "External id": 128835, "cbid": 147, "correlation": 241681519 + } + }, + { + "ph": "s", "id": 241681519, "pid": 5717, "tid": 6759, "ts": 6302685381506.921, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685381510.911, "dur": 0.800, + "args": { + "External id": 128835, "cbid": 135, "correlation": 241681522 + } + }, + { + "ph": "f", "id": 241681522, "pid": 5717, "tid": 6759, "ts": 6302685381510.911, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685381527.451, "dur": 0.440, + "args": { + "External id": 128835, "cbid": 135, "correlation": 241681529 + } + }, + { + "ph": "f", "id": 241681529, "pid": 5717, "tid": 6759, "ts": 6302685381527.451, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685381571.351, "dur": 1.210, + "args": { + "External id": 128837, "cbid": 147, "correlation": 241681534 + } + }, + { + "ph": "s", "id": 241681534, "pid": 5717, "tid": 6759, "ts": 6302685381571.351, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685381593.331, "dur": 0.930, + "args": { + "External id": 128802, "cbid": 135, "correlation": 241681549 + } + }, + { + "ph": "f", "id": 241681549, "pid": 5717, "tid": 6759, "ts": 6302685381593.331, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685398286.962, "dur": 315.618, + "args": { + "External id": 128839, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681574, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681574, "pid": 3, "tid": 7, "ts": 6302685398286.962, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685381800.610, "dur": 11.220, + "args": { + "External id": 128839, "cbid": 211, "correlation": 241681574 + } + }, + { + "ph": "s", "id": 241681574, "pid": 5717, "tid": 6759, "ts": 6302685381800.610, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 3, "tid": 7, + "ts": 6302685398603.252, "dur": 432.324, + "args": { + "External id": 128840, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681597, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241681597, "pid": 3, "tid": 7, "ts": 6302685398603.252, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685381868.250, "dur": 7.430, + "args": { + "External id": 128840, "cbid": 307, "correlation": 241681597 + } + }, + { + "ph": "s", "id": 241681597, "pid": 5717, "tid": 6759, "ts": 6302685381868.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685381914.370, "dur": 0.500, + "args": { + "External id": 128841, "cbid": 200, "correlation": 241681620 + } + }, + { + "ph": "f", "id": 241681620, "pid": 5717, "tid": 6759, "ts": 6302685381914.370, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685399036.343, "dur": 0.768, + "args": { + "External id": 128841, "device": 3, "context": 1, "stream": 7, "correlation": 241681623, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 241681623, "pid": 3, "tid": 7, "ts": 6302685399036.343, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685381916.530, "dur": 6.540, + "args": { + "External id": 128841, "cbid": 51, "correlation": 241681623 + } + }, + { + "ph": "s", "id": 241681623, "pid": 5717, "tid": 6759, "ts": 6302685381916.530, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685399038.263, "dur": 354.787, + "args": { + "External id": 128841, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681624, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681624, "pid": 3, "tid": 7, "ts": 6302685399038.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685381923.280, "dur": 6.060, + "args": { + "External id": 128841, "cbid": 307, "correlation": 241681624 + } + }, + { + "ph": "s", "id": 241681624, "pid": 5717, "tid": 6759, "ts": 6302685381923.280, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685381957.140, "dur": 0.280, + "args": { + "External id": 128842, "cbid": 200, "correlation": 241681649 + } + }, + { + "ph": "f", "id": 241681649, "pid": 5717, "tid": 6759, "ts": 6302685381957.140, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685399393.914, "dur": 0.768, + "args": { + "External id": 128842, "device": 3, "context": 1, "stream": 7, "correlation": 241681652, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 241681652, "pid": 3, "tid": 7, "ts": 6302685399393.914, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685381958.470, "dur": 4.660, + "args": { + "External id": 128842, "cbid": 51, "correlation": 241681652 + } + }, + { + "ph": "s", "id": 241681652, "pid": 5717, "tid": 6759, "ts": 6302685381958.470, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685399395.802, "dur": 347.843, + "args": { + "External id": 128842, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681653, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681653, "pid": 3, "tid": 7, "ts": 6302685399395.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685381963.300, "dur": 5.010, + "args": { + "External id": 128842, "cbid": 307, "correlation": 241681653 + } + }, + { + "ph": "s", "id": 241681653, "pid": 5717, "tid": 6759, "ts": 6302685381963.300, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685381991.780, "dur": 0.280, + "args": { + "External id": 128843, "cbid": 200, "correlation": 241681678 + } + }, + { + "ph": "f", "id": 241681678, "pid": 5717, "tid": 6759, "ts": 6302685381991.780, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685399744.221, "dur": 356.642, + "args": { + "External id": 128843, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681681, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681681, "pid": 3, "tid": 7, "ts": 6302685399744.221, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685381993.260, "dur": 5.170, + "args": { + "External id": 128843, "cbid": 307, "correlation": 241681681 + } + }, + { + "ph": "s", "id": 241681681, "pid": 5717, "tid": 6759, "ts": 6302685381993.260, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685382021.560, "dur": 0.250, + "args": { + "External id": 128844, "cbid": 200, "correlation": 241681706 + } + }, + { + "ph": "f", "id": 241681706, "pid": 5717, "tid": 6759, "ts": 6302685382021.560, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685400101.727, "dur": 0.800, + "args": { + "External id": 128844, "device": 3, "context": 1, "stream": 7, "correlation": 241681709, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241681709, "pid": 3, "tid": 7, "ts": 6302685400101.727, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685382022.880, "dur": 4.800, + "args": { + "External id": 128844, "cbid": 51, "correlation": 241681709 + } + }, + { + "ph": "s", "id": 241681709, "pid": 5717, "tid": 6759, "ts": 6302685382022.880, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685400103.647, "dur": 353.539, + "args": { + "External id": 128844, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681710, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681710, "pid": 3, "tid": 7, "ts": 6302685400103.647, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382027.840, "dur": 4.950, + "args": { + "External id": 128844, "cbid": 307, "correlation": 241681710 + } + }, + { + "ph": "s", "id": 241681710, "pid": 5717, "tid": 6759, "ts": 6302685382027.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685382056.000, "dur": 0.320, + "args": { + "External id": 128845, "cbid": 200, "correlation": 241681735 + } + }, + { + "ph": "f", "id": 241681735, "pid": 5717, "tid": 6759, "ts": 6302685382056.000, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685400457.858, "dur": 353.827, + "args": { + "External id": 128845, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681738, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681738, "pid": 3, "tid": 7, "ts": 6302685400457.858, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382057.420, "dur": 5.000, + "args": { + "External id": 128845, "cbid": 307, "correlation": 241681738 + } + }, + { + "ph": "s", "id": 241681738, "pid": 5717, "tid": 6759, "ts": 6302685382057.420, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685400812.389, "dur": 78.017, + "args": { + "External id": 128846, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681751, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681751, "pid": 3, "tid": 7, "ts": 6302685400812.389, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382105.020, "dur": 7.140, + "args": { + "External id": 128846, "cbid": 307, "correlation": 241681751 + } + }, + { + "ph": "s", "id": 241681751, "pid": 5717, "tid": 6759, "ts": 6302685382105.020, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 3, "tid": 7, + "ts": 6302685400891.046, "dur": 1.920, + "args": { + "External id": 128847, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681759, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241681759, "pid": 3, "tid": 7, "ts": 6302685400891.046, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382140.800, "dur": 6.749, + "args": { + "External id": 128847, "cbid": 307, "correlation": 241681759 + } + }, + { + "ph": "s", "id": 241681759, "pid": 5717, "tid": 6759, "ts": 6302685382140.800, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 3, "tid": 7, + "ts": 6302685400893.574, "dur": 112.160, + "args": { + "External id": 128848, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681767, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681767, "pid": 3, "tid": 7, "ts": 6302685400893.574, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382180.709, "dur": 7.100, + "args": { + "External id": 128848, "cbid": 307, "correlation": 241681767 + } + }, + { + "ph": "s", "id": 241681767, "pid": 5717, "tid": 6759, "ts": 6302685382180.709, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685382448.769, "dur": 0.510, + "args": { + "External id": 128867, "cbid": 200, "correlation": 241681813 + } + }, + { + "ph": "f", "id": 241681813, "pid": 5717, "tid": 6759, "ts": 6302685382448.769, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685401006.534, "dur": 0.736, + "args": { + "External id": 128867, "device": 3, "context": 1, "stream": 7, "correlation": 241681816, "bytes": 576, "memory bandwidth (GB/s)": 0.782608695652174 + } + }, + { + "ph": "f", "id": 241681816, "pid": 3, "tid": 7, "ts": 6302685401006.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685382450.969, "dur": 7.610, + "args": { + "External id": 128867, "cbid": 51, "correlation": 241681816 + } + }, + { + "ph": "s", "id": 241681816, "pid": 5717, "tid": 6759, "ts": 6302685382450.969, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685401008.454, "dur": 135.425, + "args": { + "External id": 128867, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681817, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681817, "pid": 3, "tid": 7, "ts": 6302685401008.454, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382458.809, "dur": 8.550, + "args": { + "External id": 128867, "cbid": 307, "correlation": 241681817 + } + }, + { + "ph": "s", "id": 241681817, "pid": 5717, "tid": 6759, "ts": 6302685382458.809, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685401144.551, "dur": 118.689, + "args": { + "External id": 128868, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681839, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681839, "pid": 3, "tid": 7, "ts": 6302685401144.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382494.659, "dur": 5.530, + "args": { + "External id": 128868, "cbid": 211, "correlation": 241681839 + } + }, + { + "ph": "s", "id": 241681839, "pid": 5717, "tid": 6759, "ts": 6302685382494.659, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685382580.348, "dur": 0.411, + "args": { + "External id": 128869, "cbid": 200, "correlation": 241681857 + } + }, + { + "ph": "f", "id": 241681857, "pid": 5717, "tid": 6759, "ts": 6302685382580.348, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685382580.868, "dur": 0.200, + "args": { + "External id": 128869, "cbid": 200, "correlation": 241681858 + } + }, + { + "ph": "f", "id": 241681858, "pid": 5717, "tid": 6759, "ts": 6302685382580.868, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685382599.718, "dur": 0.220, + "args": { + "External id": 128869, "cbid": 200, "correlation": 241681876 + } + }, + { + "ph": "f", "id": 241681876, "pid": 5717, "tid": 6759, "ts": 6302685382599.718, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685401263.848, "dur": 92.257, + "args": { + "External id": 128869, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681877, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681877, "pid": 3, "tid": 7, "ts": 6302685401263.848, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382601.138, "dur": 9.280, + "args": { + "External id": 128869, "cbid": 211, "correlation": 241681877 + } + }, + { + "ph": "s", "id": 241681877, "pid": 5717, "tid": 6759, "ts": 6302685382601.138, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685382611.168, "dur": 0.970, + "args": { + "External id": 128869, "cbid": 273, "correlation": 241681879 + } + }, + { + "ph": "f", "id": 241681879, "pid": 5717, "tid": 6759, "ts": 6302685382611.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685401356.745, "dur": 964.135, + "args": { + "External id": 128869, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681880, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241681880, "pid": 3, "tid": 7, "ts": 6302685401356.745, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382612.458, "dur": 4.140, + "args": { + "External id": 128869, "cbid": 211, "correlation": 241681880 + } + }, + { + "ph": "s", "id": 241681880, "pid": 5717, "tid": 6759, "ts": 6302685382612.458, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685402321.488, "dur": 71.265, + "args": { + "External id": 128869, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681882, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241681882, "pid": 3, "tid": 7, "ts": 6302685402321.488, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382617.148, "dur": 3.740, + "args": { + "External id": 128869, "cbid": 211, "correlation": 241681882 + } + }, + { + "ph": "s", "id": 241681882, "pid": 5717, "tid": 6759, "ts": 6302685382617.148, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685402393.361, "dur": 47.168, + "args": { + "External id": 128880, "device": 3, "context": 1, "stream": 7, "correlation": 241681904, "bytes": 25165824, "memory bandwidth (GB/s)": 533.5359565807327 + } + }, + { + "ph": "f", "id": 241681904, "pid": 3, "tid": 7, "ts": 6302685402393.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685382751.678, "dur": 17.570, + "args": { + "External id": 128880, "cbid": 41, "correlation": 241681904 + } + }, + { + "ph": "s", "id": 241681904, "pid": 5717, "tid": 6759, "ts": 6302685382751.678, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685402441.201, "dur": 32.160, + "args": { + "External id": 128877, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681922, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681922, "pid": 3, "tid": 7, "ts": 6302685402441.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685382864.718, "dur": 8.300, + "args": { + "External id": 128877, "cbid": 307, "correlation": 241681922 + } + }, + { + "ph": "s", "id": 241681922, "pid": 5717, "tid": 6759, "ts": 6302685382864.718, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685402474.001, "dur": 38.913, + "args": { + "External id": 128887, "device": 3, "context": 1, "stream": 7, "correlation": 241681937, "bytes": 25165824, "memory bandwidth (GB/s)": 646.7202220337676 + } + }, + { + "ph": "f", "id": 241681937, "pid": 3, "tid": 7, "ts": 6302685402474.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685382930.268, "dur": 13.550, + "args": { + "External id": 128887, "cbid": 41, "correlation": 241681937 + } + }, + { + "ph": "s", "id": 241681937, "pid": 5717, "tid": 6759, "ts": 6302685382930.268, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685402513.618, "dur": 26.624, + "args": { + "External id": 128884, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681955, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241681955, "pid": 3, "tid": 7, "ts": 6302685402513.618, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383029.447, "dur": 8.000, + "args": { + "External id": 128884, "cbid": 307, "correlation": 241681955 + } + }, + { + "ph": "s", "id": 241681955, "pid": 5717, "tid": 6759, "ts": 6302685383029.447, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685383172.447, "dur": 0.520, + "args": { + "External id": 128892, "cbid": 200, "correlation": 241681985 + } + }, + { + "ph": "f", "id": 241681985, "pid": 5717, "tid": 6759, "ts": 6302685383172.447, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685402541.106, "dur": 0.736, + "args": { + "External id": 128892, "device": 3, "context": 1, "stream": 7, "correlation": 241681988, "bytes": 576, "memory bandwidth (GB/s)": 0.782608695652174 + } + }, + { + "ph": "f", "id": 241681988, "pid": 3, "tid": 7, "ts": 6302685402541.106, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685383174.587, "dur": 7.150, + "args": { + "External id": 128892, "cbid": 51, "correlation": 241681988 + } + }, + { + "ph": "s", "id": 241681988, "pid": 5717, "tid": 6759, "ts": 6302685383174.587, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685402542.994, "dur": 138.017, + "args": { + "External id": 128892, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241681989, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241681989, "pid": 3, "tid": 7, "ts": 6302685402542.994, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383181.957, "dur": 7.600, + "args": { + "External id": 128892, "cbid": 307, "correlation": 241681989 + } + }, + { + "ph": "s", "id": 241681989, "pid": 5717, "tid": 6759, "ts": 6302685383181.957, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685383222.097, "dur": 0.300, + "args": { + "External id": 128893, "cbid": 200, "correlation": 241682014 + } + }, + { + "ph": "f", "id": 241682014, "pid": 5717, "tid": 6759, "ts": 6302685383222.097, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685402681.811, "dur": 0.768, + "args": { + "External id": 128893, "device": 3, "context": 1, "stream": 7, "correlation": 241682017, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241682017, "pid": 3, "tid": 7, "ts": 6302685402681.811, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685383224.877, "dur": 8.480, + "args": { + "External id": 128893, "cbid": 51, "correlation": 241682017 + } + }, + { + "ph": "s", "id": 241682017, "pid": 5717, "tid": 6759, "ts": 6302685383224.877, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685402683.699, "dur": 137.313, + "args": { + "External id": 128893, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682018, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682018, "pid": 3, "tid": 7, "ts": 6302685402683.699, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383234.607, "dur": 8.090, + "args": { + "External id": 128893, "cbid": 307, "correlation": 241682018 + } + }, + { + "ph": "s", "id": 241682018, "pid": 5717, "tid": 6759, "ts": 6302685383234.607, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685383270.017, "dur": 0.320, + "args": { + "External id": 128894, "cbid": 200, "correlation": 241682043 + } + }, + { + "ph": "f", "id": 241682043, "pid": 5717, "tid": 6759, "ts": 6302685383270.017, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685402821.908, "dur": 0.768, + "args": { + "External id": 128894, "device": 3, "context": 1, "stream": 7, "correlation": 241682046, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241682046, "pid": 3, "tid": 7, "ts": 6302685402821.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685383271.397, "dur": 7.330, + "args": { + "External id": 128894, "cbid": 51, "correlation": 241682046 + } + }, + { + "ph": "s", "id": 241682046, "pid": 5717, "tid": 6759, "ts": 6302685383271.397, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685402823.828, "dur": 134.497, + "args": { + "External id": 128894, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682047, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682047, "pid": 3, "tid": 7, "ts": 6302685402823.828, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383278.897, "dur": 8.420, + "args": { + "External id": 128894, "cbid": 307, "correlation": 241682047 + } + }, + { + "ph": "s", "id": 241682047, "pid": 5717, "tid": 6759, "ts": 6302685383278.897, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685402959.061, "dur": 118.753, + "args": { + "External id": 128895, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682069, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682069, "pid": 3, "tid": 7, "ts": 6302685402959.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383329.327, "dur": 11.000, + "args": { + "External id": 128895, "cbid": 211, "correlation": 241682069 + } + }, + { + "ph": "s", "id": 241682069, "pid": 5717, "tid": 6759, "ts": 6302685383329.327, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685403078.518, "dur": 118.849, + "args": { + "External id": 128896, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682092, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682092, "pid": 3, "tid": 7, "ts": 6302685403078.518, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383366.977, "dur": 5.260, + "args": { + "External id": 128896, "cbid": 211, "correlation": 241682092 + } + }, + { + "ph": "s", "id": 241682092, "pid": 5717, "tid": 6759, "ts": 6302685383366.977, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685403197.975, "dur": 121.377, + "args": { + "External id": 128897, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682115, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682115, "pid": 3, "tid": 7, "ts": 6302685403197.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383390.637, "dur": 4.740, + "args": { + "External id": 128897, "cbid": 211, "correlation": 241682115 + } + }, + { + "ph": "s", "id": 241682115, "pid": 5717, "tid": 6759, "ts": 6302685383390.637, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 3, "tid": 7, + "ts": 6302685403320.024, "dur": 80.544, + "args": { + "External id": 128898, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682123, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682123, "pid": 3, "tid": 7, "ts": 6302685403320.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383435.037, "dur": 6.649, + "args": { + "External id": 128898, "cbid": 307, "correlation": 241682123 + } + }, + { + "ph": "s", "id": 241682123, "pid": 5717, "tid": 6759, "ts": 6302685383435.037, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685403401.304, "dur": 42.465, + "args": { + "External id": 128913, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682152, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682152, "pid": 3, "tid": 7, "ts": 6302685403401.304, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383667.846, "dur": 16.410, + "args": { + "External id": 128913, "cbid": 307, "correlation": 241682152 + } + }, + { + "ph": "s", "id": 241682152, "pid": 5717, "tid": 6759, "ts": 6302685383667.846, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685403444.377, "dur": 1.920, + "args": { + "External id": 128914, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682160, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241682160, "pid": 3, "tid": 7, "ts": 6302685403444.377, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383729.776, "dur": 12.130, + "args": { + "External id": 128914, "cbid": 307, "correlation": 241682160 + } + }, + { + "ph": "s", "id": 241682160, "pid": 5717, "tid": 6759, "ts": 6302685383729.776, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685403446.937, "dur": 47.840, + "args": { + "External id": 128915, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682171, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682171, "pid": 3, "tid": 7, "ts": 6302685403446.937, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383773.876, "dur": 7.520, + "args": { + "External id": 128915, "cbid": 307, "correlation": 241682171 + } + }, + { + "ph": "s", "id": 241682171, "pid": 5717, "tid": 6759, "ts": 6302685383773.876, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685403495.385, "dur": 45.696, + "args": { + "External id": 128916, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682176, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682176, "pid": 3, "tid": 7, "ts": 6302685403495.385, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685383817.436, "dur": 7.000, + "args": { + "External id": 128916, "cbid": 211, "correlation": 241682176 + } + }, + { + "ph": "s", "id": 241682176, "pid": 5717, "tid": 6759, "ts": 6302685383817.436, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685403545.625, "dur": 2.368, + "args": { + "External id": 128934, "device": 3, "context": 1, "stream": 7, "correlation": 241682207, "bytes": 28112, "memory bandwidth (GB/s)": 11.871621621621621 + } + }, + { + "ph": "f", "id": 241682207, "pid": 3, "tid": 7, "ts": 6302685403545.625, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685384156.095, "dur": 12.140, + "args": { + "External id": 128934, "cbid": 41, "correlation": 241682207 + } + }, + { + "ph": "s", "id": 241682207, "pid": 5717, "tid": 6759, "ts": 6302685384156.095, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685384173.775, "dur": 2.060, + "args": { + "External id": 128929, "cbid": 135, "correlation": 241682211 + } + }, + { + "ph": "f", "id": 241682211, "pid": 5717, "tid": 6759, "ts": 6302685384173.775, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685403549.881, "dur": 34.721, + "args": { + "External id": 128929, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682215, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682215, "pid": 3, "tid": 7, "ts": 6302685403549.881, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685384182.565, "dur": 12.030, + "args": { + "External id": 128929, "cbid": 211, "correlation": 241682215 + } + }, + { + "ph": "s", "id": 241682215, "pid": 5717, "tid": 6759, "ts": 6302685384182.565, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685384261.815, "dur": 1.090, + "args": { + "External id": 128922, "cbid": 135, "correlation": 241682226 + } + }, + { + "ph": "f", "id": 241682226, "pid": 5717, "tid": 6759, "ts": 6302685384261.815, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685384267.975, "dur": 1.649, + "args": { + "External id": 128922, "cbid": 147, "correlation": 241682230 + } + }, + { + "ph": "s", "id": 241682230, "pid": 5717, "tid": 6759, "ts": 6302685384267.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685384395.384, "dur": 1.400, + "args": { + "External id": 128938, "cbid": 317, "correlation": 241682250 + } + }, + { + "ph": "f", "id": 241682250, "pid": 5717, "tid": 6759, "ts": 6302685384395.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685384401.694, "dur": 2.930, + "args": { + "External id": 128938, "cbid": 135, "correlation": 241682252 + } + }, + { + "ph": "f", "id": 241682252, "pid": 5717, "tid": 6759, "ts": 6302685384401.694, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685384406.104, "dur": 2.290, + "args": { + "External id": 128938, "cbid": 147, "correlation": 241682256 + } + }, + { + "ph": "s", "id": 241682256, "pid": 5717, "tid": 6759, "ts": 6302685384406.104, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685384437.534, "dur": 1.270, + "args": { + "External id": 128938, "cbid": 409, "correlation": 241682259 + } + }, + { + "ph": "f", "id": 241682259, "pid": 5717, "tid": 6759, "ts": 6302685384437.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685384445.494, "dur": 1.130, + "args": { + "External id": 128938, "cbid": 135, "correlation": 241682262 + } + }, + { + "ph": "f", "id": 241682262, "pid": 5717, "tid": 6759, "ts": 6302685384445.494, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685384446.804, "dur": 0.930, + "args": { + "External id": 128938, "cbid": 147, "correlation": 241682263 + } + }, + { + "ph": "s", "id": 241682263, "pid": 5717, "tid": 6759, "ts": 6302685384446.804, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685403586.234, "dur": 13650.407, + "args": { + "External id": 128938, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241682265, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241682265, "pid": 3, "tid": 20, "ts": 6302685403586.234, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685384449.944, "dur": 13.260, + "args": { + "External id": 128938, "cbid": 430, "correlation": 241682265 + } + }, + { + "ph": "s", "id": 241682265, "pid": 5717, "tid": 6759, "ts": 6302685384449.944, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685384464.514, "dur": 0.440, + "args": { + "External id": 128938, "cbid": 135, "correlation": 241682267 + } + }, + { + "ph": "f", "id": 241682267, "pid": 5717, "tid": 6759, "ts": 6302685384464.514, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685384465.074, "dur": 0.520, + "args": { + "External id": 128938, "cbid": 147, "correlation": 241682268 + } + }, + { + "ph": "s", "id": 241682268, "pid": 5717, "tid": 6759, "ts": 6302685384465.074, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685384467.264, "dur": 0.780, + "args": { + "External id": 128938, "cbid": 135, "correlation": 241682271 + } + }, + { + "ph": "f", "id": 241682271, "pid": 5717, "tid": 6759, "ts": 6302685384467.264, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685384477.274, "dur": 0.420, + "args": { + "External id": 128938, "cbid": 135, "correlation": 241682278 + } + }, + { + "ph": "f", "id": 241682278, "pid": 5717, "tid": 6759, "ts": 6302685384477.274, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685384507.484, "dur": 1.380, + "args": { + "External id": 128940, "cbid": 147, "correlation": 241682283 + } + }, + { + "ph": "s", "id": 241682283, "pid": 5717, "tid": 6759, "ts": 6302685384507.484, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685384526.684, "dur": 0.720, + "args": { + "External id": 128922, "cbid": 135, "correlation": 241682298 + } + }, + { + "ph": "f", "id": 241682298, "pid": 5717, "tid": 6759, "ts": 6302685384526.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685384742.834, "dur": 1.240, + "args": { + "External id": 128922, "cbid": 135, "correlation": 241682311 + } + }, + { + "ph": "f", "id": 241682311, "pid": 5717, "tid": 6759, "ts": 6302685384742.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685384861.383, "dur": 3.700, + "args": { + "External id": 128950, "cbid": 147, "correlation": 241682322 + } + }, + { + "ph": "s", "id": 241682322, "pid": 5717, "tid": 6759, "ts": 6302685384861.383, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685385006.683, "dur": 1.200, + "args": { + "External id": 128964, "cbid": 317, "correlation": 241682363 + } + }, + { + "ph": "f", "id": 241682363, "pid": 5717, "tid": 6759, "ts": 6302685385006.683, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685385023.313, "dur": 4.400, + "args": { + "External id": 128965, "cbid": 138, "correlation": 241682366 + } + }, + { + "ph": "f", "id": 241682366, "pid": 5717, "tid": 6759, "ts": 6302685385023.313, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685403587.738, "dur": 1.472, + "args": { + "External id": 128969, "device": 3, "context": 1, "stream": 7, "correlation": 241682377, "bytes": 7224, "memory bandwidth (GB/s)": 4.907608695652174 + } + }, + { + "ph": "f", "id": 241682377, "pid": 3, "tid": 7, "ts": 6302685403587.738, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685385055.993, "dur": 14.140, + "args": { + "External id": 128969, "cbid": 41, "correlation": 241682377 + } + }, + { + "ph": "s", "id": 241682377, "pid": 5717, "tid": 6759, "ts": 6302685385055.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685385075.683, "dur": 1.920, + "args": { + "External id": 128964, "cbid": 135, "correlation": 241682381 + } + }, + { + "ph": "f", "id": 241682381, "pid": 5717, "tid": 6759, "ts": 6302685385075.683, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685403591.066, "dur": 262.274, + "args": { + "External id": 128964, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682385, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682385, "pid": 3, "tid": 7, "ts": 6302685403591.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685385080.563, "dur": 17.910, + "args": { + "External id": 128964, "cbid": 211, "correlation": 241682385 + } + }, + { + "ph": "s", "id": 241682385, "pid": 5717, "tid": 6759, "ts": 6302685385080.563, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685385226.962, "dur": 2.620, + "args": { + "External id": 128950, "cbid": 135, "correlation": 241682396 + } + }, + { + "ph": "f", "id": 241682396, "pid": 5717, "tid": 6759, "ts": 6302685385226.962, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685385235.862, "dur": 2.420, + "args": { + "External id": 128950, "cbid": 147, "correlation": 241682400 + } + }, + { + "ph": "s", "id": 241682400, "pid": 5717, "tid": 6759, "ts": 6302685385235.862, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685385241.182, "dur": 0.980, + "args": { + "External id": 128950, "cbid": 147, "correlation": 241682404 + } + }, + { + "ph": "s", "id": 241682404, "pid": 5717, "tid": 6759, "ts": 6302685385241.182, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685404400.384, "dur": 406.691, + "args": { + "External id": 128983, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241682428, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241682428, "pid": 3, "tid": 17, "ts": 6302685404400.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685385417.632, "dur": 13.430, + "args": { + "External id": 128983, "cbid": 211, "correlation": 241682428 + } + }, + { + "ph": "s", "id": 241682428, "pid": 5717, "tid": 6759, "ts": 6302685385417.632, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685404808.067, "dur": 11.680, + "args": { + "External id": 128999, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241682441, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241682441, "pid": 3, "tid": 17, "ts": 6302685404808.067, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685385545.272, "dur": 10.520, + "args": { + "External id": 128999, "cbid": 211, "correlation": 241682441 + } + }, + { + "ph": "s", "id": 241682441, "pid": 5717, "tid": 6759, "ts": 6302685385545.272, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685385580.662, "dur": 1.270, + "args": { + "External id": 128950, "cbid": 135, "correlation": 241682451 + } + }, + { + "ph": "f", "id": 241682451, "pid": 5717, "tid": 6759, "ts": 6302685385580.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685385583.982, "dur": 1.210, + "args": { + "External id": 128950, "cbid": 147, "correlation": 241682455 + } + }, + { + "ph": "s", "id": 241682455, "pid": 5717, "tid": 6759, "ts": 6302685385583.982, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685385640.572, "dur": 0.889, + "args": { + "External id": 129001, "cbid": 317, "correlation": 241682468 + } + }, + { + "ph": "f", "id": 241682468, "pid": 5717, "tid": 6759, "ts": 6302685385640.572, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685385643.892, "dur": 1.269, + "args": { + "External id": 129001, "cbid": 135, "correlation": 241682470 + } + }, + { + "ph": "f", "id": 241682470, "pid": 5717, "tid": 6759, "ts": 6302685385643.892, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685385646.501, "dur": 1.080, + "args": { + "External id": 129001, "cbid": 147, "correlation": 241682474 + } + }, + { + "ph": "s", "id": 241682474, "pid": 5717, "tid": 6759, "ts": 6302685385646.501, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685385664.201, "dur": 1.120, + "args": { + "External id": 129001, "cbid": 409, "correlation": 241682477 + } + }, + { + "ph": "f", "id": 241682477, "pid": 5717, "tid": 6759, "ts": 6302685385664.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685385670.181, "dur": 0.800, + "args": { + "External id": 129001, "cbid": 135, "correlation": 241682480 + } + }, + { + "ph": "f", "id": 241682480, "pid": 5717, "tid": 6759, "ts": 6302685385670.181, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685385671.161, "dur": 0.871, + "args": { + "External id": 129001, "cbid": 147, "correlation": 241682481 + } + }, + { + "ph": "s", "id": 241682481, "pid": 5717, "tid": 6759, "ts": 6302685385671.161, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685417237.697, "dur": 3731.227, + "args": { + "External id": 129001, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241682483, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241682483, "pid": 3, "tid": 20, "ts": 6302685417237.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685385673.632, "dur": 11.629, + "args": { + "External id": 129001, "cbid": 430, "correlation": 241682483 + } + }, + { + "ph": "s", "id": 241682483, "pid": 5717, "tid": 6759, "ts": 6302685385673.632, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685385686.632, "dur": 0.369, + "args": { + "External id": 129001, "cbid": 135, "correlation": 241682485 + } + }, + { + "ph": "f", "id": 241682485, "pid": 5717, "tid": 6759, "ts": 6302685385686.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685385687.132, "dur": 0.489, + "args": { + "External id": 129001, "cbid": 147, "correlation": 241682486 + } + }, + { + "ph": "s", "id": 241682486, "pid": 5717, "tid": 6759, "ts": 6302685385687.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685385689.161, "dur": 0.971, + "args": { + "External id": 129001, "cbid": 135, "correlation": 241682489 + } + }, + { + "ph": "f", "id": 241682489, "pid": 5717, "tid": 6759, "ts": 6302685385689.161, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685385697.901, "dur": 0.410, + "args": { + "External id": 129001, "cbid": 135, "correlation": 241682496 + } + }, + { + "ph": "f", "id": 241682496, "pid": 5717, "tid": 6759, "ts": 6302685385697.901, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685385726.001, "dur": 0.960, + "args": { + "External id": 129003, "cbid": 147, "correlation": 241682501 + } + }, + { + "ph": "s", "id": 241682501, "pid": 5717, "tid": 6759, "ts": 6302685385726.001, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685385743.421, "dur": 0.860, + "args": { + "External id": 128950, "cbid": 135, "correlation": 241682516 + } + }, + { + "ph": "f", "id": 241682516, "pid": 5717, "tid": 6759, "ts": 6302685385743.421, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685403857.788, "dur": 1202.025, + "args": { + "External id": 129005, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682541, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682541, "pid": 3, "tid": 7, "ts": 6302685403857.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685385892.951, "dur": 11.050, + "args": { + "External id": 129005, "cbid": 211, "correlation": 241682541 + } + }, + { + "ph": "s", "id": 241682541, "pid": 5717, "tid": 6759, "ts": 6302685385892.951, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 3, "tid": 7, + "ts": 6302685405060.389, "dur": 429.443, + "args": { + "External id": 129006, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682564, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241682564, "pid": 3, "tid": 7, "ts": 6302685405060.389, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685385955.031, "dur": 7.280, + "args": { + "External id": 129006, "cbid": 307, "correlation": 241682564 + } + }, + { + "ph": "s", "id": 241682564, "pid": 5717, "tid": 6759, "ts": 6302685385955.031, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685386006.341, "dur": 0.490, + "args": { + "External id": 129007, "cbid": 200, "correlation": 241682587 + } + }, + { + "ph": "f", "id": 241682587, "pid": 5717, "tid": 6759, "ts": 6302685386006.341, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685405491.048, "dur": 1.280, + "args": { + "External id": 129007, "device": 3, "context": 1, "stream": 7, "correlation": 241682590, "bytes": 1536, "memory bandwidth (GB/s)": 1.2 + } + }, + { + "ph": "f", "id": 241682590, "pid": 3, "tid": 7, "ts": 6302685405491.048, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685386008.511, "dur": 6.590, + "args": { + "External id": 129007, "cbid": 51, "correlation": 241682590 + } + }, + { + "ph": "s", "id": 241682590, "pid": 5717, "tid": 6759, "ts": 6302685386008.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685405493.544, "dur": 351.907, + "args": { + "External id": 129007, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682591, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682591, "pid": 3, "tid": 7, "ts": 6302685405493.544, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386015.341, "dur": 6.070, + "args": { + "External id": 129007, "cbid": 307, "correlation": 241682591 + } + }, + { + "ph": "s", "id": 241682591, "pid": 5717, "tid": 6759, "ts": 6302685386015.341, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685386050.140, "dur": 0.300, + "args": { + "External id": 129008, "cbid": 200, "correlation": 241682616 + } + }, + { + "ph": "f", "id": 241682616, "pid": 5717, "tid": 6759, "ts": 6302685386050.140, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685405846.987, "dur": 1.216, + "args": { + "External id": 129008, "device": 3, "context": 1, "stream": 7, "correlation": 241682619, "bytes": 1536, "memory bandwidth (GB/s)": 1.263157894736842 + } + }, + { + "ph": "f", "id": 241682619, "pid": 3, "tid": 7, "ts": 6302685405846.987, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685386051.580, "dur": 4.700, + "args": { + "External id": 129008, "cbid": 51, "correlation": 241682619 + } + }, + { + "ph": "s", "id": 241682619, "pid": 5717, "tid": 6759, "ts": 6302685386051.580, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685405849.355, "dur": 353.698, + "args": { + "External id": 129008, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682620, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682620, "pid": 3, "tid": 7, "ts": 6302685405849.355, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386056.431, "dur": 5.160, + "args": { + "External id": 129008, "cbid": 307, "correlation": 241682620 + } + }, + { + "ph": "s", "id": 241682620, "pid": 5717, "tid": 6759, "ts": 6302685386056.431, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685386085.231, "dur": 0.269, + "args": { + "External id": 129009, "cbid": 200, "correlation": 241682645 + } + }, + { + "ph": "f", "id": 241682645, "pid": 5717, "tid": 6759, "ts": 6302685386085.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685406203.757, "dur": 357.187, + "args": { + "External id": 129009, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682648, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682648, "pid": 3, "tid": 7, "ts": 6302685406203.757, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386086.711, "dur": 5.369, + "args": { + "External id": 129009, "cbid": 307, "correlation": 241682648 + } + }, + { + "ph": "s", "id": 241682648, "pid": 5717, "tid": 6759, "ts": 6302685386086.711, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685386113.600, "dur": 0.251, + "args": { + "External id": 129010, "cbid": 200, "correlation": 241682673 + } + }, + { + "ph": "f", "id": 241682673, "pid": 5717, "tid": 6759, "ts": 6302685386113.600, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685406562.160, "dur": 1.184, + "args": { + "External id": 129010, "device": 3, "context": 1, "stream": 7, "correlation": 241682676, "bytes": 1536, "memory bandwidth (GB/s)": 1.2972972972972974 + } + }, + { + "ph": "f", "id": 241682676, "pid": 3, "tid": 7, "ts": 6302685406562.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685386114.820, "dur": 4.911, + "args": { + "External id": 129010, "cbid": 51, "correlation": 241682676 + } + }, + { + "ph": "s", "id": 241682676, "pid": 5717, "tid": 6759, "ts": 6302685386114.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685406565.008, "dur": 354.371, + "args": { + "External id": 129010, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682677, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682677, "pid": 3, "tid": 7, "ts": 6302685406565.008, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386119.880, "dur": 4.911, + "args": { + "External id": 129010, "cbid": 307, "correlation": 241682677 + } + }, + { + "ph": "s", "id": 241682677, "pid": 5717, "tid": 6759, "ts": 6302685386119.880, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685386147.230, "dur": 0.270, + "args": { + "External id": 129011, "cbid": 200, "correlation": 241682702 + } + }, + { + "ph": "f", "id": 241682702, "pid": 5717, "tid": 6759, "ts": 6302685386147.230, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685406920.083, "dur": 355.906, + "args": { + "External id": 129011, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682705, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682705, "pid": 3, "tid": 7, "ts": 6302685406920.083, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386148.550, "dur": 5.110, + "args": { + "External id": 129011, "cbid": 307, "correlation": 241682705 + } + }, + { + "ph": "s", "id": 241682705, "pid": 5717, "tid": 6759, "ts": 6302685386148.550, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685407276.629, "dur": 84.897, + "args": { + "External id": 129012, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682718, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682718, "pid": 3, "tid": 7, "ts": 6302685407276.629, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386192.140, "dur": 6.700, + "args": { + "External id": 129012, "cbid": 307, "correlation": 241682718 + } + }, + { + "ph": "s", "id": 241682718, "pid": 5717, "tid": 6759, "ts": 6302685386192.140, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 3, "tid": 7, + "ts": 6302685407362.198, "dur": 3.360, + "args": { + "External id": 129013, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682726, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241682726, "pid": 3, "tid": 7, "ts": 6302685407362.198, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386224.930, "dur": 6.190, + "args": { + "External id": 129013, "cbid": 307, "correlation": 241682726 + } + }, + { + "ph": "s", "id": 241682726, "pid": 5717, "tid": 6759, "ts": 6302685386224.930, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 3, "tid": 7, + "ts": 6302685407366.230, "dur": 113.729, + "args": { + "External id": 129014, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682734, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682734, "pid": 3, "tid": 7, "ts": 6302685407366.230, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386259.400, "dur": 6.390, + "args": { + "External id": 129014, "cbid": 307, "correlation": 241682734 + } + }, + { + "ph": "s", "id": 241682734, "pid": 5717, "tid": 6759, "ts": 6302685386259.400, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685386462.960, "dur": 0.510, + "args": { + "External id": 129033, "cbid": 200, "correlation": 241682780 + } + }, + { + "ph": "f", "id": 241682780, "pid": 5717, "tid": 6759, "ts": 6302685386462.960, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685407481.207, "dur": 1.088, + "args": { + "External id": 129033, "device": 3, "context": 1, "stream": 7, "correlation": 241682783, "bytes": 576, "memory bandwidth (GB/s)": 0.5294117647058824 + } + }, + { + "ph": "f", "id": 241682783, "pid": 3, "tid": 7, "ts": 6302685407481.207, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685386465.140, "dur": 7.230, + "args": { + "External id": 129033, "cbid": 51, "correlation": 241682783 + } + }, + { + "ph": "s", "id": 241682783, "pid": 5717, "tid": 6759, "ts": 6302685386465.140, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685407483.799, "dur": 142.177, + "args": { + "External id": 129033, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682784, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682784, "pid": 3, "tid": 7, "ts": 6302685407483.799, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386472.580, "dur": 8.370, + "args": { + "External id": 129033, "cbid": 307, "correlation": 241682784 + } + }, + { + "ph": "s", "id": 241682784, "pid": 5717, "tid": 6759, "ts": 6302685386472.580, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685407626.584, "dur": 139.489, + "args": { + "External id": 129034, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682806, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682806, "pid": 3, "tid": 7, "ts": 6302685407626.584, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386507.210, "dur": 5.809, + "args": { + "External id": 129034, "cbid": 211, "correlation": 241682806 + } + }, + { + "ph": "s", "id": 241682806, "pid": 5717, "tid": 6759, "ts": 6302685386507.210, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685386608.629, "dur": 0.430, + "args": { + "External id": 129035, "cbid": 200, "correlation": 241682824 + } + }, + { + "ph": "f", "id": 241682824, "pid": 5717, "tid": 6759, "ts": 6302685386608.629, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685386609.229, "dur": 0.200, + "args": { + "External id": 129035, "cbid": 200, "correlation": 241682825 + } + }, + { + "ph": "f", "id": 241682825, "pid": 5717, "tid": 6759, "ts": 6302685386609.229, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685386641.139, "dur": 0.220, + "args": { + "External id": 129035, "cbid": 200, "correlation": 241682843 + } + }, + { + "ph": "f", "id": 241682843, "pid": 5717, "tid": 6759, "ts": 6302685386641.139, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685407766.745, "dur": 92.097, + "args": { + "External id": 129035, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682844, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682844, "pid": 3, "tid": 7, "ts": 6302685407766.745, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386643.619, "dur": 14.370, + "args": { + "External id": 129035, "cbid": 211, "correlation": 241682844 + } + }, + { + "ph": "s", "id": 241682844, "pid": 5717, "tid": 6759, "ts": 6302685386643.619, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685386658.749, "dur": 0.940, + "args": { + "External id": 129035, "cbid": 273, "correlation": 241682846 + } + }, + { + "ph": "f", "id": 241682846, "pid": 5717, "tid": 6759, "ts": 6302685386658.749, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685407859.546, "dur": 993.639, + "args": { + "External id": 129035, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682847, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241682847, "pid": 3, "tid": 7, "ts": 6302685407859.546, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386660.019, "dur": 4.460, + "args": { + "External id": 129035, "cbid": 211, "correlation": 241682847 + } + }, + { + "ph": "s", "id": 241682847, "pid": 5717, "tid": 6759, "ts": 6302685386660.019, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685408853.793, "dur": 75.137, + "args": { + "External id": 129035, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682849, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241682849, "pid": 3, "tid": 7, "ts": 6302685408853.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386665.009, "dur": 3.820, + "args": { + "External id": 129035, "cbid": 211, "correlation": 241682849 + } + }, + { + "ph": "s", "id": 241682849, "pid": 5717, "tid": 6759, "ts": 6302685386665.009, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685408929.506, "dur": 48.192, + "args": { + "External id": 129046, "device": 3, "context": 1, "stream": 7, "correlation": 241682871, "bytes": 25165824, "memory bandwidth (GB/s)": 522.199203187251 + } + }, + { + "ph": "f", "id": 241682871, "pid": 3, "tid": 7, "ts": 6302685408929.506, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685386814.179, "dur": 27.400, + "args": { + "External id": 129046, "cbid": 41, "correlation": 241682871 + } + }, + { + "ph": "s", "id": 241682871, "pid": 5717, "tid": 6759, "ts": 6302685386814.179, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685408978.338, "dur": 34.081, + "args": { + "External id": 129043, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682889, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682889, "pid": 3, "tid": 7, "ts": 6302685408978.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685386986.818, "dur": 12.860, + "args": { + "External id": 129043, "cbid": 307, "correlation": 241682889 + } + }, + { + "ph": "s", "id": 241682889, "pid": 5717, "tid": 6759, "ts": 6302685386986.818, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685409013.091, "dur": 40.480, + "args": { + "External id": 129053, "device": 3, "context": 1, "stream": 7, "correlation": 241682904, "bytes": 25165824, "memory bandwidth (GB/s)": 621.6853754940712 + } + }, + { + "ph": "f", "id": 241682904, "pid": 3, "tid": 7, "ts": 6302685409013.091, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685387066.128, "dur": 14.280, + "args": { + "External id": 129053, "cbid": 41, "correlation": 241682904 + } + }, + { + "ph": "s", "id": 241682904, "pid": 5717, "tid": 6759, "ts": 6302685387066.128, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685409054.275, "dur": 30.784, + "args": { + "External id": 129050, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682922, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241682922, "pid": 3, "tid": 7, "ts": 6302685409054.275, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387170.958, "dur": 9.110, + "args": { + "External id": 129050, "cbid": 307, "correlation": 241682922 + } + }, + { + "ph": "s", "id": 241682922, "pid": 5717, "tid": 6759, "ts": 6302685387170.958, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685387337.018, "dur": 0.580, + "args": { + "External id": 129058, "cbid": 200, "correlation": 241682952 + } + }, + { + "ph": "f", "id": 241682952, "pid": 5717, "tid": 6759, "ts": 6302685387337.018, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685409086.179, "dur": 0.800, + "args": { + "External id": 129058, "device": 3, "context": 1, "stream": 7, "correlation": 241682955, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 241682955, "pid": 3, "tid": 7, "ts": 6302685409086.179, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685387339.348, "dur": 7.890, + "args": { + "External id": 129058, "cbid": 51, "correlation": 241682955 + } + }, + { + "ph": "s", "id": 241682955, "pid": 5717, "tid": 6759, "ts": 6302685387339.348, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685409088.131, "dur": 144.129, + "args": { + "External id": 129058, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682956, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682956, "pid": 3, "tid": 7, "ts": 6302685409088.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387347.468, "dur": 8.500, + "args": { + "External id": 129058, "cbid": 307, "correlation": 241682956 + } + }, + { + "ph": "s", "id": 241682956, "pid": 5717, "tid": 6759, "ts": 6302685387347.468, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685387385.637, "dur": 0.300, + "args": { + "External id": 129059, "cbid": 200, "correlation": 241682981 + } + }, + { + "ph": "f", "id": 241682981, "pid": 5717, "tid": 6759, "ts": 6302685387385.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685409233.508, "dur": 0.928, + "args": { + "External id": 129059, "device": 3, "context": 1, "stream": 7, "correlation": 241682984, "bytes": 576, "memory bandwidth (GB/s)": 0.6206896551724138 + } + }, + { + "ph": "f", "id": 241682984, "pid": 3, "tid": 7, "ts": 6302685409233.508, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685387387.048, "dur": 4.700, + "args": { + "External id": 129059, "cbid": 51, "correlation": 241682984 + } + }, + { + "ph": "s", "id": 241682984, "pid": 5717, "tid": 6759, "ts": 6302685387387.048, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685409236.004, "dur": 140.609, + "args": { + "External id": 129059, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241682985, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241682985, "pid": 3, "tid": 7, "ts": 6302685409236.004, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387391.908, "dur": 5.540, + "args": { + "External id": 129059, "cbid": 307, "correlation": 241682985 + } + }, + { + "ph": "s", "id": 241682985, "pid": 5717, "tid": 6759, "ts": 6302685387391.908, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685387424.257, "dur": 0.291, + "args": { + "External id": 129060, "cbid": 200, "correlation": 241683010 + } + }, + { + "ph": "f", "id": 241683010, "pid": 5717, "tid": 6759, "ts": 6302685387424.257, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685409377.797, "dur": 1.024, + "args": { + "External id": 129060, "device": 3, "context": 1, "stream": 7, "correlation": 241683013, "bytes": 576, "memory bandwidth (GB/s)": 0.5625 + } + }, + { + "ph": "f", "id": 241683013, "pid": 3, "tid": 7, "ts": 6302685409377.797, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685387425.708, "dur": 4.349, + "args": { + "External id": 129060, "cbid": 51, "correlation": 241683013 + } + }, + { + "ph": "s", "id": 241683013, "pid": 5717, "tid": 6759, "ts": 6302685387425.708, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685409380.261, "dur": 140.801, + "args": { + "External id": 129060, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683014, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683014, "pid": 3, "tid": 7, "ts": 6302685409380.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387430.188, "dur": 5.429, + "args": { + "External id": 129060, "cbid": 307, "correlation": 241683014 + } + }, + { + "ph": "s", "id": 241683014, "pid": 5717, "tid": 6759, "ts": 6302685387430.188, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685409521.702, "dur": 139.265, + "args": { + "External id": 129061, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683036, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683036, "pid": 3, "tid": 7, "ts": 6302685409521.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387460.857, "dur": 5.690, + "args": { + "External id": 129061, "cbid": 211, "correlation": 241683036 + } + }, + { + "ph": "s", "id": 241683036, "pid": 5717, "tid": 6759, "ts": 6302685387460.857, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685409661.639, "dur": 139.009, + "args": { + "External id": 129062, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683059, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683059, "pid": 3, "tid": 7, "ts": 6302685409661.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387487.097, "dur": 5.290, + "args": { + "External id": 129062, "cbid": 211, "correlation": 241683059 + } + }, + { + "ph": "s", "id": 241683059, "pid": 5717, "tid": 6759, "ts": 6302685387487.097, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685409801.256, "dur": 141.025, + "args": { + "External id": 129063, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683082, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683082, "pid": 3, "tid": 7, "ts": 6302685409801.256, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387513.397, "dur": 4.970, + "args": { + "External id": 129063, "cbid": 211, "correlation": 241683082 + } + }, + { + "ph": "s", "id": 241683082, "pid": 5717, "tid": 6759, "ts": 6302685387513.397, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 3, "tid": 7, + "ts": 6302685409942.985, "dur": 81.473, + "args": { + "External id": 129064, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683090, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683090, "pid": 3, "tid": 7, "ts": 6302685409942.985, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387557.917, "dur": 6.670, + "args": { + "External id": 129064, "cbid": 307, "correlation": 241683090 + } + }, + { + "ph": "s", "id": 241683090, "pid": 5717, "tid": 6759, "ts": 6302685387557.917, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685410025.162, "dur": 47.425, + "args": { + "External id": 129079, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683119, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683119, "pid": 3, "tid": 7, "ts": 6302685410025.162, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387735.087, "dur": 10.800, + "args": { + "External id": 129079, "cbid": 307, "correlation": 241683119 + } + }, + { + "ph": "s", "id": 241683119, "pid": 5717, "tid": 6759, "ts": 6302685387735.087, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685410073.227, "dur": 3.679, + "args": { + "External id": 129080, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683127, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241683127, "pid": 3, "tid": 7, "ts": 6302685410073.227, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387776.127, "dur": 6.680, + "args": { + "External id": 129080, "cbid": 307, "correlation": 241683127 + } + }, + { + "ph": "s", "id": 241683127, "pid": 5717, "tid": 6759, "ts": 6302685387776.127, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685410077.578, "dur": 52.257, + "args": { + "External id": 129081, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683138, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683138, "pid": 3, "tid": 7, "ts": 6302685410077.578, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387815.527, "dur": 10.789, + "args": { + "External id": 129081, "cbid": 307, "correlation": 241683138 + } + }, + { + "ph": "s", "id": 241683138, "pid": 5717, "tid": 6759, "ts": 6302685387815.527, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685410130.539, "dur": 45.472, + "args": { + "External id": 129082, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683143, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683143, "pid": 3, "tid": 7, "ts": 6302685410130.539, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685387870.056, "dur": 11.051, + "args": { + "External id": 129082, "cbid": 211, "correlation": 241683143 + } + }, + { + "ph": "s", "id": 241683143, "pid": 5717, "tid": 6759, "ts": 6302685387870.056, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685388093.556, "dur": 2.860, + "args": { + "External id": 129088, "cbid": 147, "correlation": 241683160 + } + }, + { + "ph": "s", "id": 241683160, "pid": 5717, "tid": 6759, "ts": 6302685388093.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685388241.086, "dur": 4.080, + "args": { + "External id": 129096, "cbid": 138, "correlation": 241683175 + } + }, + { + "ph": "f", "id": 241683175, "pid": 5717, "tid": 6759, "ts": 6302685388241.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685417239.681, "dur": 2.559, + "args": { + "External id": 129100, "device": 3, "context": 1, "stream": 7, "correlation": 241683186, "bytes": 28112, "memory bandwidth (GB/s)": 10.985541227041812 + } + }, + { + "ph": "f", "id": 241683186, "pid": 3, "tid": 7, "ts": 6302685417239.681, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685388274.455, "dur": 16.160, + "args": { + "External id": 129100, "cbid": 41, "correlation": 241683186 + } + }, + { + "ph": "s", "id": 241683186, "pid": 5717, "tid": 6759, "ts": 6302685388274.455, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685388303.846, "dur": 2.420, + "args": { + "External id": 129095, "cbid": 135, "correlation": 241683190 + } + }, + { + "ph": "f", "id": 241683190, "pid": 5717, "tid": 6759, "ts": 6302685388303.846, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685417244.448, "dur": 373.987, + "args": { + "External id": 129095, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683194, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683194, "pid": 3, "tid": 7, "ts": 6302685417244.448, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685388310.615, "dur": 12.420, + "args": { + "External id": 129095, "cbid": 211, "correlation": 241683194 + } + }, + { + "ph": "s", "id": 241683194, "pid": 5717, "tid": 6759, "ts": 6302685388310.615, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685388367.825, "dur": 1.170, + "args": { + "External id": 129088, "cbid": 135, "correlation": 241683205 + } + }, + { + "ph": "f", "id": 241683205, "pid": 5717, "tid": 6759, "ts": 6302685388367.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685388371.175, "dur": 1.350, + "args": { + "External id": 129088, "cbid": 147, "correlation": 241683209 + } + }, + { + "ph": "s", "id": 241683209, "pid": 5717, "tid": 6759, "ts": 6302685388371.175, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685388457.865, "dur": 1.120, + "args": { + "External id": 129104, "cbid": 317, "correlation": 241683229 + } + }, + { + "ph": "f", "id": 241683229, "pid": 5717, "tid": 6759, "ts": 6302685388457.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685388462.085, "dur": 1.700, + "args": { + "External id": 129104, "cbid": 135, "correlation": 241683231 + } + }, + { + "ph": "f", "id": 241683231, "pid": 5717, "tid": 6759, "ts": 6302685388462.085, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685388465.275, "dur": 0.920, + "args": { + "External id": 129104, "cbid": 147, "correlation": 241683235 + } + }, + { + "ph": "s", "id": 241683235, "pid": 5717, "tid": 6759, "ts": 6302685388465.275, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685388484.495, "dur": 1.160, + "args": { + "External id": 129104, "cbid": 409, "correlation": 241683238 + } + }, + { + "ph": "f", "id": 241683238, "pid": 5717, "tid": 6759, "ts": 6302685388484.495, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685388490.775, "dur": 0.940, + "args": { + "External id": 129104, "cbid": 135, "correlation": 241683241 + } + }, + { + "ph": "f", "id": 241683241, "pid": 5717, "tid": 6759, "ts": 6302685388490.775, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685388491.895, "dur": 1.020, + "args": { + "External id": 129104, "cbid": 147, "correlation": 241683242 + } + }, + { + "ph": "s", "id": 241683242, "pid": 5717, "tid": 6759, "ts": 6302685388491.895, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685420970.172, "dur": 9501.096, + "args": { + "External id": 129104, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241683244, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241683244, "pid": 3, "tid": 20, "ts": 6302685420970.172, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685388494.655, "dur": 12.550, + "args": { + "External id": 129104, "cbid": 430, "correlation": 241683244 + } + }, + { + "ph": "s", "id": 241683244, "pid": 5717, "tid": 6759, "ts": 6302685388494.655, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685388508.615, "dur": 0.440, + "args": { + "External id": 129104, "cbid": 135, "correlation": 241683246 + } + }, + { + "ph": "f", "id": 241683246, "pid": 5717, "tid": 6759, "ts": 6302685388508.615, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685388509.195, "dur": 0.630, + "args": { + "External id": 129104, "cbid": 147, "correlation": 241683247 + } + }, + { + "ph": "s", "id": 241683247, "pid": 5717, "tid": 6759, "ts": 6302685388509.195, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685388511.625, "dur": 0.860, + "args": { + "External id": 129104, "cbid": 135, "correlation": 241683250 + } + }, + { + "ph": "f", "id": 241683250, "pid": 5717, "tid": 6759, "ts": 6302685388511.625, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685388521.705, "dur": 0.490, + "args": { + "External id": 129104, "cbid": 135, "correlation": 241683257 + } + }, + { + "ph": "f", "id": 241683257, "pid": 5717, "tid": 6759, "ts": 6302685388521.705, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685388553.115, "dur": 1.080, + "args": { + "External id": 129106, "cbid": 147, "correlation": 241683262 + } + }, + { + "ph": "s", "id": 241683262, "pid": 5717, "tid": 6759, "ts": 6302685388553.115, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685388572.685, "dur": 0.920, + "args": { + "External id": 129088, "cbid": 135, "correlation": 241683277 + } + }, + { + "ph": "f", "id": 241683277, "pid": 5717, "tid": 6759, "ts": 6302685388572.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685388784.444, "dur": 1.280, + "args": { + "External id": 129088, "cbid": 135, "correlation": 241683290 + } + }, + { + "ph": "f", "id": 241683290, "pid": 5717, "tid": 6759, "ts": 6302685388784.444, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685388906.904, "dur": 3.600, + "args": { + "External id": 129116, "cbid": 147, "correlation": 241683301 + } + }, + { + "ph": "s", "id": 241683301, "pid": 5717, "tid": 6759, "ts": 6302685388906.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685389036.594, "dur": 1.210, + "args": { + "External id": 129130, "cbid": 317, "correlation": 241683342 + } + }, + { + "ph": "f", "id": 241683342, "pid": 5717, "tid": 6759, "ts": 6302685389036.594, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685389047.704, "dur": 3.030, + "args": { + "External id": 129131, "cbid": 138, "correlation": 241683345 + } + }, + { + "ph": "f", "id": 241683345, "pid": 5717, "tid": 6759, "ts": 6302685389047.704, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685420971.164, "dur": 2.048, + "args": { + "External id": 129135, "device": 3, "context": 1, "stream": 7, "correlation": 241683356, "bytes": 7224, "memory bandwidth (GB/s)": 3.52734375 + } + }, + { + "ph": "f", "id": 241683356, "pid": 3, "tid": 7, "ts": 6302685420971.164, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685389075.854, "dur": 13.250, + "args": { + "External id": 129135, "cbid": 41, "correlation": 241683356 + } + }, + { + "ph": "s", "id": 241683356, "pid": 5717, "tid": 6759, "ts": 6302685389075.854, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685389094.114, "dur": 1.960, + "args": { + "External id": 129130, "cbid": 135, "correlation": 241683360 + } + }, + { + "ph": "f", "id": 241683360, "pid": 5717, "tid": 6759, "ts": 6302685389094.114, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685420975.196, "dur": 237.923, + "args": { + "External id": 129130, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683364, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683364, "pid": 3, "tid": 7, "ts": 6302685420975.196, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685389099.324, "dur": 12.590, + "args": { + "External id": 129130, "cbid": 211, "correlation": 241683364 + } + }, + { + "ph": "s", "id": 241683364, "pid": 5717, "tid": 6759, "ts": 6302685389099.324, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685389229.543, "dur": 1.960, + "args": { + "External id": 129116, "cbid": 135, "correlation": 241683375 + } + }, + { + "ph": "f", "id": 241683375, "pid": 5717, "tid": 6759, "ts": 6302685389229.543, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685389236.073, "dur": 1.720, + "args": { + "External id": 129116, "cbid": 147, "correlation": 241683379 + } + }, + { + "ph": "s", "id": 241683379, "pid": 5717, "tid": 6759, "ts": 6302685389236.073, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685389241.813, "dur": 1.130, + "args": { + "External id": 129116, "cbid": 147, "correlation": 241683383 + } + }, + { + "ph": "s", "id": 241683383, "pid": 5717, "tid": 6759, "ts": 6302685389241.813, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685421722.626, "dur": 453.924, + "args": { + "External id": 129149, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241683407, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241683407, "pid": 3, "tid": 17, "ts": 6302685421722.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685389451.593, "dur": 15.390, + "args": { + "External id": 129149, "cbid": 211, "correlation": 241683407 + } + }, + { + "ph": "s", "id": 241683407, "pid": 5717, "tid": 6759, "ts": 6302685389451.593, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685422178.342, "dur": 11.680, + "args": { + "External id": 129165, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241683420, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241683420, "pid": 3, "tid": 17, "ts": 6302685422178.342, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685389590.712, "dur": 11.931, + "args": { + "External id": 129165, "cbid": 211, "correlation": 241683420 + } + }, + { + "ph": "s", "id": 241683420, "pid": 5717, "tid": 6759, "ts": 6302685389590.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685389631.943, "dur": 1.569, + "args": { + "External id": 129116, "cbid": 135, "correlation": 241683430 + } + }, + { + "ph": "f", "id": 241683430, "pid": 5717, "tid": 6759, "ts": 6302685389631.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685389635.892, "dur": 1.511, + "args": { + "External id": 129116, "cbid": 147, "correlation": 241683434 + } + }, + { + "ph": "s", "id": 241683434, "pid": 5717, "tid": 6759, "ts": 6302685389635.892, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685389700.902, "dur": 1.020, + "args": { + "External id": 129167, "cbid": 317, "correlation": 241683447 + } + }, + { + "ph": "f", "id": 241683447, "pid": 5717, "tid": 6759, "ts": 6302685389700.902, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685389704.162, "dur": 1.420, + "args": { + "External id": 129167, "cbid": 135, "correlation": 241683449 + } + }, + { + "ph": "f", "id": 241683449, "pid": 5717, "tid": 6759, "ts": 6302685389704.162, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685389707.252, "dur": 1.370, + "args": { + "External id": 129167, "cbid": 147, "correlation": 241683453 + } + }, + { + "ph": "s", "id": 241683453, "pid": 5717, "tid": 6759, "ts": 6302685389707.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685389726.592, "dur": 1.020, + "args": { + "External id": 129167, "cbid": 409, "correlation": 241683456 + } + }, + { + "ph": "f", "id": 241683456, "pid": 5717, "tid": 6759, "ts": 6302685389726.592, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685389732.932, "dur": 0.910, + "args": { + "External id": 129167, "cbid": 135, "correlation": 241683459 + } + }, + { + "ph": "f", "id": 241683459, "pid": 5717, "tid": 6759, "ts": 6302685389732.932, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685389734.082, "dur": 1.030, + "args": { + "External id": 129167, "cbid": 147, "correlation": 241683460 + } + }, + { + "ph": "s", "id": 241683460, "pid": 5717, "tid": 6759, "ts": 6302685389734.082, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685430472.260, "dur": 4429.217, + "args": { + "External id": 129167, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241683462, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241683462, "pid": 3, "tid": 20, "ts": 6302685430472.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685389736.722, "dur": 12.790, + "args": { + "External id": 129167, "cbid": 430, "correlation": 241683462 + } + }, + { + "ph": "s", "id": 241683462, "pid": 5717, "tid": 6759, "ts": 6302685389736.722, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685389751.042, "dur": 0.500, + "args": { + "External id": 129167, "cbid": 135, "correlation": 241683464 + } + }, + { + "ph": "f", "id": 241683464, "pid": 5717, "tid": 6759, "ts": 6302685389751.042, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685389751.692, "dur": 0.660, + "args": { + "External id": 129167, "cbid": 147, "correlation": 241683465 + } + }, + { + "ph": "s", "id": 241683465, "pid": 5717, "tid": 6759, "ts": 6302685389751.692, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685389754.272, "dur": 0.980, + "args": { + "External id": 129167, "cbid": 135, "correlation": 241683468 + } + }, + { + "ph": "f", "id": 241683468, "pid": 5717, "tid": 6759, "ts": 6302685389754.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685389764.052, "dur": 0.510, + "args": { + "External id": 129167, "cbid": 135, "correlation": 241683475 + } + }, + { + "ph": "f", "id": 241683475, "pid": 5717, "tid": 6759, "ts": 6302685389764.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685389796.172, "dur": 1.160, + "args": { + "External id": 129169, "cbid": 147, "correlation": 241683480 + } + }, + { + "ph": "s", "id": 241683480, "pid": 5717, "tid": 6759, "ts": 6302685389796.172, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685389817.222, "dur": 1.170, + "args": { + "External id": 129116, "cbid": 135, "correlation": 241683495 + } + }, + { + "ph": "f", "id": 241683495, "pid": 5717, "tid": 6759, "ts": 6302685389817.222, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685421228.607, "dur": 1203.336, + "args": { + "External id": 129171, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683520, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683520, "pid": 3, "tid": 7, "ts": 6302685421228.607, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390024.602, "dur": 13.749, + "args": { + "External id": 129171, "cbid": 211, "correlation": 241683520 + } + }, + { + "ph": "s", "id": 241683520, "pid": 5717, "tid": 6759, "ts": 6302685390024.602, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 3, "tid": 7, + "ts": 6302685422432.583, "dur": 430.276, + "args": { + "External id": 129172, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683543, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241683543, "pid": 3, "tid": 7, "ts": 6302685422432.583, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390096.722, "dur": 8.640, + "args": { + "External id": 129172, "cbid": 307, "correlation": 241683543 + } + }, + { + "ph": "s", "id": 241683543, "pid": 5717, "tid": 6759, "ts": 6302685390096.722, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685390149.211, "dur": 0.650, + "args": { + "External id": 129173, "cbid": 200, "correlation": 241683566 + } + }, + { + "ph": "f", "id": 241683566, "pid": 5717, "tid": 6759, "ts": 6302685390149.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685422864.043, "dur": 0.832, + "args": { + "External id": 129173, "device": 3, "context": 1, "stream": 7, "correlation": 241683569, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 241683569, "pid": 3, "tid": 7, "ts": 6302685422864.043, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685390151.911, "dur": 8.040, + "args": { + "External id": 129173, "cbid": 51, "correlation": 241683569 + } + }, + { + "ph": "s", "id": 241683569, "pid": 5717, "tid": 6759, "ts": 6302685390151.911, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685422866.027, "dur": 357.443, + "args": { + "External id": 129173, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683570, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683570, "pid": 3, "tid": 7, "ts": 6302685422866.027, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390160.201, "dur": 7.420, + "args": { + "External id": 129173, "cbid": 307, "correlation": 241683570 + } + }, + { + "ph": "s", "id": 241683570, "pid": 5717, "tid": 6759, "ts": 6302685390160.201, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685390202.861, "dur": 0.340, + "args": { + "External id": 129174, "cbid": 200, "correlation": 241683595 + } + }, + { + "ph": "f", "id": 241683595, "pid": 5717, "tid": 6759, "ts": 6302685390202.861, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685423224.814, "dur": 1.120, + "args": { + "External id": 129174, "device": 3, "context": 1, "stream": 7, "correlation": 241683598, "bytes": 1536, "memory bandwidth (GB/s)": 1.3714285714285714 + } + }, + { + "ph": "f", "id": 241683598, "pid": 3, "tid": 7, "ts": 6302685423224.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685390204.571, "dur": 5.390, + "args": { + "External id": 129174, "cbid": 51, "correlation": 241683598 + } + }, + { + "ph": "s", "id": 241683598, "pid": 5717, "tid": 6759, "ts": 6302685390204.571, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685423227.694, "dur": 349.410, + "args": { + "External id": 129174, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683599, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683599, "pid": 3, "tid": 7, "ts": 6302685423227.694, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390210.151, "dur": 6.300, + "args": { + "External id": 129174, "cbid": 307, "correlation": 241683599 + } + }, + { + "ph": "s", "id": 241683599, "pid": 5717, "tid": 6759, "ts": 6302685390210.151, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685390247.201, "dur": 0.290, + "args": { + "External id": 129175, "cbid": 200, "correlation": 241683624 + } + }, + { + "ph": "f", "id": 241683624, "pid": 5717, "tid": 6759, "ts": 6302685390247.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685423577.776, "dur": 392.099, + "args": { + "External id": 129175, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683627, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683627, "pid": 3, "tid": 7, "ts": 6302685423577.776, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390248.991, "dur": 6.190, + "args": { + "External id": 129175, "cbid": 307, "correlation": 241683627 + } + }, + { + "ph": "s", "id": 241683627, "pid": 5717, "tid": 6759, "ts": 6302685390248.991, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685390282.641, "dur": 0.310, + "args": { + "External id": 129176, "cbid": 200, "correlation": 241683652 + } + }, + { + "ph": "f", "id": 241683652, "pid": 5717, "tid": 6759, "ts": 6302685390282.641, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685424002.931, "dur": 35.649, + "args": { + "External id": 129176, "device": 3, "context": 1, "stream": 7, "correlation": 241683655, "bytes": 1536, "memory bandwidth (GB/s)": 0.043086762602036525 + } + }, + { + "ph": "f", "id": 241683655, "pid": 3, "tid": 7, "ts": 6302685424002.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685390284.221, "dur": 5.460, + "args": { + "External id": 129176, "cbid": 51, "correlation": 241683655 + } + }, + { + "ph": "s", "id": 241683655, "pid": 5717, "tid": 6759, "ts": 6302685390284.221, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685424073.876, "dur": 438.819, + "args": { + "External id": 129176, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683656, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683656, "pid": 3, "tid": 7, "ts": 6302685424073.876, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390289.881, "dur": 6.230, + "args": { + "External id": 129176, "cbid": 307, "correlation": 241683656 + } + }, + { + "ph": "s", "id": 241683656, "pid": 5717, "tid": 6759, "ts": 6302685390289.881, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685390339.341, "dur": 0.310, + "args": { + "External id": 129177, "cbid": 200, "correlation": 241683681 + } + }, + { + "ph": "f", "id": 241683681, "pid": 5717, "tid": 6759, "ts": 6302685390339.341, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685424513.367, "dur": 416.260, + "args": { + "External id": 129177, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683684, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683684, "pid": 3, "tid": 7, "ts": 6302685424513.367, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390341.131, "dur": 7.670, + "args": { + "External id": 129177, "cbid": 307, "correlation": 241683684 + } + }, + { + "ph": "s", "id": 241683684, "pid": 5717, "tid": 6759, "ts": 6302685390341.131, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685424930.203, "dur": 204.865, + "args": { + "External id": 129178, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683697, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683697, "pid": 3, "tid": 7, "ts": 6302685424930.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390395.441, "dur": 7.330, + "args": { + "External id": 129178, "cbid": 307, "correlation": 241683697 + } + }, + { + "ph": "s", "id": 241683697, "pid": 5717, "tid": 6759, "ts": 6302685390395.441, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 3, "tid": 7, + "ts": 6302685425135.708, "dur": 63.936, + "args": { + "External id": 129179, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683705, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241683705, "pid": 3, "tid": 7, "ts": 6302685425135.708, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390434.351, "dur": 6.880, + "args": { + "External id": 129179, "cbid": 307, "correlation": 241683705 + } + }, + { + "ph": "s", "id": 241683705, "pid": 5717, "tid": 6759, "ts": 6302685390434.351, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 3, "tid": 7, + "ts": 6302685425200.220, "dur": 168.002, + "args": { + "External id": 129180, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683713, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683713, "pid": 3, "tid": 7, "ts": 6302685425200.220, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390472.930, "dur": 6.600, + "args": { + "External id": 129180, "cbid": 307, "correlation": 241683713 + } + }, + { + "ph": "s", "id": 241683713, "pid": 5717, "tid": 6759, "ts": 6302685390472.930, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685390687.710, "dur": 0.600, + "args": { + "External id": 129199, "cbid": 200, "correlation": 241683759 + } + }, + { + "ph": "f", "id": 241683759, "pid": 5717, "tid": 6759, "ts": 6302685390687.710, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685425381.054, "dur": 3.168, + "args": { + "External id": 129199, "device": 3, "context": 1, "stream": 7, "correlation": 241683762, "bytes": 576, "memory bandwidth (GB/s)": 0.18181818181818182 + } + }, + { + "ph": "f", "id": 241683762, "pid": 3, "tid": 7, "ts": 6302685425381.054, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685390690.310, "dur": 8.710, + "args": { + "External id": 129199, "cbid": 51, "correlation": 241683762 + } + }, + { + "ph": "s", "id": 241683762, "pid": 5717, "tid": 6759, "ts": 6302685390690.310, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685425385.790, "dur": 142.561, + "args": { + "External id": 129199, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683763, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683763, "pid": 3, "tid": 7, "ts": 6302685425385.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390699.300, "dur": 9.870, + "args": { + "External id": 129199, "cbid": 307, "correlation": 241683763 + } + }, + { + "ph": "s", "id": 241683763, "pid": 5717, "tid": 6759, "ts": 6302685390699.300, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685425529.087, "dur": 140.321, + "args": { + "External id": 129200, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683785, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683785, "pid": 3, "tid": 7, "ts": 6302685425529.087, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390744.560, "dur": 7.070, + "args": { + "External id": 129200, "cbid": 211, "correlation": 241683785 + } + }, + { + "ph": "s", "id": 241683785, "pid": 5717, "tid": 6759, "ts": 6302685390744.560, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685390840.430, "dur": 0.500, + "args": { + "External id": 129201, "cbid": 200, "correlation": 241683803 + } + }, + { + "ph": "f", "id": 241683803, "pid": 5717, "tid": 6759, "ts": 6302685390840.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685390841.070, "dur": 0.250, + "args": { + "External id": 129201, "cbid": 200, "correlation": 241683804 + } + }, + { + "ph": "f", "id": 241683804, "pid": 5717, "tid": 6759, "ts": 6302685390841.070, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685390863.220, "dur": 0.280, + "args": { + "External id": 129201, "cbid": 200, "correlation": 241683822 + } + }, + { + "ph": "f", "id": 241683822, "pid": 5717, "tid": 6759, "ts": 6302685390863.220, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685425670.080, "dur": 90.593, + "args": { + "External id": 129201, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683823, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683823, "pid": 3, "tid": 7, "ts": 6302685425670.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390864.930, "dur": 11.250, + "args": { + "External id": 129201, "cbid": 211, "correlation": 241683823 + } + }, + { + "ph": "s", "id": 241683823, "pid": 5717, "tid": 6759, "ts": 6302685390864.930, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685390877.080, "dur": 1.140, + "args": { + "External id": 129201, "cbid": 273, "correlation": 241683825 + } + }, + { + "ph": "f", "id": 241683825, "pid": 5717, "tid": 6759, "ts": 6302685390877.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685425761.281, "dur": 1283.209, + "args": { + "External id": 129201, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683826, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241683826, "pid": 3, "tid": 7, "ts": 6302685425761.281, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390878.600, "dur": 4.970, + "args": { + "External id": 129201, "cbid": 211, "correlation": 241683826 + } + }, + { + "ph": "s", "id": 241683826, "pid": 5717, "tid": 6759, "ts": 6302685390878.600, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685427045.130, "dur": 72.833, + "args": { + "External id": 129201, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683828, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241683828, "pid": 3, "tid": 7, "ts": 6302685427045.130, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685390884.220, "dur": 4.380, + "args": { + "External id": 129201, "cbid": 211, "correlation": 241683828 + } + }, + { + "ph": "s", "id": 241683828, "pid": 5717, "tid": 6759, "ts": 6302685390884.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685427118.667, "dur": 49.632, + "args": { + "External id": 129212, "device": 3, "context": 1, "stream": 7, "correlation": 241683850, "bytes": 25165824, "memory bandwidth (GB/s)": 507.04835589941973 + } + }, + { + "ph": "f", "id": 241683850, "pid": 3, "tid": 7, "ts": 6302685427118.667, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685391045.819, "dur": 20.970, + "args": { + "External id": 129212, "cbid": 41, "correlation": 241683850 + } + }, + { + "ph": "s", "id": 241683850, "pid": 5717, "tid": 6759, "ts": 6302685391045.819, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685427168.971, "dur": 33.409, + "args": { + "External id": 129209, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683868, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683868, "pid": 3, "tid": 7, "ts": 6302685427168.971, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685391184.659, "dur": 10.380, + "args": { + "External id": 129209, "cbid": 307, "correlation": 241683868 + } + }, + { + "ph": "s", "id": 241683868, "pid": 5717, "tid": 6759, "ts": 6302685391184.659, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685427202.988, "dur": 39.616, + "args": { + "External id": 129219, "device": 3, "context": 1, "stream": 7, "correlation": 241683883, "bytes": 25165824, "memory bandwidth (GB/s)": 635.2439418416801 + } + }, + { + "ph": "f", "id": 241683883, "pid": 3, "tid": 7, "ts": 6302685427202.988, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685391267.609, "dur": 16.960, + "args": { + "External id": 129219, "cbid": 41, "correlation": 241683883 + } + }, + { + "ph": "s", "id": 241683883, "pid": 5717, "tid": 6759, "ts": 6302685391267.609, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685427243.308, "dur": 29.568, + "args": { + "External id": 129216, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683901, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241683901, "pid": 3, "tid": 7, "ts": 6302685427243.308, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685391395.039, "dur": 9.609, + "args": { + "External id": 129216, "cbid": 307, "correlation": 241683901 + } + }, + { + "ph": "s", "id": 241683901, "pid": 5717, "tid": 6759, "ts": 6302685391395.039, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685391544.848, "dur": 0.610, + "args": { + "External id": 129224, "cbid": 200, "correlation": 241683931 + } + }, + { + "ph": "f", "id": 241683931, "pid": 5717, "tid": 6759, "ts": 6302685391544.848, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685427274.188, "dur": 0.864, + "args": { + "External id": 129224, "device": 3, "context": 1, "stream": 7, "correlation": 241683934, "bytes": 576, "memory bandwidth (GB/s)": 0.6666666666666666 + } + }, + { + "ph": "f", "id": 241683934, "pid": 3, "tid": 7, "ts": 6302685427274.188, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685391547.398, "dur": 8.450, + "args": { + "External id": 129224, "cbid": 51, "correlation": 241683934 + } + }, + { + "ph": "s", "id": 241683934, "pid": 5717, "tid": 6759, "ts": 6302685391547.398, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685427276.268, "dur": 143.169, + "args": { + "External id": 129224, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683935, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683935, "pid": 3, "tid": 7, "ts": 6302685427276.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685391556.128, "dur": 9.020, + "args": { + "External id": 129224, "cbid": 307, "correlation": 241683935 + } + }, + { + "ph": "s", "id": 241683935, "pid": 5717, "tid": 6759, "ts": 6302685391556.128, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685391596.428, "dur": 0.320, + "args": { + "External id": 129225, "cbid": 200, "correlation": 241683960 + } + }, + { + "ph": "f", "id": 241683960, "pid": 5717, "tid": 6759, "ts": 6302685391596.428, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685427420.749, "dur": 0.768, + "args": { + "External id": 129225, "device": 3, "context": 1, "stream": 7, "correlation": 241683963, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 241683963, "pid": 3, "tid": 7, "ts": 6302685427420.749, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685391598.018, "dur": 4.860, + "args": { + "External id": 129225, "cbid": 51, "correlation": 241683963 + } + }, + { + "ph": "s", "id": 241683963, "pid": 5717, "tid": 6759, "ts": 6302685391598.018, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685427422.733, "dur": 306.210, + "args": { + "External id": 129225, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683964, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683964, "pid": 3, "tid": 7, "ts": 6302685427422.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685391603.068, "dur": 5.680, + "args": { + "External id": 129225, "cbid": 307, "correlation": 241683964 + } + }, + { + "ph": "s", "id": 241683964, "pid": 5717, "tid": 6759, "ts": 6302685391603.068, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685391634.668, "dur": 0.320, + "args": { + "External id": 129226, "cbid": 200, "correlation": 241683989 + } + }, + { + "ph": "f", "id": 241683989, "pid": 5717, "tid": 6759, "ts": 6302685391634.668, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685427767.376, "dur": 32.928, + "args": { + "External id": 129226, "device": 3, "context": 1, "stream": 7, "correlation": 241683992, "bytes": 576, "memory bandwidth (GB/s)": 0.01749271137026239 + } + }, + { + "ph": "f", "id": 241683992, "pid": 3, "tid": 7, "ts": 6302685427767.376, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685391636.228, "dur": 4.560, + "args": { + "External id": 129226, "cbid": 51, "correlation": 241683992 + } + }, + { + "ph": "s", "id": 241683992, "pid": 5717, "tid": 6759, "ts": 6302685391636.228, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685427856.656, "dur": 368.067, + "args": { + "External id": 129226, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241683993, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241683993, "pid": 3, "tid": 7, "ts": 6302685427856.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685391640.948, "dur": 5.490, + "args": { + "External id": 129226, "cbid": 307, "correlation": 241683993 + } + }, + { + "ph": "s", "id": 241683993, "pid": 5717, "tid": 6759, "ts": 6302685391640.948, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685428225.331, "dur": 140.513, + "args": { + "External id": 129227, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684015, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684015, "pid": 3, "tid": 7, "ts": 6302685428225.331, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685391674.728, "dur": 6.270, + "args": { + "External id": 129227, "cbid": 211, "correlation": 241684015 + } + }, + { + "ph": "s", "id": 241684015, "pid": 5717, "tid": 6759, "ts": 6302685391674.728, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685428366.484, "dur": 139.073, + "args": { + "External id": 129228, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684038, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684038, "pid": 3, "tid": 7, "ts": 6302685428366.484, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685391702.588, "dur": 5.630, + "args": { + "External id": 129228, "cbid": 211, "correlation": 241684038 + } + }, + { + "ph": "s", "id": 241684038, "pid": 5717, "tid": 6759, "ts": 6302685391702.588, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685428506.229, "dur": 139.681, + "args": { + "External id": 129229, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684061, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684061, "pid": 3, "tid": 7, "ts": 6302685428506.229, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685391728.088, "dur": 5.580, + "args": { + "External id": 129229, "cbid": 211, "correlation": 241684061 + } + }, + { + "ph": "s", "id": 241684061, "pid": 5717, "tid": 6759, "ts": 6302685391728.088, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 3, "tid": 7, + "ts": 6302685428646.614, "dur": 81.345, + "args": { + "External id": 129230, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684069, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684069, "pid": 3, "tid": 7, "ts": 6302685428646.614, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685391772.918, "dur": 6.769, + "args": { + "External id": 129230, "cbid": 307, "correlation": 241684069 + } + }, + { + "ph": "s", "id": 241684069, "pid": 5717, "tid": 6759, "ts": 6302685391772.918, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685428728.599, "dur": 46.336, + "args": { + "External id": 129245, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684098, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684098, "pid": 3, "tid": 7, "ts": 6302685428728.599, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685391958.577, "dur": 11.490, + "args": { + "External id": 129245, "cbid": 307, "correlation": 241684098 + } + }, + { + "ph": "s", "id": 241684098, "pid": 5717, "tid": 6759, "ts": 6302685391958.577, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685428775.639, "dur": 3.680, + "args": { + "External id": 129246, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684106, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241684106, "pid": 3, "tid": 7, "ts": 6302685428775.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685392000.377, "dur": 6.570, + "args": { + "External id": 129246, "cbid": 307, "correlation": 241684106 + } + }, + { + "ph": "s", "id": 241684106, "pid": 5717, "tid": 6759, "ts": 6302685392000.377, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685428780.023, "dur": 51.297, + "args": { + "External id": 129247, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684117, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684117, "pid": 3, "tid": 7, "ts": 6302685428780.023, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685392039.037, "dur": 7.820, + "args": { + "External id": 129247, "cbid": 307, "correlation": 241684117 + } + }, + { + "ph": "s", "id": 241684117, "pid": 5717, "tid": 6759, "ts": 6302685392039.037, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685428831.928, "dur": 46.656, + "args": { + "External id": 129248, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684122, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684122, "pid": 3, "tid": 7, "ts": 6302685428831.928, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685392087.187, "dur": 7.920, + "args": { + "External id": 129248, "cbid": 211, "correlation": 241684122 + } + }, + { + "ph": "s", "id": 241684122, "pid": 5717, "tid": 6759, "ts": 6302685392087.187, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685392290.226, "dur": 2.911, + "args": { + "External id": 129254, "cbid": 147, "correlation": 241684139 + } + }, + { + "ph": "s", "id": 241684139, "pid": 5717, "tid": 6759, "ts": 6302685392290.226, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685392449.016, "dur": 4.480, + "args": { + "External id": 129262, "cbid": 138, "correlation": 241684154 + } + }, + { + "ph": "f", "id": 241684154, "pid": 5717, "tid": 6759, "ts": 6302685392449.016, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685430472.932, "dur": 3.744, + "args": { + "External id": 129266, "device": 3, "context": 1, "stream": 7, "correlation": 241684165, "bytes": 28112, "memory bandwidth (GB/s)": 7.5085470085470085 + } + }, + { + "ph": "f", "id": 241684165, "pid": 3, "tid": 7, "ts": 6302685430472.932, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685392477.896, "dur": 13.250, + "args": { + "External id": 129266, "cbid": 41, "correlation": 241684165 + } + }, + { + "ph": "s", "id": 241684165, "pid": 5717, "tid": 6759, "ts": 6302685392477.896, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685392495.866, "dur": 2.090, + "args": { + "External id": 129261, "cbid": 135, "correlation": 241684169 + } + }, + { + "ph": "f", "id": 241684169, "pid": 5717, "tid": 6759, "ts": 6302685392495.866, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685430478.596, "dur": 386.787, + "args": { + "External id": 129261, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684173, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684173, "pid": 3, "tid": 7, "ts": 6302685430478.596, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685392503.396, "dur": 12.210, + "args": { + "External id": 129261, "cbid": 211, "correlation": 241684173 + } + }, + { + "ph": "s", "id": 241684173, "pid": 5717, "tid": 6759, "ts": 6302685392503.396, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685392571.716, "dur": 1.280, + "args": { + "External id": 129254, "cbid": 135, "correlation": 241684184 + } + }, + { + "ph": "f", "id": 241684184, "pid": 5717, "tid": 6759, "ts": 6302685392571.716, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685392575.306, "dur": 1.430, + "args": { + "External id": 129254, "cbid": 147, "correlation": 241684188 + } + }, + { + "ph": "s", "id": 241684188, "pid": 5717, "tid": 6759, "ts": 6302685392575.306, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685392680.925, "dur": 1.191, + "args": { + "External id": 129270, "cbid": 317, "correlation": 241684208 + } + }, + { + "ph": "f", "id": 241684208, "pid": 5717, "tid": 6759, "ts": 6302685392680.925, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685392684.765, "dur": 1.620, + "args": { + "External id": 129270, "cbid": 135, "correlation": 241684210 + } + }, + { + "ph": "f", "id": 241684210, "pid": 5717, "tid": 6759, "ts": 6302685392684.765, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685392687.976, "dur": 0.860, + "args": { + "External id": 129270, "cbid": 147, "correlation": 241684214 + } + }, + { + "ph": "s", "id": 241684214, "pid": 5717, "tid": 6759, "ts": 6302685392687.976, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685392706.536, "dur": 0.980, + "args": { + "External id": 129270, "cbid": 409, "correlation": 241684217 + } + }, + { + "ph": "f", "id": 241684217, "pid": 5717, "tid": 6759, "ts": 6302685392706.536, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685392712.016, "dur": 0.880, + "args": { + "External id": 129270, "cbid": 135, "correlation": 241684220 + } + }, + { + "ph": "f", "id": 241684220, "pid": 5717, "tid": 6759, "ts": 6302685392712.016, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685392713.105, "dur": 1.031, + "args": { + "External id": 129270, "cbid": 147, "correlation": 241684221 + } + }, + { + "ph": "s", "id": 241684221, "pid": 5717, "tid": 6759, "ts": 6302685392713.105, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685434903.205, "dur": 9371.847, + "args": { + "External id": 129270, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241684223, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241684223, "pid": 3, "tid": 20, "ts": 6302685434903.205, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685392715.545, "dur": 12.160, + "args": { + "External id": 129270, "cbid": 430, "correlation": 241684223 + } + }, + { + "ph": "s", "id": 241684223, "pid": 5717, "tid": 6759, "ts": 6302685392715.545, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685392729.005, "dur": 0.431, + "args": { + "External id": 129270, "cbid": 135, "correlation": 241684225 + } + }, + { + "ph": "f", "id": 241684225, "pid": 5717, "tid": 6759, "ts": 6302685392729.005, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685392729.625, "dur": 0.540, + "args": { + "External id": 129270, "cbid": 147, "correlation": 241684226 + } + }, + { + "ph": "s", "id": 241684226, "pid": 5717, "tid": 6759, "ts": 6302685392729.625, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685392731.945, "dur": 0.711, + "args": { + "External id": 129270, "cbid": 135, "correlation": 241684229 + } + }, + { + "ph": "f", "id": 241684229, "pid": 5717, "tid": 6759, "ts": 6302685392731.945, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685392741.856, "dur": 0.440, + "args": { + "External id": 129270, "cbid": 135, "correlation": 241684236 + } + }, + { + "ph": "f", "id": 241684236, "pid": 5717, "tid": 6759, "ts": 6302685392741.856, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685392780.665, "dur": 2.270, + "args": { + "External id": 129272, "cbid": 147, "correlation": 241684241 + } + }, + { + "ph": "s", "id": 241684241, "pid": 5717, "tid": 6759, "ts": 6302685392780.665, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685392810.685, "dur": 0.960, + "args": { + "External id": 129254, "cbid": 135, "correlation": 241684256 + } + }, + { + "ph": "f", "id": 241684256, "pid": 5717, "tid": 6759, "ts": 6302685392810.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685393029.595, "dur": 1.360, + "args": { + "External id": 129254, "cbid": 135, "correlation": 241684269 + } + }, + { + "ph": "f", "id": 241684269, "pid": 5717, "tid": 6759, "ts": 6302685393029.595, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685393150.524, "dur": 4.440, + "args": { + "External id": 129282, "cbid": 147, "correlation": 241684280 + } + }, + { + "ph": "s", "id": 241684280, "pid": 5717, "tid": 6759, "ts": 6302685393150.524, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685393307.404, "dur": 1.360, + "args": { + "External id": 129296, "cbid": 317, "correlation": 241684321 + } + }, + { + "ph": "f", "id": 241684321, "pid": 5717, "tid": 6759, "ts": 6302685393307.404, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685393318.364, "dur": 3.150, + "args": { + "External id": 129297, "cbid": 138, "correlation": 241684324 + } + }, + { + "ph": "f", "id": 241684324, "pid": 5717, "tid": 6759, "ts": 6302685393318.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685434909.957, "dur": 1.633, + "args": { + "External id": 129301, "device": 3, "context": 1, "stream": 7, "correlation": 241684335, "bytes": 7224, "memory bandwidth (GB/s)": 4.42375995101041 + } + }, + { + "ph": "f", "id": 241684335, "pid": 3, "tid": 7, "ts": 6302685434909.957, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685393352.514, "dur": 19.220, + "args": { + "External id": 129301, "cbid": 41, "correlation": 241684335 + } + }, + { + "ph": "s", "id": 241684335, "pid": 5717, "tid": 6759, "ts": 6302685393352.514, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685393380.444, "dur": 2.290, + "args": { + "External id": 129296, "cbid": 135, "correlation": 241684339 + } + }, + { + "ph": "f", "id": 241684339, "pid": 5717, "tid": 6759, "ts": 6302685393380.444, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685435018.790, "dur": 471.812, + "args": { + "External id": 129296, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684343, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684343, "pid": 3, "tid": 7, "ts": 6302685435018.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685393387.414, "dur": 18.100, + "args": { + "External id": 129296, "cbid": 211, "correlation": 241684343 + } + }, + { + "ph": "s", "id": 241684343, "pid": 5717, "tid": 6759, "ts": 6302685393387.414, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685393517.474, "dur": 1.430, + "args": { + "External id": 129282, "cbid": 135, "correlation": 241684354 + } + }, + { + "ph": "f", "id": 241684354, "pid": 5717, "tid": 6759, "ts": 6302685393517.474, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685393523.594, "dur": 1.260, + "args": { + "External id": 129282, "cbid": 147, "correlation": 241684358 + } + }, + { + "ph": "s", "id": 241684358, "pid": 5717, "tid": 6759, "ts": 6302685393523.594, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685393527.694, "dur": 0.790, + "args": { + "External id": 129282, "cbid": 147, "correlation": 241684362 + } + }, + { + "ph": "s", "id": 241684362, "pid": 5717, "tid": 6759, "ts": 6302685393527.694, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685436082.574, "dur": 258.594, + "args": { + "External id": 129315, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241684386, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241684386, "pid": 3, "tid": 17, "ts": 6302685436082.574, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685393702.153, "dur": 12.680, + "args": { + "External id": 129315, "cbid": 211, "correlation": 241684386 + } + }, + { + "ph": "s", "id": 241684386, "pid": 5717, "tid": 6759, "ts": 6302685393702.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685436344.368, "dur": 13.088, + "args": { + "External id": 129331, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241684399, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241684399, "pid": 3, "tid": 17, "ts": 6302685436344.368, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685393817.813, "dur": 10.510, + "args": { + "External id": 129331, "cbid": 211, "correlation": 241684399 + } + }, + { + "ph": "s", "id": 241684399, "pid": 5717, "tid": 6759, "ts": 6302685393817.813, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685393851.883, "dur": 1.190, + "args": { + "External id": 129282, "cbid": 135, "correlation": 241684409 + } + }, + { + "ph": "f", "id": 241684409, "pid": 5717, "tid": 6759, "ts": 6302685393851.883, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685393855.013, "dur": 1.230, + "args": { + "External id": 129282, "cbid": 147, "correlation": 241684413 + } + }, + { + "ph": "s", "id": 241684413, "pid": 5717, "tid": 6759, "ts": 6302685393855.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685393908.283, "dur": 0.900, + "args": { + "External id": 129333, "cbid": 317, "correlation": 241684426 + } + }, + { + "ph": "f", "id": 241684426, "pid": 5717, "tid": 6759, "ts": 6302685393908.283, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685393910.993, "dur": 1.170, + "args": { + "External id": 129333, "cbid": 135, "correlation": 241684428 + } + }, + { + "ph": "f", "id": 241684428, "pid": 5717, "tid": 6759, "ts": 6302685393910.993, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685393913.543, "dur": 1.090, + "args": { + "External id": 129333, "cbid": 147, "correlation": 241684432 + } + }, + { + "ph": "s", "id": 241684432, "pid": 5717, "tid": 6759, "ts": 6302685393913.543, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685393929.343, "dur": 0.880, + "args": { + "External id": 129333, "cbid": 409, "correlation": 241684435 + } + }, + { + "ph": "f", "id": 241684435, "pid": 5717, "tid": 6759, "ts": 6302685393929.343, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685393934.423, "dur": 0.760, + "args": { + "External id": 129333, "cbid": 135, "correlation": 241684438 + } + }, + { + "ph": "f", "id": 241684438, "pid": 5717, "tid": 6759, "ts": 6302685393934.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685393935.363, "dur": 0.810, + "args": { + "External id": 129333, "cbid": 147, "correlation": 241684439 + } + }, + { + "ph": "s", "id": 241684439, "pid": 5717, "tid": 6759, "ts": 6302685393935.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685444276.556, "dur": 4945.189, + "args": { + "External id": 129333, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241684441, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241684441, "pid": 3, "tid": 20, "ts": 6302685444276.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685393937.493, "dur": 10.590, + "args": { + "External id": 129333, "cbid": 430, "correlation": 241684441 + } + }, + { + "ph": "s", "id": 241684441, "pid": 5717, "tid": 6759, "ts": 6302685393937.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685393949.223, "dur": 0.400, + "args": { + "External id": 129333, "cbid": 135, "correlation": 241684443 + } + }, + { + "ph": "f", "id": 241684443, "pid": 5717, "tid": 6759, "ts": 6302685393949.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685393949.743, "dur": 0.500, + "args": { + "External id": 129333, "cbid": 147, "correlation": 241684444 + } + }, + { + "ph": "s", "id": 241684444, "pid": 5717, "tid": 6759, "ts": 6302685393949.743, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685393951.923, "dur": 0.970, + "args": { + "External id": 129333, "cbid": 135, "correlation": 241684447 + } + }, + { + "ph": "f", "id": 241684447, "pid": 5717, "tid": 6759, "ts": 6302685393951.923, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685393961.463, "dur": 0.450, + "args": { + "External id": 129333, "cbid": 135, "correlation": 241684454 + } + }, + { + "ph": "f", "id": 241684454, "pid": 5717, "tid": 6759, "ts": 6302685393961.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685393987.793, "dur": 1.020, + "args": { + "External id": 129335, "cbid": 147, "correlation": 241684459 + } + }, + { + "ph": "s", "id": 241684459, "pid": 5717, "tid": 6759, "ts": 6302685393987.793, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685394006.202, "dur": 0.991, + "args": { + "External id": 129282, "cbid": 135, "correlation": 241684474 + } + }, + { + "ph": "f", "id": 241684474, "pid": 5717, "tid": 6759, "ts": 6302685394006.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685435504.330, "dur": 1100.424, + "args": { + "External id": 129337, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684499, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684499, "pid": 3, "tid": 7, "ts": 6302685435504.330, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394154.562, "dur": 10.830, + "args": { + "External id": 129337, "cbid": 211, "correlation": 241684499 + } + }, + { + "ph": "s", "id": 241684499, "pid": 5717, "tid": 6759, "ts": 6302685394154.562, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 3, "tid": 7, + "ts": 6302685436605.426, "dur": 430.339, + "args": { + "External id": 129338, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684522, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241684522, "pid": 3, "tid": 7, "ts": 6302685436605.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394216.972, "dur": 10.630, + "args": { + "External id": 129338, "cbid": 307, "correlation": 241684522 + } + }, + { + "ph": "s", "id": 241684522, "pid": 5717, "tid": 6759, "ts": 6302685394216.972, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685394286.032, "dur": 0.560, + "args": { + "External id": 129339, "cbid": 200, "correlation": 241684545 + } + }, + { + "ph": "f", "id": 241684545, "pid": 5717, "tid": 6759, "ts": 6302685394286.032, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685437037.013, "dur": 1.184, + "args": { + "External id": 129339, "device": 3, "context": 1, "stream": 7, "correlation": 241684548, "bytes": 1536, "memory bandwidth (GB/s)": 1.2972972972972974 + } + }, + { + "ph": "f", "id": 241684548, "pid": 3, "tid": 7, "ts": 6302685437037.013, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685394289.322, "dur": 15.020, + "args": { + "External id": 129339, "cbid": 51, "correlation": 241684548 + } + }, + { + "ph": "s", "id": 241684548, "pid": 5717, "tid": 6759, "ts": 6302685394289.322, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685437039.605, "dur": 464.868, + "args": { + "External id": 129339, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684549, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684549, "pid": 3, "tid": 7, "ts": 6302685437039.605, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394304.582, "dur": 6.560, + "args": { + "External id": 129339, "cbid": 307, "correlation": 241684549 + } + }, + { + "ph": "s", "id": 241684549, "pid": 5717, "tid": 6759, "ts": 6302685394304.582, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685394352.612, "dur": 0.280, + "args": { + "External id": 129340, "cbid": 200, "correlation": 241684574 + } + }, + { + "ph": "f", "id": 241684574, "pid": 5717, "tid": 6759, "ts": 6302685394352.612, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685437551.930, "dur": 46.688, + "args": { + "External id": 129340, "device": 3, "context": 1, "stream": 7, "correlation": 241684577, "bytes": 1536, "memory bandwidth (GB/s)": 0.03289924605894448 + } + }, + { + "ph": "f", "id": 241684577, "pid": 3, "tid": 7, "ts": 6302685437551.930, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685394354.042, "dur": 4.940, + "args": { + "External id": 129340, "cbid": 51, "correlation": 241684577 + } + }, + { + "ph": "s", "id": 241684577, "pid": 5717, "tid": 6759, "ts": 6302685394354.042, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685437652.026, "dur": 428.483, + "args": { + "External id": 129340, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684578, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684578, "pid": 3, "tid": 7, "ts": 6302685437652.026, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394359.162, "dur": 5.480, + "args": { + "External id": 129340, "cbid": 307, "correlation": 241684578 + } + }, + { + "ph": "s", "id": 241684578, "pid": 5717, "tid": 6759, "ts": 6302685394359.162, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685394388.132, "dur": 0.260, + "args": { + "External id": 129341, "cbid": 200, "correlation": 241684603 + } + }, + { + "ph": "f", "id": 241684603, "pid": 5717, "tid": 6759, "ts": 6302685394388.132, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685438081.181, "dur": 368.803, + "args": { + "External id": 129341, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684606, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684606, "pid": 3, "tid": 7, "ts": 6302685438081.181, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394389.602, "dur": 5.370, + "args": { + "External id": 129341, "cbid": 307, "correlation": 241684606 + } + }, + { + "ph": "s", "id": 241684606, "pid": 5717, "tid": 6759, "ts": 6302685394389.602, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685394416.312, "dur": 0.270, + "args": { + "External id": 129342, "cbid": 200, "correlation": 241684631 + } + }, + { + "ph": "f", "id": 241684631, "pid": 5717, "tid": 6759, "ts": 6302685394416.312, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685438451.040, "dur": 1.344, + "args": { + "External id": 129342, "device": 3, "context": 1, "stream": 7, "correlation": 241684634, "bytes": 1536, "memory bandwidth (GB/s)": 1.1428571428571428 + } + }, + { + "ph": "f", "id": 241684634, "pid": 3, "tid": 7, "ts": 6302685438451.040, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685394417.712, "dur": 4.690, + "args": { + "External id": 129342, "cbid": 51, "correlation": 241684634 + } + }, + { + "ph": "s", "id": 241684634, "pid": 5717, "tid": 6759, "ts": 6302685394417.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685438453.824, "dur": 415.716, + "args": { + "External id": 129342, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684635, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684635, "pid": 3, "tid": 7, "ts": 6302685438453.824, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394422.562, "dur": 4.919, + "args": { + "External id": 129342, "cbid": 307, "correlation": 241684635 + } + }, + { + "ph": "s", "id": 241684635, "pid": 5717, "tid": 6759, "ts": 6302685394422.562, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685394449.872, "dur": 0.269, + "args": { + "External id": 129343, "cbid": 200, "correlation": 241684660 + } + }, + { + "ph": "f", "id": 241684660, "pid": 5717, "tid": 6759, "ts": 6302685394449.872, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685438870.244, "dur": 367.842, + "args": { + "External id": 129343, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684663, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684663, "pid": 3, "tid": 7, "ts": 6302685438870.244, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394451.172, "dur": 5.220, + "args": { + "External id": 129343, "cbid": 307, "correlation": 241684663 + } + }, + { + "ph": "s", "id": 241684663, "pid": 5717, "tid": 6759, "ts": 6302685394451.172, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685439238.694, "dur": 89.825, + "args": { + "External id": 129344, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684676, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684676, "pid": 3, "tid": 7, "ts": 6302685439238.694, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394495.361, "dur": 6.840, + "args": { + "External id": 129344, "cbid": 307, "correlation": 241684676 + } + }, + { + "ph": "s", "id": 241684676, "pid": 5717, "tid": 6759, "ts": 6302685394495.361, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 3, "tid": 7, + "ts": 6302685439329.159, "dur": 3.552, + "args": { + "External id": 129345, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684684, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241684684, "pid": 3, "tid": 7, "ts": 6302685439329.159, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394528.821, "dur": 7.880, + "args": { + "External id": 129345, "cbid": 307, "correlation": 241684684 + } + }, + { + "ph": "s", "id": 241684684, "pid": 5717, "tid": 6759, "ts": 6302685394528.821, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 3, "tid": 7, + "ts": 6302685439333.319, "dur": 115.041, + "args": { + "External id": 129346, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684692, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684692, "pid": 3, "tid": 7, "ts": 6302685439333.319, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394571.371, "dur": 5.920, + "args": { + "External id": 129346, "cbid": 307, "correlation": 241684692 + } + }, + { + "ph": "s", "id": 241684692, "pid": 5717, "tid": 6759, "ts": 6302685394571.371, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685394784.511, "dur": 0.530, + "args": { + "External id": 129365, "cbid": 200, "correlation": 241684738 + } + }, + { + "ph": "f", "id": 241684738, "pid": 5717, "tid": 6759, "ts": 6302685394784.511, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685439449.608, "dur": 1.152, + "args": { + "External id": 129365, "device": 3, "context": 1, "stream": 7, "correlation": 241684741, "bytes": 576, "memory bandwidth (GB/s)": 0.5 + } + }, + { + "ph": "f", "id": 241684741, "pid": 3, "tid": 7, "ts": 6302685439449.608, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685394787.811, "dur": 7.760, + "args": { + "External id": 129365, "cbid": 51, "correlation": 241684741 + } + }, + { + "ph": "s", "id": 241684741, "pid": 5717, "tid": 6759, "ts": 6302685394787.811, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685439452.200, "dur": 143.265, + "args": { + "External id": 129365, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684742, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684742, "pid": 3, "tid": 7, "ts": 6302685439452.200, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394795.841, "dur": 8.280, + "args": { + "External id": 129365, "cbid": 307, "correlation": 241684742 + } + }, + { + "ph": "s", "id": 241684742, "pid": 5717, "tid": 6759, "ts": 6302685394795.841, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685439596.137, "dur": 138.529, + "args": { + "External id": 129366, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684764, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684764, "pid": 3, "tid": 7, "ts": 6302685439596.137, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394841.631, "dur": 5.950, + "args": { + "External id": 129366, "cbid": 211, "correlation": 241684764 + } + }, + { + "ph": "s", "id": 241684764, "pid": 5717, "tid": 6759, "ts": 6302685394841.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685394923.360, "dur": 0.431, + "args": { + "External id": 129367, "cbid": 200, "correlation": 241684782 + } + }, + { + "ph": "f", "id": 241684782, "pid": 5717, "tid": 6759, "ts": 6302685394923.360, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685394923.991, "dur": 0.200, + "args": { + "External id": 129367, "cbid": 200, "correlation": 241684783 + } + }, + { + "ph": "f", "id": 241684783, "pid": 5717, "tid": 6759, "ts": 6302685394923.991, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685394942.831, "dur": 0.229, + "args": { + "External id": 129367, "cbid": 200, "correlation": 241684801 + } + }, + { + "ph": "f", "id": 241684801, "pid": 5717, "tid": 6759, "ts": 6302685394942.831, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685439735.274, "dur": 92.641, + "args": { + "External id": 129367, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684802, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684802, "pid": 3, "tid": 7, "ts": 6302685439735.274, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394944.260, "dur": 9.371, + "args": { + "External id": 129367, "cbid": 211, "correlation": 241684802 + } + }, + { + "ph": "s", "id": 241684802, "pid": 5717, "tid": 6759, "ts": 6302685394944.260, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685394954.371, "dur": 0.940, + "args": { + "External id": 129367, "cbid": 273, "correlation": 241684804 + } + }, + { + "ph": "f", "id": 241684804, "pid": 5717, "tid": 6759, "ts": 6302685394954.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685439828.587, "dur": 1109.672, + "args": { + "External id": 129367, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684805, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241684805, "pid": 3, "tid": 7, "ts": 6302685439828.587, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394956.331, "dur": 12.479, + "args": { + "External id": 129367, "cbid": 211, "correlation": 241684805 + } + }, + { + "ph": "s", "id": 241684805, "pid": 5717, "tid": 6759, "ts": 6302685394956.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685440938.899, "dur": 73.345, + "args": { + "External id": 129367, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684807, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241684807, "pid": 3, "tid": 7, "ts": 6302685440938.899, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685394969.390, "dur": 3.800, + "args": { + "External id": 129367, "cbid": 211, "correlation": 241684807 + } + }, + { + "ph": "s", "id": 241684807, "pid": 5717, "tid": 6759, "ts": 6302685394969.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685441012.820, "dur": 48.224, + "args": { + "External id": 129378, "device": 3, "context": 1, "stream": 7, "correlation": 241684829, "bytes": 25165824, "memory bandwidth (GB/s)": 521.8526874585269 + } + }, + { + "ph": "f", "id": 241684829, "pid": 3, "tid": 7, "ts": 6302685441012.820, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685395101.950, "dur": 18.080, + "args": { + "External id": 129378, "cbid": 41, "correlation": 241684829 + } + }, + { + "ph": "s", "id": 241684829, "pid": 5717, "tid": 6759, "ts": 6302685395101.950, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685441061.684, "dur": 33.440, + "args": { + "External id": 129375, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684847, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684847, "pid": 3, "tid": 7, "ts": 6302685441061.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395216.390, "dur": 8.420, + "args": { + "External id": 129375, "cbid": 307, "correlation": 241684847 + } + }, + { + "ph": "s", "id": 241684847, "pid": 5717, "tid": 6759, "ts": 6302685395216.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685441095.732, "dur": 40.129, + "args": { + "External id": 129385, "device": 3, "context": 1, "stream": 7, "correlation": 241684862, "bytes": 25165824, "memory bandwidth (GB/s)": 627.1231279124822 + } + }, + { + "ph": "f", "id": 241684862, "pid": 3, "tid": 7, "ts": 6302685441095.732, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685395284.170, "dur": 21.550, + "args": { + "External id": 129385, "cbid": 41, "correlation": 241684862 + } + }, + { + "ph": "s", "id": 241684862, "pid": 5717, "tid": 6759, "ts": 6302685395284.170, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685441136.533, "dur": 27.808, + "args": { + "External id": 129382, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684880, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241684880, "pid": 3, "tid": 7, "ts": 6302685441136.533, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395398.779, "dur": 10.750, + "args": { + "External id": 129382, "cbid": 307, "correlation": 241684880 + } + }, + { + "ph": "s", "id": 241684880, "pid": 5717, "tid": 6759, "ts": 6302685395398.779, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685395570.009, "dur": 0.530, + "args": { + "External id": 129390, "cbid": 200, "correlation": 241684910 + } + }, + { + "ph": "f", "id": 241684910, "pid": 5717, "tid": 6759, "ts": 6302685395570.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685441165.589, "dur": 1.280, + "args": { + "External id": 129390, "device": 3, "context": 1, "stream": 7, "correlation": 241684913, "bytes": 576, "memory bandwidth (GB/s)": 0.45 + } + }, + { + "ph": "f", "id": 241684913, "pid": 3, "tid": 7, "ts": 6302685441165.589, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685395572.239, "dur": 8.810, + "args": { + "External id": 129390, "cbid": 51, "correlation": 241684913 + } + }, + { + "ph": "s", "id": 241684913, "pid": 5717, "tid": 6759, "ts": 6302685395572.239, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685441168.149, "dur": 162.209, + "args": { + "External id": 129390, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684914, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684914, "pid": 3, "tid": 7, "ts": 6302685441168.149, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395582.339, "dur": 11.310, + "args": { + "External id": 129390, "cbid": 307, "correlation": 241684914 + } + }, + { + "ph": "s", "id": 241684914, "pid": 5717, "tid": 6759, "ts": 6302685395582.339, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685395639.639, "dur": 0.280, + "args": { + "External id": 129391, "cbid": 200, "correlation": 241684939 + } + }, + { + "ph": "f", "id": 241684939, "pid": 5717, "tid": 6759, "ts": 6302685395639.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685441343.958, "dur": 17.152, + "args": { + "External id": 129391, "device": 3, "context": 1, "stream": 7, "correlation": 241684942, "bytes": 576, "memory bandwidth (GB/s)": 0.033582089552238806 + } + }, + { + "ph": "f", "id": 241684942, "pid": 3, "tid": 7, "ts": 6302685441343.958, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685395640.979, "dur": 10.150, + "args": { + "External id": 129391, "cbid": 51, "correlation": 241684942 + } + }, + { + "ph": "s", "id": 241684942, "pid": 5717, "tid": 6759, "ts": 6302685395640.979, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685441389.206, "dur": 332.259, + "args": { + "External id": 129391, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684943, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684943, "pid": 3, "tid": 7, "ts": 6302685441389.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395651.309, "dur": 8.850, + "args": { + "External id": 129391, "cbid": 307, "correlation": 241684943 + } + }, + { + "ph": "s", "id": 241684943, "pid": 5717, "tid": 6759, "ts": 6302685395651.309, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685395684.009, "dur": 0.260, + "args": { + "External id": 129392, "cbid": 200, "correlation": 241684968 + } + }, + { + "ph": "f", "id": 241684968, "pid": 5717, "tid": 6759, "ts": 6302685395684.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685441775.353, "dur": 53.761, + "args": { + "External id": 129392, "device": 3, "context": 1, "stream": 7, "correlation": 241684971, "bytes": 576, "memory bandwidth (GB/s)": 0.010714086419523447 + } + }, + { + "ph": "f", "id": 241684971, "pid": 3, "tid": 7, "ts": 6302685441775.353, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685395685.269, "dur": 4.190, + "args": { + "External id": 129392, "cbid": 51, "correlation": 241684971 + } + }, + { + "ph": "s", "id": 241684971, "pid": 5717, "tid": 6759, "ts": 6302685395685.269, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685441867.226, "dur": 197.953, + "args": { + "External id": 129392, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684972, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684972, "pid": 3, "tid": 7, "ts": 6302685441867.226, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395689.589, "dur": 4.770, + "args": { + "External id": 129392, "cbid": 307, "correlation": 241684972 + } + }, + { + "ph": "s", "id": 241684972, "pid": 5717, "tid": 6759, "ts": 6302685395689.589, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685442065.883, "dur": 139.777, + "args": { + "External id": 129393, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241684994, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241684994, "pid": 3, "tid": 7, "ts": 6302685442065.883, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395718.109, "dur": 5.540, + "args": { + "External id": 129393, "cbid": 211, "correlation": 241684994 + } + }, + { + "ph": "s", "id": 241684994, "pid": 5717, "tid": 6759, "ts": 6302685395718.109, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685442206.332, "dur": 139.233, + "args": { + "External id": 129394, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685017, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685017, "pid": 3, "tid": 7, "ts": 6302685442206.332, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395742.809, "dur": 5.020, + "args": { + "External id": 129394, "cbid": 211, "correlation": 241685017 + } + }, + { + "ph": "s", "id": 241685017, "pid": 5717, "tid": 6759, "ts": 6302685395742.809, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685442346.205, "dur": 139.905, + "args": { + "External id": 129395, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685040, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685040, "pid": 3, "tid": 7, "ts": 6302685442346.205, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395765.429, "dur": 4.700, + "args": { + "External id": 129395, "cbid": 211, "correlation": 241685040 + } + }, + { + "ph": "s", "id": 241685040, "pid": 5717, "tid": 6759, "ts": 6302685395765.429, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 3, "tid": 7, + "ts": 6302685442486.814, "dur": 81.409, + "args": { + "External id": 129396, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685048, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685048, "pid": 3, "tid": 7, "ts": 6302685442486.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395803.698, "dur": 5.940, + "args": { + "External id": 129396, "cbid": 307, "correlation": 241685048 + } + }, + { + "ph": "s", "id": 241685048, "pid": 5717, "tid": 6759, "ts": 6302685395803.698, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685442568.927, "dur": 45.952, + "args": { + "External id": 129411, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685077, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685077, "pid": 3, "tid": 7, "ts": 6302685442568.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395961.358, "dur": 10.340, + "args": { + "External id": 129411, "cbid": 307, "correlation": 241685077 + } + }, + { + "ph": "s", "id": 241685077, "pid": 5717, "tid": 6759, "ts": 6302685395961.358, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685442615.487, "dur": 3.520, + "args": { + "External id": 129412, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685085, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241685085, "pid": 3, "tid": 7, "ts": 6302685442615.487, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685395998.628, "dur": 5.500, + "args": { + "External id": 129412, "cbid": 307, "correlation": 241685085 + } + }, + { + "ph": "s", "id": 241685085, "pid": 5717, "tid": 6759, "ts": 6302685395998.628, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685442619.711, "dur": 49.953, + "args": { + "External id": 129413, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685096, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685096, "pid": 3, "tid": 7, "ts": 6302685442619.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685396031.018, "dur": 6.450, + "args": { + "External id": 129413, "cbid": 307, "correlation": 241685096 + } + }, + { + "ph": "s", "id": 241685096, "pid": 5717, "tid": 6759, "ts": 6302685396031.018, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685442670.272, "dur": 48.321, + "args": { + "External id": 129414, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685101, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685101, "pid": 3, "tid": 7, "ts": 6302685442670.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685396072.498, "dur": 7.110, + "args": { + "External id": 129414, "cbid": 211, "correlation": 241685101 + } + }, + { + "ph": "s", "id": 241685101, "pid": 5717, "tid": 6759, "ts": 6302685396072.498, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685396250.157, "dur": 2.651, + "args": { + "External id": 129420, "cbid": 147, "correlation": 241685118 + } + }, + { + "ph": "s", "id": 241685118, "pid": 5717, "tid": 6759, "ts": 6302685396250.157, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685396362.687, "dur": 2.560, + "args": { + "External id": 129428, "cbid": 138, "correlation": 241685133 + } + }, + { + "ph": "f", "id": 241685133, "pid": 5717, "tid": 6759, "ts": 6302685396362.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685444276.556, "dur": 3.072, + "args": { + "External id": 129432, "device": 3, "context": 1, "stream": 7, "correlation": 241685144, "bytes": 28112, "memory bandwidth (GB/s)": 9.151041666666666 + } + }, + { + "ph": "f", "id": 241685144, "pid": 3, "tid": 7, "ts": 6302685444276.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685396385.417, "dur": 11.840, + "args": { + "External id": 129432, "cbid": 41, "correlation": 241685144 + } + }, + { + "ph": "s", "id": 241685144, "pid": 5717, "tid": 6759, "ts": 6302685396385.417, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685396401.387, "dur": 1.980, + "args": { + "External id": 129427, "cbid": 135, "correlation": 241685148 + } + }, + { + "ph": "f", "id": 241685148, "pid": 5717, "tid": 6759, "ts": 6302685396401.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685444281.612, "dur": 369.539, + "args": { + "External id": 129427, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685152, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685152, "pid": 3, "tid": 7, "ts": 6302685444281.612, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685396406.897, "dur": 10.490, + "args": { + "External id": 129427, "cbid": 211, "correlation": 241685152 + } + }, + { + "ph": "s", "id": 241685152, "pid": 5717, "tid": 6759, "ts": 6302685396406.897, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685396456.067, "dur": 1.120, + "args": { + "External id": 129420, "cbid": 135, "correlation": 241685163 + } + }, + { + "ph": "f", "id": 241685163, "pid": 5717, "tid": 6759, "ts": 6302685396456.067, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685396459.187, "dur": 1.230, + "args": { + "External id": 129420, "cbid": 147, "correlation": 241685167 + } + }, + { + "ph": "s", "id": 241685167, "pid": 5717, "tid": 6759, "ts": 6302685396459.187, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685396530.127, "dur": 0.950, + "args": { + "External id": 129436, "cbid": 317, "correlation": 241685187 + } + }, + { + "ph": "f", "id": 241685187, "pid": 5717, "tid": 6759, "ts": 6302685396530.127, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685396532.877, "dur": 1.330, + "args": { + "External id": 129436, "cbid": 135, "correlation": 241685189 + } + }, + { + "ph": "f", "id": 241685189, "pid": 5717, "tid": 6759, "ts": 6302685396532.877, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685396535.477, "dur": 0.790, + "args": { + "External id": 129436, "cbid": 147, "correlation": 241685193 + } + }, + { + "ph": "s", "id": 241685193, "pid": 5717, "tid": 6759, "ts": 6302685396535.477, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685396553.907, "dur": 0.900, + "args": { + "External id": 129436, "cbid": 409, "correlation": 241685196 + } + }, + { + "ph": "f", "id": 241685196, "pid": 5717, "tid": 6759, "ts": 6302685396553.907, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685396558.887, "dur": 0.920, + "args": { + "External id": 129436, "cbid": 135, "correlation": 241685199 + } + }, + { + "ph": "f", "id": 241685199, "pid": 5717, "tid": 6759, "ts": 6302685396558.887, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685396559.987, "dur": 0.930, + "args": { + "External id": 129436, "cbid": 147, "correlation": 241685200 + } + }, + { + "ph": "s", "id": 241685200, "pid": 5717, "tid": 6759, "ts": 6302685396559.987, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685449223.409, "dur": 9259.334, + "args": { + "External id": 129436, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241685202, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241685202, "pid": 3, "tid": 20, "ts": 6302685449223.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685396562.197, "dur": 17.010, + "args": { + "External id": 129436, "cbid": 430, "correlation": 241685202 + } + }, + { + "ph": "s", "id": 241685202, "pid": 5717, "tid": 6759, "ts": 6302685396562.197, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685396581.487, "dur": 0.420, + "args": { + "External id": 129436, "cbid": 135, "correlation": 241685204 + } + }, + { + "ph": "f", "id": 241685204, "pid": 5717, "tid": 6759, "ts": 6302685396581.487, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685396582.037, "dur": 0.500, + "args": { + "External id": 129436, "cbid": 147, "correlation": 241685205 + } + }, + { + "ph": "s", "id": 241685205, "pid": 5717, "tid": 6759, "ts": 6302685396582.037, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685396585.437, "dur": 1.970, + "args": { + "External id": 129436, "cbid": 135, "correlation": 241685208 + } + }, + { + "ph": "f", "id": 241685208, "pid": 5717, "tid": 6759, "ts": 6302685396585.437, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685396595.567, "dur": 0.420, + "args": { + "External id": 129436, "cbid": 135, "correlation": 241685215 + } + }, + { + "ph": "f", "id": 241685215, "pid": 5717, "tid": 6759, "ts": 6302685396595.567, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685396623.407, "dur": 1.020, + "args": { + "External id": 129438, "cbid": 147, "correlation": 241685220 + } + }, + { + "ph": "s", "id": 241685220, "pid": 5717, "tid": 6759, "ts": 6302685396623.407, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685396640.787, "dur": 0.929, + "args": { + "External id": 129420, "cbid": 135, "correlation": 241685235 + } + }, + { + "ph": "f", "id": 241685235, "pid": 5717, "tid": 6759, "ts": 6302685396640.787, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685396880.786, "dur": 3.520, + "args": { + "External id": 129420, "cbid": 135, "correlation": 241685248 + } + }, + { + "ph": "f", "id": 241685248, "pid": 5717, "tid": 6759, "ts": 6302685396880.786, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685396997.756, "dur": 3.480, + "args": { + "External id": 129448, "cbid": 147, "correlation": 241685259 + } + }, + { + "ph": "s", "id": 241685259, "pid": 5717, "tid": 6759, "ts": 6302685396997.756, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685397112.155, "dur": 1.071, + "args": { + "External id": 129462, "cbid": 317, "correlation": 241685300 + } + }, + { + "ph": "f", "id": 241685300, "pid": 5717, "tid": 6759, "ts": 6302685397112.155, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685397120.255, "dur": 2.151, + "args": { + "External id": 129463, "cbid": 138, "correlation": 241685303 + } + }, + { + "ph": "f", "id": 241685303, "pid": 5717, "tid": 6759, "ts": 6302685397120.255, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685449228.657, "dur": 3.680, + "args": { + "External id": 129467, "device": 3, "context": 1, "stream": 7, "correlation": 241685314, "bytes": 7224, "memory bandwidth (GB/s)": 1.9630434782608697 + } + }, + { + "ph": "f", "id": 241685314, "pid": 3, "tid": 7, "ts": 6302685449228.657, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685397142.826, "dur": 11.969, + "args": { + "External id": 129467, "cbid": 41, "correlation": 241685314 + } + }, + { + "ph": "s", "id": 241685314, "pid": 5717, "tid": 6759, "ts": 6302685397142.826, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685397161.935, "dur": 2.100, + "args": { + "External id": 129462, "cbid": 135, "correlation": 241685318 + } + }, + { + "ph": "f", "id": 241685318, "pid": 5717, "tid": 6759, "ts": 6302685397161.935, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685449390.131, "dur": 387.554, + "args": { + "External id": 129462, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685322, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685322, "pid": 3, "tid": 7, "ts": 6302685449390.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685397166.815, "dur": 10.680, + "args": { + "External id": 129462, "cbid": 211, "correlation": 241685322 + } + }, + { + "ph": "s", "id": 241685322, "pid": 5717, "tid": 6759, "ts": 6302685397166.815, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685397318.215, "dur": 1.630, + "args": { + "External id": 129448, "cbid": 135, "correlation": 241685333 + } + }, + { + "ph": "f", "id": 241685333, "pid": 5717, "tid": 6759, "ts": 6302685397318.215, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685397325.395, "dur": 2.300, + "args": { + "External id": 129448, "cbid": 147, "correlation": 241685337 + } + }, + { + "ph": "s", "id": 241685337, "pid": 5717, "tid": 6759, "ts": 6302685397325.395, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685397330.875, "dur": 0.760, + "args": { + "External id": 129448, "cbid": 147, "correlation": 241685341 + } + }, + { + "ph": "s", "id": 241685341, "pid": 5717, "tid": 6759, "ts": 6302685397330.875, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685450348.858, "dur": 315.618, + "args": { + "External id": 129481, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241685365, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241685365, "pid": 3, "tid": 17, "ts": 6302685450348.858, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685397487.115, "dur": 12.220, + "args": { + "External id": 129481, "cbid": 211, "correlation": 241685365 + } + }, + { + "ph": "s", "id": 241685365, "pid": 5717, "tid": 6759, "ts": 6302685397487.115, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685450665.884, "dur": 13.248, + "args": { + "External id": 129497, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241685378, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241685378, "pid": 3, "tid": 17, "ts": 6302685450665.884, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685397597.974, "dur": 9.651, + "args": { + "External id": 129497, "cbid": 211, "correlation": 241685378 + } + }, + { + "ph": "s", "id": 241685378, "pid": 5717, "tid": 6759, "ts": 6302685397597.974, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685397630.684, "dur": 1.200, + "args": { + "External id": 129448, "cbid": 135, "correlation": 241685388 + } + }, + { + "ph": "f", "id": 241685388, "pid": 5717, "tid": 6759, "ts": 6302685397630.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685397633.744, "dur": 1.220, + "args": { + "External id": 129448, "cbid": 147, "correlation": 241685392 + } + }, + { + "ph": "s", "id": 241685392, "pid": 5717, "tid": 6759, "ts": 6302685397633.744, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685397683.474, "dur": 0.890, + "args": { + "External id": 129499, "cbid": 317, "correlation": 241685405 + } + }, + { + "ph": "f", "id": 241685405, "pid": 5717, "tid": 6759, "ts": 6302685397683.474, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685397686.154, "dur": 1.170, + "args": { + "External id": 129499, "cbid": 135, "correlation": 241685407 + } + }, + { + "ph": "f", "id": 241685407, "pid": 5717, "tid": 6759, "ts": 6302685397686.154, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685397688.704, "dur": 1.090, + "args": { + "External id": 129499, "cbid": 147, "correlation": 241685411 + } + }, + { + "ph": "s", "id": 241685411, "pid": 5717, "tid": 6759, "ts": 6302685397688.704, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685397706.974, "dur": 0.870, + "args": { + "External id": 129499, "cbid": 409, "correlation": 241685414 + } + }, + { + "ph": "f", "id": 241685414, "pid": 5717, "tid": 6759, "ts": 6302685397706.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685397711.784, "dur": 0.830, + "args": { + "External id": 129499, "cbid": 135, "correlation": 241685417 + } + }, + { + "ph": "f", "id": 241685417, "pid": 5717, "tid": 6759, "ts": 6302685397711.784, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685397712.784, "dur": 0.900, + "args": { + "External id": 129499, "cbid": 147, "correlation": 241685418 + } + }, + { + "ph": "s", "id": 241685418, "pid": 5717, "tid": 6759, "ts": 6302685397712.784, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685458483.959, "dur": 4540.322, + "args": { + "External id": 129499, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241685420, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241685420, "pid": 3, "tid": 20, "ts": 6302685458483.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685397714.854, "dur": 14.080, + "args": { + "External id": 129499, "cbid": 430, "correlation": 241685420 + } + }, + { + "ph": "s", "id": 241685420, "pid": 5717, "tid": 6759, "ts": 6302685397714.854, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685397731.114, "dur": 0.390, + "args": { + "External id": 129499, "cbid": 135, "correlation": 241685422 + } + }, + { + "ph": "f", "id": 241685422, "pid": 5717, "tid": 6759, "ts": 6302685397731.114, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685397731.634, "dur": 0.480, + "args": { + "External id": 129499, "cbid": 147, "correlation": 241685423 + } + }, + { + "ph": "s", "id": 241685423, "pid": 5717, "tid": 6759, "ts": 6302685397731.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685397735.034, "dur": 0.890, + "args": { + "External id": 129499, "cbid": 135, "correlation": 241685426 + } + }, + { + "ph": "f", "id": 241685426, "pid": 5717, "tid": 6759, "ts": 6302685397735.034, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685397748.024, "dur": 0.410, + "args": { + "External id": 129499, "cbid": 135, "correlation": 241685433 + } + }, + { + "ph": "f", "id": 241685433, "pid": 5717, "tid": 6759, "ts": 6302685397748.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685397789.484, "dur": 1.070, + "args": { + "External id": 129501, "cbid": 147, "correlation": 241685438 + } + }, + { + "ph": "s", "id": 241685438, "pid": 5717, "tid": 6759, "ts": 6302685397789.484, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685397815.894, "dur": 2.040, + "args": { + "External id": 129448, "cbid": 135, "correlation": 241685453 + } + }, + { + "ph": "f", "id": 241685453, "pid": 5717, "tid": 6759, "ts": 6302685397815.894, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685449796.950, "dur": 1130.216, + "args": { + "External id": 129503, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685478, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685478, "pid": 3, "tid": 7, "ts": 6302685449796.950, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685397961.313, "dur": 10.920, + "args": { + "External id": 129503, "cbid": 211, "correlation": 241685478 + } + }, + { + "ph": "s", "id": 241685478, "pid": 5717, "tid": 6759, "ts": 6302685397961.313, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 3, "tid": 7, + "ts": 6302685450927.902, "dur": 431.459, + "args": { + "External id": 129504, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685501, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241685501, "pid": 3, "tid": 7, "ts": 6302685450927.902, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398020.353, "dur": 6.731, + "args": { + "External id": 129504, "cbid": 307, "correlation": 241685501 + } + }, + { + "ph": "s", "id": 241685501, "pid": 5717, "tid": 6759, "ts": 6302685398020.353, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685398061.793, "dur": 0.500, + "args": { + "External id": 129505, "cbid": 200, "correlation": 241685524 + } + }, + { + "ph": "f", "id": 241685524, "pid": 5717, "tid": 6759, "ts": 6302685398061.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685451360.609, "dur": 1.248, + "args": { + "External id": 129505, "device": 3, "context": 1, "stream": 7, "correlation": 241685527, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 241685527, "pid": 3, "tid": 7, "ts": 6302685451360.609, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685398063.993, "dur": 6.480, + "args": { + "External id": 129505, "cbid": 51, "correlation": 241685527 + } + }, + { + "ph": "s", "id": 241685527, "pid": 5717, "tid": 6759, "ts": 6302685398063.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685451363.201, "dur": 594.565, + "args": { + "External id": 129505, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685528, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685528, "pid": 3, "tid": 7, "ts": 6302685451363.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398070.703, "dur": 5.800, + "args": { + "External id": 129505, "cbid": 307, "correlation": 241685528 + } + }, + { + "ph": "s", "id": 241685528, "pid": 5717, "tid": 6759, "ts": 6302685398070.703, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685398103.603, "dur": 0.310, + "args": { + "External id": 129506, "cbid": 200, "correlation": 241685553 + } + }, + { + "ph": "f", "id": 241685553, "pid": 5717, "tid": 6759, "ts": 6302685398103.603, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685452036.359, "dur": 28.960, + "args": { + "External id": 129506, "device": 3, "context": 1, "stream": 7, "correlation": 241685556, "bytes": 1536, "memory bandwidth (GB/s)": 0.05303867403314917 + } + }, + { + "ph": "f", "id": 241685556, "pid": 3, "tid": 7, "ts": 6302685452036.359, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685398105.013, "dur": 4.230, + "args": { + "External id": 129506, "cbid": 51, "correlation": 241685556 + } + }, + { + "ph": "s", "id": 241685556, "pid": 5717, "tid": 6759, "ts": 6302685398105.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685452081.831, "dur": 358.787, + "args": { + "External id": 129506, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685557, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685557, "pid": 3, "tid": 7, "ts": 6302685452081.831, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398109.423, "dur": 5.350, + "args": { + "External id": 129506, "cbid": 307, "correlation": 241685557 + } + }, + { + "ph": "s", "id": 241685557, "pid": 5717, "tid": 6759, "ts": 6302685398109.423, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685398138.123, "dur": 0.250, + "args": { + "External id": 129507, "cbid": 200, "correlation": 241685582 + } + }, + { + "ph": "f", "id": 241685582, "pid": 5717, "tid": 6759, "ts": 6302685398138.123, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685452441.258, "dur": 354.050, + "args": { + "External id": 129507, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685585, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685585, "pid": 3, "tid": 7, "ts": 6302685452441.258, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398139.543, "dur": 5.260, + "args": { + "External id": 129507, "cbid": 307, "correlation": 241685585 + } + }, + { + "ph": "s", "id": 241685585, "pid": 5717, "tid": 6759, "ts": 6302685398139.543, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685398173.113, "dur": 0.280, + "args": { + "External id": 129508, "cbid": 200, "correlation": 241685610 + } + }, + { + "ph": "f", "id": 241685610, "pid": 5717, "tid": 6759, "ts": 6302685398173.113, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685452796.556, "dur": 0.800, + "args": { + "External id": 129508, "device": 3, "context": 1, "stream": 7, "correlation": 241685613, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241685613, "pid": 3, "tid": 7, "ts": 6302685452796.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685398174.513, "dur": 8.770, + "args": { + "External id": 129508, "cbid": 51, "correlation": 241685613 + } + }, + { + "ph": "s", "id": 241685613, "pid": 5717, "tid": 6759, "ts": 6302685398174.513, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685452798.540, "dur": 445.412, + "args": { + "External id": 129508, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685614, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685614, "pid": 3, "tid": 7, "ts": 6302685452798.540, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398183.423, "dur": 7.770, + "args": { + "External id": 129508, "cbid": 307, "correlation": 241685614 + } + }, + { + "ph": "s", "id": 241685614, "pid": 5717, "tid": 6759, "ts": 6302685398183.423, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685398217.893, "dur": 0.260, + "args": { + "External id": 129509, "cbid": 200, "correlation": 241685639 + } + }, + { + "ph": "f", "id": 241685639, "pid": 5717, "tid": 6759, "ts": 6302685398217.893, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685453244.592, "dur": 393.923, + "args": { + "External id": 129509, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685642, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685642, "pid": 3, "tid": 7, "ts": 6302685453244.592, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398219.263, "dur": 5.380, + "args": { + "External id": 129509, "cbid": 307, "correlation": 241685642 + } + }, + { + "ph": "s", "id": 241685642, "pid": 5717, "tid": 6759, "ts": 6302685398219.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685453639.187, "dur": 90.880, + "args": { + "External id": 129510, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685655, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685655, "pid": 3, "tid": 7, "ts": 6302685453639.187, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398271.803, "dur": 8.820, + "args": { + "External id": 129510, "cbid": 307, "correlation": 241685655 + } + }, + { + "ph": "s", "id": 241685655, "pid": 5717, "tid": 6759, "ts": 6302685398271.803, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 3, "tid": 7, + "ts": 6302685453730.675, "dur": 3.936, + "args": { + "External id": 129511, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685663, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241685663, "pid": 3, "tid": 7, "ts": 6302685453730.675, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398316.683, "dur": 6.590, + "args": { + "External id": 129511, "cbid": 307, "correlation": 241685663 + } + }, + { + "ph": "s", "id": 241685663, "pid": 5717, "tid": 6759, "ts": 6302685398316.683, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 3, "tid": 7, + "ts": 6302685453735.347, "dur": 113.793, + "args": { + "External id": 129512, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685671, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685671, "pid": 3, "tid": 7, "ts": 6302685453735.347, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398350.333, "dur": 5.510, + "args": { + "External id": 129512, "cbid": 307, "correlation": 241685671 + } + }, + { + "ph": "s", "id": 241685671, "pid": 5717, "tid": 6759, "ts": 6302685398350.333, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685398544.932, "dur": 0.490, + "args": { + "External id": 129531, "cbid": 200, "correlation": 241685717 + } + }, + { + "ph": "f", "id": 241685717, "pid": 5717, "tid": 6759, "ts": 6302685398544.932, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685453850.388, "dur": 0.800, + "args": { + "External id": 129531, "device": 3, "context": 1, "stream": 7, "correlation": 241685720, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 241685720, "pid": 3, "tid": 7, "ts": 6302685453850.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685398547.112, "dur": 7.720, + "args": { + "External id": 129531, "cbid": 51, "correlation": 241685720 + } + }, + { + "ph": "s", "id": 241685720, "pid": 5717, "tid": 6759, "ts": 6302685398547.112, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685453852.372, "dur": 142.465, + "args": { + "External id": 129531, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685721, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685721, "pid": 3, "tid": 7, "ts": 6302685453852.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398555.052, "dur": 8.200, + "args": { + "External id": 129531, "cbid": 307, "correlation": 241685721 + } + }, + { + "ph": "s", "id": 241685721, "pid": 5717, "tid": 6759, "ts": 6302685398555.052, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685453995.509, "dur": 139.233, + "args": { + "External id": 129532, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685743, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685743, "pid": 3, "tid": 7, "ts": 6302685453995.509, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398590.592, "dur": 5.770, + "args": { + "External id": 129532, "cbid": 211, "correlation": 241685743 + } + }, + { + "ph": "s", "id": 241685743, "pid": 5717, "tid": 6759, "ts": 6302685398590.592, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685398667.562, "dur": 0.400, + "args": { + "External id": 129533, "cbid": 200, "correlation": 241685761 + } + }, + { + "ph": "f", "id": 241685761, "pid": 5717, "tid": 6759, "ts": 6302685398667.562, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685398668.082, "dur": 0.190, + "args": { + "External id": 129533, "cbid": 200, "correlation": 241685762 + } + }, + { + "ph": "f", "id": 241685762, "pid": 5717, "tid": 6759, "ts": 6302685398668.082, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685398686.262, "dur": 0.210, + "args": { + "External id": 129533, "cbid": 200, "correlation": 241685780 + } + }, + { + "ph": "f", "id": 241685780, "pid": 5717, "tid": 6759, "ts": 6302685398686.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685454135.350, "dur": 184.450, + "args": { + "External id": 129533, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685781, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685781, "pid": 3, "tid": 7, "ts": 6302685454135.350, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398687.642, "dur": 11.410, + "args": { + "External id": 129533, "cbid": 211, "correlation": 241685781 + } + }, + { + "ph": "s", "id": 241685781, "pid": 5717, "tid": 6759, "ts": 6302685398687.642, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685398699.782, "dur": 0.940, + "args": { + "External id": 129533, "cbid": 273, "correlation": 241685783 + } + }, + { + "ph": "f", "id": 241685783, "pid": 5717, "tid": 6759, "ts": 6302685398699.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685454320.504, "dur": 1358.250, + "args": { + "External id": 129533, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685784, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241685784, "pid": 3, "tid": 7, "ts": 6302685454320.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398701.062, "dur": 4.050, + "args": { + "External id": 129533, "cbid": 211, "correlation": 241685784 + } + }, + { + "ph": "s", "id": 241685784, "pid": 5717, "tid": 6759, "ts": 6302685398701.062, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685455679.426, "dur": 164.577, + "args": { + "External id": 129533, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685786, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241685786, "pid": 3, "tid": 7, "ts": 6302685455679.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398705.642, "dur": 3.620, + "args": { + "External id": 129533, "cbid": 211, "correlation": 241685786 + } + }, + { + "ph": "s", "id": 241685786, "pid": 5717, "tid": 6759, "ts": 6302685398705.642, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685455844.611, "dur": 145.313, + "args": { + "External id": 129544, "device": 3, "context": 1, "stream": 7, "correlation": 241685808, "bytes": 25165824, "memory bandwidth (GB/s)": 173.18356926083695 + } + }, + { + "ph": "f", "id": 241685808, "pid": 3, "tid": 7, "ts": 6302685455844.611, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685398843.302, "dur": 17.389, + "args": { + "External id": 129544, "cbid": 41, "correlation": 241685808 + } + }, + { + "ph": "s", "id": 241685808, "pid": 5717, "tid": 6759, "ts": 6302685398843.302, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685455990.596, "dur": 170.658, + "args": { + "External id": 129541, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685826, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685826, "pid": 3, "tid": 7, "ts": 6302685455990.596, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685398957.401, "dur": 8.440, + "args": { + "External id": 129541, "cbid": 307, "correlation": 241685826 + } + }, + { + "ph": "s", "id": 241685826, "pid": 5717, "tid": 6759, "ts": 6302685398957.401, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685456161.926, "dur": 105.184, + "args": { + "External id": 129551, "device": 3, "context": 1, "stream": 7, "correlation": 241685841, "bytes": 25165824, "memory bandwidth (GB/s)": 239.25524794645574 + } + }, + { + "ph": "f", "id": 241685841, "pid": 3, "tid": 7, "ts": 6302685456161.926, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685399023.801, "dur": 16.360, + "args": { + "External id": 129551, "cbid": 41, "correlation": 241685841 + } + }, + { + "ph": "s", "id": 241685841, "pid": 5717, "tid": 6759, "ts": 6302685399023.801, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685456267.750, "dur": 25.633, + "args": { + "External id": 129548, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685859, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241685859, "pid": 3, "tid": 7, "ts": 6302685456267.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399164.971, "dur": 8.600, + "args": { + "External id": 129548, "cbid": 307, "correlation": 241685859 + } + }, + { + "ph": "s", "id": 241685859, "pid": 5717, "tid": 6759, "ts": 6302685399164.971, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685399309.450, "dur": 1.600, + "args": { + "External id": 129556, "cbid": 200, "correlation": 241685889 + } + }, + { + "ph": "f", "id": 241685889, "pid": 5717, "tid": 6759, "ts": 6302685399309.450, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685456294.695, "dur": 1.312, + "args": { + "External id": 129556, "device": 3, "context": 1, "stream": 7, "correlation": 241685892, "bytes": 576, "memory bandwidth (GB/s)": 0.43902439024390244 + } + }, + { + "ph": "f", "id": 241685892, "pid": 3, "tid": 7, "ts": 6302685456294.695, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685399312.750, "dur": 12.271, + "args": { + "External id": 129556, "cbid": 51, "correlation": 241685892 + } + }, + { + "ph": "s", "id": 241685892, "pid": 5717, "tid": 6759, "ts": 6302685399312.750, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685456297.319, "dur": 144.897, + "args": { + "External id": 129556, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685893, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685893, "pid": 3, "tid": 7, "ts": 6302685456297.319, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399325.241, "dur": 12.569, + "args": { + "External id": 129556, "cbid": 307, "correlation": 241685893 + } + }, + { + "ph": "s", "id": 241685893, "pid": 5717, "tid": 6759, "ts": 6302685399325.241, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685399383.430, "dur": 0.280, + "args": { + "External id": 129557, "cbid": 200, "correlation": 241685918 + } + }, + { + "ph": "f", "id": 241685918, "pid": 5717, "tid": 6759, "ts": 6302685399383.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685456443.496, "dur": 1.216, + "args": { + "External id": 129557, "device": 3, "context": 1, "stream": 7, "correlation": 241685921, "bytes": 576, "memory bandwidth (GB/s)": 0.47368421052631576 + } + }, + { + "ph": "f", "id": 241685921, "pid": 3, "tid": 7, "ts": 6302685456443.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685399385.850, "dur": 5.830, + "args": { + "External id": 129557, "cbid": 51, "correlation": 241685921 + } + }, + { + "ph": "s", "id": 241685921, "pid": 5717, "tid": 6759, "ts": 6302685399385.850, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685456446.152, "dur": 141.921, + "args": { + "External id": 129557, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685922, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685922, "pid": 3, "tid": 7, "ts": 6302685456446.152, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399391.880, "dur": 5.180, + "args": { + "External id": 129557, "cbid": 307, "correlation": 241685922 + } + }, + { + "ph": "s", "id": 241685922, "pid": 5717, "tid": 6759, "ts": 6302685399391.880, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685399421.850, "dur": 0.280, + "args": { + "External id": 129558, "cbid": 200, "correlation": 241685947 + } + }, + { + "ph": "f", "id": 241685947, "pid": 5717, "tid": 6759, "ts": 6302685399421.850, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685456589.321, "dur": 1.216, + "args": { + "External id": 129558, "device": 3, "context": 1, "stream": 7, "correlation": 241685950, "bytes": 576, "memory bandwidth (GB/s)": 0.47368421052631576 + } + }, + { + "ph": "f", "id": 241685950, "pid": 3, "tid": 7, "ts": 6302685456589.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685399423.160, "dur": 4.340, + "args": { + "External id": 129558, "cbid": 51, "correlation": 241685950 + } + }, + { + "ph": "s", "id": 241685950, "pid": 5717, "tid": 6759, "ts": 6302685399423.160, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685456592.105, "dur": 140.417, + "args": { + "External id": 129558, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685951, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685951, "pid": 3, "tid": 7, "ts": 6302685456592.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399427.640, "dur": 4.830, + "args": { + "External id": 129558, "cbid": 307, "correlation": 241685951 + } + }, + { + "ph": "s", "id": 241685951, "pid": 5717, "tid": 6759, "ts": 6302685399427.640, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685456733.098, "dur": 138.689, + "args": { + "External id": 129559, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685973, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685973, "pid": 3, "tid": 7, "ts": 6302685456733.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399456.520, "dur": 5.430, + "args": { + "External id": 129559, "cbid": 211, "correlation": 241685973 + } + }, + { + "ph": "s", "id": 241685973, "pid": 5717, "tid": 6759, "ts": 6302685399456.520, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685456872.363, "dur": 139.905, + "args": { + "External id": 129560, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241685996, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241685996, "pid": 3, "tid": 7, "ts": 6302685456872.363, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399489.930, "dur": 8.750, + "args": { + "External id": 129560, "cbid": 211, "correlation": 241685996 + } + }, + { + "ph": "s", "id": 241685996, "pid": 5717, "tid": 6759, "ts": 6302685399489.930, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685457012.908, "dur": 140.385, + "args": { + "External id": 129561, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686019, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686019, "pid": 3, "tid": 7, "ts": 6302685457012.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399519.290, "dur": 4.770, + "args": { + "External id": 129561, "cbid": 211, "correlation": 241686019 + } + }, + { + "ph": "s", "id": 241686019, "pid": 5717, "tid": 6759, "ts": 6302685399519.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 3, "tid": 7, + "ts": 6302685457153.997, "dur": 80.928, + "args": { + "External id": 129562, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686027, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686027, "pid": 3, "tid": 7, "ts": 6302685457153.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399556.740, "dur": 6.070, + "args": { + "External id": 129562, "cbid": 307, "correlation": 241686027 + } + }, + { + "ph": "s", "id": 241686027, "pid": 5717, "tid": 6759, "ts": 6302685399556.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685457235.565, "dur": 46.945, + "args": { + "External id": 129577, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686056, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686056, "pid": 3, "tid": 7, "ts": 6302685457235.565, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399714.430, "dur": 9.479, + "args": { + "External id": 129577, "cbid": 307, "correlation": 241686056 + } + }, + { + "ph": "s", "id": 241686056, "pid": 5717, "tid": 6759, "ts": 6302685399714.430, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685457283.182, "dur": 3.808, + "args": { + "External id": 129578, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686064, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241686064, "pid": 3, "tid": 7, "ts": 6302685457283.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399749.440, "dur": 5.520, + "args": { + "External id": 129578, "cbid": 307, "correlation": 241686064 + } + }, + { + "ph": "s", "id": 241686064, "pid": 5717, "tid": 6759, "ts": 6302685399749.440, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685457287.694, "dur": 49.921, + "args": { + "External id": 129579, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686075, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686075, "pid": 3, "tid": 7, "ts": 6302685457287.694, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399782.209, "dur": 6.391, + "args": { + "External id": 129579, "cbid": 307, "correlation": 241686075 + } + }, + { + "ph": "s", "id": 241686075, "pid": 5717, "tid": 6759, "ts": 6302685399782.209, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685457338.223, "dur": 45.856, + "args": { + "External id": 129580, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686080, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686080, "pid": 3, "tid": 7, "ts": 6302685457338.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685399829.539, "dur": 11.710, + "args": { + "External id": 129580, "cbid": 211, "correlation": 241686080 + } + }, + { + "ph": "s", "id": 241686080, "pid": 5717, "tid": 6759, "ts": 6302685399829.539, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685400022.959, "dur": 2.910, + "args": { + "External id": 129586, "cbid": 147, "correlation": 241686097 + } + }, + { + "ph": "s", "id": 241686097, "pid": 5717, "tid": 6759, "ts": 6302685400022.959, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685400127.169, "dur": 2.220, + "args": { + "External id": 129594, "cbid": 138, "correlation": 241686112 + } + }, + { + "ph": "f", "id": 241686112, "pid": 5717, "tid": 6759, "ts": 6302685400127.169, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685458485.687, "dur": 3.136, + "args": { + "External id": 129598, "device": 3, "context": 1, "stream": 7, "correlation": 241686123, "bytes": 28112, "memory bandwidth (GB/s)": 8.964285714285714 + } + }, + { + "ph": "f", "id": 241686123, "pid": 3, "tid": 7, "ts": 6302685458485.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685400149.799, "dur": 11.790, + "args": { + "External id": 129598, "cbid": 41, "correlation": 241686123 + } + }, + { + "ph": "s", "id": 241686123, "pid": 5717, "tid": 6759, "ts": 6302685400149.799, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685400165.959, "dur": 1.829, + "args": { + "External id": 129593, "cbid": 135, "correlation": 241686127 + } + }, + { + "ph": "f", "id": 241686127, "pid": 5717, "tid": 6759, "ts": 6302685400165.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685458491.543, "dur": 460.515, + "args": { + "External id": 129593, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686131, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686131, "pid": 3, "tid": 7, "ts": 6302685458491.543, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685400170.979, "dur": 11.120, + "args": { + "External id": 129593, "cbid": 211, "correlation": 241686131 + } + }, + { + "ph": "s", "id": 241686131, "pid": 5717, "tid": 6759, "ts": 6302685400170.979, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685400220.828, "dur": 0.971, + "args": { + "External id": 129586, "cbid": 135, "correlation": 241686142 + } + }, + { + "ph": "f", "id": 241686142, "pid": 5717, "tid": 6759, "ts": 6302685400220.828, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685400223.748, "dur": 1.260, + "args": { + "External id": 129586, "cbid": 147, "correlation": 241686146 + } + }, + { + "ph": "s", "id": 241686146, "pid": 5717, "tid": 6759, "ts": 6302685400223.748, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685400295.378, "dur": 7.970, + "args": { + "External id": 129602, "cbid": 317, "correlation": 241686166 + } + }, + { + "ph": "f", "id": 241686166, "pid": 5717, "tid": 6759, "ts": 6302685400295.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685400305.378, "dur": 1.500, + "args": { + "External id": 129602, "cbid": 135, "correlation": 241686168 + } + }, + { + "ph": "f", "id": 241686168, "pid": 5717, "tid": 6759, "ts": 6302685400305.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685400308.228, "dur": 1.060, + "args": { + "External id": 129602, "cbid": 147, "correlation": 241686172 + } + }, + { + "ph": "s", "id": 241686172, "pid": 5717, "tid": 6759, "ts": 6302685400308.228, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685400323.508, "dur": 1.080, + "args": { + "External id": 129602, "cbid": 409, "correlation": 241686175 + } + }, + { + "ph": "f", "id": 241686175, "pid": 5717, "tid": 6759, "ts": 6302685400323.508, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685400328.478, "dur": 0.830, + "args": { + "External id": 129602, "cbid": 135, "correlation": 241686178 + } + }, + { + "ph": "f", "id": 241686178, "pid": 5717, "tid": 6759, "ts": 6302685400328.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685400329.498, "dur": 0.820, + "args": { + "External id": 129602, "cbid": 147, "correlation": 241686179 + } + }, + { + "ph": "s", "id": 241686179, "pid": 5717, "tid": 6759, "ts": 6302685400329.498, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685463027.129, "dur": 10256.045, + "args": { + "External id": 129602, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241686181, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241686181, "pid": 3, "tid": 20, "ts": 6302685463027.129, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685400331.378, "dur": 10.680, + "args": { + "External id": 129602, "cbid": 430, "correlation": 241686181 + } + }, + { + "ph": "s", "id": 241686181, "pid": 5717, "tid": 6759, "ts": 6302685400331.378, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685400343.078, "dur": 0.400, + "args": { + "External id": 129602, "cbid": 135, "correlation": 241686183 + } + }, + { + "ph": "f", "id": 241686183, "pid": 5717, "tid": 6759, "ts": 6302685400343.078, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685400343.608, "dur": 0.580, + "args": { + "External id": 129602, "cbid": 147, "correlation": 241686184 + } + }, + { + "ph": "s", "id": 241686184, "pid": 5717, "tid": 6759, "ts": 6302685400343.608, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685400345.758, "dur": 0.830, + "args": { + "External id": 129602, "cbid": 135, "correlation": 241686187 + } + }, + { + "ph": "f", "id": 241686187, "pid": 5717, "tid": 6759, "ts": 6302685400345.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685400354.258, "dur": 0.410, + "args": { + "External id": 129602, "cbid": 135, "correlation": 241686194 + } + }, + { + "ph": "f", "id": 241686194, "pid": 5717, "tid": 6759, "ts": 6302685400354.258, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685400381.618, "dur": 0.990, + "args": { + "External id": 129604, "cbid": 147, "correlation": 241686199 + } + }, + { + "ph": "s", "id": 241686199, "pid": 5717, "tid": 6759, "ts": 6302685400381.618, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685400398.598, "dur": 0.840, + "args": { + "External id": 129586, "cbid": 135, "correlation": 241686214 + } + }, + { + "ph": "f", "id": 241686214, "pid": 5717, "tid": 6759, "ts": 6302685400398.598, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685400585.918, "dur": 1.170, + "args": { + "External id": 129586, "cbid": 135, "correlation": 241686227 + } + }, + { + "ph": "f", "id": 241686227, "pid": 5717, "tid": 6759, "ts": 6302685400585.918, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685400720.097, "dur": 3.470, + "args": { + "External id": 129614, "cbid": 147, "correlation": 241686238 + } + }, + { + "ph": "s", "id": 241686238, "pid": 5717, "tid": 6759, "ts": 6302685400720.097, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685400856.067, "dur": 1.270, + "args": { + "External id": 129628, "cbid": 317, "correlation": 241686279 + } + }, + { + "ph": "f", "id": 241686279, "pid": 5717, "tid": 6759, "ts": 6302685400856.067, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685400864.587, "dur": 2.220, + "args": { + "External id": 129629, "cbid": 138, "correlation": 241686282 + } + }, + { + "ph": "f", "id": 241686282, "pid": 5717, "tid": 6759, "ts": 6302685400864.587, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685463027.577, "dur": 2.144, + "args": { + "External id": 129633, "device": 3, "context": 1, "stream": 7, "correlation": 241686293, "bytes": 7224, "memory bandwidth (GB/s)": 3.3694029850746268 + } + }, + { + "ph": "f", "id": 241686293, "pid": 3, "tid": 7, "ts": 6302685463027.577, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685400887.017, "dur": 11.890, + "args": { + "External id": 129633, "cbid": 41, "correlation": 241686293 + } + }, + { + "ph": "s", "id": 241686293, "pid": 5717, "tid": 6759, "ts": 6302685400887.017, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685400903.097, "dur": 1.720, + "args": { + "External id": 129628, "cbid": 135, "correlation": 241686297 + } + }, + { + "ph": "f", "id": 241686297, "pid": 5717, "tid": 6759, "ts": 6302685400903.097, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685463031.769, "dur": 227.426, + "args": { + "External id": 129628, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686301, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686301, "pid": 3, "tid": 7, "ts": 6302685463031.769, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685400907.217, "dur": 10.460, + "args": { + "External id": 129628, "cbid": 211, "correlation": 241686301 + } + }, + { + "ph": "s", "id": 241686301, "pid": 5717, "tid": 6759, "ts": 6302685400907.217, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685401008.797, "dur": 1.310, + "args": { + "External id": 129614, "cbid": 135, "correlation": 241686312 + } + }, + { + "ph": "f", "id": 241686312, "pid": 5717, "tid": 6759, "ts": 6302685401008.797, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685401013.237, "dur": 1.190, + "args": { + "External id": 129614, "cbid": 147, "correlation": 241686316 + } + }, + { + "ph": "s", "id": 241686316, "pid": 5717, "tid": 6759, "ts": 6302685401013.237, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685401016.087, "dur": 0.740, + "args": { + "External id": 129614, "cbid": 147, "correlation": 241686320 + } + }, + { + "ph": "s", "id": 241686320, "pid": 5717, "tid": 6759, "ts": 6302685401016.087, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685463865.376, "dur": 565.444, + "args": { + "External id": 129647, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241686344, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241686344, "pid": 3, "tid": 17, "ts": 6302685463865.376, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401160.616, "dur": 11.980, + "args": { + "External id": 129647, "cbid": 211, "correlation": 241686344 + } + }, + { + "ph": "s", "id": 241686344, "pid": 5717, "tid": 6759, "ts": 6302685401160.616, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685464456.292, "dur": 15.328, + "args": { + "External id": 129663, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241686357, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241686357, "pid": 3, "tid": 17, "ts": 6302685464456.292, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401270.366, "dur": 9.580, + "args": { + "External id": 129663, "cbid": 211, "correlation": 241686357 + } + }, + { + "ph": "s", "id": 241686357, "pid": 5717, "tid": 6759, "ts": 6302685401270.366, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685401311.066, "dur": 1.230, + "args": { + "External id": 129614, "cbid": 135, "correlation": 241686367 + } + }, + { + "ph": "f", "id": 241686367, "pid": 5717, "tid": 6759, "ts": 6302685401311.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685401314.256, "dur": 1.620, + "args": { + "External id": 129614, "cbid": 147, "correlation": 241686371 + } + }, + { + "ph": "s", "id": 241686371, "pid": 5717, "tid": 6759, "ts": 6302685401314.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685401368.356, "dur": 0.890, + "args": { + "External id": 129665, "cbid": 317, "correlation": 241686384 + } + }, + { + "ph": "f", "id": 241686384, "pid": 5717, "tid": 6759, "ts": 6302685401368.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685401371.086, "dur": 1.220, + "args": { + "External id": 129665, "cbid": 135, "correlation": 241686386 + } + }, + { + "ph": "f", "id": 241686386, "pid": 5717, "tid": 6759, "ts": 6302685401371.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685401373.636, "dur": 1.070, + "args": { + "External id": 129665, "cbid": 147, "correlation": 241686390 + } + }, + { + "ph": "s", "id": 241686390, "pid": 5717, "tid": 6759, "ts": 6302685401373.636, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685401388.236, "dur": 0.700, + "args": { + "External id": 129665, "cbid": 409, "correlation": 241686393 + } + }, + { + "ph": "f", "id": 241686393, "pid": 5717, "tid": 6759, "ts": 6302685401388.236, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685401392.716, "dur": 0.720, + "args": { + "External id": 129665, "cbid": 135, "correlation": 241686396 + } + }, + { + "ph": "f", "id": 241686396, "pid": 5717, "tid": 6759, "ts": 6302685401392.716, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685401393.616, "dur": 0.780, + "args": { + "External id": 129665, "cbid": 147, "correlation": 241686397 + } + }, + { + "ph": "s", "id": 241686397, "pid": 5717, "tid": 6759, "ts": 6302685401393.616, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685473285.766, "dur": 5162.567, + "args": { + "External id": 129665, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241686399, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241686399, "pid": 3, "tid": 20, "ts": 6302685473285.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685401395.526, "dur": 10.150, + "args": { + "External id": 129665, "cbid": 430, "correlation": 241686399 + } + }, + { + "ph": "s", "id": 241686399, "pid": 5717, "tid": 6759, "ts": 6302685401395.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685401406.656, "dur": 0.400, + "args": { + "External id": 129665, "cbid": 135, "correlation": 241686401 + } + }, + { + "ph": "f", "id": 241686401, "pid": 5717, "tid": 6759, "ts": 6302685401406.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685401407.176, "dur": 0.500, + "args": { + "External id": 129665, "cbid": 147, "correlation": 241686402 + } + }, + { + "ph": "s", "id": 241686402, "pid": 5717, "tid": 6759, "ts": 6302685401407.176, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685401409.576, "dur": 0.600, + "args": { + "External id": 129665, "cbid": 135, "correlation": 241686405 + } + }, + { + "ph": "f", "id": 241686405, "pid": 5717, "tid": 6759, "ts": 6302685401409.576, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685401417.676, "dur": 0.410, + "args": { + "External id": 129665, "cbid": 135, "correlation": 241686412 + } + }, + { + "ph": "f", "id": 241686412, "pid": 5717, "tid": 6759, "ts": 6302685401417.676, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685401443.656, "dur": 0.980, + "args": { + "External id": 129667, "cbid": 147, "correlation": 241686417 + } + }, + { + "ph": "s", "id": 241686417, "pid": 5717, "tid": 6759, "ts": 6302685401443.656, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685401460.496, "dur": 0.870, + "args": { + "External id": 129614, "cbid": 135, "correlation": 241686432 + } + }, + { + "ph": "f", "id": 241686432, "pid": 5717, "tid": 6759, "ts": 6302685401460.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685463285.115, "dur": 1431.147, + "args": { + "External id": 129669, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686457, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686457, "pid": 3, "tid": 7, "ts": 6302685463285.115, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401611.715, "dur": 10.940, + "args": { + "External id": 129669, "cbid": 211, "correlation": 241686457 + } + }, + { + "ph": "s", "id": 241686457, "pid": 5717, "tid": 6759, "ts": 6302685401611.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 3, "tid": 7, + "ts": 6302685464716.902, "dur": 434.851, + "args": { + "External id": 129670, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686480, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241686480, "pid": 3, "tid": 7, "ts": 6302685464716.902, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401671.815, "dur": 6.350, + "args": { + "External id": 129670, "cbid": 307, "correlation": 241686480 + } + }, + { + "ph": "s", "id": 241686480, "pid": 5717, "tid": 6759, "ts": 6302685401671.815, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685401713.165, "dur": 0.500, + "args": { + "External id": 129671, "cbid": 200, "correlation": 241686503 + } + }, + { + "ph": "f", "id": 241686503, "pid": 5717, "tid": 6759, "ts": 6302685401713.165, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685465153.001, "dur": 1.120, + "args": { + "External id": 129671, "device": 3, "context": 1, "stream": 7, "correlation": 241686506, "bytes": 1536, "memory bandwidth (GB/s)": 1.3714285714285714 + } + }, + { + "ph": "f", "id": 241686506, "pid": 3, "tid": 7, "ts": 6302685465153.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685401715.295, "dur": 6.490, + "args": { + "External id": 129671, "cbid": 51, "correlation": 241686506 + } + }, + { + "ph": "s", "id": 241686506, "pid": 5717, "tid": 6759, "ts": 6302685401715.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685465155.561, "dur": 374.595, + "args": { + "External id": 129671, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686507, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686507, "pid": 3, "tid": 7, "ts": 6302685465155.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401721.985, "dur": 6.170, + "args": { + "External id": 129671, "cbid": 307, "correlation": 241686507 + } + }, + { + "ph": "s", "id": 241686507, "pid": 5717, "tid": 6759, "ts": 6302685401721.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685401773.305, "dur": 0.300, + "args": { + "External id": 129672, "cbid": 200, "correlation": 241686532 + } + }, + { + "ph": "f", "id": 241686532, "pid": 5717, "tid": 6759, "ts": 6302685401773.305, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685465533.100, "dur": 11.264, + "args": { + "External id": 129672, "device": 3, "context": 1, "stream": 7, "correlation": 241686535, "bytes": 1536, "memory bandwidth (GB/s)": 0.13636363636363635 + } + }, + { + "ph": "f", "id": 241686535, "pid": 3, "tid": 7, "ts": 6302685465533.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685401774.735, "dur": 6.750, + "args": { + "External id": 129672, "cbid": 51, "correlation": 241686535 + } + }, + { + "ph": "s", "id": 241686535, "pid": 5717, "tid": 6759, "ts": 6302685401774.735, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685465556.268, "dur": 722.662, + "args": { + "External id": 129672, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686536, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686536, "pid": 3, "tid": 7, "ts": 6302685465556.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401781.625, "dur": 5.490, + "args": { + "External id": 129672, "cbid": 307, "correlation": 241686536 + } + }, + { + "ph": "s", "id": 241686536, "pid": 5717, "tid": 6759, "ts": 6302685401781.625, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685401811.195, "dur": 0.290, + "args": { + "External id": 129673, "cbid": 200, "correlation": 241686561 + } + }, + { + "ph": "f", "id": 241686561, "pid": 5717, "tid": 6759, "ts": 6302685401811.195, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685466279.570, "dur": 402.467, + "args": { + "External id": 129673, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686564, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686564, "pid": 3, "tid": 7, "ts": 6302685466279.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401812.685, "dur": 5.420, + "args": { + "External id": 129673, "cbid": 307, "correlation": 241686564 + } + }, + { + "ph": "s", "id": 241686564, "pid": 5717, "tid": 6759, "ts": 6302685401812.685, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685401839.105, "dur": 0.230, + "args": { + "External id": 129674, "cbid": 200, "correlation": 241686589 + } + }, + { + "ph": "f", "id": 241686589, "pid": 5717, "tid": 6759, "ts": 6302685401839.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685466683.445, "dur": 1.248, + "args": { + "External id": 129674, "device": 3, "context": 1, "stream": 7, "correlation": 241686592, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 241686592, "pid": 3, "tid": 7, "ts": 6302685466683.445, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685401840.395, "dur": 4.430, + "args": { + "External id": 129674, "cbid": 51, "correlation": 241686592 + } + }, + { + "ph": "s", "id": 241686592, "pid": 5717, "tid": 6759, "ts": 6302685401840.395, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685466686.261, "dur": 456.067, + "args": { + "External id": 129674, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686593, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686593, "pid": 3, "tid": 7, "ts": 6302685466686.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401844.945, "dur": 4.820, + "args": { + "External id": 129674, "cbid": 307, "correlation": 241686593 + } + }, + { + "ph": "s", "id": 241686593, "pid": 5717, "tid": 6759, "ts": 6302685401844.945, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685401871.455, "dur": 0.290, + "args": { + "External id": 129675, "cbid": 200, "correlation": 241686618 + } + }, + { + "ph": "f", "id": 241686618, "pid": 5717, "tid": 6759, "ts": 6302685401871.455, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685467142.936, "dur": 604.101, + "args": { + "External id": 129675, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686621, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686621, "pid": 3, "tid": 7, "ts": 6302685467142.936, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401872.765, "dur": 5.040, + "args": { + "External id": 129675, "cbid": 307, "correlation": 241686621 + } + }, + { + "ph": "s", "id": 241686621, "pid": 5717, "tid": 6759, "ts": 6302685401872.765, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685467747.741, "dur": 89.152, + "args": { + "External id": 129676, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686634, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686634, "pid": 3, "tid": 7, "ts": 6302685467747.741, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401915.545, "dur": 6.030, + "args": { + "External id": 129676, "cbid": 307, "correlation": 241686634 + } + }, + { + "ph": "s", "id": 241686634, "pid": 5717, "tid": 6759, "ts": 6302685401915.545, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 3, "tid": 7, + "ts": 6302685467837.501, "dur": 3.840, + "args": { + "External id": 129677, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686642, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241686642, "pid": 3, "tid": 7, "ts": 6302685467837.501, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401944.515, "dur": 5.200, + "args": { + "External id": 129677, "cbid": 307, "correlation": 241686642 + } + }, + { + "ph": "s", "id": 241686642, "pid": 5717, "tid": 6759, "ts": 6302685401944.515, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 3, "tid": 7, + "ts": 6302685467842.013, "dur": 113.858, + "args": { + "External id": 129678, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686650, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686650, "pid": 3, "tid": 7, "ts": 6302685467842.013, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685401974.715, "dur": 4.969, + "args": { + "External id": 129678, "cbid": 307, "correlation": 241686650 + } + }, + { + "ph": "s", "id": 241686650, "pid": 5717, "tid": 6759, "ts": 6302685401974.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685402169.884, "dur": 0.550, + "args": { + "External id": 129697, "cbid": 200, "correlation": 241686696 + } + }, + { + "ph": "f", "id": 241686696, "pid": 5717, "tid": 6759, "ts": 6302685402169.884, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685467957.119, "dur": 1.088, + "args": { + "External id": 129697, "device": 3, "context": 1, "stream": 7, "correlation": 241686699, "bytes": 576, "memory bandwidth (GB/s)": 0.5294117647058824 + } + }, + { + "ph": "f", "id": 241686699, "pid": 3, "tid": 7, "ts": 6302685467957.119, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685402172.124, "dur": 9.560, + "args": { + "External id": 129697, "cbid": 51, "correlation": 241686699 + } + }, + { + "ph": "s", "id": 241686699, "pid": 5717, "tid": 6759, "ts": 6302685402172.124, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685467959.647, "dur": 141.249, + "args": { + "External id": 129697, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686700, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686700, "pid": 3, "tid": 7, "ts": 6302685467959.647, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685402182.054, "dur": 8.440, + "args": { + "External id": 129697, "cbid": 307, "correlation": 241686700 + } + }, + { + "ph": "s", "id": 241686700, "pid": 5717, "tid": 6759, "ts": 6302685402182.054, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685468101.568, "dur": 138.241, + "args": { + "External id": 129698, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686722, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686722, "pid": 3, "tid": 7, "ts": 6302685468101.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685402219.764, "dur": 7.240, + "args": { + "External id": 129698, "cbid": 211, "correlation": 241686722 + } + }, + { + "ph": "s", "id": 241686722, "pid": 5717, "tid": 6759, "ts": 6302685402219.764, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685402353.674, "dur": 0.460, + "args": { + "External id": 129699, "cbid": 200, "correlation": 241686740 + } + }, + { + "ph": "f", "id": 241686740, "pid": 5717, "tid": 6759, "ts": 6302685402353.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685402354.254, "dur": 0.270, + "args": { + "External id": 129699, "cbid": 200, "correlation": 241686741 + } + }, + { + "ph": "f", "id": 241686741, "pid": 5717, "tid": 6759, "ts": 6302685402354.254, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685402382.983, "dur": 0.220, + "args": { + "External id": 129699, "cbid": 200, "correlation": 241686759 + } + }, + { + "ph": "f", "id": 241686759, "pid": 5717, "tid": 6759, "ts": 6302685402382.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685468240.417, "dur": 93.856, + "args": { + "External id": 129699, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686760, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686760, "pid": 3, "tid": 7, "ts": 6302685468240.417, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685402384.403, "dur": 9.831, + "args": { + "External id": 129699, "cbid": 211, "correlation": 241686760 + } + }, + { + "ph": "s", "id": 241686760, "pid": 5717, "tid": 6759, "ts": 6302685402384.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685402394.974, "dur": 0.989, + "args": { + "External id": 129699, "cbid": 273, "correlation": 241686762 + } + }, + { + "ph": "f", "id": 241686762, "pid": 5717, "tid": 6759, "ts": 6302685402394.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685468334.913, "dur": 1125.033, + "args": { + "External id": 129699, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686763, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241686763, "pid": 3, "tid": 7, "ts": 6302685468334.913, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685402396.283, "dur": 4.671, + "args": { + "External id": 129699, "cbid": 211, "correlation": 241686763 + } + }, + { + "ph": "s", "id": 241686763, "pid": 5717, "tid": 6759, "ts": 6302685402396.283, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685469460.554, "dur": 73.857, + "args": { + "External id": 129699, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686765, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241686765, "pid": 3, "tid": 7, "ts": 6302685469460.554, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685402401.483, "dur": 3.791, + "args": { + "External id": 129699, "cbid": 211, "correlation": 241686765 + } + }, + { + "ph": "s", "id": 241686765, "pid": 5717, "tid": 6759, "ts": 6302685402401.483, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685469535.019, "dur": 48.288, + "args": { + "External id": 129710, "device": 3, "context": 1, "stream": 7, "correlation": 241686787, "bytes": 25165824, "memory bandwidth (GB/s)": 521.1610337972166 + } + }, + { + "ph": "f", "id": 241686787, "pid": 3, "tid": 7, "ts": 6302685469535.019, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685402540.363, "dur": 18.280, + "args": { + "External id": 129710, "cbid": 41, "correlation": 241686787 + } + }, + { + "ph": "s", "id": 241686787, "pid": 5717, "tid": 6759, "ts": 6302685402540.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685469583.883, "dur": 34.720, + "args": { + "External id": 129707, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686805, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686805, "pid": 3, "tid": 7, "ts": 6302685469583.883, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685402654.803, "dur": 8.510, + "args": { + "External id": 129707, "cbid": 307, "correlation": 241686805 + } + }, + { + "ph": "s", "id": 241686805, "pid": 5717, "tid": 6759, "ts": 6302685402654.803, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685469619.307, "dur": 40.320, + "args": { + "External id": 129717, "device": 3, "context": 1, "stream": 7, "correlation": 241686820, "bytes": 25165824, "memory bandwidth (GB/s)": 624.152380952381 + } + }, + { + "ph": "f", "id": 241686820, "pid": 3, "tid": 7, "ts": 6302685469619.307, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685402722.063, "dur": 13.530, + "args": { + "External id": 129717, "cbid": 41, "correlation": 241686820 + } + }, + { + "ph": "s", "id": 241686820, "pid": 5717, "tid": 6759, "ts": 6302685402722.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685469660.299, "dur": 27.265, + "args": { + "External id": 129714, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686838, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241686838, "pid": 3, "tid": 7, "ts": 6302685469660.299, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685402820.773, "dur": 8.080, + "args": { + "External id": 129714, "cbid": 307, "correlation": 241686838 + } + }, + { + "ph": "s", "id": 241686838, "pid": 5717, "tid": 6759, "ts": 6302685402820.773, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685402940.392, "dur": 0.530, + "args": { + "External id": 129722, "cbid": 200, "correlation": 241686868 + } + }, + { + "ph": "f", "id": 241686868, "pid": 5717, "tid": 6759, "ts": 6302685402940.392, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685469689.036, "dur": 1.248, + "args": { + "External id": 129722, "device": 3, "context": 1, "stream": 7, "correlation": 241686871, "bytes": 576, "memory bandwidth (GB/s)": 0.46153846153846156 + } + }, + { + "ph": "f", "id": 241686871, "pid": 3, "tid": 7, "ts": 6302685469689.036, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685402942.582, "dur": 7.290, + "args": { + "External id": 129722, "cbid": 51, "correlation": 241686871 + } + }, + { + "ph": "s", "id": 241686871, "pid": 5717, "tid": 6759, "ts": 6302685402942.582, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685469691.499, "dur": 145.410, + "args": { + "External id": 129722, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686872, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686872, "pid": 3, "tid": 7, "ts": 6302685469691.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685402950.122, "dur": 7.720, + "args": { + "External id": 129722, "cbid": 307, "correlation": 241686872 + } + }, + { + "ph": "s", "id": 241686872, "pid": 5717, "tid": 6759, "ts": 6302685402950.122, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685402984.372, "dur": 0.270, + "args": { + "External id": 129723, "cbid": 200, "correlation": 241686897 + } + }, + { + "ph": "f", "id": 241686897, "pid": 5717, "tid": 6759, "ts": 6302685402984.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685469838.189, "dur": 1.119, + "args": { + "External id": 129723, "device": 3, "context": 1, "stream": 7, "correlation": 241686900, "bytes": 576, "memory bandwidth (GB/s)": 0.514745308310992 + } + }, + { + "ph": "f", "id": 241686900, "pid": 3, "tid": 7, "ts": 6302685469838.189, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685402985.772, "dur": 4.640, + "args": { + "External id": 129723, "cbid": 51, "correlation": 241686900 + } + }, + { + "ph": "s", "id": 241686900, "pid": 5717, "tid": 6759, "ts": 6302685402985.772, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685469840.716, "dur": 140.002, + "args": { + "External id": 129723, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686901, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686901, "pid": 3, "tid": 7, "ts": 6302685469840.716, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685402990.542, "dur": 5.140, + "args": { + "External id": 129723, "cbid": 307, "correlation": 241686901 + } + }, + { + "ph": "s", "id": 241686901, "pid": 5717, "tid": 6759, "ts": 6302685402990.542, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685403017.512, "dur": 0.290, + "args": { + "External id": 129724, "cbid": 200, "correlation": 241686926 + } + }, + { + "ph": "f", "id": 241686926, "pid": 5717, "tid": 6759, "ts": 6302685403017.512, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685469982.094, "dur": 1.248, + "args": { + "External id": 129724, "device": 3, "context": 1, "stream": 7, "correlation": 241686929, "bytes": 576, "memory bandwidth (GB/s)": 0.46153846153846156 + } + }, + { + "ph": "f", "id": 241686929, "pid": 3, "tid": 7, "ts": 6302685469982.094, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685403018.942, "dur": 4.300, + "args": { + "External id": 129724, "cbid": 51, "correlation": 241686929 + } + }, + { + "ph": "s", "id": 241686929, "pid": 5717, "tid": 6759, "ts": 6302685403018.942, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685469984.750, "dur": 274.690, + "args": { + "External id": 129724, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686930, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686930, "pid": 3, "tid": 7, "ts": 6302685469984.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685403023.402, "dur": 4.890, + "args": { + "External id": 129724, "cbid": 307, "correlation": 241686930 + } + }, + { + "ph": "s", "id": 241686930, "pid": 5717, "tid": 6759, "ts": 6302685403023.402, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685470272.080, "dur": 607.108, + "args": { + "External id": 129725, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686952, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686952, "pid": 3, "tid": 7, "ts": 6302685470272.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685403050.492, "dur": 5.570, + "args": { + "External id": 129725, "cbid": 211, "correlation": 241686952 + } + }, + { + "ph": "s", "id": 241686952, "pid": 5717, "tid": 6759, "ts": 6302685403050.492, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685470879.796, "dur": 157.474, + "args": { + "External id": 129726, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686975, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686975, "pid": 3, "tid": 7, "ts": 6302685470879.796, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685403074.412, "dur": 4.890, + "args": { + "External id": 129726, "cbid": 211, "correlation": 241686975 + } + }, + { + "ph": "s", "id": 241686975, "pid": 5717, "tid": 6759, "ts": 6302685403074.412, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685471037.974, "dur": 141.153, + "args": { + "External id": 129727, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241686998, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241686998, "pid": 3, "tid": 7, "ts": 6302685471037.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685403097.172, "dur": 4.590, + "args": { + "External id": 129727, "cbid": 211, "correlation": 241686998 + } + }, + { + "ph": "s", "id": 241686998, "pid": 5717, "tid": 6759, "ts": 6302685403097.172, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 3, "tid": 7, + "ts": 6302685471179.735, "dur": 80.161, + "args": { + "External id": 129728, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687006, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687006, "pid": 3, "tid": 7, "ts": 6302685471179.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685403134.682, "dur": 5.410, + "args": { + "External id": 129728, "cbid": 307, "correlation": 241687006 + } + }, + { + "ph": "s", "id": 241687006, "pid": 5717, "tid": 6759, "ts": 6302685403134.682, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685471260.536, "dur": 46.176, + "args": { + "External id": 129743, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687035, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687035, "pid": 3, "tid": 7, "ts": 6302685471260.536, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685403286.161, "dur": 8.900, + "args": { + "External id": 129743, "cbid": 307, "correlation": 241687035 + } + }, + { + "ph": "s", "id": 241687035, "pid": 5717, "tid": 6759, "ts": 6302685403286.161, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685471307.416, "dur": 3.648, + "args": { + "External id": 129744, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687043, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241687043, "pid": 3, "tid": 7, "ts": 6302685471307.416, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685403333.332, "dur": 6.780, + "args": { + "External id": 129744, "cbid": 307, "correlation": 241687043 + } + }, + { + "ph": "s", "id": 241687043, "pid": 5717, "tid": 6759, "ts": 6302685403333.332, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685471311.640, "dur": 52.256, + "args": { + "External id": 129745, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687054, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687054, "pid": 3, "tid": 7, "ts": 6302685471311.640, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685403370.851, "dur": 5.730, + "args": { + "External id": 129745, "cbid": 307, "correlation": 241687054 + } + }, + { + "ph": "s", "id": 241687054, "pid": 5717, "tid": 6759, "ts": 6302685403370.851, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685471364.504, "dur": 47.456, + "args": { + "External id": 129746, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687059, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687059, "pid": 3, "tid": 7, "ts": 6302685471364.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685403416.461, "dur": 7.340, + "args": { + "External id": 129746, "cbid": 211, "correlation": 241687059 + } + }, + { + "ph": "s", "id": 241687059, "pid": 5717, "tid": 6759, "ts": 6302685403416.461, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685403616.281, "dur": 4.100, + "args": { + "External id": 129752, "cbid": 147, "correlation": 241687076 + } + }, + { + "ph": "s", "id": 241687076, "pid": 5717, "tid": 6759, "ts": 6302685403616.281, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685403746.611, "dur": 2.860, + "args": { + "External id": 129760, "cbid": 138, "correlation": 241687091 + } + }, + { + "ph": "f", "id": 241687091, "pid": 5717, "tid": 6759, "ts": 6302685403746.611, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685403750.160, "dur": 1.051, + "args": { + "External id": 129760, "cbid": 138, "correlation": 241687092 + } + }, + { + "ph": "f", "id": 241687092, "pid": 5717, "tid": 6759, "ts": 6302685403750.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685403751.840, "dur": 0.631, + "args": { + "External id": 129760, "cbid": 138, "correlation": 241687093 + } + }, + { + "ph": "f", "id": 241687093, "pid": 5717, "tid": 6759, "ts": 6302685403751.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685473285.478, "dur": 3.232, + "args": { + "External id": 129764, "device": 3, "context": 1, "stream": 7, "correlation": 241687104, "bytes": 28112, "memory bandwidth (GB/s)": 8.698019801980198 + } + }, + { + "ph": "f", "id": 241687104, "pid": 3, "tid": 7, "ts": 6302685473285.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685403773.280, "dur": 11.600, + "args": { + "External id": 129764, "cbid": 41, "correlation": 241687104 + } + }, + { + "ph": "s", "id": 241687104, "pid": 5717, "tid": 6759, "ts": 6302685403773.280, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685403788.840, "dur": 1.720, + "args": { + "External id": 129759, "cbid": 135, "correlation": 241687108 + } + }, + { + "ph": "f", "id": 241687108, "pid": 5717, "tid": 6759, "ts": 6302685403788.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685473290.598, "dur": 595.941, + "args": { + "External id": 129759, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687112, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687112, "pid": 3, "tid": 7, "ts": 6302685473290.598, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685403793.680, "dur": 10.300, + "args": { + "External id": 129759, "cbid": 211, "correlation": 241687112 + } + }, + { + "ph": "s", "id": 241687112, "pid": 5717, "tid": 6759, "ts": 6302685403793.680, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685403855.980, "dur": 2.220, + "args": { + "External id": 129752, "cbid": 135, "correlation": 241687123 + } + }, + { + "ph": "f", "id": 241687123, "pid": 5717, "tid": 6759, "ts": 6302685403855.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685403861.340, "dur": 2.360, + "args": { + "External id": 129752, "cbid": 147, "correlation": 241687127 + } + }, + { + "ph": "s", "id": 241687127, "pid": 5717, "tid": 6759, "ts": 6302685403861.340, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685403940.830, "dur": 0.980, + "args": { + "External id": 129768, "cbid": 317, "correlation": 241687147 + } + }, + { + "ph": "f", "id": 241687147, "pid": 5717, "tid": 6759, "ts": 6302685403940.830, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685403943.810, "dur": 1.410, + "args": { + "External id": 129768, "cbid": 135, "correlation": 241687149 + } + }, + { + "ph": "f", "id": 241687149, "pid": 5717, "tid": 6759, "ts": 6302685403943.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685403946.590, "dur": 1.000, + "args": { + "External id": 129768, "cbid": 147, "correlation": 241687153 + } + }, + { + "ph": "s", "id": 241687153, "pid": 5717, "tid": 6759, "ts": 6302685403946.590, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685403961.820, "dur": 0.790, + "args": { + "External id": 129768, "cbid": 409, "correlation": 241687156 + } + }, + { + "ph": "f", "id": 241687156, "pid": 5717, "tid": 6759, "ts": 6302685403961.820, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685403966.440, "dur": 0.790, + "args": { + "External id": 129768, "cbid": 135, "correlation": 241687159 + } + }, + { + "ph": "f", "id": 241687159, "pid": 5717, "tid": 6759, "ts": 6302685403966.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685403967.390, "dur": 0.860, + "args": { + "External id": 129768, "cbid": 147, "correlation": 241687160 + } + }, + { + "ph": "s", "id": 241687160, "pid": 5717, "tid": 6759, "ts": 6302685403967.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685478451.181, "dur": 10507.440, + "args": { + "External id": 129768, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241687162, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241687162, "pid": 3, "tid": 20, "ts": 6302685478451.181, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685403969.350, "dur": 9.940, + "args": { + "External id": 129768, "cbid": 430, "correlation": 241687162 + } + }, + { + "ph": "s", "id": 241687162, "pid": 5717, "tid": 6759, "ts": 6302685403969.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685403980.290, "dur": 0.410, + "args": { + "External id": 129768, "cbid": 135, "correlation": 241687164 + } + }, + { + "ph": "f", "id": 241687164, "pid": 5717, "tid": 6759, "ts": 6302685403980.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685403980.820, "dur": 0.620, + "args": { + "External id": 129768, "cbid": 147, "correlation": 241687165 + } + }, + { + "ph": "s", "id": 241687165, "pid": 5717, "tid": 6759, "ts": 6302685403980.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685403983.000, "dur": 0.900, + "args": { + "External id": 129768, "cbid": 135, "correlation": 241687168 + } + }, + { + "ph": "f", "id": 241687168, "pid": 5717, "tid": 6759, "ts": 6302685403983.000, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685403991.470, "dur": 0.420, + "args": { + "External id": 129768, "cbid": 135, "correlation": 241687175 + } + }, + { + "ph": "f", "id": 241687175, "pid": 5717, "tid": 6759, "ts": 6302685403991.470, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685404017.470, "dur": 0.950, + "args": { + "External id": 129770, "cbid": 147, "correlation": 241687180 + } + }, + { + "ph": "s", "id": 241687180, "pid": 5717, "tid": 6759, "ts": 6302685404017.470, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685404034.210, "dur": 0.870, + "args": { + "External id": 129752, "cbid": 135, "correlation": 241687195 + } + }, + { + "ph": "f", "id": 241687195, "pid": 5717, "tid": 6759, "ts": 6302685404034.210, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685404214.590, "dur": 1.320, + "args": { + "External id": 129752, "cbid": 135, "correlation": 241687208 + } + }, + { + "ph": "f", "id": 241687208, "pid": 5717, "tid": 6759, "ts": 6302685404214.590, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685404325.459, "dur": 3.320, + "args": { + "External id": 129780, "cbid": 147, "correlation": 241687219 + } + }, + { + "ph": "s", "id": 241687219, "pid": 5717, "tid": 6759, "ts": 6302685404325.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685404436.869, "dur": 1.150, + "args": { + "External id": 129794, "cbid": 317, "correlation": 241687260 + } + }, + { + "ph": "f", "id": 241687260, "pid": 5717, "tid": 6759, "ts": 6302685404436.869, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685404445.119, "dur": 2.220, + "args": { + "External id": 129795, "cbid": 138, "correlation": 241687263 + } + }, + { + "ph": "f", "id": 241687263, "pid": 5717, "tid": 6759, "ts": 6302685404445.119, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685478450.637, "dur": 1.888, + "args": { + "External id": 129799, "device": 3, "context": 1, "stream": 7, "correlation": 241687274, "bytes": 7224, "memory bandwidth (GB/s)": 3.8262711864406778 + } + }, + { + "ph": "f", "id": 241687274, "pid": 3, "tid": 7, "ts": 6302685478450.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685404467.849, "dur": 11.840, + "args": { + "External id": 129799, "cbid": 41, "correlation": 241687274 + } + }, + { + "ph": "s", "id": 241687274, "pid": 5717, "tid": 6759, "ts": 6302685404467.849, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685404483.939, "dur": 1.730, + "args": { + "External id": 129794, "cbid": 135, "correlation": 241687278 + } + }, + { + "ph": "f", "id": 241687278, "pid": 5717, "tid": 6759, "ts": 6302685404483.939, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685478454.669, "dur": 209.986, + "args": { + "External id": 129794, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687282, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687282, "pid": 3, "tid": 7, "ts": 6302685478454.669, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685404488.029, "dur": 10.110, + "args": { + "External id": 129794, "cbid": 211, "correlation": 241687282 + } + }, + { + "ph": "s", "id": 241687282, "pid": 5717, "tid": 6759, "ts": 6302685404488.029, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685404586.838, "dur": 1.211, + "args": { + "External id": 129780, "cbid": 135, "correlation": 241687293 + } + }, + { + "ph": "f", "id": 241687293, "pid": 5717, "tid": 6759, "ts": 6302685404586.838, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685404591.049, "dur": 1.200, + "args": { + "External id": 129780, "cbid": 147, "correlation": 241687297 + } + }, + { + "ph": "s", "id": 241687297, "pid": 5717, "tid": 6759, "ts": 6302685404591.049, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685404593.909, "dur": 0.700, + "args": { + "External id": 129780, "cbid": 147, "correlation": 241687301 + } + }, + { + "ph": "s", "id": 241687301, "pid": 5717, "tid": 6759, "ts": 6302685404593.909, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685479368.276, "dur": 634.245, + "args": { + "External id": 129813, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241687325, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241687325, "pid": 3, "tid": 17, "ts": 6302685479368.276, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685404736.688, "dur": 11.400, + "args": { + "External id": 129813, "cbid": 211, "correlation": 241687325 + } + }, + { + "ph": "s", "id": 241687325, "pid": 5717, "tid": 6759, "ts": 6302685404736.688, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685480112.762, "dur": 51.168, + "args": { + "External id": 129829, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241687338, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241687338, "pid": 3, "tid": 17, "ts": 6302685480112.762, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685404844.038, "dur": 9.150, + "args": { + "External id": 129829, "cbid": 211, "correlation": 241687338 + } + }, + { + "ph": "s", "id": 241687338, "pid": 5717, "tid": 6759, "ts": 6302685404844.038, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685404876.248, "dur": 1.270, + "args": { + "External id": 129780, "cbid": 135, "correlation": 241687348 + } + }, + { + "ph": "f", "id": 241687348, "pid": 5717, "tid": 6759, "ts": 6302685404876.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685404879.398, "dur": 1.250, + "args": { + "External id": 129780, "cbid": 147, "correlation": 241687352 + } + }, + { + "ph": "s", "id": 241687352, "pid": 5717, "tid": 6759, "ts": 6302685404879.398, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685404930.208, "dur": 0.870, + "args": { + "External id": 129831, "cbid": 317, "correlation": 241687365 + } + }, + { + "ph": "f", "id": 241687365, "pid": 5717, "tid": 6759, "ts": 6302685404930.208, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685404932.918, "dur": 1.180, + "args": { + "External id": 129831, "cbid": 135, "correlation": 241687367 + } + }, + { + "ph": "f", "id": 241687367, "pid": 5717, "tid": 6759, "ts": 6302685404932.918, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685404935.498, "dur": 1.120, + "args": { + "External id": 129831, "cbid": 147, "correlation": 241687371 + } + }, + { + "ph": "s", "id": 241687371, "pid": 5717, "tid": 6759, "ts": 6302685404935.498, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685404949.978, "dur": 0.680, + "args": { + "External id": 129831, "cbid": 409, "correlation": 241687374 + } + }, + { + "ph": "f", "id": 241687374, "pid": 5717, "tid": 6759, "ts": 6302685404949.978, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685404954.508, "dur": 0.770, + "args": { + "External id": 129831, "cbid": 135, "correlation": 241687377 + } + }, + { + "ph": "f", "id": 241687377, "pid": 5717, "tid": 6759, "ts": 6302685404954.508, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685404955.448, "dur": 0.820, + "args": { + "External id": 129831, "cbid": 147, "correlation": 241687378 + } + }, + { + "ph": "s", "id": 241687378, "pid": 5717, "tid": 6759, "ts": 6302685404955.448, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685488961.085, "dur": 4832.644, + "args": { + "External id": 129831, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241687380, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241687380, "pid": 3, "tid": 20, "ts": 6302685488961.085, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685404957.558, "dur": 9.490, + "args": { + "External id": 129831, "cbid": 430, "correlation": 241687380 + } + }, + { + "ph": "s", "id": 241687380, "pid": 5717, "tid": 6759, "ts": 6302685404957.558, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685404968.028, "dur": 0.360, + "args": { + "External id": 129831, "cbid": 135, "correlation": 241687382 + } + }, + { + "ph": "f", "id": 241687382, "pid": 5717, "tid": 6759, "ts": 6302685404968.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685404968.508, "dur": 0.490, + "args": { + "External id": 129831, "cbid": 147, "correlation": 241687383 + } + }, + { + "ph": "s", "id": 241687383, "pid": 5717, "tid": 6759, "ts": 6302685404968.508, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685404970.558, "dur": 0.860, + "args": { + "External id": 129831, "cbid": 135, "correlation": 241687386 + } + }, + { + "ph": "f", "id": 241687386, "pid": 5717, "tid": 6759, "ts": 6302685404970.558, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685404978.718, "dur": 0.460, + "args": { + "External id": 129831, "cbid": 135, "correlation": 241687393 + } + }, + { + "ph": "f", "id": 241687393, "pid": 5717, "tid": 6759, "ts": 6302685404978.718, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685405004.258, "dur": 1.090, + "args": { + "External id": 129833, "cbid": 147, "correlation": 241687398 + } + }, + { + "ph": "s", "id": 241687398, "pid": 5717, "tid": 6759, "ts": 6302685405004.258, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685405021.138, "dur": 0.900, + "args": { + "External id": 129780, "cbid": 135, "correlation": 241687413 + } + }, + { + "ph": "f", "id": 241687413, "pid": 5717, "tid": 6759, "ts": 6302685405021.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685478692.911, "dur": 1709.389, + "args": { + "External id": 129835, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687438, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687438, "pid": 3, "tid": 7, "ts": 6302685478692.911, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405159.717, "dur": 10.750, + "args": { + "External id": 129835, "cbid": 211, "correlation": 241687438 + } + }, + { + "ph": "s", "id": 241687438, "pid": 5717, "tid": 6759, "ts": 6302685405159.717, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 3, "tid": 7, + "ts": 6302685480402.972, "dur": 431.716, + "args": { + "External id": 129836, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687461, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241687461, "pid": 3, "tid": 7, "ts": 6302685480402.972, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405218.557, "dur": 6.370, + "args": { + "External id": 129836, "cbid": 307, "correlation": 241687461 + } + }, + { + "ph": "s", "id": 241687461, "pid": 5717, "tid": 6759, "ts": 6302685405218.557, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685405258.677, "dur": 0.510, + "args": { + "External id": 129837, "cbid": 200, "correlation": 241687484 + } + }, + { + "ph": "f", "id": 241687484, "pid": 5717, "tid": 6759, "ts": 6302685405258.677, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685480836.288, "dur": 0.800, + "args": { + "External id": 129837, "device": 3, "context": 1, "stream": 7, "correlation": 241687487, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 241687487, "pid": 3, "tid": 7, "ts": 6302685480836.288, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685405260.777, "dur": 6.690, + "args": { + "External id": 129837, "cbid": 51, "correlation": 241687487 + } + }, + { + "ph": "s", "id": 241687487, "pid": 5717, "tid": 6759, "ts": 6302685405260.777, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685480838.368, "dur": 494.819, + "args": { + "External id": 129837, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687488, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687488, "pid": 3, "tid": 7, "ts": 6302685480838.368, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405267.667, "dur": 5.830, + "args": { + "External id": 129837, "cbid": 307, "correlation": 241687488 + } + }, + { + "ph": "s", "id": 241687488, "pid": 5717, "tid": 6759, "ts": 6302685405267.667, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685405308.817, "dur": 0.300, + "args": { + "External id": 129838, "cbid": 200, "correlation": 241687513 + } + }, + { + "ph": "f", "id": 241687513, "pid": 5717, "tid": 6759, "ts": 6302685405308.817, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685481367.747, "dur": 58.977, + "args": { + "External id": 129838, "device": 3, "context": 1, "stream": 7, "correlation": 241687516, "bytes": 1536, "memory bandwidth (GB/s)": 0.026044051070756396 + } + }, + { + "ph": "f", "id": 241687516, "pid": 3, "tid": 7, "ts": 6302685481367.747, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685405310.297, "dur": 5.080, + "args": { + "External id": 129838, "cbid": 51, "correlation": 241687516 + } + }, + { + "ph": "s", "id": 241687516, "pid": 5717, "tid": 6759, "ts": 6302685405310.297, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685481485.476, "dur": 405.795, + "args": { + "External id": 129838, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687517, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687517, "pid": 3, "tid": 7, "ts": 6302685481485.476, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405315.577, "dur": 5.920, + "args": { + "External id": 129838, "cbid": 307, "correlation": 241687517 + } + }, + { + "ph": "s", "id": 241687517, "pid": 5717, "tid": 6759, "ts": 6302685405315.577, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685405348.057, "dur": 0.280, + "args": { + "External id": 129839, "cbid": 200, "correlation": 241687542 + } + }, + { + "ph": "f", "id": 241687542, "pid": 5717, "tid": 6759, "ts": 6302685405348.057, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685481891.943, "dur": 354.595, + "args": { + "External id": 129839, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687545, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687545, "pid": 3, "tid": 7, "ts": 6302685481891.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405349.537, "dur": 5.420, + "args": { + "External id": 129839, "cbid": 307, "correlation": 241687545 + } + }, + { + "ph": "s", "id": 241687545, "pid": 5717, "tid": 6759, "ts": 6302685405349.537, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685405376.477, "dur": 0.240, + "args": { + "External id": 129840, "cbid": 200, "correlation": 241687570 + } + }, + { + "ph": "f", "id": 241687570, "pid": 5717, "tid": 6759, "ts": 6302685405376.477, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685482248.266, "dur": 1.216, + "args": { + "External id": 129840, "device": 3, "context": 1, "stream": 7, "correlation": 241687573, "bytes": 1536, "memory bandwidth (GB/s)": 1.263157894736842 + } + }, + { + "ph": "f", "id": 241687573, "pid": 3, "tid": 7, "ts": 6302685482248.266, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685405377.747, "dur": 4.390, + "args": { + "External id": 129840, "cbid": 51, "correlation": 241687573 + } + }, + { + "ph": "s", "id": 241687573, "pid": 5717, "tid": 6759, "ts": 6302685405377.747, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685482250.666, "dur": 685.765, + "args": { + "External id": 129840, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687574, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687574, "pid": 3, "tid": 7, "ts": 6302685482250.666, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405382.297, "dur": 4.930, + "args": { + "External id": 129840, "cbid": 307, "correlation": 241687574 + } + }, + { + "ph": "s", "id": 241687574, "pid": 5717, "tid": 6759, "ts": 6302685405382.297, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685405409.617, "dur": 0.310, + "args": { + "External id": 129841, "cbid": 200, "correlation": 241687599 + } + }, + { + "ph": "f", "id": 241687599, "pid": 5717, "tid": 6759, "ts": 6302685405409.617, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685482937.071, "dur": 393.955, + "args": { + "External id": 129841, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687602, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687602, "pid": 3, "tid": 7, "ts": 6302685482937.071, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405410.967, "dur": 5.130, + "args": { + "External id": 129841, "cbid": 307, "correlation": 241687602 + } + }, + { + "ph": "s", "id": 241687602, "pid": 5717, "tid": 6759, "ts": 6302685405410.967, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685483331.634, "dur": 89.953, + "args": { + "External id": 129842, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687615, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687615, "pid": 3, "tid": 7, "ts": 6302685483331.634, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405453.047, "dur": 6.100, + "args": { + "External id": 129842, "cbid": 307, "correlation": 241687615 + } + }, + { + "ph": "s", "id": 241687615, "pid": 5717, "tid": 6759, "ts": 6302685405453.047, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 3, "tid": 7, + "ts": 6302685483422.291, "dur": 3.424, + "args": { + "External id": 129843, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687623, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241687623, "pid": 3, "tid": 7, "ts": 6302685483422.291, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405483.376, "dur": 5.400, + "args": { + "External id": 129843, "cbid": 307, "correlation": 241687623 + } + }, + { + "ph": "s", "id": 241687623, "pid": 5717, "tid": 6759, "ts": 6302685405483.376, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 3, "tid": 7, + "ts": 6302685483426.387, "dur": 114.433, + "args": { + "External id": 129844, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687631, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687631, "pid": 3, "tid": 7, "ts": 6302685483426.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405513.867, "dur": 5.300, + "args": { + "External id": 129844, "cbid": 307, "correlation": 241687631 + } + }, + { + "ph": "s", "id": 241687631, "pid": 5717, "tid": 6759, "ts": 6302685405513.867, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685405685.316, "dur": 0.500, + "args": { + "External id": 129863, "cbid": 200, "correlation": 241687677 + } + }, + { + "ph": "f", "id": 241687677, "pid": 5717, "tid": 6759, "ts": 6302685405685.316, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685483542.068, "dur": 1.600, + "args": { + "External id": 129863, "device": 3, "context": 1, "stream": 7, "correlation": 241687680, "bytes": 576, "memory bandwidth (GB/s)": 0.36 + } + }, + { + "ph": "f", "id": 241687680, "pid": 3, "tid": 7, "ts": 6302685483542.068, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685405687.386, "dur": 7.010, + "args": { + "External id": 129863, "cbid": 51, "correlation": 241687680 + } + }, + { + "ph": "s", "id": 241687680, "pid": 5717, "tid": 6759, "ts": 6302685405687.386, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685483544.820, "dur": 140.833, + "args": { + "External id": 129863, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687681, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687681, "pid": 3, "tid": 7, "ts": 6302685483544.820, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405694.596, "dur": 8.000, + "args": { + "External id": 129863, "cbid": 307, "correlation": 241687681 + } + }, + { + "ph": "s", "id": 241687681, "pid": 5717, "tid": 6759, "ts": 6302685405694.596, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685483686.325, "dur": 139.169, + "args": { + "External id": 129864, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687703, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687703, "pid": 3, "tid": 7, "ts": 6302685483686.325, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405728.266, "dur": 5.800, + "args": { + "External id": 129864, "cbid": 211, "correlation": 241687703 + } + }, + { + "ph": "s", "id": 241687703, "pid": 5717, "tid": 6759, "ts": 6302685405728.266, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685405803.556, "dur": 0.390, + "args": { + "External id": 129865, "cbid": 200, "correlation": 241687721 + } + }, + { + "ph": "f", "id": 241687721, "pid": 5717, "tid": 6759, "ts": 6302685405803.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685405804.056, "dur": 0.200, + "args": { + "External id": 129865, "cbid": 200, "correlation": 241687722 + } + }, + { + "ph": "f", "id": 241687722, "pid": 5717, "tid": 6759, "ts": 6302685405804.056, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685405822.916, "dur": 0.240, + "args": { + "External id": 129865, "cbid": 200, "correlation": 241687740 + } + }, + { + "ph": "f", "id": 241687740, "pid": 5717, "tid": 6759, "ts": 6302685405822.916, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685483826.198, "dur": 93.665, + "args": { + "External id": 129865, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687741, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687741, "pid": 3, "tid": 7, "ts": 6302685483826.198, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405824.336, "dur": 9.320, + "args": { + "External id": 129865, "cbid": 211, "correlation": 241687741 + } + }, + { + "ph": "s", "id": 241687741, "pid": 5717, "tid": 6759, "ts": 6302685405824.336, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685405834.406, "dur": 0.930, + "args": { + "External id": 129865, "cbid": 273, "correlation": 241687743 + } + }, + { + "ph": "f", "id": 241687743, "pid": 5717, "tid": 6759, "ts": 6302685405834.406, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685483920.503, "dur": 1378.154, + "args": { + "External id": 129865, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687744, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241687744, "pid": 3, "tid": 7, "ts": 6302685483920.503, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405835.626, "dur": 4.120, + "args": { + "External id": 129865, "cbid": 211, "correlation": 241687744 + } + }, + { + "ph": "s", "id": 241687744, "pid": 5717, "tid": 6759, "ts": 6302685405835.626, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685485299.393, "dur": 73.249, + "args": { + "External id": 129865, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687746, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241687746, "pid": 3, "tid": 7, "ts": 6302685485299.393, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685405840.286, "dur": 3.670, + "args": { + "External id": 129865, "cbid": 211, "correlation": 241687746 + } + }, + { + "ph": "s", "id": 241687746, "pid": 5717, "tid": 6759, "ts": 6302685405840.286, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685485373.346, "dur": 49.728, + "args": { + "External id": 129876, "device": 3, "context": 1, "stream": 7, "correlation": 241687768, "bytes": 25165824, "memory bandwidth (GB/s)": 506.0694980694981 + } + }, + { + "ph": "f", "id": 241687768, "pid": 3, "tid": 7, "ts": 6302685485373.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685405969.875, "dur": 17.780, + "args": { + "External id": 129876, "cbid": 41, "correlation": 241687768 + } + }, + { + "ph": "s", "id": 241687768, "pid": 5717, "tid": 6759, "ts": 6302685405969.875, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685485423.714, "dur": 32.640, + "args": { + "External id": 129873, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687786, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687786, "pid": 3, "tid": 7, "ts": 6302685485423.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406083.505, "dur": 8.430, + "args": { + "External id": 129873, "cbid": 307, "correlation": 241687786 + } + }, + { + "ph": "s", "id": 241687786, "pid": 5717, "tid": 6759, "ts": 6302685406083.505, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685485456.962, "dur": 39.968, + "args": { + "External id": 129883, "device": 3, "context": 1, "stream": 7, "correlation": 241687801, "bytes": 25165824, "memory bandwidth (GB/s)": 629.6493194555644 + } + }, + { + "ph": "f", "id": 241687801, "pid": 3, "tid": 7, "ts": 6302685485456.962, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685406150.165, "dur": 13.350, + "args": { + "External id": 129883, "cbid": 41, "correlation": 241687801 + } + }, + { + "ph": "s", "id": 241687801, "pid": 5717, "tid": 6759, "ts": 6302685406150.165, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685485497.634, "dur": 28.513, + "args": { + "External id": 129880, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687819, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687819, "pid": 3, "tid": 7, "ts": 6302685485497.634, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406248.325, "dur": 8.010, + "args": { + "External id": 129880, "cbid": 307, "correlation": 241687819 + } + }, + { + "ph": "s", "id": 241687819, "pid": 5717, "tid": 6759, "ts": 6302685406248.325, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685406425.225, "dur": 0.560, + "args": { + "External id": 129888, "cbid": 200, "correlation": 241687849 + } + }, + { + "ph": "f", "id": 241687849, "pid": 5717, "tid": 6759, "ts": 6302685406425.225, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685485527.395, "dur": 1.152, + "args": { + "External id": 129888, "device": 3, "context": 1, "stream": 7, "correlation": 241687852, "bytes": 576, "memory bandwidth (GB/s)": 0.5 + } + }, + { + "ph": "f", "id": 241687852, "pid": 3, "tid": 7, "ts": 6302685485527.395, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685406428.574, "dur": 12.131, + "args": { + "External id": 129888, "cbid": 51, "correlation": 241687852 + } + }, + { + "ph": "s", "id": 241687852, "pid": 5717, "tid": 6759, "ts": 6302685406428.574, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685485529.987, "dur": 308.194, + "args": { + "External id": 129888, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687853, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687853, "pid": 3, "tid": 7, "ts": 6302685485529.987, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406440.994, "dur": 11.700, + "args": { + "External id": 129888, "cbid": 307, "correlation": 241687853 + } + }, + { + "ph": "s", "id": 241687853, "pid": 5717, "tid": 6759, "ts": 6302685406440.994, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685406479.874, "dur": 0.270, + "args": { + "External id": 129889, "cbid": 200, "correlation": 241687878 + } + }, + { + "ph": "f", "id": 241687878, "pid": 5717, "tid": 6759, "ts": 6302685406479.874, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685485887.333, "dur": 59.105, + "args": { + "External id": 129889, "device": 3, "context": 1, "stream": 7, "correlation": 241687881, "bytes": 576, "memory bandwidth (GB/s)": 0.009745368412147872 + } + }, + { + "ph": "f", "id": 241687881, "pid": 3, "tid": 7, "ts": 6302685485887.333, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685406481.194, "dur": 4.690, + "args": { + "External id": 129889, "cbid": 51, "correlation": 241687881 + } + }, + { + "ph": "s", "id": 241687881, "pid": 5717, "tid": 6759, "ts": 6302685406481.194, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685485997.606, "dur": 389.731, + "args": { + "External id": 129889, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687882, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687882, "pid": 3, "tid": 7, "ts": 6302685485997.606, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406486.024, "dur": 5.340, + "args": { + "External id": 129889, "cbid": 307, "correlation": 241687882 + } + }, + { + "ph": "s", "id": 241687882, "pid": 5717, "tid": 6759, "ts": 6302685406486.024, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685406514.044, "dur": 0.280, + "args": { + "External id": 129890, "cbid": 200, "correlation": 241687907 + } + }, + { + "ph": "f", "id": 241687907, "pid": 5717, "tid": 6759, "ts": 6302685406514.044, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685486388.393, "dur": 0.832, + "args": { + "External id": 129890, "device": 3, "context": 1, "stream": 7, "correlation": 241687910, "bytes": 576, "memory bandwidth (GB/s)": 0.6923076923076923 + } + }, + { + "ph": "f", "id": 241687910, "pid": 3, "tid": 7, "ts": 6302685486388.393, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685406515.434, "dur": 4.280, + "args": { + "External id": 129890, "cbid": 51, "correlation": 241687910 + } + }, + { + "ph": "s", "id": 241687910, "pid": 5717, "tid": 6759, "ts": 6302685406515.434, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685486390.345, "dur": 140.033, + "args": { + "External id": 129890, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687911, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687911, "pid": 3, "tid": 7, "ts": 6302685486390.345, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406519.864, "dur": 4.770, + "args": { + "External id": 129890, "cbid": 307, "correlation": 241687911 + } + }, + { + "ph": "s", "id": 241687911, "pid": 5717, "tid": 6759, "ts": 6302685406519.864, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685486531.018, "dur": 139.265, + "args": { + "External id": 129891, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687933, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687933, "pid": 3, "tid": 7, "ts": 6302685486531.018, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406547.634, "dur": 5.620, + "args": { + "External id": 129891, "cbid": 211, "correlation": 241687933 + } + }, + { + "ph": "s", "id": 241687933, "pid": 5717, "tid": 6759, "ts": 6302685406547.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685486670.955, "dur": 139.617, + "args": { + "External id": 129892, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687956, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687956, "pid": 3, "tid": 7, "ts": 6302685486670.955, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406571.944, "dur": 5.030, + "args": { + "External id": 129892, "cbid": 211, "correlation": 241687956 + } + }, + { + "ph": "s", "id": 241687956, "pid": 5717, "tid": 6759, "ts": 6302685406571.944, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685486811.180, "dur": 140.001, + "args": { + "External id": 129893, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687979, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241687979, "pid": 3, "tid": 7, "ts": 6302685486811.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406594.394, "dur": 4.930, + "args": { + "External id": 129893, "cbid": 211, "correlation": 241687979 + } + }, + { + "ph": "s", "id": 241687979, "pid": 5717, "tid": 6759, "ts": 6302685406594.394, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 3, "tid": 7, + "ts": 6302685486951.789, "dur": 81.025, + "args": { + "External id": 129894, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241687987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241687987, "pid": 3, "tid": 7, "ts": 6302685486951.789, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406632.974, "dur": 5.350, + "args": { + "External id": 129894, "cbid": 307, "correlation": 241687987 + } + }, + { + "ph": "s", "id": 241687987, "pid": 5717, "tid": 6759, "ts": 6302685406632.974, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685487033.422, "dur": 46.944, + "args": { + "External id": 129909, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688016, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688016, "pid": 3, "tid": 7, "ts": 6302685487033.422, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406789.173, "dur": 8.851, + "args": { + "External id": 129909, "cbid": 307, "correlation": 241688016 + } + }, + { + "ph": "s", "id": 241688016, "pid": 5717, "tid": 6759, "ts": 6302685406789.173, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685487081.038, "dur": 3.456, + "args": { + "External id": 129910, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688024, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241688024, "pid": 3, "tid": 7, "ts": 6302685487081.038, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406822.673, "dur": 5.091, + "args": { + "External id": 129910, "cbid": 307, "correlation": 241688024 + } + }, + { + "ph": "s", "id": 241688024, "pid": 5717, "tid": 6759, "ts": 6302685406822.673, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685487085.134, "dur": 50.785, + "args": { + "External id": 129911, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688035, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688035, "pid": 3, "tid": 7, "ts": 6302685487085.134, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406854.013, "dur": 5.560, + "args": { + "External id": 129911, "cbid": 307, "correlation": 241688035 + } + }, + { + "ph": "s", "id": 241688035, "pid": 5717, "tid": 6759, "ts": 6302685406854.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685487136.527, "dur": 65.984, + "args": { + "External id": 129912, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688040, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688040, "pid": 3, "tid": 7, "ts": 6302685487136.527, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685406896.663, "dur": 6.750, + "args": { + "External id": 129912, "cbid": 211, "correlation": 241688040 + } + }, + { + "ph": "s", "id": 241688040, "pid": 5717, "tid": 6759, "ts": 6302685406896.663, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685407067.283, "dur": 2.710, + "args": { + "External id": 129918, "cbid": 147, "correlation": 241688057 + } + }, + { + "ph": "s", "id": 241688057, "pid": 5717, "tid": 6759, "ts": 6302685407067.283, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685407167.453, "dur": 2.520, + "args": { + "External id": 129926, "cbid": 138, "correlation": 241688072 + } + }, + { + "ph": "f", "id": 241688072, "pid": 5717, "tid": 6759, "ts": 6302685407167.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685488963.485, "dur": 3.744, + "args": { + "External id": 129930, "device": 3, "context": 1, "stream": 7, "correlation": 241688083, "bytes": 28112, "memory bandwidth (GB/s)": 7.5085470085470085 + } + }, + { + "ph": "f", "id": 241688083, "pid": 3, "tid": 7, "ts": 6302685488963.485, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685407190.793, "dur": 11.310, + "args": { + "External id": 129930, "cbid": 41, "correlation": 241688083 + } + }, + { + "ph": "s", "id": 241688083, "pid": 5717, "tid": 6759, "ts": 6302685407190.793, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407206.283, "dur": 1.790, + "args": { + "External id": 129925, "cbid": 135, "correlation": 241688087 + } + }, + { + "ph": "f", "id": 241688087, "pid": 5717, "tid": 6759, "ts": 6302685407206.283, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685488969.373, "dur": 500.067, + "args": { + "External id": 129925, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688091, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688091, "pid": 3, "tid": 7, "ts": 6302685488969.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685407210.953, "dur": 9.710, + "args": { + "External id": 129925, "cbid": 211, "correlation": 241688091 + } + }, + { + "ph": "s", "id": 241688091, "pid": 5717, "tid": 6759, "ts": 6302685407210.953, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407258.952, "dur": 0.911, + "args": { + "External id": 129918, "cbid": 135, "correlation": 241688102 + } + }, + { + "ph": "f", "id": 241688102, "pid": 5717, "tid": 6759, "ts": 6302685407258.952, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685407261.752, "dur": 1.231, + "args": { + "External id": 129918, "cbid": 147, "correlation": 241688106 + } + }, + { + "ph": "s", "id": 241688106, "pid": 5717, "tid": 6759, "ts": 6302685407261.752, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685407336.992, "dur": 1.010, + "args": { + "External id": 129934, "cbid": 317, "correlation": 241688126 + } + }, + { + "ph": "f", "id": 241688126, "pid": 5717, "tid": 6759, "ts": 6302685407336.992, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407339.852, "dur": 1.420, + "args": { + "External id": 129934, "cbid": 135, "correlation": 241688128 + } + }, + { + "ph": "f", "id": 241688128, "pid": 5717, "tid": 6759, "ts": 6302685407339.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685407342.662, "dur": 1.000, + "args": { + "External id": 129934, "cbid": 147, "correlation": 241688132 + } + }, + { + "ph": "s", "id": 241688132, "pid": 5717, "tid": 6759, "ts": 6302685407342.662, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685407357.392, "dur": 0.690, + "args": { + "External id": 129934, "cbid": 409, "correlation": 241688135 + } + }, + { + "ph": "f", "id": 241688135, "pid": 5717, "tid": 6759, "ts": 6302685407357.392, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407361.942, "dur": 0.800, + "args": { + "External id": 129934, "cbid": 135, "correlation": 241688138 + } + }, + { + "ph": "f", "id": 241688138, "pid": 5717, "tid": 6759, "ts": 6302685407361.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685407362.902, "dur": 0.860, + "args": { + "External id": 129934, "cbid": 147, "correlation": 241688139 + } + }, + { + "ph": "s", "id": 241688139, "pid": 5717, "tid": 6759, "ts": 6302685407362.902, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685493795.425, "dur": 10559.760, + "args": { + "External id": 129934, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241688141, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241688141, "pid": 3, "tid": 20, "ts": 6302685493795.425, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685407364.872, "dur": 10.030, + "args": { + "External id": 129934, "cbid": 430, "correlation": 241688141 + } + }, + { + "ph": "s", "id": 241688141, "pid": 5717, "tid": 6759, "ts": 6302685407364.872, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407375.872, "dur": 0.410, + "args": { + "External id": 129934, "cbid": 135, "correlation": 241688143 + } + }, + { + "ph": "f", "id": 241688143, "pid": 5717, "tid": 6759, "ts": 6302685407375.872, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685407376.392, "dur": 0.480, + "args": { + "External id": 129934, "cbid": 147, "correlation": 241688144 + } + }, + { + "ph": "s", "id": 241688144, "pid": 5717, "tid": 6759, "ts": 6302685407376.392, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407378.352, "dur": 0.870, + "args": { + "External id": 129934, "cbid": 135, "correlation": 241688147 + } + }, + { + "ph": "f", "id": 241688147, "pid": 5717, "tid": 6759, "ts": 6302685407378.352, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407387.712, "dur": 0.410, + "args": { + "External id": 129934, "cbid": 135, "correlation": 241688154 + } + }, + { + "ph": "f", "id": 241688154, "pid": 5717, "tid": 6759, "ts": 6302685407387.712, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685407413.192, "dur": 0.930, + "args": { + "External id": 129936, "cbid": 147, "correlation": 241688159 + } + }, + { + "ph": "s", "id": 241688159, "pid": 5717, "tid": 6759, "ts": 6302685407413.192, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407430.062, "dur": 0.840, + "args": { + "External id": 129918, "cbid": 135, "correlation": 241688174 + } + }, + { + "ph": "f", "id": 241688174, "pid": 5717, "tid": 6759, "ts": 6302685407430.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407612.412, "dur": 1.280, + "args": { + "External id": 129918, "cbid": 135, "correlation": 241688187 + } + }, + { + "ph": "f", "id": 241688187, "pid": 5717, "tid": 6759, "ts": 6302685407612.412, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685407713.571, "dur": 2.991, + "args": { + "External id": 129946, "cbid": 147, "correlation": 241688198 + } + }, + { + "ph": "s", "id": 241688198, "pid": 5717, "tid": 6759, "ts": 6302685407713.571, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685407822.101, "dur": 1.100, + "args": { + "External id": 129960, "cbid": 317, "correlation": 241688239 + } + }, + { + "ph": "f", "id": 241688239, "pid": 5717, "tid": 6759, "ts": 6302685407822.101, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685407830.361, "dur": 2.320, + "args": { + "External id": 129961, "cbid": 138, "correlation": 241688242 + } + }, + { + "ph": "f", "id": 241688242, "pid": 5717, "tid": 6759, "ts": 6302685407830.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685493796.673, "dur": 1.856, + "args": { + "External id": 129965, "device": 3, "context": 1, "stream": 7, "correlation": 241688253, "bytes": 7224, "memory bandwidth (GB/s)": 3.8922413793103448 + } + }, + { + "ph": "f", "id": 241688253, "pid": 3, "tid": 7, "ts": 6302685493796.673, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685407853.051, "dur": 11.500, + "args": { + "External id": 129965, "cbid": 41, "correlation": 241688253 + } + }, + { + "ph": "s", "id": 241688253, "pid": 5717, "tid": 6759, "ts": 6302685407853.051, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407868.921, "dur": 1.960, + "args": { + "External id": 129960, "cbid": 135, "correlation": 241688257 + } + }, + { + "ph": "f", "id": 241688257, "pid": 5717, "tid": 6759, "ts": 6302685407868.921, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685493800.481, "dur": 228.450, + "args": { + "External id": 129960, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688261, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688261, "pid": 3, "tid": 7, "ts": 6302685493800.481, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685407873.341, "dur": 10.340, + "args": { + "External id": 129960, "cbid": 211, "correlation": 241688261 + } + }, + { + "ph": "s", "id": 241688261, "pid": 5717, "tid": 6759, "ts": 6302685407873.341, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685407971.681, "dur": 1.270, + "args": { + "External id": 129946, "cbid": 135, "correlation": 241688272 + } + }, + { + "ph": "f", "id": 241688272, "pid": 5717, "tid": 6759, "ts": 6302685407971.681, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685407975.971, "dur": 1.110, + "args": { + "External id": 129946, "cbid": 147, "correlation": 241688276 + } + }, + { + "ph": "s", "id": 241688276, "pid": 5717, "tid": 6759, "ts": 6302685407975.971, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685407978.791, "dur": 0.740, + "args": { + "External id": 129946, "cbid": 147, "correlation": 241688280 + } + }, + { + "ph": "s", "id": 241688280, "pid": 5717, "tid": 6759, "ts": 6302685407978.791, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685494654.151, "dur": 583.077, + "args": { + "External id": 129979, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241688304, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241688304, "pid": 3, "tid": 17, "ts": 6302685494654.151, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408147.310, "dur": 18.120, + "args": { + "External id": 129979, "cbid": 211, "correlation": 241688304 + } + }, + { + "ph": "s", "id": 241688304, "pid": 5717, "tid": 6759, "ts": 6302685408147.310, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685495239.436, "dur": 12.288, + "args": { + "External id": 129995, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241688317, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241688317, "pid": 3, "tid": 17, "ts": 6302685495239.436, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408276.140, "dur": 9.350, + "args": { + "External id": 129995, "cbid": 211, "correlation": 241688317 + } + }, + { + "ph": "s", "id": 241688317, "pid": 5717, "tid": 6759, "ts": 6302685408276.140, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685408315.700, "dur": 1.210, + "args": { + "External id": 129946, "cbid": 135, "correlation": 241688327 + } + }, + { + "ph": "f", "id": 241688327, "pid": 5717, "tid": 6759, "ts": 6302685408315.700, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685408318.820, "dur": 1.260, + "args": { + "External id": 129946, "cbid": 147, "correlation": 241688331 + } + }, + { + "ph": "s", "id": 241688331, "pid": 5717, "tid": 6759, "ts": 6302685408318.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685408378.500, "dur": 0.980, + "args": { + "External id": 129997, "cbid": 317, "correlation": 241688344 + } + }, + { + "ph": "f", "id": 241688344, "pid": 5717, "tid": 6759, "ts": 6302685408378.500, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685408381.290, "dur": 1.230, + "args": { + "External id": 129997, "cbid": 135, "correlation": 241688346 + } + }, + { + "ph": "f", "id": 241688346, "pid": 5717, "tid": 6759, "ts": 6302685408381.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685408383.830, "dur": 1.050, + "args": { + "External id": 129997, "cbid": 147, "correlation": 241688350 + } + }, + { + "ph": "s", "id": 241688350, "pid": 5717, "tid": 6759, "ts": 6302685408383.830, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685408398.650, "dur": 0.660, + "args": { + "External id": 129997, "cbid": 409, "correlation": 241688353 + } + }, + { + "ph": "f", "id": 241688353, "pid": 5717, "tid": 6759, "ts": 6302685408398.650, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685408403.210, "dur": 0.760, + "args": { + "External id": 129997, "cbid": 135, "correlation": 241688356 + } + }, + { + "ph": "f", "id": 241688356, "pid": 5717, "tid": 6759, "ts": 6302685408403.210, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685408404.140, "dur": 0.760, + "args": { + "External id": 129997, "cbid": 147, "correlation": 241688357 + } + }, + { + "ph": "s", "id": 241688357, "pid": 5717, "tid": 6759, "ts": 6302685408404.140, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685504356.689, "dur": 5074.598, + "args": { + "External id": 129997, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241688359, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241688359, "pid": 3, "tid": 20, "ts": 6302685504356.689, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685408406.100, "dur": 10.260, + "args": { + "External id": 129997, "cbid": 430, "correlation": 241688359 + } + }, + { + "ph": "s", "id": 241688359, "pid": 5717, "tid": 6759, "ts": 6302685408406.100, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685408417.320, "dur": 0.390, + "args": { + "External id": 129997, "cbid": 135, "correlation": 241688361 + } + }, + { + "ph": "f", "id": 241688361, "pid": 5717, "tid": 6759, "ts": 6302685408417.320, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685408417.820, "dur": 0.480, + "args": { + "External id": 129997, "cbid": 147, "correlation": 241688362 + } + }, + { + "ph": "s", "id": 241688362, "pid": 5717, "tid": 6759, "ts": 6302685408417.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685408419.870, "dur": 0.740, + "args": { + "External id": 129997, "cbid": 135, "correlation": 241688365 + } + }, + { + "ph": "f", "id": 241688365, "pid": 5717, "tid": 6759, "ts": 6302685408419.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685408427.850, "dur": 0.420, + "args": { + "External id": 129997, "cbid": 135, "correlation": 241688372 + } + }, + { + "ph": "f", "id": 241688372, "pid": 5717, "tid": 6759, "ts": 6302685408427.850, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685408454.010, "dur": 1.000, + "args": { + "External id": 129999, "cbid": 147, "correlation": 241688377 + } + }, + { + "ph": "s", "id": 241688377, "pid": 5717, "tid": 6759, "ts": 6302685408454.010, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685408470.910, "dur": 0.890, + "args": { + "External id": 129946, "cbid": 135, "correlation": 241688392 + } + }, + { + "ph": "f", "id": 241688392, "pid": 5717, "tid": 6759, "ts": 6302685408470.910, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685494038.915, "dur": 1449.515, + "args": { + "External id": 130001, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688417, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688417, "pid": 3, "tid": 7, "ts": 6302685494038.915, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408611.349, "dur": 11.431, + "args": { + "External id": 130001, "cbid": 211, "correlation": 241688417 + } + }, + { + "ph": "s", "id": 241688417, "pid": 5717, "tid": 6759, "ts": 6302685408611.349, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 3, "tid": 7, + "ts": 6302685495489.102, "dur": 432.899, + "args": { + "External id": 130002, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688440, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241688440, "pid": 3, "tid": 7, "ts": 6302685495489.102, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408668.829, "dur": 6.040, + "args": { + "External id": 130002, "cbid": 307, "correlation": 241688440 + } + }, + { + "ph": "s", "id": 241688440, "pid": 5717, "tid": 6759, "ts": 6302685408668.829, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685408708.139, "dur": 0.530, + "args": { + "External id": 130003, "cbid": 200, "correlation": 241688463 + } + }, + { + "ph": "f", "id": 241688463, "pid": 5717, "tid": 6759, "ts": 6302685408708.139, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685495923.249, "dur": 1.344, + "args": { + "External id": 130003, "device": 3, "context": 1, "stream": 7, "correlation": 241688466, "bytes": 1536, "memory bandwidth (GB/s)": 1.1428571428571428 + } + }, + { + "ph": "f", "id": 241688466, "pid": 3, "tid": 7, "ts": 6302685495923.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685408710.369, "dur": 6.490, + "args": { + "External id": 130003, "cbid": 51, "correlation": 241688466 + } + }, + { + "ph": "s", "id": 241688466, "pid": 5717, "tid": 6759, "ts": 6302685408710.369, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685495926.001, "dur": 372.099, + "args": { + "External id": 130003, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688467, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688467, "pid": 3, "tid": 7, "ts": 6302685495926.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408717.049, "dur": 5.990, + "args": { + "External id": 130003, "cbid": 307, "correlation": 241688467 + } + }, + { + "ph": "s", "id": 241688467, "pid": 5717, "tid": 6759, "ts": 6302685408717.049, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685408749.639, "dur": 0.320, + "args": { + "External id": 130004, "cbid": 200, "correlation": 241688492 + } + }, + { + "ph": "f", "id": 241688492, "pid": 5717, "tid": 6759, "ts": 6302685408749.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685496304.372, "dur": 24.032, + "args": { + "External id": 130004, "device": 3, "context": 1, "stream": 7, "correlation": 241688495, "bytes": 1536, "memory bandwidth (GB/s)": 0.06391478029294274 + } + }, + { + "ph": "f", "id": 241688495, "pid": 3, "tid": 7, "ts": 6302685496304.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685408751.119, "dur": 4.340, + "args": { + "External id": 130004, "cbid": 51, "correlation": 241688495 + } + }, + { + "ph": "s", "id": 241688495, "pid": 5717, "tid": 6759, "ts": 6302685408751.119, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685496352.276, "dur": 440.867, + "args": { + "External id": 130004, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688496, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688496, "pid": 3, "tid": 7, "ts": 6302685496352.276, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408755.619, "dur": 5.140, + "args": { + "External id": 130004, "cbid": 307, "correlation": 241688496 + } + }, + { + "ph": "s", "id": 241688496, "pid": 5717, "tid": 6759, "ts": 6302685408755.619, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685408783.649, "dur": 0.340, + "args": { + "External id": 130005, "cbid": 200, "correlation": 241688521 + } + }, + { + "ph": "f", "id": 241688521, "pid": 5717, "tid": 6759, "ts": 6302685408783.649, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685496793.783, "dur": 358.147, + "args": { + "External id": 130005, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688524, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688524, "pid": 3, "tid": 7, "ts": 6302685496793.783, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408785.229, "dur": 5.200, + "args": { + "External id": 130005, "cbid": 307, "correlation": 241688524 + } + }, + { + "ph": "s", "id": 241688524, "pid": 5717, "tid": 6759, "ts": 6302685408785.229, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685408811.569, "dur": 0.240, + "args": { + "External id": 130006, "cbid": 200, "correlation": 241688549 + } + }, + { + "ph": "f", "id": 241688549, "pid": 5717, "tid": 6759, "ts": 6302685408811.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685497153.178, "dur": 0.896, + "args": { + "External id": 130006, "device": 3, "context": 1, "stream": 7, "correlation": 241688552, "bytes": 1536, "memory bandwidth (GB/s)": 1.7142857142857142 + } + }, + { + "ph": "f", "id": 241688552, "pid": 3, "tid": 7, "ts": 6302685497153.178, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685408812.819, "dur": 4.270, + "args": { + "External id": 130006, "cbid": 51, "correlation": 241688552 + } + }, + { + "ph": "s", "id": 241688552, "pid": 5717, "tid": 6759, "ts": 6302685408812.819, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685497155.226, "dur": 352.099, + "args": { + "External id": 130006, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688553, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688553, "pid": 3, "tid": 7, "ts": 6302685497155.226, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408817.219, "dur": 5.260, + "args": { + "External id": 130006, "cbid": 307, "correlation": 241688553 + } + }, + { + "ph": "s", "id": 241688553, "pid": 5717, "tid": 6759, "ts": 6302685408817.219, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685408844.799, "dur": 0.260, + "args": { + "External id": 130007, "cbid": 200, "correlation": 241688578 + } + }, + { + "ph": "f", "id": 241688578, "pid": 5717, "tid": 6759, "ts": 6302685408844.799, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685497508.029, "dur": 601.732, + "args": { + "External id": 130007, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688581, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688581, "pid": 3, "tid": 7, "ts": 6302685497508.029, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408846.089, "dur": 5.050, + "args": { + "External id": 130007, "cbid": 307, "correlation": 241688581 + } + }, + { + "ph": "s", "id": 241688581, "pid": 5717, "tid": 6759, "ts": 6302685408846.089, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685498110.401, "dur": 139.201, + "args": { + "External id": 130008, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688594, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688594, "pid": 3, "tid": 7, "ts": 6302685498110.401, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408891.219, "dur": 6.420, + "args": { + "External id": 130008, "cbid": 307, "correlation": 241688594 + } + }, + { + "ph": "s", "id": 241688594, "pid": 5717, "tid": 6759, "ts": 6302685408891.219, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 3, "tid": 7, + "ts": 6302685498250.242, "dur": 75.457, + "args": { + "External id": 130009, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688602, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241688602, "pid": 3, "tid": 7, "ts": 6302685498250.242, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408930.079, "dur": 6.530, + "args": { + "External id": 130009, "cbid": 307, "correlation": 241688602 + } + }, + { + "ph": "s", "id": 241688602, "pid": 5717, "tid": 6759, "ts": 6302685408930.079, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 3, "tid": 7, + "ts": 6302685498326.307, "dur": 114.753, + "args": { + "External id": 130010, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688610, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688610, "pid": 3, "tid": 7, "ts": 6302685498326.307, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685408969.599, "dur": 7.490, + "args": { + "External id": 130010, "cbid": 307, "correlation": 241688610 + } + }, + { + "ph": "s", "id": 241688610, "pid": 5717, "tid": 6759, "ts": 6302685408969.599, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685409173.738, "dur": 0.480, + "args": { + "External id": 130029, "cbid": 200, "correlation": 241688656 + } + }, + { + "ph": "f", "id": 241688656, "pid": 5717, "tid": 6759, "ts": 6302685409173.738, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685498442.340, "dur": 1.184, + "args": { + "External id": 130029, "device": 3, "context": 1, "stream": 7, "correlation": 241688659, "bytes": 576, "memory bandwidth (GB/s)": 0.4864864864864865 + } + }, + { + "ph": "f", "id": 241688659, "pid": 3, "tid": 7, "ts": 6302685498442.340, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685409175.818, "dur": 7.350, + "args": { + "External id": 130029, "cbid": 51, "correlation": 241688659 + } + }, + { + "ph": "s", "id": 241688659, "pid": 5717, "tid": 6759, "ts": 6302685409175.818, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685498445.092, "dur": 142.049, + "args": { + "External id": 130029, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688660, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688660, "pid": 3, "tid": 7, "ts": 6302685498445.092, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409183.378, "dur": 8.580, + "args": { + "External id": 130029, "cbid": 307, "correlation": 241688660 + } + }, + { + "ph": "s", "id": 241688660, "pid": 5717, "tid": 6759, "ts": 6302685409183.378, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685498587.813, "dur": 139.105, + "args": { + "External id": 130030, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688682, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688682, "pid": 3, "tid": 7, "ts": 6302685498587.813, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409228.868, "dur": 6.010, + "args": { + "External id": 130030, "cbid": 211, "correlation": 241688682 + } + }, + { + "ph": "s", "id": 241688682, "pid": 5717, "tid": 6759, "ts": 6302685409228.868, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685409311.328, "dur": 0.400, + "args": { + "External id": 130031, "cbid": 200, "correlation": 241688700 + } + }, + { + "ph": "f", "id": 241688700, "pid": 5717, "tid": 6759, "ts": 6302685409311.328, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685409311.848, "dur": 0.200, + "args": { + "External id": 130031, "cbid": 200, "correlation": 241688701 + } + }, + { + "ph": "f", "id": 241688701, "pid": 5717, "tid": 6759, "ts": 6302685409311.848, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685409331.168, "dur": 0.220, + "args": { + "External id": 130031, "cbid": 200, "correlation": 241688719 + } + }, + { + "ph": "f", "id": 241688719, "pid": 5717, "tid": 6759, "ts": 6302685409331.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685498727.526, "dur": 91.457, + "args": { + "External id": 130031, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688720, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688720, "pid": 3, "tid": 7, "ts": 6302685498727.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409332.618, "dur": 9.760, + "args": { + "External id": 130031, "cbid": 211, "correlation": 241688720 + } + }, + { + "ph": "s", "id": 241688720, "pid": 5717, "tid": 6759, "ts": 6302685409332.618, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685409343.128, "dur": 0.980, + "args": { + "External id": 130031, "cbid": 273, "correlation": 241688722 + } + }, + { + "ph": "f", "id": 241688722, "pid": 5717, "tid": 6759, "ts": 6302685409343.128, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685498819.591, "dur": 1219.721, + "args": { + "External id": 130031, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688723, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241688723, "pid": 3, "tid": 7, "ts": 6302685498819.591, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409344.428, "dur": 4.260, + "args": { + "External id": 130031, "cbid": 211, "correlation": 241688723 + } + }, + { + "ph": "s", "id": 241688723, "pid": 5717, "tid": 6759, "ts": 6302685409344.428, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685500039.984, "dur": 73.440, + "args": { + "External id": 130031, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688725, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241688725, "pid": 3, "tid": 7, "ts": 6302685500039.984, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409349.208, "dur": 3.730, + "args": { + "External id": 130031, "cbid": 211, "correlation": 241688725 + } + }, + { + "ph": "s", "id": 241688725, "pid": 5717, "tid": 6759, "ts": 6302685409349.208, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685500114.128, "dur": 48.193, + "args": { + "External id": 130042, "device": 3, "context": 1, "stream": 7, "correlation": 241688747, "bytes": 25165824, "memory bandwidth (GB/s)": 522.1883676052539 + } + }, + { + "ph": "f", "id": 241688747, "pid": 3, "tid": 7, "ts": 6302685500114.128, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685409482.007, "dur": 17.331, + "args": { + "External id": 130042, "cbid": 41, "correlation": 241688747 + } + }, + { + "ph": "s", "id": 241688747, "pid": 5717, "tid": 6759, "ts": 6302685409482.007, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685500162.929, "dur": 31.616, + "args": { + "External id": 130039, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688765, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688765, "pid": 3, "tid": 7, "ts": 6302685500162.929, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409593.707, "dur": 8.340, + "args": { + "External id": 130039, "cbid": 307, "correlation": 241688765 + } + }, + { + "ph": "s", "id": 241688765, "pid": 5717, "tid": 6759, "ts": 6302685409593.707, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685500195.185, "dur": 40.544, + "args": { + "External id": 130049, "device": 3, "context": 1, "stream": 7, "correlation": 241688780, "bytes": 25165824, "memory bandwidth (GB/s)": 620.7040252565115 + } + }, + { + "ph": "f", "id": 241688780, "pid": 3, "tid": 7, "ts": 6302685500195.185, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685409667.617, "dur": 13.880, + "args": { + "External id": 130049, "cbid": 41, "correlation": 241688780 + } + }, + { + "ph": "s", "id": 241688780, "pid": 5717, "tid": 6759, "ts": 6302685409667.617, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685500236.433, "dur": 29.889, + "args": { + "External id": 130046, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688798, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688798, "pid": 3, "tid": 7, "ts": 6302685500236.433, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409765.687, "dur": 8.030, + "args": { + "External id": 130046, "cbid": 307, "correlation": 241688798 + } + }, + { + "ph": "s", "id": 241688798, "pid": 5717, "tid": 6759, "ts": 6302685409765.687, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685409884.206, "dur": 0.531, + "args": { + "External id": 130054, "cbid": 200, "correlation": 241688828 + } + }, + { + "ph": "f", "id": 241688828, "pid": 5717, "tid": 6759, "ts": 6302685409884.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685500267.570, "dur": 1.216, + "args": { + "External id": 130054, "device": 3, "context": 1, "stream": 7, "correlation": 241688831, "bytes": 576, "memory bandwidth (GB/s)": 0.47368421052631576 + } + }, + { + "ph": "f", "id": 241688831, "pid": 3, "tid": 7, "ts": 6302685500267.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685409886.357, "dur": 7.120, + "args": { + "External id": 130054, "cbid": 51, "correlation": 241688831 + } + }, + { + "ph": "s", "id": 241688831, "pid": 5717, "tid": 6759, "ts": 6302685409886.357, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685500270.418, "dur": 147.233, + "args": { + "External id": 130054, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688832, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688832, "pid": 3, "tid": 7, "ts": 6302685500270.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409893.706, "dur": 7.551, + "args": { + "External id": 130054, "cbid": 307, "correlation": 241688832 + } + }, + { + "ph": "s", "id": 241688832, "pid": 5717, "tid": 6759, "ts": 6302685409893.706, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685409926.946, "dur": 0.251, + "args": { + "External id": 130055, "cbid": 200, "correlation": 241688857 + } + }, + { + "ph": "f", "id": 241688857, "pid": 5717, "tid": 6759, "ts": 6302685409926.946, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685500419.027, "dur": 1.280, + "args": { + "External id": 130055, "device": 3, "context": 1, "stream": 7, "correlation": 241688860, "bytes": 576, "memory bandwidth (GB/s)": 0.45 + } + }, + { + "ph": "f", "id": 241688860, "pid": 3, "tid": 7, "ts": 6302685500419.027, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685409928.266, "dur": 4.331, + "args": { + "External id": 130055, "cbid": 51, "correlation": 241688860 + } + }, + { + "ph": "s", "id": 241688860, "pid": 5717, "tid": 6759, "ts": 6302685409928.266, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685500421.523, "dur": 142.433, + "args": { + "External id": 130055, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688861, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688861, "pid": 3, "tid": 7, "ts": 6302685500421.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409932.746, "dur": 5.051, + "args": { + "External id": 130055, "cbid": 307, "correlation": 241688861 + } + }, + { + "ph": "s", "id": 241688861, "pid": 5717, "tid": 6759, "ts": 6302685409932.746, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685409959.026, "dur": 0.271, + "args": { + "External id": 130056, "cbid": 200, "correlation": 241688886 + } + }, + { + "ph": "f", "id": 241688886, "pid": 5717, "tid": 6759, "ts": 6302685409959.026, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685500565.268, "dur": 1.056, + "args": { + "External id": 130056, "device": 3, "context": 1, "stream": 7, "correlation": 241688889, "bytes": 576, "memory bandwidth (GB/s)": 0.5454545454545454 + } + }, + { + "ph": "f", "id": 241688889, "pid": 3, "tid": 7, "ts": 6302685500565.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685409960.366, "dur": 4.000, + "args": { + "External id": 130056, "cbid": 51, "correlation": 241688889 + } + }, + { + "ph": "s", "id": 241688889, "pid": 5717, "tid": 6759, "ts": 6302685409960.366, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685500567.860, "dur": 139.137, + "args": { + "External id": 130056, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688890, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688890, "pid": 3, "tid": 7, "ts": 6302685500567.860, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409964.486, "dur": 5.080, + "args": { + "External id": 130056, "cbid": 307, "correlation": 241688890 + } + }, + { + "ph": "s", "id": 241688890, "pid": 5717, "tid": 6759, "ts": 6302685409964.486, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685500707.669, "dur": 138.625, + "args": { + "External id": 130057, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688912, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688912, "pid": 3, "tid": 7, "ts": 6302685500707.669, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685409992.416, "dur": 5.550, + "args": { + "External id": 130057, "cbid": 211, "correlation": 241688912 + } + }, + { + "ph": "s", "id": 241688912, "pid": 5717, "tid": 6759, "ts": 6302685409992.416, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685500846.998, "dur": 298.370, + "args": { + "External id": 130058, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688935, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688935, "pid": 3, "tid": 7, "ts": 6302685500846.998, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685410015.746, "dur": 4.850, + "args": { + "External id": 130058, "cbid": 211, "correlation": 241688935 + } + }, + { + "ph": "s", "id": 241688935, "pid": 5717, "tid": 6759, "ts": 6302685410015.746, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685501145.944, "dur": 565.285, + "args": { + "External id": 130059, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688958, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241688958, "pid": 3, "tid": 7, "ts": 6302685501145.944, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685410037.786, "dur": 4.980, + "args": { + "External id": 130059, "cbid": 211, "correlation": 241688958 + } + }, + { + "ph": "s", "id": 241688958, "pid": 5717, "tid": 6759, "ts": 6302685410037.786, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 3, "tid": 7, + "ts": 6302685501711.837, "dur": 94.944, + "args": { + "External id": 130060, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688966, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688966, "pid": 3, "tid": 7, "ts": 6302685501711.837, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685410074.816, "dur": 5.640, + "args": { + "External id": 130060, "cbid": 307, "correlation": 241688966 + } + }, + { + "ph": "s", "id": 241688966, "pid": 5717, "tid": 6759, "ts": 6302685410074.816, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685501807.485, "dur": 47.393, + "args": { + "External id": 130075, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241688995, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241688995, "pid": 3, "tid": 7, "ts": 6302685501807.485, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685410227.886, "dur": 8.760, + "args": { + "External id": 130075, "cbid": 307, "correlation": 241688995 + } + }, + { + "ph": "s", "id": 241688995, "pid": 5717, "tid": 6759, "ts": 6302685410227.886, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685501855.518, "dur": 3.648, + "args": { + "External id": 130076, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689003, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241689003, "pid": 3, "tid": 7, "ts": 6302685501855.518, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685410260.936, "dur": 5.220, + "args": { + "External id": 130076, "cbid": 307, "correlation": 241689003 + } + }, + { + "ph": "s", "id": 241689003, "pid": 5717, "tid": 6759, "ts": 6302685410260.936, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685501859.870, "dur": 45.408, + "args": { + "External id": 130077, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689014, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689014, "pid": 3, "tid": 7, "ts": 6302685501859.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685410291.956, "dur": 12.530, + "args": { + "External id": 130077, "cbid": 307, "correlation": 241689014 + } + }, + { + "ph": "s", "id": 241689014, "pid": 5717, "tid": 6759, "ts": 6302685410291.956, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685501905.854, "dur": 47.232, + "args": { + "External id": 130078, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689019, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689019, "pid": 3, "tid": 7, "ts": 6302685501905.854, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685410340.425, "dur": 7.491, + "args": { + "External id": 130078, "cbid": 211, "correlation": 241689019 + } + }, + { + "ph": "s", "id": 241689019, "pid": 5717, "tid": 6759, "ts": 6302685410340.425, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685410511.865, "dur": 2.830, + "args": { + "External id": 130084, "cbid": 147, "correlation": 241689036 + } + }, + { + "ph": "s", "id": 241689036, "pid": 5717, "tid": 6759, "ts": 6302685410511.865, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685410613.095, "dur": 2.310, + "args": { + "External id": 130092, "cbid": 138, "correlation": 241689051 + } + }, + { + "ph": "f", "id": 241689051, "pid": 5717, "tid": 6759, "ts": 6302685410613.095, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685504361.361, "dur": 3.552, + "args": { + "External id": 130096, "device": 3, "context": 1, "stream": 7, "correlation": 241689062, "bytes": 28112, "memory bandwidth (GB/s)": 7.914414414414415 + } + }, + { + "ph": "f", "id": 241689062, "pid": 3, "tid": 7, "ts": 6302685504361.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685410635.395, "dur": 11.690, + "args": { + "External id": 130096, "cbid": 41, "correlation": 241689062 + } + }, + { + "ph": "s", "id": 241689062, "pid": 5717, "tid": 6759, "ts": 6302685410635.395, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685410651.065, "dur": 1.760, + "args": { + "External id": 130091, "cbid": 135, "correlation": 241689066 + } + }, + { + "ph": "f", "id": 241689066, "pid": 5717, "tid": 6759, "ts": 6302685410651.065, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685504366.897, "dur": 670.373, + "args": { + "External id": 130091, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689070, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689070, "pid": 3, "tid": 7, "ts": 6302685504366.897, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685410655.765, "dur": 9.750, + "args": { + "External id": 130091, "cbid": 211, "correlation": 241689070 + } + }, + { + "ph": "s", "id": 241689070, "pid": 5717, "tid": 6759, "ts": 6302685410655.765, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685410703.235, "dur": 0.940, + "args": { + "External id": 130084, "cbid": 135, "correlation": 241689081 + } + }, + { + "ph": "f", "id": 241689081, "pid": 5717, "tid": 6759, "ts": 6302685410703.235, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685410706.095, "dur": 1.400, + "args": { + "External id": 130084, "cbid": 147, "correlation": 241689085 + } + }, + { + "ph": "s", "id": 241689085, "pid": 5717, "tid": 6759, "ts": 6302685410706.095, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685410774.144, "dur": 0.971, + "args": { + "External id": 130100, "cbid": 317, "correlation": 241689105 + } + }, + { + "ph": "f", "id": 241689105, "pid": 5717, "tid": 6759, "ts": 6302685410774.144, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685410776.924, "dur": 1.500, + "args": { + "External id": 130100, "cbid": 135, "correlation": 241689107 + } + }, + { + "ph": "f", "id": 241689107, "pid": 5717, "tid": 6759, "ts": 6302685410776.924, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685410779.684, "dur": 0.980, + "args": { + "External id": 130100, "cbid": 147, "correlation": 241689111 + } + }, + { + "ph": "s", "id": 241689111, "pid": 5717, "tid": 6759, "ts": 6302685410779.684, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685410794.175, "dur": 0.680, + "args": { + "External id": 130100, "cbid": 409, "correlation": 241689114 + } + }, + { + "ph": "f", "id": 241689114, "pid": 5717, "tid": 6759, "ts": 6302685410794.175, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685410798.764, "dur": 0.800, + "args": { + "External id": 130100, "cbid": 135, "correlation": 241689117 + } + }, + { + "ph": "f", "id": 241689117, "pid": 5717, "tid": 6759, "ts": 6302685410798.764, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685410799.744, "dur": 0.811, + "args": { + "External id": 130100, "cbid": 147, "correlation": 241689118 + } + }, + { + "ph": "s", "id": 241689118, "pid": 5717, "tid": 6759, "ts": 6302685410799.744, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685509433.079, "dur": 10380.430, + "args": { + "External id": 130100, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241689120, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241689120, "pid": 3, "tid": 20, "ts": 6302685509433.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685410801.635, "dur": 9.780, + "args": { + "External id": 130100, "cbid": 430, "correlation": 241689120 + } + }, + { + "ph": "s", "id": 241689120, "pid": 5717, "tid": 6759, "ts": 6302685410801.635, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685410812.424, "dur": 0.411, + "args": { + "External id": 130100, "cbid": 135, "correlation": 241689122 + } + }, + { + "ph": "f", "id": 241689122, "pid": 5717, "tid": 6759, "ts": 6302685410812.424, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685410812.944, "dur": 0.471, + "args": { + "External id": 130100, "cbid": 147, "correlation": 241689123 + } + }, + { + "ph": "s", "id": 241689123, "pid": 5717, "tid": 6759, "ts": 6302685410812.944, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685410814.975, "dur": 1.009, + "args": { + "External id": 130100, "cbid": 135, "correlation": 241689126 + } + }, + { + "ph": "f", "id": 241689126, "pid": 5717, "tid": 6759, "ts": 6302685410814.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685410823.684, "dur": 0.420, + "args": { + "External id": 130100, "cbid": 135, "correlation": 241689133 + } + }, + { + "ph": "f", "id": 241689133, "pid": 5717, "tid": 6759, "ts": 6302685410823.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685410849.335, "dur": 1.009, + "args": { + "External id": 130102, "cbid": 147, "correlation": 241689138 + } + }, + { + "ph": "s", "id": 241689138, "pid": 5717, "tid": 6759, "ts": 6302685410849.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685410866.194, "dur": 0.850, + "args": { + "External id": 130084, "cbid": 135, "correlation": 241689153 + } + }, + { + "ph": "f", "id": 241689153, "pid": 5717, "tid": 6759, "ts": 6302685410866.194, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685411107.174, "dur": 1.450, + "args": { + "External id": 130084, "cbid": 135, "correlation": 241689166 + } + }, + { + "ph": "f", "id": 241689166, "pid": 5717, "tid": 6759, "ts": 6302685411107.174, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685411211.903, "dur": 3.031, + "args": { + "External id": 130112, "cbid": 147, "correlation": 241689177 + } + }, + { + "ph": "s", "id": 241689177, "pid": 5717, "tid": 6759, "ts": 6302685411211.903, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685411332.953, "dur": 1.230, + "args": { + "External id": 130126, "cbid": 317, "correlation": 241689218 + } + }, + { + "ph": "f", "id": 241689218, "pid": 5717, "tid": 6759, "ts": 6302685411332.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685411341.603, "dur": 2.320, + "args": { + "External id": 130127, "cbid": 138, "correlation": 241689221 + } + }, + { + "ph": "f", "id": 241689221, "pid": 5717, "tid": 6759, "ts": 6302685411341.603, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685509434.167, "dur": 1.952, + "args": { + "External id": 130131, "device": 3, "context": 1, "stream": 7, "correlation": 241689232, "bytes": 7224, "memory bandwidth (GB/s)": 3.7008196721311477 + } + }, + { + "ph": "f", "id": 241689232, "pid": 3, "tid": 7, "ts": 6302685509434.167, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685411364.163, "dur": 11.880, + "args": { + "External id": 130131, "cbid": 41, "correlation": 241689232 + } + }, + { + "ph": "s", "id": 241689232, "pid": 5717, "tid": 6759, "ts": 6302685411364.163, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685411380.533, "dur": 1.860, + "args": { + "External id": 130126, "cbid": 135, "correlation": 241689236 + } + }, + { + "ph": "f", "id": 241689236, "pid": 5717, "tid": 6759, "ts": 6302685411380.533, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685509438.039, "dur": 241.506, + "args": { + "External id": 130126, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689240, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689240, "pid": 3, "tid": 7, "ts": 6302685509438.039, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685411384.763, "dur": 10.010, + "args": { + "External id": 130126, "cbid": 211, "correlation": 241689240 + } + }, + { + "ph": "s", "id": 241689240, "pid": 5717, "tid": 6759, "ts": 6302685411384.763, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685411485.633, "dur": 1.220, + "args": { + "External id": 130112, "cbid": 135, "correlation": 241689251 + } + }, + { + "ph": "f", "id": 241689251, "pid": 5717, "tid": 6759, "ts": 6302685411485.633, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685411489.603, "dur": 1.160, + "args": { + "External id": 130112, "cbid": 147, "correlation": 241689255 + } + }, + { + "ph": "s", "id": 241689255, "pid": 5717, "tid": 6759, "ts": 6302685411489.603, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685411492.393, "dur": 0.820, + "args": { + "External id": 130112, "cbid": 147, "correlation": 241689259 + } + }, + { + "ph": "s", "id": 241689259, "pid": 5717, "tid": 6759, "ts": 6302685411492.393, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685510284.797, "dur": 659.557, + "args": { + "External id": 130145, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241689283, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241689283, "pid": 3, "tid": 17, "ts": 6302685510284.797, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685411654.362, "dur": 11.960, + "args": { + "External id": 130145, "cbid": 211, "correlation": 241689283 + } + }, + { + "ph": "s", "id": 241689283, "pid": 5717, "tid": 6759, "ts": 6302685411654.362, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 3, "tid": 17, + "ts": 6302685511023.203, "dur": 15.488, + "args": { + "External id": 130161, "queued": 0, "device": 3, "context": 1, "stream": 17, "correlation": 241689296, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241689296, "pid": 3, "tid": 17, "ts": 6302685511023.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685411763.652, "dur": 9.370, + "args": { + "External id": 130161, "cbid": 211, "correlation": 241689296 + } + }, + { + "ph": "s", "id": 241689296, "pid": 5717, "tid": 6759, "ts": 6302685411763.652, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685411796.782, "dur": 3.230, + "args": { + "External id": 130112, "cbid": 135, "correlation": 241689306 + } + }, + { + "ph": "f", "id": 241689306, "pid": 5717, "tid": 6759, "ts": 6302685411796.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685411802.882, "dur": 1.370, + "args": { + "External id": 130112, "cbid": 147, "correlation": 241689310 + } + }, + { + "ph": "s", "id": 241689310, "pid": 5717, "tid": 6759, "ts": 6302685411802.882, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685411855.972, "dur": 0.910, + "args": { + "External id": 130163, "cbid": 317, "correlation": 241689323 + } + }, + { + "ph": "f", "id": 241689323, "pid": 5717, "tid": 6759, "ts": 6302685411855.972, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685411858.752, "dur": 1.060, + "args": { + "External id": 130163, "cbid": 135, "correlation": 241689325 + } + }, + { + "ph": "f", "id": 241689325, "pid": 5717, "tid": 6759, "ts": 6302685411858.752, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685411861.132, "dur": 1.030, + "args": { + "External id": 130163, "cbid": 147, "correlation": 241689329 + } + }, + { + "ph": "s", "id": 241689329, "pid": 5717, "tid": 6759, "ts": 6302685411861.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685411875.832, "dur": 0.700, + "args": { + "External id": 130163, "cbid": 409, "correlation": 241689332 + } + }, + { + "ph": "f", "id": 241689332, "pid": 5717, "tid": 6759, "ts": 6302685411875.832, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685411880.372, "dur": 0.680, + "args": { + "External id": 130163, "cbid": 135, "correlation": 241689335 + } + }, + { + "ph": "f", "id": 241689335, "pid": 5717, "tid": 6759, "ts": 6302685411880.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685411881.232, "dur": 0.720, + "args": { + "External id": 130163, "cbid": 147, "correlation": 241689336 + } + }, + { + "ph": "s", "id": 241689336, "pid": 5717, "tid": 6759, "ts": 6302685411881.232, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685519814.181, "dur": 4975.749, + "args": { + "External id": 130163, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241689338, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241689338, "pid": 3, "tid": 20, "ts": 6302685519814.181, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685411883.052, "dur": 9.760, + "args": { + "External id": 130163, "cbid": 430, "correlation": 241689338 + } + }, + { + "ph": "s", "id": 241689338, "pid": 5717, "tid": 6759, "ts": 6302685411883.052, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685411893.802, "dur": 0.390, + "args": { + "External id": 130163, "cbid": 135, "correlation": 241689340 + } + }, + { + "ph": "f", "id": 241689340, "pid": 5717, "tid": 6759, "ts": 6302685411893.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685411894.312, "dur": 0.580, + "args": { + "External id": 130163, "cbid": 147, "correlation": 241689341 + } + }, + { + "ph": "s", "id": 241689341, "pid": 5717, "tid": 6759, "ts": 6302685411894.312, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685411896.412, "dur": 2.430, + "args": { + "External id": 130163, "cbid": 135, "correlation": 241689344 + } + }, + { + "ph": "f", "id": 241689344, "pid": 5717, "tid": 6759, "ts": 6302685411896.412, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685411909.742, "dur": 0.410, + "args": { + "External id": 130163, "cbid": 135, "correlation": 241689351 + } + }, + { + "ph": "f", "id": 241689351, "pid": 5717, "tid": 6759, "ts": 6302685411909.742, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685411938.082, "dur": 1.060, + "args": { + "External id": 130165, "cbid": 147, "correlation": 241689356 + } + }, + { + "ph": "s", "id": 241689356, "pid": 5717, "tid": 6759, "ts": 6302685411938.082, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685411954.632, "dur": 0.960, + "args": { + "External id": 130112, "cbid": 135, "correlation": 241689371 + } + }, + { + "ph": "f", "id": 241689371, "pid": 5717, "tid": 6759, "ts": 6302685411954.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685509692.345, "dur": 1590.348, + "args": { + "External id": 130167, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689396, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689396, "pid": 3, "tid": 7, "ts": 6302685509692.345, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412109.921, "dur": 11.271, + "args": { + "External id": 130167, "cbid": 211, "correlation": 241689396 + } + }, + { + "ph": "s", "id": 241689396, "pid": 5717, "tid": 6759, "ts": 6302685412109.921, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 3, "tid": 7, + "ts": 6302685511283.365, "dur": 429.027, + "args": { + "External id": 130168, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689419, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241689419, "pid": 3, "tid": 7, "ts": 6302685511283.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412175.492, "dur": 7.440, + "args": { + "External id": 130168, "cbid": 307, "correlation": 241689419 + } + }, + { + "ph": "s", "id": 241689419, "pid": 5717, "tid": 6759, "ts": 6302685412175.492, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685412226.511, "dur": 0.650, + "args": { + "External id": 130169, "cbid": 200, "correlation": 241689442 + } + }, + { + "ph": "f", "id": 241689442, "pid": 5717, "tid": 6759, "ts": 6302685412226.511, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685511713.672, "dur": 1.152, + "args": { + "External id": 130169, "device": 3, "context": 1, "stream": 7, "correlation": 241689445, "bytes": 1536, "memory bandwidth (GB/s)": 1.3333333333333333 + } + }, + { + "ph": "f", "id": 241689445, "pid": 3, "tid": 7, "ts": 6302685511713.672, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685412229.971, "dur": 8.270, + "args": { + "External id": 130169, "cbid": 51, "correlation": 241689445 + } + }, + { + "ph": "s", "id": 241689445, "pid": 5717, "tid": 6759, "ts": 6302685412229.971, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685511716.360, "dur": 410.627, + "args": { + "External id": 130169, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689446, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689446, "pid": 3, "tid": 7, "ts": 6302685511716.360, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412238.541, "dur": 7.780, + "args": { + "External id": 130169, "cbid": 307, "correlation": 241689446 + } + }, + { + "ph": "s", "id": 241689446, "pid": 5717, "tid": 6759, "ts": 6302685412238.541, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685412281.591, "dur": 0.340, + "args": { + "External id": 130170, "cbid": 200, "correlation": 241689471 + } + }, + { + "ph": "f", "id": 241689471, "pid": 5717, "tid": 6759, "ts": 6302685412281.591, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685512157.163, "dur": 3.520, + "args": { + "External id": 130170, "device": 3, "context": 1, "stream": 7, "correlation": 241689474, "bytes": 1536, "memory bandwidth (GB/s)": 0.43636363636363634 + } + }, + { + "ph": "f", "id": 241689474, "pid": 3, "tid": 7, "ts": 6302685512157.163, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685412283.351, "dur": 5.610, + "args": { + "External id": 130170, "cbid": 51, "correlation": 241689474 + } + }, + { + "ph": "s", "id": 241689474, "pid": 5717, "tid": 6759, "ts": 6302685412283.351, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685512194.988, "dur": 435.267, + "args": { + "External id": 130170, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689475, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689475, "pid": 3, "tid": 7, "ts": 6302685512194.988, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412289.201, "dur": 6.700, + "args": { + "External id": 130170, "cbid": 307, "correlation": 241689475 + } + }, + { + "ph": "s", "id": 241689475, "pid": 5717, "tid": 6759, "ts": 6302685412289.201, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685412336.751, "dur": 0.320, + "args": { + "External id": 130171, "cbid": 200, "correlation": 241689500 + } + }, + { + "ph": "f", "id": 241689500, "pid": 5717, "tid": 6759, "ts": 6302685412336.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685512630.959, "dur": 350.946, + "args": { + "External id": 130171, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689503, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689503, "pid": 3, "tid": 7, "ts": 6302685512630.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412339.571, "dur": 8.740, + "args": { + "External id": 130171, "cbid": 307, "correlation": 241689503 + } + }, + { + "ph": "s", "id": 241689503, "pid": 5717, "tid": 6759, "ts": 6302685412339.571, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685412387.811, "dur": 0.270, + "args": { + "External id": 130172, "cbid": 200, "correlation": 241689528 + } + }, + { + "ph": "f", "id": 241689528, "pid": 5717, "tid": 6759, "ts": 6302685412387.811, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685512983.121, "dur": 1.184, + "args": { + "External id": 130172, "device": 3, "context": 1, "stream": 7, "correlation": 241689531, "bytes": 1536, "memory bandwidth (GB/s)": 1.2972972972972974 + } + }, + { + "ph": "f", "id": 241689531, "pid": 3, "tid": 7, "ts": 6302685512983.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685412389.301, "dur": 8.510, + "args": { + "External id": 130172, "cbid": 51, "correlation": 241689531 + } + }, + { + "ph": "s", "id": 241689531, "pid": 5717, "tid": 6759, "ts": 6302685412389.301, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685512985.649, "dur": 365.411, + "args": { + "External id": 130172, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689532, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689532, "pid": 3, "tid": 7, "ts": 6302685512985.649, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412398.011, "dur": 7.630, + "args": { + "External id": 130172, "cbid": 307, "correlation": 241689532 + } + }, + { + "ph": "s", "id": 241689532, "pid": 5717, "tid": 6759, "ts": 6302685412398.011, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685412446.351, "dur": 0.290, + "args": { + "External id": 130173, "cbid": 200, "correlation": 241689557 + } + }, + { + "ph": "f", "id": 241689557, "pid": 5717, "tid": 6759, "ts": 6302685412446.351, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685513351.668, "dur": 473.828, + "args": { + "External id": 130173, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689560, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689560, "pid": 3, "tid": 7, "ts": 6302685513351.668, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412447.901, "dur": 5.890, + "args": { + "External id": 130173, "cbid": 307, "correlation": 241689560 + } + }, + { + "ph": "s", "id": 241689560, "pid": 5717, "tid": 6759, "ts": 6302685412447.901, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685513826.104, "dur": 158.337, + "args": { + "External id": 130174, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689573, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689573, "pid": 3, "tid": 7, "ts": 6302685513826.104, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412495.001, "dur": 6.310, + "args": { + "External id": 130174, "cbid": 307, "correlation": 241689573 + } + }, + { + "ph": "s", "id": 241689573, "pid": 5717, "tid": 6759, "ts": 6302685412495.001, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 3, "tid": 7, + "ts": 6302685513985.081, "dur": 37.568, + "args": { + "External id": 130175, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689581, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241689581, "pid": 3, "tid": 7, "ts": 6302685513985.081, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412527.851, "dur": 5.460, + "args": { + "External id": 130175, "cbid": 307, "correlation": 241689581 + } + }, + { + "ph": "s", "id": 241689581, "pid": 5717, "tid": 6759, "ts": 6302685412527.851, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 3, "tid": 7, + "ts": 6302685514023.353, "dur": 114.913, + "args": { + "External id": 130176, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689589, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689589, "pid": 3, "tid": 7, "ts": 6302685514023.353, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412560.791, "dur": 5.440, + "args": { + "External id": 130176, "cbid": 307, "correlation": 241689589 + } + }, + { + "ph": "s", "id": 241689589, "pid": 5717, "tid": 6759, "ts": 6302685412560.791, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685412756.230, "dur": 0.500, + "args": { + "External id": 130195, "cbid": 200, "correlation": 241689635 + } + }, + { + "ph": "f", "id": 241689635, "pid": 5717, "tid": 6759, "ts": 6302685412756.230, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685514139.450, "dur": 1.088, + "args": { + "External id": 130195, "device": 3, "context": 1, "stream": 7, "correlation": 241689638, "bytes": 576, "memory bandwidth (GB/s)": 0.5294117647058824 + } + }, + { + "ph": "f", "id": 241689638, "pid": 3, "tid": 7, "ts": 6302685514139.450, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685412758.530, "dur": 8.070, + "args": { + "External id": 130195, "cbid": 51, "correlation": 241689638 + } + }, + { + "ph": "s", "id": 241689638, "pid": 5717, "tid": 6759, "ts": 6302685412758.530, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685514142.010, "dur": 140.865, + "args": { + "External id": 130195, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689639, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689639, "pid": 3, "tid": 7, "ts": 6302685514142.010, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412766.830, "dur": 8.870, + "args": { + "External id": 130195, "cbid": 307, "correlation": 241689639 + } + }, + { + "ph": "s", "id": 241689639, "pid": 5717, "tid": 6759, "ts": 6302685412766.830, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685514283.611, "dur": 139.521, + "args": { + "External id": 130196, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689661, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689661, "pid": 3, "tid": 7, "ts": 6302685514283.611, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412805.990, "dur": 6.460, + "args": { + "External id": 130196, "cbid": 211, "correlation": 241689661 + } + }, + { + "ph": "s", "id": 241689661, "pid": 5717, "tid": 6759, "ts": 6302685412805.990, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685412888.250, "dur": 0.450, + "args": { + "External id": 130197, "cbid": 200, "correlation": 241689679 + } + }, + { + "ph": "f", "id": 241689679, "pid": 5717, "tid": 6759, "ts": 6302685412888.250, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685412888.840, "dur": 0.260, + "args": { + "External id": 130197, "cbid": 200, "correlation": 241689680 + } + }, + { + "ph": "f", "id": 241689680, "pid": 5717, "tid": 6759, "ts": 6302685412888.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685412908.860, "dur": 0.260, + "args": { + "External id": 130197, "cbid": 200, "correlation": 241689698 + } + }, + { + "ph": "f", "id": 241689698, "pid": 5717, "tid": 6759, "ts": 6302685412908.860, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685514423.900, "dur": 138.241, + "args": { + "External id": 130197, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689699, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689699, "pid": 3, "tid": 7, "ts": 6302685514423.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412910.460, "dur": 10.120, + "args": { + "External id": 130197, "cbid": 211, "correlation": 241689699 + } + }, + { + "ph": "s", "id": 241689699, "pid": 5717, "tid": 6759, "ts": 6302685412910.460, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685412921.390, "dur": 1.030, + "args": { + "External id": 130197, "cbid": 273, "correlation": 241689701 + } + }, + { + "ph": "f", "id": 241689701, "pid": 5717, "tid": 6759, "ts": 6302685412921.390, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685514562.749, "dur": 1394.763, + "args": { + "External id": 130197, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689702, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241689702, "pid": 3, "tid": 7, "ts": 6302685514562.749, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412922.760, "dur": 4.600, + "args": { + "External id": 130197, "cbid": 211, "correlation": 241689702 + } + }, + { + "ph": "s", "id": 241689702, "pid": 5717, "tid": 6759, "ts": 6302685412922.760, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685515958.120, "dur": 72.320, + "args": { + "External id": 130197, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689704, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241689704, "pid": 3, "tid": 7, "ts": 6302685515958.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685412927.940, "dur": 3.790, + "args": { + "External id": 130197, "cbid": 211, "correlation": 241689704 + } + }, + { + "ph": "s", "id": 241689704, "pid": 5717, "tid": 6759, "ts": 6302685412927.940, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685516031.144, "dur": 48.417, + "args": { + "External id": 130208, "device": 3, "context": 1, "stream": 7, "correlation": 241689726, "bytes": 25165824, "memory bandwidth (GB/s)": 519.7724766094553 + } + }, + { + "ph": "f", "id": 241689726, "pid": 3, "tid": 7, "ts": 6302685516031.144, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685413069.889, "dur": 18.450, + "args": { + "External id": 130208, "cbid": 41, "correlation": 241689726 + } + }, + { + "ph": "s", "id": 241689726, "pid": 5717, "tid": 6759, "ts": 6302685413069.889, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685516080.201, "dur": 33.440, + "args": { + "External id": 130205, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689744, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689744, "pid": 3, "tid": 7, "ts": 6302685516080.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685413191.629, "dur": 12.620, + "args": { + "External id": 130205, "cbid": 307, "correlation": 241689744 + } + }, + { + "ph": "s", "id": 241689744, "pid": 5717, "tid": 6759, "ts": 6302685413191.629, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685516114.249, "dur": 39.680, + "args": { + "External id": 130215, "device": 3, "context": 1, "stream": 7, "correlation": 241689759, "bytes": 25165824, "memory bandwidth (GB/s)": 634.2193548387097 + } + }, + { + "ph": "f", "id": 241689759, "pid": 3, "tid": 7, "ts": 6302685516114.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685413270.559, "dur": 15.610, + "args": { + "External id": 130215, "cbid": 41, "correlation": 241689759 + } + }, + { + "ph": "s", "id": 241689759, "pid": 5717, "tid": 6759, "ts": 6302685413270.559, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685516154.569, "dur": 28.321, + "args": { + "External id": 130212, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689777, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689777, "pid": 3, "tid": 7, "ts": 6302685516154.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685413407.979, "dur": 14.679, + "args": { + "External id": 130212, "cbid": 307, "correlation": 241689777 + } + }, + { + "ph": "s", "id": 241689777, "pid": 5717, "tid": 6759, "ts": 6302685413407.979, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685413570.178, "dur": 0.620, + "args": { + "External id": 130220, "cbid": 200, "correlation": 241689807 + } + }, + { + "ph": "f", "id": 241689807, "pid": 5717, "tid": 6759, "ts": 6302685413570.178, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685516184.106, "dur": 0.800, + "args": { + "External id": 130220, "device": 3, "context": 1, "stream": 7, "correlation": 241689810, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 241689810, "pid": 3, "tid": 7, "ts": 6302685516184.106, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685413572.668, "dur": 8.370, + "args": { + "External id": 130220, "cbid": 51, "correlation": 241689810 + } + }, + { + "ph": "s", "id": 241689810, "pid": 5717, "tid": 6759, "ts": 6302685413572.668, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685516186.090, "dur": 146.433, + "args": { + "External id": 130220, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689811, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689811, "pid": 3, "tid": 7, "ts": 6302685516186.090, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685413581.328, "dur": 8.940, + "args": { + "External id": 130220, "cbid": 307, "correlation": 241689811 + } + }, + { + "ph": "s", "id": 241689811, "pid": 5717, "tid": 6759, "ts": 6302685413581.328, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685413620.398, "dur": 0.310, + "args": { + "External id": 130221, "cbid": 200, "correlation": 241689836 + } + }, + { + "ph": "f", "id": 241689836, "pid": 5717, "tid": 6759, "ts": 6302685413620.398, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685516333.803, "dur": 1.280, + "args": { + "External id": 130221, "device": 3, "context": 1, "stream": 7, "correlation": 241689839, "bytes": 576, "memory bandwidth (GB/s)": 0.45 + } + }, + { + "ph": "f", "id": 241689839, "pid": 3, "tid": 7, "ts": 6302685516333.803, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685413621.928, "dur": 4.910, + "args": { + "External id": 130221, "cbid": 51, "correlation": 241689839 + } + }, + { + "ph": "s", "id": 241689839, "pid": 5717, "tid": 6759, "ts": 6302685413621.928, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685516336.427, "dur": 176.449, + "args": { + "External id": 130221, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689840, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689840, "pid": 3, "tid": 7, "ts": 6302685516336.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685413626.998, "dur": 5.710, + "args": { + "External id": 130221, "cbid": 307, "correlation": 241689840 + } + }, + { + "ph": "s", "id": 241689840, "pid": 5717, "tid": 6759, "ts": 6302685413626.998, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685413657.498, "dur": 0.290, + "args": { + "External id": 130222, "cbid": 200, "correlation": 241689865 + } + }, + { + "ph": "f", "id": 241689865, "pid": 5717, "tid": 6759, "ts": 6302685413657.498, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685516514.092, "dur": 1.152, + "args": { + "External id": 130222, "device": 3, "context": 1, "stream": 7, "correlation": 241689868, "bytes": 576, "memory bandwidth (GB/s)": 0.5 + } + }, + { + "ph": "f", "id": 241689868, "pid": 3, "tid": 7, "ts": 6302685516514.092, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685413658.988, "dur": 4.670, + "args": { + "External id": 130222, "cbid": 51, "correlation": 241689868 + } + }, + { + "ph": "s", "id": 241689868, "pid": 5717, "tid": 6759, "ts": 6302685413658.988, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685516516.652, "dur": 138.817, + "args": { + "External id": 130222, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689869, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689869, "pid": 3, "tid": 7, "ts": 6302685516516.652, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685413663.858, "dur": 5.300, + "args": { + "External id": 130222, "cbid": 307, "correlation": 241689869 + } + }, + { + "ph": "s", "id": 241689869, "pid": 5717, "tid": 6759, "ts": 6302685413663.858, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685516656.173, "dur": 220.802, + "args": { + "External id": 130223, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689891, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689891, "pid": 3, "tid": 7, "ts": 6302685516656.173, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685413695.278, "dur": 6.090, + "args": { + "External id": 130223, "cbid": 211, "correlation": 241689891 + } + }, + { + "ph": "s", "id": 241689891, "pid": 5717, "tid": 6759, "ts": 6302685413695.278, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685516877.583, "dur": 531.428, + "args": { + "External id": 130224, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689914, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689914, "pid": 3, "tid": 7, "ts": 6302685516877.583, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685413721.768, "dur": 5.430, + "args": { + "External id": 130224, "cbid": 211, "correlation": 241689914 + } + }, + { + "ph": "s", "id": 241689914, "pid": 5717, "tid": 6759, "ts": 6302685413721.768, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685517409.683, "dur": 185.409, + "args": { + "External id": 130225, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689937, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241689937, "pid": 3, "tid": 7, "ts": 6302685517409.683, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685413758.248, "dur": 9.040, + "args": { + "External id": 130225, "cbid": 211, "correlation": 241689937 + } + }, + { + "ph": "s", "id": 241689937, "pid": 5717, "tid": 6759, "ts": 6302685413758.248, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 3, "tid": 7, + "ts": 6302685517595.732, "dur": 81.761, + "args": { + "External id": 130226, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689945, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689945, "pid": 3, "tid": 7, "ts": 6302685517595.732, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685413817.398, "dur": 6.110, + "args": { + "External id": 130226, "cbid": 307, "correlation": 241689945 + } + }, + { + "ph": "s", "id": 241689945, "pid": 5717, "tid": 6759, "ts": 6302685413817.398, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685517678.165, "dur": 45.440, + "args": { + "External id": 130241, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689974, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689974, "pid": 3, "tid": 7, "ts": 6302685517678.165, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685414040.717, "dur": 10.810, + "args": { + "External id": 130241, "cbid": 307, "correlation": 241689974 + } + }, + { + "ph": "s", "id": 241689974, "pid": 5717, "tid": 6759, "ts": 6302685414040.717, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685517724.277, "dur": 3.648, + "args": { + "External id": 130242, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689982, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241689982, "pid": 3, "tid": 7, "ts": 6302685517724.277, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685414078.917, "dur": 5.900, + "args": { + "External id": 130242, "cbid": 307, "correlation": 241689982 + } + }, + { + "ph": "s", "id": 241689982, "pid": 5717, "tid": 6759, "ts": 6302685414078.917, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685517728.565, "dur": 48.993, + "args": { + "External id": 130243, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689993, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689993, "pid": 3, "tid": 7, "ts": 6302685517728.565, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685414120.537, "dur": 7.040, + "args": { + "External id": 130243, "cbid": 307, "correlation": 241689993 + } + }, + { + "ph": "s", "id": 241689993, "pid": 5717, "tid": 6759, "ts": 6302685414120.537, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685517778.262, "dur": 47.328, + "args": { + "External id": 130244, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241689998, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241689998, "pid": 3, "tid": 7, "ts": 6302685517778.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685414175.407, "dur": 9.440, + "args": { + "External id": 130244, "cbid": 211, "correlation": 241689998 + } + }, + { + "ph": "s", "id": 241689998, "pid": 5717, "tid": 6759, "ts": 6302685414175.407, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685414462.736, "dur": 3.530, + "args": { + "External id": 130250, "cbid": 147, "correlation": 241690015 + } + }, + { + "ph": "s", "id": 241690015, "pid": 5717, "tid": 6759, "ts": 6302685414462.736, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685414601.656, "dur": 3.140, + "args": { + "External id": 130258, "cbid": 138, "correlation": 241690030 + } + }, + { + "ph": "f", "id": 241690030, "pid": 5717, "tid": 6759, "ts": 6302685414601.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685519820.677, "dur": 3.936, + "args": { + "External id": 130262, "device": 3, "context": 1, "stream": 7, "correlation": 241690041, "bytes": 28112, "memory bandwidth (GB/s)": 7.142276422764228 + } + }, + { + "ph": "f", "id": 241690041, "pid": 3, "tid": 7, "ts": 6302685519820.677, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685414630.376, "dur": 14.640, + "args": { + "External id": 130262, "cbid": 41, "correlation": 241690041 + } + }, + { + "ph": "s", "id": 241690041, "pid": 5717, "tid": 6759, "ts": 6302685414630.376, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685414650.086, "dur": 2.330, + "args": { + "External id": 130257, "cbid": 135, "correlation": 241690045 + } + }, + { + "ph": "f", "id": 241690045, "pid": 5717, "tid": 6759, "ts": 6302685414650.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685519918.246, "dur": 443.907, + "args": { + "External id": 130257, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690049, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690049, "pid": 3, "tid": 7, "ts": 6302685519918.246, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685414656.246, "dur": 12.220, + "args": { + "External id": 130257, "cbid": 211, "correlation": 241690049 + } + }, + { + "ph": "s", "id": 241690049, "pid": 5717, "tid": 6759, "ts": 6302685414656.246, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685414716.556, "dur": 1.260, + "args": { + "External id": 130250, "cbid": 135, "correlation": 241690060 + } + }, + { + "ph": "f", "id": 241690060, "pid": 5717, "tid": 6759, "ts": 6302685414716.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685414720.256, "dur": 1.560, + "args": { + "External id": 130250, "cbid": 147, "correlation": 241690064 + } + }, + { + "ph": "s", "id": 241690064, "pid": 5717, "tid": 6759, "ts": 6302685414720.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685414807.646, "dur": 1.229, + "args": { + "External id": 130266, "cbid": 317, "correlation": 241690084 + } + }, + { + "ph": "f", "id": 241690084, "pid": 5717, "tid": 6759, "ts": 6302685414807.646, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685414811.275, "dur": 3.260, + "args": { + "External id": 130266, "cbid": 135, "correlation": 241690086 + } + }, + { + "ph": "f", "id": 241690086, "pid": 5717, "tid": 6759, "ts": 6302685414811.275, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685414817.466, "dur": 1.340, + "args": { + "External id": 130266, "cbid": 147, "correlation": 241690090 + } + }, + { + "ph": "s", "id": 241690090, "pid": 5717, "tid": 6759, "ts": 6302685414817.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685414845.655, "dur": 2.200, + "args": { + "External id": 130266, "cbid": 409, "correlation": 241690093 + } + }, + { + "ph": "f", "id": 241690093, "pid": 5717, "tid": 6759, "ts": 6302685414845.655, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685414854.025, "dur": 2.210, + "args": { + "External id": 130266, "cbid": 135, "correlation": 241690096 + } + }, + { + "ph": "f", "id": 241690096, "pid": 5717, "tid": 6759, "ts": 6302685414854.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685414856.465, "dur": 1.000, + "args": { + "External id": 130266, "cbid": 147, "correlation": 241690097 + } + }, + { + "ph": "s", "id": 241690097, "pid": 5717, "tid": 6759, "ts": 6302685414856.465, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685524792.458, "dur": 9382.599, + "args": { + "External id": 130266, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241690099, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241690099, "pid": 3, "tid": 20, "ts": 6302685524792.458, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685414860.235, "dur": 18.800, + "args": { + "External id": 130266, "cbid": 430, "correlation": 241690099 + } + }, + { + "ph": "s", "id": 241690099, "pid": 5717, "tid": 6759, "ts": 6302685414860.235, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685414880.355, "dur": 1.770, + "args": { + "External id": 130266, "cbid": 135, "correlation": 241690101 + } + }, + { + "ph": "f", "id": 241690101, "pid": 5717, "tid": 6759, "ts": 6302685414880.355, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685414882.295, "dur": 0.700, + "args": { + "External id": 130266, "cbid": 147, "correlation": 241690102 + } + }, + { + "ph": "s", "id": 241690102, "pid": 5717, "tid": 6759, "ts": 6302685414882.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685414886.435, "dur": 0.980, + "args": { + "External id": 130266, "cbid": 135, "correlation": 241690105 + } + }, + { + "ph": "f", "id": 241690105, "pid": 5717, "tid": 6759, "ts": 6302685414886.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685414896.995, "dur": 0.520, + "args": { + "External id": 130266, "cbid": 135, "correlation": 241690112 + } + }, + { + "ph": "f", "id": 241690112, "pid": 5717, "tid": 6759, "ts": 6302685414896.995, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685414930.115, "dur": 1.270, + "args": { + "External id": 130268, "cbid": 147, "correlation": 241690117 + } + }, + { + "ph": "s", "id": 241690117, "pid": 5717, "tid": 6759, "ts": 6302685414930.115, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685414955.775, "dur": 1.190, + "args": { + "External id": 130250, "cbid": 135, "correlation": 241690132 + } + }, + { + "ph": "f", "id": 241690132, "pid": 5717, "tid": 6759, "ts": 6302685414955.775, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685415328.074, "dur": 1.970, + "args": { + "External id": 130250, "cbid": 135, "correlation": 241690145 + } + }, + { + "ph": "f", "id": 241690145, "pid": 5717, "tid": 6759, "ts": 6302685415328.074, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685415476.584, "dur": 4.360, + "args": { + "External id": 130278, "cbid": 147, "correlation": 241690156 + } + }, + { + "ph": "s", "id": 241690156, "pid": 5717, "tid": 6759, "ts": 6302685415476.584, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685415683.764, "dur": 1.640, + "args": { + "External id": 130292, "cbid": 317, "correlation": 241690197 + } + }, + { + "ph": "f", "id": 241690197, "pid": 5717, "tid": 6759, "ts": 6302685415683.764, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685415695.853, "dur": 3.400, + "args": { + "External id": 130293, "cbid": 138, "correlation": 241690200 + } + }, + { + "ph": "f", "id": 241690200, "pid": 5717, "tid": 6759, "ts": 6302685415695.853, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685524791.882, "dur": 2.720, + "args": { + "External id": 130297, "device": 3, "context": 1, "stream": 7, "correlation": 241690211, "bytes": 7224, "memory bandwidth (GB/s)": 2.6558823529411764 + } + }, + { + "ph": "f", "id": 241690211, "pid": 3, "tid": 7, "ts": 6302685524791.882, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685415726.983, "dur": 16.670, + "args": { + "External id": 130297, "cbid": 41, "correlation": 241690211 + } + }, + { + "ph": "s", "id": 241690211, "pid": 5717, "tid": 6759, "ts": 6302685415726.983, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685415749.863, "dur": 2.440, + "args": { + "External id": 130292, "cbid": 135, "correlation": 241690215 + } + }, + { + "ph": "f", "id": 241690215, "pid": 5717, "tid": 6759, "ts": 6302685415749.863, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 3, "tid": 7, + "ts": 6302685524796.490, "dur": 126.401, + "args": { + "External id": 130292, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690219, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690219, "pid": 3, "tid": 7, "ts": 6302685524796.490, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685415755.833, "dur": 13.930, + "args": { + "External id": 130292, "cbid": 211, "correlation": 241690219 + } + }, + { + "ph": "s", "id": 241690219, "pid": 5717, "tid": 6759, "ts": 6302685415755.833, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685415894.853, "dur": 1.670, + "args": { + "External id": 130278, "cbid": 135, "correlation": 241690230 + } + }, + { + "ph": "f", "id": 241690230, "pid": 5717, "tid": 6759, "ts": 6302685415894.853, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685415900.803, "dur": 1.610, + "args": { + "External id": 130278, "cbid": 147, "correlation": 241690234 + } + }, + { + "ph": "s", "id": 241690234, "pid": 5717, "tid": 6759, "ts": 6302685415900.803, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685415904.803, "dur": 1.120, + "args": { + "External id": 130278, "cbid": 147, "correlation": 241690238 + } + }, + { + "ph": "s", "id": 241690238, "pid": 5717, "tid": 6759, "ts": 6302685415904.803, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 3, "tid": 7, + "ts": 6302685524967.852, "dur": 1682.188, + "args": { + "External id": 130299, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690270, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690270, "pid": 3, "tid": 7, "ts": 6302685524967.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416058.633, "dur": 14.430, + "args": { + "External id": 130299, "cbid": 211, "correlation": 241690270 + } + }, + { + "ph": "s", "id": 241690270, "pid": 5717, "tid": 6759, "ts": 6302685416058.633, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 3, "tid": 7, + "ts": 6302685526650.680, "dur": 431.620, + "args": { + "External id": 130300, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690293, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241690293, "pid": 3, "tid": 7, "ts": 6302685526650.680, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416138.443, "dur": 10.089, + "args": { + "External id": 130300, "cbid": 307, "correlation": 241690293 + } + }, + { + "ph": "s", "id": 241690293, "pid": 5717, "tid": 6759, "ts": 6302685416138.443, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685416205.282, "dur": 0.970, + "args": { + "External id": 130301, "cbid": 200, "correlation": 241690316 + } + }, + { + "ph": "f", "id": 241690316, "pid": 5717, "tid": 6759, "ts": 6302685416205.282, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685527083.548, "dur": 1.088, + "args": { + "External id": 130301, "device": 3, "context": 1, "stream": 7, "correlation": 241690319, "bytes": 1536, "memory bandwidth (GB/s)": 1.411764705882353 + } + }, + { + "ph": "f", "id": 241690319, "pid": 3, "tid": 7, "ts": 6302685527083.548, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685416209.292, "dur": 11.860, + "args": { + "External id": 130301, "cbid": 51, "correlation": 241690319 + } + }, + { + "ph": "s", "id": 241690319, "pid": 5717, "tid": 6759, "ts": 6302685416209.292, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685527086.108, "dur": 788.710, + "args": { + "External id": 130301, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690320, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690320, "pid": 3, "tid": 7, "ts": 6302685527086.108, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416221.572, "dur": 10.010, + "args": { + "External id": 130301, "cbid": 307, "correlation": 241690320 + } + }, + { + "ph": "s", "id": 241690320, "pid": 5717, "tid": 6759, "ts": 6302685416221.572, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685416279.502, "dur": 0.550, + "args": { + "External id": 130302, "cbid": 200, "correlation": 241690345 + } + }, + { + "ph": "f", "id": 241690345, "pid": 5717, "tid": 6759, "ts": 6302685416279.502, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685527955.202, "dur": 28.193, + "args": { + "External id": 130302, "device": 3, "context": 1, "stream": 7, "correlation": 241690348, "bytes": 1536, "memory bandwidth (GB/s)": 0.05448160891001312 + } + }, + { + "ph": "f", "id": 241690348, "pid": 3, "tid": 7, "ts": 6302685527955.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685416282.012, "dur": 7.600, + "args": { + "External id": 130302, "cbid": 51, "correlation": 241690348 + } + }, + { + "ph": "s", "id": 241690348, "pid": 5717, "tid": 6759, "ts": 6302685416282.012, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685528002.211, "dur": 435.203, + "args": { + "External id": 130302, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690349, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690349, "pid": 3, "tid": 7, "ts": 6302685528002.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416289.902, "dur": 22.250, + "args": { + "External id": 130302, "cbid": 307, "correlation": 241690349 + } + }, + { + "ph": "s", "id": 241690349, "pid": 5717, "tid": 6759, "ts": 6302685416289.902, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685416357.302, "dur": 0.450, + "args": { + "External id": 130303, "cbid": 200, "correlation": 241690374 + } + }, + { + "ph": "f", "id": 241690374, "pid": 5717, "tid": 6759, "ts": 6302685416357.302, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685528438.118, "dur": 776.166, + "args": { + "External id": 130303, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690377, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690377, "pid": 3, "tid": 7, "ts": 6302685528438.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416359.662, "dur": 8.550, + "args": { + "External id": 130303, "cbid": 307, "correlation": 241690377 + } + }, + { + "ph": "s", "id": 241690377, "pid": 5717, "tid": 6759, "ts": 6302685416359.662, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685416401.942, "dur": 0.390, + "args": { + "External id": 130304, "cbid": 200, "correlation": 241690402 + } + }, + { + "ph": "f", "id": 241690402, "pid": 5717, "tid": 6759, "ts": 6302685416401.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685529215.500, "dur": 0.832, + "args": { + "External id": 130304, "device": 3, "context": 1, "stream": 7, "correlation": 241690405, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 241690405, "pid": 3, "tid": 7, "ts": 6302685529215.500, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685416404.162, "dur": 6.940, + "args": { + "External id": 130304, "cbid": 51, "correlation": 241690405 + } + }, + { + "ph": "s", "id": 241690405, "pid": 5717, "tid": 6759, "ts": 6302685416404.162, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685529217.548, "dur": 351.971, + "args": { + "External id": 130304, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690406, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690406, "pid": 3, "tid": 7, "ts": 6302685529217.548, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416411.352, "dur": 7.300, + "args": { + "External id": 130304, "cbid": 307, "correlation": 241690406 + } + }, + { + "ph": "s", "id": 241690406, "pid": 5717, "tid": 6759, "ts": 6302685416411.352, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685416452.682, "dur": 0.520, + "args": { + "External id": 130305, "cbid": 200, "correlation": 241690431 + } + }, + { + "ph": "f", "id": 241690431, "pid": 5717, "tid": 6759, "ts": 6302685416452.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685529570.127, "dur": 428.035, + "args": { + "External id": 130305, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690434, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690434, "pid": 3, "tid": 7, "ts": 6302685529570.127, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416454.902, "dur": 7.660, + "args": { + "External id": 130305, "cbid": 307, "correlation": 241690434 + } + }, + { + "ph": "s", "id": 241690434, "pid": 5717, "tid": 6759, "ts": 6302685416454.902, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685529998.866, "dur": 184.897, + "args": { + "External id": 130306, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690447, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690447, "pid": 3, "tid": 7, "ts": 6302685529998.866, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416517.631, "dur": 8.491, + "args": { + "External id": 130306, "cbid": 307, "correlation": 241690447 + } + }, + { + "ph": "s", "id": 241690447, "pid": 5717, "tid": 6759, "ts": 6302685416517.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 3, "tid": 7, + "ts": 6302685530184.435, "dur": 96.385, + "args": { + "External id": 130307, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690455, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241690455, "pid": 3, "tid": 7, "ts": 6302685530184.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416561.502, "dur": 7.529, + "args": { + "External id": 130307, "cbid": 307, "correlation": 241690455 + } + }, + { + "ph": "s", "id": 241690455, "pid": 5717, "tid": 6759, "ts": 6302685416561.502, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 3, "tid": 7, + "ts": 6302685530281.428, "dur": 199.553, + "args": { + "External id": 130308, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690463, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690463, "pid": 3, "tid": 7, "ts": 6302685530281.428, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416614.831, "dur": 13.920, + "args": { + "External id": 130308, "cbid": 307, "correlation": 241690463 + } + }, + { + "ph": "s", "id": 241690463, "pid": 5717, "tid": 6759, "ts": 6302685416614.831, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685416903.721, "dur": 0.750, + "args": { + "External id": 130327, "cbid": 200, "correlation": 241690509 + } + }, + { + "ph": "f", "id": 241690509, "pid": 5717, "tid": 6759, "ts": 6302685416903.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685530529.686, "dur": 45.440, + "args": { + "External id": 130327, "device": 3, "context": 1, "stream": 7, "correlation": 241690512, "bytes": 576, "memory bandwidth (GB/s)": 0.01267605633802817 + } + }, + { + "ph": "f", "id": 241690512, "pid": 3, "tid": 7, "ts": 6302685530529.686, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685416909.161, "dur": 17.570, + "args": { + "External id": 130327, "cbid": 51, "correlation": 241690512 + } + }, + { + "ph": "s", "id": 241690512, "pid": 5717, "tid": 6759, "ts": 6302685416909.161, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685530596.246, "dur": 142.785, + "args": { + "External id": 130327, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690513, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690513, "pid": 3, "tid": 7, "ts": 6302685530596.246, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685416927.141, "dur": 18.360, + "args": { + "External id": 130327, "cbid": 307, "correlation": 241690513 + } + }, + { + "ph": "s", "id": 241690513, "pid": 5717, "tid": 6759, "ts": 6302685416927.141, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685530739.671, "dur": 139.457, + "args": { + "External id": 130328, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690535, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690535, "pid": 3, "tid": 7, "ts": 6302685530739.671, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685417008.370, "dur": 13.331, + "args": { + "External id": 130328, "cbid": 211, "correlation": 241690535 + } + }, + { + "ph": "s", "id": 241690535, "pid": 5717, "tid": 6759, "ts": 6302685417008.370, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685417155.460, "dur": 0.710, + "args": { + "External id": 130329, "cbid": 200, "correlation": 241690553 + } + }, + { + "ph": "f", "id": 241690553, "pid": 5717, "tid": 6759, "ts": 6302685417155.460, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685417156.380, "dur": 0.370, + "args": { + "External id": 130329, "cbid": 200, "correlation": 241690554 + } + }, + { + "ph": "f", "id": 241690554, "pid": 5717, "tid": 6759, "ts": 6302685417156.380, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685417198.570, "dur": 0.500, + "args": { + "External id": 130329, "cbid": 200, "correlation": 241690572 + } + }, + { + "ph": "f", "id": 241690572, "pid": 5717, "tid": 6759, "ts": 6302685417198.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685530879.864, "dur": 91.105, + "args": { + "External id": 130329, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690573, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690573, "pid": 3, "tid": 7, "ts": 6302685530879.864, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685417201.890, "dur": 22.800, + "args": { + "External id": 130329, "cbid": 211, "correlation": 241690573 + } + }, + { + "ph": "s", "id": 241690573, "pid": 5717, "tid": 6759, "ts": 6302685417201.890, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685417226.280, "dur": 2.060, + "args": { + "External id": 130329, "cbid": 273, "correlation": 241690575 + } + }, + { + "ph": "f", "id": 241690575, "pid": 5717, "tid": 6759, "ts": 6302685417226.280, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 3, "tid": 7, + "ts": 6302685530971.673, "dur": 1107.016, + "args": { + "External id": 130329, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690576, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241690576, "pid": 3, "tid": 7, "ts": 6302685530971.673, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685417229.020, "dur": 8.370, + "args": { + "External id": 130329, "cbid": 211, "correlation": 241690576 + } + }, + { + "ph": "s", "id": 241690576, "pid": 5717, "tid": 6759, "ts": 6302685417229.020, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 3, "tid": 7, + "ts": 6302685532079.393, "dur": 73.057, + "args": { + "External id": 130329, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690578, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241690578, "pid": 3, "tid": 7, "ts": 6302685532079.393, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685417238.540, "dur": 8.190, + "args": { + "External id": 130329, "cbid": 211, "correlation": 241690578 + } + }, + { + "ph": "s", "id": 241690578, "pid": 5717, "tid": 6759, "ts": 6302685417238.540, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685532153.122, "dur": 50.144, + "args": { + "External id": 130340, "device": 3, "context": 1, "stream": 7, "correlation": 241690600, "bytes": 25165824, "memory bandwidth (GB/s)": 501.87109125717933 + } + }, + { + "ph": "f", "id": 241690600, "pid": 3, "tid": 7, "ts": 6302685532153.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685417505.089, "dur": 30.050, + "args": { + "External id": 130340, "cbid": 41, "correlation": 241690600 + } + }, + { + "ph": "s", "id": 241690600, "pid": 5717, "tid": 6759, "ts": 6302685417505.089, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685532203.938, "dur": 33.504, + "args": { + "External id": 130337, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690618, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690618, "pid": 3, "tid": 7, "ts": 6302685532203.938, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685417708.419, "dur": 15.530, + "args": { + "External id": 130337, "cbid": 307, "correlation": 241690618 + } + }, + { + "ph": "s", "id": 241690618, "pid": 5717, "tid": 6759, "ts": 6302685417708.419, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685532238.082, "dur": 40.513, + "args": { + "External id": 130347, "device": 3, "context": 1, "stream": 7, "correlation": 241690633, "bytes": 25165824, "memory bandwidth (GB/s)": 621.1789795867993 + } + }, + { + "ph": "f", "id": 241690633, "pid": 3, "tid": 7, "ts": 6302685532238.082, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685417829.988, "dur": 23.851, + "args": { + "External id": 130347, "cbid": 41, "correlation": 241690633 + } + }, + { + "ph": "s", "id": 241690633, "pid": 5717, "tid": 6759, "ts": 6302685417829.988, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 3, "tid": 7, + "ts": 6302685532279.267, "dur": 29.920, + "args": { + "External id": 130344, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690651, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690651, "pid": 3, "tid": 7, "ts": 6302685532279.267, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685418009.938, "dur": 14.460, + "args": { + "External id": 130344, "cbid": 307, "correlation": 241690651 + } + }, + { + "ph": "s", "id": 241690651, "pid": 5717, "tid": 6759, "ts": 6302685418009.938, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685418255.328, "dur": 1.540, + "args": { + "External id": 130352, "cbid": 200, "correlation": 241690681 + } + }, + { + "ph": "f", "id": 241690681, "pid": 5717, "tid": 6759, "ts": 6302685418255.328, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685532310.659, "dur": 1.248, + "args": { + "External id": 130352, "device": 3, "context": 1, "stream": 7, "correlation": 241690684, "bytes": 576, "memory bandwidth (GB/s)": 0.46153846153846156 + } + }, + { + "ph": "f", "id": 241690684, "pid": 3, "tid": 7, "ts": 6302685532310.659, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685418261.588, "dur": 19.230, + "args": { + "External id": 130352, "cbid": 51, "correlation": 241690684 + } + }, + { + "ph": "s", "id": 241690684, "pid": 5717, "tid": 6759, "ts": 6302685418261.588, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685532313.091, "dur": 143.617, + "args": { + "External id": 130352, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690685, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690685, "pid": 3, "tid": 7, "ts": 6302685532313.091, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685418281.458, "dur": 37.989, + "args": { + "External id": 130352, "cbid": 307, "correlation": 241690685 + } + }, + { + "ph": "s", "id": 241690685, "pid": 5717, "tid": 6759, "ts": 6302685418281.458, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685418386.867, "dur": 0.670, + "args": { + "External id": 130353, "cbid": 200, "correlation": 241690710 + } + }, + { + "ph": "f", "id": 241690710, "pid": 5717, "tid": 6759, "ts": 6302685418386.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685532457.956, "dur": 1.440, + "args": { + "External id": 130353, "device": 3, "context": 1, "stream": 7, "correlation": 241690713, "bytes": 576, "memory bandwidth (GB/s)": 0.4 + } + }, + { + "ph": "f", "id": 241690713, "pid": 3, "tid": 7, "ts": 6302685532457.956, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685418390.247, "dur": 10.010, + "args": { + "External id": 130353, "cbid": 51, "correlation": 241690713 + } + }, + { + "ph": "s", "id": 241690713, "pid": 5717, "tid": 6759, "ts": 6302685418390.247, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685532460.612, "dur": 140.897, + "args": { + "External id": 130353, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690714, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690714, "pid": 3, "tid": 7, "ts": 6302685532460.612, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685418400.597, "dur": 11.700, + "args": { + "External id": 130353, "cbid": 307, "correlation": 241690714 + } + }, + { + "ph": "s", "id": 241690714, "pid": 5717, "tid": 6759, "ts": 6302685418400.597, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5717, "tid": 6759, + "ts": 6302685418463.927, "dur": 0.660, + "args": { + "External id": 130354, "cbid": 200, "correlation": 241690739 + } + }, + { + "ph": "f", "id": 241690739, "pid": 5717, "tid": 6759, "ts": 6302685418463.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 3, "tid": 7, + "ts": 6302685532602.725, "dur": 1.152, + "args": { + "External id": 130354, "device": 3, "context": 1, "stream": 7, "correlation": 241690742, "bytes": 576, "memory bandwidth (GB/s)": 0.5 + } + }, + { + "ph": "f", "id": 241690742, "pid": 3, "tid": 7, "ts": 6302685532602.725, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5717, "tid": 6759, + "ts": 6302685418467.087, "dur": 8.850, + "args": { + "External id": 130354, "cbid": 51, "correlation": 241690742 + } + }, + { + "ph": "s", "id": 241690742, "pid": 5717, "tid": 6759, "ts": 6302685418467.087, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 3, "tid": 7, + "ts": 6302685532605.253, "dur": 142.081, + "args": { + "External id": 130354, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690743, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690743, "pid": 3, "tid": 7, "ts": 6302685532605.253, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685418476.267, "dur": 10.770, + "args": { + "External id": 130354, "cbid": 307, "correlation": 241690743 + } + }, + { + "ph": "s", "id": 241690743, "pid": 5717, "tid": 6759, "ts": 6302685418476.267, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685532747.974, "dur": 146.977, + "args": { + "External id": 130355, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690765, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690765, "pid": 3, "tid": 7, "ts": 6302685532747.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685418541.797, "dur": 12.410, + "args": { + "External id": 130355, "cbid": 211, "correlation": 241690765 + } + }, + { + "ph": "s", "id": 241690765, "pid": 5717, "tid": 6759, "ts": 6302685418541.797, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685532895.559, "dur": 139.585, + "args": { + "External id": 130356, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690788, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690788, "pid": 3, "tid": 7, "ts": 6302685532895.559, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685418597.227, "dur": 10.100, + "args": { + "External id": 130356, "cbid": 211, "correlation": 241690788 + } + }, + { + "ph": "s", "id": 241690788, "pid": 5717, "tid": 6759, "ts": 6302685418597.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 3, "tid": 7, + "ts": 6302685533035.816, "dur": 140.354, + "args": { + "External id": 130357, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690811, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 241690811, "pid": 3, "tid": 7, "ts": 6302685533035.816, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685418647.267, "dur": 10.030, + "args": { + "External id": 130357, "cbid": 211, "correlation": 241690811 + } + }, + { + "ph": "s", "id": 241690811, "pid": 5717, "tid": 6759, "ts": 6302685418647.267, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 3, "tid": 7, + "ts": 6302685533176.810, "dur": 81.792, + "args": { + "External id": 130358, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690819, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690819, "pid": 3, "tid": 7, "ts": 6302685533176.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685418732.506, "dur": 12.511, + "args": { + "External id": 130358, "cbid": 307, "correlation": 241690819 + } + }, + { + "ph": "s", "id": 241690819, "pid": 5717, "tid": 6759, "ts": 6302685418732.506, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 3, "tid": 7, + "ts": 6302685533259.274, "dur": 47.617, + "args": { + "External id": 130373, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690848, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690848, "pid": 3, "tid": 7, "ts": 6302685533259.274, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685419090.356, "dur": 20.770, + "args": { + "External id": 130373, "cbid": 307, "correlation": 241690848 + } + }, + { + "ph": "s", "id": 241690848, "pid": 5717, "tid": 6759, "ts": 6302685419090.356, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 3, "tid": 7, + "ts": 6302685533307.499, "dur": 3.744, + "args": { + "External id": 130374, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690856, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 241690856, "pid": 3, "tid": 7, "ts": 6302685533307.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685419168.816, "dur": 11.820, + "args": { + "External id": 130374, "cbid": 307, "correlation": 241690856 + } + }, + { + "ph": "s", "id": 241690856, "pid": 5717, "tid": 6759, "ts": 6302685419168.816, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 3, "tid": 7, + "ts": 6302685533311.883, "dur": 52.160, + "args": { + "External id": 130375, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690867, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690867, "pid": 3, "tid": 7, "ts": 6302685533311.883, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685419243.596, "dur": 12.309, + "args": { + "External id": 130375, "cbid": 307, "correlation": 241690867 + } + }, + { + "ph": "s", "id": 241690867, "pid": 5717, "tid": 6759, "ts": 6302685419243.596, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685533364.651, "dur": 45.696, + "args": { + "External id": 130376, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690872, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690872, "pid": 3, "tid": 7, "ts": 6302685533364.651, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685419363.045, "dur": 26.640, + "args": { + "External id": 130376, "cbid": 211, "correlation": 241690872 + } + }, + { + "ph": "s", "id": 241690872, "pid": 5717, "tid": 6759, "ts": 6302685419363.045, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685419794.584, "dur": 6.930, + "args": { + "External id": 130382, "cbid": 147, "correlation": 241690889 + } + }, + { + "ph": "s", "id": 241690889, "pid": 5717, "tid": 6759, "ts": 6302685419794.584, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685420037.883, "dur": 6.340, + "args": { + "External id": 130390, "cbid": 138, "correlation": 241690904 + } + }, + { + "ph": "f", "id": 241690904, "pid": 5717, "tid": 6759, "ts": 6302685420037.883, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685420045.514, "dur": 1.669, + "args": { + "External id": 130390, "cbid": 138, "correlation": 241690905 + } + }, + { + "ph": "f", "id": 241690905, "pid": 5717, "tid": 6759, "ts": 6302685420045.514, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685534182.417, "dur": 3.808, + "args": { + "External id": 130394, "device": 3, "context": 1, "stream": 7, "correlation": 241690916, "bytes": 28112, "memory bandwidth (GB/s)": 7.382352941176471 + } + }, + { + "ph": "f", "id": 241690916, "pid": 3, "tid": 7, "ts": 6302685534182.417, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685420096.503, "dur": 28.800, + "args": { + "External id": 130394, "cbid": 41, "correlation": 241690916 + } + }, + { + "ph": "s", "id": 241690916, "pid": 5717, "tid": 6759, "ts": 6302685420096.503, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685420135.263, "dur": 4.220, + "args": { + "External id": 130389, "cbid": 135, "correlation": 241690920 + } + }, + { + "ph": "f", "id": 241690920, "pid": 5717, "tid": 6759, "ts": 6302685420135.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685534187.793, "dur": 42.624, + "args": { + "External id": 130389, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241690924, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241690924, "pid": 3, "tid": 7, "ts": 6302685534187.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685420146.503, "dur": 22.860, + "args": { + "External id": 130389, "cbid": 211, "correlation": 241690924 + } + }, + { + "ph": "s", "id": 241690924, "pid": 5717, "tid": 6759, "ts": 6302685420146.503, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685420262.473, "dur": 2.620, + "args": { + "External id": 130382, "cbid": 135, "correlation": 241690935 + } + }, + { + "ph": "f", "id": 241690935, "pid": 5717, "tid": 6759, "ts": 6302685420262.473, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685420270.083, "dur": 3.280, + "args": { + "External id": 130382, "cbid": 147, "correlation": 241690939 + } + }, + { + "ph": "s", "id": 241690939, "pid": 5717, "tid": 6759, "ts": 6302685420270.083, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685420456.203, "dur": 2.540, + "args": { + "External id": 130398, "cbid": 317, "correlation": 241690959 + } + }, + { + "ph": "f", "id": 241690959, "pid": 5717, "tid": 6759, "ts": 6302685420456.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685420463.663, "dur": 3.620, + "args": { + "External id": 130398, "cbid": 135, "correlation": 241690961 + } + }, + { + "ph": "f", "id": 241690961, "pid": 5717, "tid": 6759, "ts": 6302685420463.663, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685420470.393, "dur": 2.630, + "args": { + "External id": 130398, "cbid": 147, "correlation": 241690965 + } + }, + { + "ph": "s", "id": 241690965, "pid": 5717, "tid": 6759, "ts": 6302685420470.393, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685420507.002, "dur": 1.820, + "args": { + "External id": 130398, "cbid": 409, "correlation": 241690968 + } + }, + { + "ph": "f", "id": 241690968, "pid": 5717, "tid": 6759, "ts": 6302685420507.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685420518.362, "dur": 1.911, + "args": { + "External id": 130398, "cbid": 135, "correlation": 241690971 + } + }, + { + "ph": "f", "id": 241690971, "pid": 5717, "tid": 6759, "ts": 6302685420518.362, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685420520.682, "dur": 1.960, + "args": { + "External id": 130398, "cbid": 147, "correlation": 241690972 + } + }, + { + "ph": "s", "id": 241690972, "pid": 5717, "tid": 6759, "ts": 6302685420520.682, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685534322.002, "dur": 11244.021, + "args": { + "External id": 130398, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241690974, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241690974, "pid": 3, "tid": 20, "ts": 6302685534322.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685420525.242, "dur": 23.951, + "args": { + "External id": 130398, "cbid": 430, "correlation": 241690974 + } + }, + { + "ph": "s", "id": 241690974, "pid": 5717, "tid": 6759, "ts": 6302685420525.242, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685420551.713, "dur": 0.980, + "args": { + "External id": 130398, "cbid": 135, "correlation": 241690976 + } + }, + { + "ph": "f", "id": 241690976, "pid": 5717, "tid": 6759, "ts": 6302685420551.713, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685420552.993, "dur": 1.389, + "args": { + "External id": 130398, "cbid": 147, "correlation": 241690977 + } + }, + { + "ph": "s", "id": 241690977, "pid": 5717, "tid": 6759, "ts": 6302685420552.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685420558.182, "dur": 1.611, + "args": { + "External id": 130398, "cbid": 135, "correlation": 241690980 + } + }, + { + "ph": "f", "id": 241690980, "pid": 5717, "tid": 6759, "ts": 6302685420558.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685420578.652, "dur": 1.090, + "args": { + "External id": 130398, "cbid": 135, "correlation": 241690987 + } + }, + { + "ph": "f", "id": 241690987, "pid": 5717, "tid": 6759, "ts": 6302685420578.652, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685420641.862, "dur": 2.190, + "args": { + "External id": 130400, "cbid": 147, "correlation": 241690992 + } + }, + { + "ph": "s", "id": 241690992, "pid": 5717, "tid": 6759, "ts": 6302685420641.862, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685420683.822, "dur": 2.290, + "args": { + "External id": 130382, "cbid": 135, "correlation": 241691007 + } + }, + { + "ph": "f", "id": 241691007, "pid": 5717, "tid": 6759, "ts": 6302685420683.822, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685421243.801, "dur": 4.070, + "args": { + "External id": 130382, "cbid": 135, "correlation": 241691020 + } + }, + { + "ph": "f", "id": 241691020, "pid": 5717, "tid": 6759, "ts": 6302685421243.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_dense_backward_0", "pid": 3, "tid": 7, + "ts": 6302685534231.057, "dur": 99.681, + "args": { + "External id": 130412, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691036, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 3000.000000, "grid": [48000, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241691036, "pid": 3, "tid": 7, "ts": 6302685534231.057, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685421770.950, "dur": 49.420, + "args": { + "External id": 130412, "cbid": 307, "correlation": 241691036 + } + }, + { + "ph": "s", "id": 241691036, "pid": 5717, "tid": 6759, "ts": 6302685421770.950, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_dense_backward_1", "pid": 3, "tid": 7, + "ts": 6302685534331.378, "dur": 689.798, + "args": { + "External id": 130413, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691041, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241691041, "pid": 3, "tid": 7, "ts": 6302685534331.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685421917.009, "dur": 22.590, + "args": { + "External id": 130413, "cbid": 307, "correlation": 241691041 + } + }, + { + "ph": "s", "id": 241691041, "pid": 5717, "tid": 6759, "ts": 6302685421917.009, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_dense_backward_2", "pid": 3, "tid": 7, + "ts": 6302685535028.664, "dur": 498.979, + "args": { + "External id": 130414, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691049, "registers per thread": 20, "shared memory": 0, "blocks per SM": 187.500000, "warps per SM": 750.000000, "grid": [24000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241691049, "pid": 3, "tid": 7, "ts": 6302685535028.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685422049.729, "dur": 21.890, + "args": { + "External id": 130414, "cbid": 307, "correlation": 241691049 + } + }, + { + "ph": "s", "id": 241691049, "pid": 5717, "tid": 6759, "ts": 6302685422049.729, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685535528.315, "dur": 285.314, + "args": { + "External id": 130415, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691054, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241691054, "pid": 3, "tid": 7, "ts": 6302685535528.315, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685422212.409, "dur": 27.950, + "args": { + "External id": 130415, "cbid": 211, "correlation": 241691054 + } + }, + { + "ph": "s", "id": 241691054, "pid": 5717, "tid": 6759, "ts": 6302685422212.409, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685423741.945, "dur": 14.470, + "args": { + "cbid": 147, "correlation": 241691069 + } + }, + { + "ph": "s", "id": 241691069, "pid": 5717, "tid": 6759, "ts": 6302685423741.945, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685424554.683, "dur": 13.450, + "args": { + "External id": 130428, "cbid": 138, "correlation": 241691084 + } + }, + { + "ph": "f", "id": 241691084, "pid": 5717, "tid": 6759, "ts": 6302685424554.683, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 6759, + "ts": 6302685424569.783, "dur": 3.980, + "args": { + "External id": 130428, "cbid": 138, "correlation": 241691085 + } + }, + { + "ph": "f", "id": 241691085, "pid": 5717, "tid": 6759, "ts": 6302685424569.783, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 3, "tid": 7, + "ts": 6302685545573.415, "dur": 12.224, + "args": { + "External id": 130432, "device": 3, "context": 1, "stream": 7, "correlation": 241691096, "bytes": 208504, "memory bandwidth (GB/s)": 17.05693717277487 + } + }, + { + "ph": "f", "id": 241691096, "pid": 3, "tid": 7, "ts": 6302685545573.415, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 6759, + "ts": 6302685424708.223, "dur": 59.360, + "args": { + "External id": 130432, "cbid": 41, "correlation": 241691096 + } + }, + { + "ph": "s", "id": 241691096, "pid": 5717, "tid": 6759, "ts": 6302685424708.223, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685424787.763, "dur": 8.710, + "args": { + "External id": 130427, "cbid": 135, "correlation": 241691100 + } + }, + { + "ph": "f", "id": 241691100, "pid": 5717, "tid": 6759, "ts": 6302685424787.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 3, "tid": 7, + "ts": 6302685545587.271, "dur": 335.363, + "args": { + "External id": 130427, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691104, "registers per thread": 32, "shared memory": 0, "blocks per SM": 807.281250, "warps per SM": 3229.125000, "grid": [25833, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 241691104, "pid": 3, "tid": 7, "ts": 6302685545587.271, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 6759, + "ts": 6302685424811.563, "dur": 49.450, + "args": { + "External id": 130427, "cbid": 211, "correlation": 241691104 + } + }, + { + "ph": "s", "id": 241691104, "pid": 5717, "tid": 6759, "ts": 6302685424811.563, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685425153.402, "dur": 5.210, + "args": { + "cbid": 135, "correlation": 241691115 + } + }, + { + "ph": "f", "id": 241691115, "pid": 5717, "tid": 6759, "ts": 6302685425153.402, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685425168.942, "dur": 6.490, + "args": { + "cbid": 147, "correlation": 241691119 + } + }, + { + "ph": "s", "id": 241691119, "pid": 5717, "tid": 6759, "ts": 6302685425168.942, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 6759, + "ts": 6302685425642.961, "dur": 7.270, + "args": { + "External id": 130436, "cbid": 317, "correlation": 241691139 + } + }, + { + "ph": "f", "id": 241691139, "pid": 5717, "tid": 6759, "ts": 6302685425642.961, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685425664.101, "dur": 10.530, + "args": { + "External id": 130436, "cbid": 135, "correlation": 241691141 + } + }, + { + "ph": "f", "id": 241691141, "pid": 5717, "tid": 6759, "ts": 6302685425664.101, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685425684.091, "dur": 7.650, + "args": { + "External id": 130436, "cbid": 147, "correlation": 241691145 + } + }, + { + "ph": "s", "id": 241691145, "pid": 5717, "tid": 6759, "ts": 6302685425684.091, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 6759, + "ts": 6302685425788.721, "dur": 4.869, + "args": { + "External id": 130436, "cbid": 409, "correlation": 241691148 + } + }, + { + "ph": "f", "id": 241691148, "pid": 5717, "tid": 6759, "ts": 6302685425788.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685425822.121, "dur": 5.680, + "args": { + "External id": 130436, "cbid": 135, "correlation": 241691151 + } + }, + { + "ph": "f", "id": 241691151, "pid": 5717, "tid": 6759, "ts": 6302685425822.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685425829.021, "dur": 5.969, + "args": { + "External id": 130436, "cbid": 147, "correlation": 241691152 + } + }, + { + "ph": "s", "id": 241691152, "pid": 5717, "tid": 6759, "ts": 6302685425829.021, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685545925.866, "dur": 64975.401, + "args": { + "External id": 130436, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241691154, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 52894464, "Out msg nelems": 13223616, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241691154, "pid": 3, "tid": 20, "ts": 6302685545925.866, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 6759, + "ts": 6302685425842.570, "dur": 71.500, + "args": { + "External id": 130436, "cbid": 430, "correlation": 241691154 + } + }, + { + "ph": "s", "id": 241691154, "pid": 5717, "tid": 6759, "ts": 6302685425842.570, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685425921.010, "dur": 3.000, + "args": { + "External id": 130436, "cbid": 135, "correlation": 241691156 + } + }, + { + "ph": "f", "id": 241691156, "pid": 5717, "tid": 6759, "ts": 6302685425921.010, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685425924.910, "dur": 3.710, + "args": { + "External id": 130436, "cbid": 147, "correlation": 241691157 + } + }, + { + "ph": "s", "id": 241691157, "pid": 5717, "tid": 6759, "ts": 6302685425924.910, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685425939.080, "dur": 4.070, + "args": { + "External id": 130436, "cbid": 135, "correlation": 241691160 + } + }, + { + "ph": "f", "id": 241691160, "pid": 5717, "tid": 6759, "ts": 6302685425939.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685425997.300, "dur": 3.090, + "args": { + "External id": 130436, "cbid": 135, "correlation": 241691167 + } + }, + { + "ph": "f", "id": 241691167, "pid": 5717, "tid": 6759, "ts": 6302685425997.300, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685426195.650, "dur": 8.140, + "args": { + "External id": 130438, "cbid": 147, "correlation": 241691172 + } + }, + { + "ph": "s", "id": 241691172, "pid": 5717, "tid": 6759, "ts": 6302685426195.650, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685426417.579, "dur": 9.130, + "args": { + "cbid": 135, "correlation": 241691187 + } + }, + { + "ph": "f", "id": 241691187, "pid": 5717, "tid": 6759, "ts": 6302685426417.579, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 6759, + "ts": 6302685431744.857, "dur": 13.140, + "args": { + "cbid": 135, "correlation": 241691200 + } + }, + { + "ph": "f", "id": 241691200, "pid": 5717, "tid": 6759, "ts": 6302685431744.857, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685432198.526, "dur": 20.500, + "args": { + "cbid": 147, "correlation": 241691207 + } + }, + { + "ph": "s", "id": 241691207, "pid": 5717, "tid": 6759, "ts": 6302685432198.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685432404.526, "dur": 12.789, + "args": { + "cbid": 147, "correlation": 241691217 + } + }, + { + "ph": "s", "id": 241691217, "pid": 5717, "tid": 6759, "ts": 6302685432404.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685432516.975, "dur": 8.010, + "args": { + "cbid": 147, "correlation": 241691227 + } + }, + { + "ph": "s", "id": 241691227, "pid": 5717, "tid": 6759, "ts": 6302685432516.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685432607.785, "dur": 6.940, + "args": { + "cbid": 147, "correlation": 241691237 + } + }, + { + "ph": "s", "id": 241691237, "pid": 5717, "tid": 6759, "ts": 6302685432607.785, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685432696.305, "dur": 7.240, + "args": { + "cbid": 147, "correlation": 241691247 + } + }, + { + "ph": "s", "id": 241691247, "pid": 5717, "tid": 6759, "ts": 6302685432696.305, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685432780.475, "dur": 7.340, + "args": { + "cbid": 147, "correlation": 241691257 + } + }, + { + "ph": "s", "id": 241691257, "pid": 5717, "tid": 6759, "ts": 6302685432780.475, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685432864.165, "dur": 6.449, + "args": { + "cbid": 147, "correlation": 241691267 + } + }, + { + "ph": "s", "id": 241691267, "pid": 5717, "tid": 6759, "ts": 6302685432864.165, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685432949.944, "dur": 6.380, + "args": { + "cbid": 147, "correlation": 241691277 + } + }, + { + "ph": "s", "id": 241691277, "pid": 5717, "tid": 6759, "ts": 6302685432949.944, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685433034.454, "dur": 6.810, + "args": { + "cbid": 147, "correlation": 241691287 + } + }, + { + "ph": "s", "id": 241691287, "pid": 5717, "tid": 6759, "ts": 6302685433034.454, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685433118.494, "dur": 6.850, + "args": { + "cbid": 147, "correlation": 241691297 + } + }, + { + "ph": "s", "id": 241691297, "pid": 5717, "tid": 6759, "ts": 6302685433118.494, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685433206.964, "dur": 6.330, + "args": { + "cbid": 147, "correlation": 241691307 + } + }, + { + "ph": "s", "id": 241691307, "pid": 5717, "tid": 6759, "ts": 6302685433206.964, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 6759, + "ts": 6302685433288.813, "dur": 49.331, + "args": { + "cbid": 147, "correlation": 241691317 + } + }, + { + "ph": "s", "id": 241691317, "pid": 5717, "tid": 6759, "ts": 6302685433288.813, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685610902.867, "dur": 1.440, + "args": { + "External id": 126586, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691330, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241691330, "pid": 3, "tid": 7, "ts": 6302685610902.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685434295.251, "dur": 193.320, + "args": { + "External id": 126586, "cbid": 211, "correlation": 241691330 + } + }, + { + "ph": "s", "id": 241691330, "pid": 5717, "tid": 5717, "ts": 6302685434295.251, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685610905.011, "dur": 1.312, + "args": { + "External id": 126587, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241691340, "pid": 3, "tid": 7, "ts": 6302685610905.011, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685434698.141, "dur": 47.209, + "args": { + "External id": 126587, "cbid": 211, "correlation": 241691340 + } + }, + { + "ph": "s", "id": 241691340, "pid": 5717, "tid": 5717, "ts": 6302685434698.141, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685610907.027, "dur": 1.056, + "args": { + "External id": 126588, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691350, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241691350, "pid": 3, "tid": 7, "ts": 6302685610907.027, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685434892.540, "dur": 204.920, + "args": { + "External id": 126588, "cbid": 211, "correlation": 241691350 + } + }, + { + "ph": "s", "id": 241691350, "pid": 5717, "tid": 5717, "ts": 6302685434892.540, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685610908.723, "dur": 1.024, + "args": { + "External id": 126589, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691360, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241691360, "pid": 3, "tid": 7, "ts": 6302685610908.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685437134.155, "dur": 67.690, + "args": { + "External id": 126589, "cbid": 211, "correlation": 241691360 + } + }, + { + "ph": "s", "id": 241691360, "pid": 5717, "tid": 5717, "ts": 6302685437134.155, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685610910.419, "dur": 1.056, + "args": { + "External id": 126590, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691370, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241691370, "pid": 3, "tid": 7, "ts": 6302685610910.419, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685437378.364, "dur": 37.780, + "args": { + "External id": 126590, "cbid": 211, "correlation": 241691370 + } + }, + { + "ph": "s", "id": 241691370, "pid": 5717, "tid": 5717, "ts": 6302685437378.364, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442440.733, "dur": 27.830, + "args": { + "cbid": 138, "correlation": 241691374 + } + }, + { + "ph": "f", "id": 241691374, "pid": 5717, "tid": 423623104, "ts": 6302685442440.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442470.873, "dur": 4.850, + "args": { + "cbid": 138, "correlation": 241691375 + } + }, + { + "ph": "f", "id": 241691375, "pid": 5717, "tid": 423623104, "ts": 6302685442470.873, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442491.153, "dur": 3.160, + "args": { + "cbid": 138, "correlation": 241691376 + } + }, + { + "ph": "f", "id": 241691376, "pid": 5717, "tid": 423623104, "ts": 6302685442491.153, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442511.613, "dur": 4.860, + "args": { + "cbid": 138, "correlation": 241691377 + } + }, + { + "ph": "f", "id": 241691377, "pid": 5717, "tid": 423623104, "ts": 6302685442511.613, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442517.273, "dur": 2.540, + "args": { + "cbid": 138, "correlation": 241691378 + } + }, + { + "ph": "f", "id": 241691378, "pid": 5717, "tid": 423623104, "ts": 6302685442517.273, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442524.463, "dur": 2.530, + "args": { + "cbid": 138, "correlation": 241691379 + } + }, + { + "ph": "f", "id": 241691379, "pid": 5717, "tid": 423623104, "ts": 6302685442524.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442534.213, "dur": 4.750, + "args": { + "cbid": 138, "correlation": 241691380 + } + }, + { + "ph": "f", "id": 241691380, "pid": 5717, "tid": 423623104, "ts": 6302685442534.213, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442539.743, "dur": 2.520, + "args": { + "cbid": 138, "correlation": 241691381 + } + }, + { + "ph": "f", "id": 241691381, "pid": 5717, "tid": 423623104, "ts": 6302685442539.743, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442546.143, "dur": 2.640, + "args": { + "cbid": 138, "correlation": 241691382 + } + }, + { + "ph": "f", "id": 241691382, "pid": 5717, "tid": 423623104, "ts": 6302685442546.143, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442556.283, "dur": 4.080, + "args": { + "cbid": 138, "correlation": 241691383 + } + }, + { + "ph": "f", "id": 241691383, "pid": 5717, "tid": 423623104, "ts": 6302685442556.283, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442561.292, "dur": 2.340, + "args": { + "cbid": 138, "correlation": 241691384 + } + }, + { + "ph": "f", "id": 241691384, "pid": 5717, "tid": 423623104, "ts": 6302685442561.292, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442567.563, "dur": 2.460, + "args": { + "cbid": 138, "correlation": 241691385 + } + }, + { + "ph": "f", "id": 241691385, "pid": 5717, "tid": 423623104, "ts": 6302685442567.563, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442578.092, "dur": 5.700, + "args": { + "cbid": 138, "correlation": 241691386 + } + }, + { + "ph": "f", "id": 241691386, "pid": 5717, "tid": 423623104, "ts": 6302685442578.092, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442584.572, "dur": 2.300, + "args": { + "cbid": 138, "correlation": 241691387 + } + }, + { + "ph": "f", "id": 241691387, "pid": 5717, "tid": 423623104, "ts": 6302685442584.572, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442589.883, "dur": 2.649, + "args": { + "cbid": 138, "correlation": 241691388 + } + }, + { + "ph": "f", "id": 241691388, "pid": 5717, "tid": 423623104, "ts": 6302685442589.883, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442600.132, "dur": 5.440, + "args": { + "cbid": 138, "correlation": 241691389 + } + }, + { + "ph": "f", "id": 241691389, "pid": 5717, "tid": 423623104, "ts": 6302685442600.132, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442616.783, "dur": 4.140, + "args": { + "cbid": 138, "correlation": 241691391 + } + }, + { + "ph": "f", "id": 241691391, "pid": 5717, "tid": 423623104, "ts": 6302685442616.783, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442626.692, "dur": 3.480, + "args": { + "cbid": 138, "correlation": 241691393 + } + }, + { + "ph": "f", "id": 241691393, "pid": 5717, "tid": 423623104, "ts": 6302685442626.692, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442635.563, "dur": 4.069, + "args": { + "cbid": 138, "correlation": 241691395 + } + }, + { + "ph": "f", "id": 241691395, "pid": 5717, "tid": 423623104, "ts": 6302685442635.563, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442645.692, "dur": 3.411, + "args": { + "cbid": 138, "correlation": 241691397 + } + }, + { + "ph": "f", "id": 241691397, "pid": 5717, "tid": 423623104, "ts": 6302685442645.692, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442654.423, "dur": 2.989, + "args": { + "cbid": 138, "correlation": 241691399 + } + }, + { + "ph": "f", "id": 241691399, "pid": 5717, "tid": 423623104, "ts": 6302685442654.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442662.802, "dur": 3.940, + "args": { + "cbid": 138, "correlation": 241691401 + } + }, + { + "ph": "f", "id": 241691401, "pid": 5717, "tid": 423623104, "ts": 6302685442662.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442672.502, "dur": 3.060, + "args": { + "cbid": 138, "correlation": 241691403 + } + }, + { + "ph": "f", "id": 241691403, "pid": 5717, "tid": 423623104, "ts": 6302685442672.502, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442680.932, "dur": 3.520, + "args": { + "cbid": 138, "correlation": 241691405 + } + }, + { + "ph": "f", "id": 241691405, "pid": 5717, "tid": 423623104, "ts": 6302685442680.932, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442690.182, "dur": 2.990, + "args": { + "cbid": 138, "correlation": 241691407 + } + }, + { + "ph": "f", "id": 241691407, "pid": 5717, "tid": 423623104, "ts": 6302685442690.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442698.542, "dur": 3.640, + "args": { + "cbid": 138, "correlation": 241691409 + } + }, + { + "ph": "f", "id": 241691409, "pid": 5717, "tid": 423623104, "ts": 6302685442698.542, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442708.052, "dur": 3.130, + "args": { + "cbid": 138, "correlation": 241691411 + } + }, + { + "ph": "f", "id": 241691411, "pid": 5717, "tid": 423623104, "ts": 6302685442708.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442717.002, "dur": 3.960, + "args": { + "cbid": 138, "correlation": 241691413 + } + }, + { + "ph": "f", "id": 241691413, "pid": 5717, "tid": 423623104, "ts": 6302685442717.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442726.582, "dur": 3.130, + "args": { + "cbid": 138, "correlation": 241691415 + } + }, + { + "ph": "f", "id": 241691415, "pid": 5717, "tid": 423623104, "ts": 6302685442726.582, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685442734.872, "dur": 3.230, + "args": { + "cbid": 138, "correlation": 241691417 + } + }, + { + "ph": "f", "id": 241691417, "pid": 5717, "tid": 423623104, "ts": 6302685442734.872, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685610912.147, "dur": 0.832, + "args": { + "External id": 126596, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691430, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.187500, "warps per SM": 0.750000, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 241691430, "pid": 3, "tid": 7, "ts": 6302685610912.147, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685444478.288, "dur": 94.810, + "args": { + "External id": 126596, "cbid": 211, "correlation": 241691430 + } + }, + { + "ph": "s", "id": 241691430, "pid": 5717, "tid": 5717, "ts": 6302685444478.288, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::LpNormFunctor, float*, int>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::LpNormFunctor, float*, int)", "pid": 3, "tid": 7, + "ts": 6302685610913.683, "dur": 89.344, + "args": { + "External id": 126592, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691949, "registers per thread": 28, "shared memory": 2048, "blocks per SM": 2.500000, "warps per SM": 40.000000, "grid": [320, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241691949, "pid": 3, "tid": 7, "ts": 6302685610913.683, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685447244.202, "dur": 52.530, + "args": { + "External id": 126592, "cbid": 211, "correlation": 241691949 + } + }, + { + "ph": "s", "id": 241691949, "pid": 5717, "tid": 5717, "ts": 6302685447244.202, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::LpNormFunctor, float*, int>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::LpNormFunctor, float*, int)", "pid": 3, "tid": 7, + "ts": 6302685611003.667, "dur": 60.673, + "args": { + "External id": 126592, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691952, "registers per thread": 28, "shared memory": 2048, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 241691952, "pid": 3, "tid": 7, "ts": 6302685611003.667, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685447339.812, "dur": 33.640, + "args": { + "External id": 126592, "cbid": 211, "correlation": 241691952 + } + }, + { + "ph": "s", "id": 241691952, "pid": 5717, "tid": 5717, "ts": 6302685447339.812, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::lpnorm_cleanup(float const*, at::native::TensorListAddresses, int)", "pid": 3, "tid": 7, + "ts": 6302685611064.980, "dur": 1.664, + "args": { + "External id": 126592, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691957, "registers per thread": 16, "shared memory": 2048, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 241691957, "pid": 3, "tid": 7, "ts": 6302685611064.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685447469.612, "dur": 40.360, + "args": { + "External id": 126592, "cbid": 211, "correlation": 241691957 + } + }, + { + "ph": "s", "id": 241691957, "pid": 5717, "tid": 5717, "ts": 6302685447469.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy_aligned16_contig, unsigned int, 1, 128, 1>(at::native::(anonymous namespace)::OpaqueType<4u>*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, unsigned int, 128, 1>, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 3, "tid": 7, + "ts": 6302685611067.284, "dur": 1.600, + "args": { + "External id": 130567, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691970, "registers per thread": 30, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 4.000000, "grid": [1, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 241691970, "pid": 3, "tid": 7, "ts": 6302685611067.284, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685460143.533, "dur": 100.110, + "args": { + "External id": 130567, "cbid": 211, "correlation": 241691970 + } + }, + { + "ph": "s", "id": 241691970, "pid": 5717, "tid": 5717, "ts": 6302685460143.533, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 3, "tid": 7, + "ts": 6302685611069.588, "dur": 2.208, + "args": { + "External id": 130569, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691984, "registers per thread": 32, "shared memory": 528, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241691984, "pid": 3, "tid": 7, "ts": 6302685611069.588, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685461418.400, "dur": 81.350, + "args": { + "External id": 130569, "cbid": 211, "correlation": 241691984 + } + }, + { + "ph": "s", "id": 241691984, "pid": 5717, "tid": 5717, "ts": 6302685461418.400, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685611072.404, "dur": 1.120, + "args": { + "External id": 130572, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241691994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241691994, "pid": 3, "tid": 7, "ts": 6302685611072.404, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685463399.436, "dur": 111.759, + "args": { + "External id": 130572, "cbid": 211, "correlation": 241691994 + } + }, + { + "ph": "s", "id": 241691994, "pid": 5717, "tid": 5717, "ts": 6302685463399.436, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 3, "tid": 7, + "ts": 6302685611074.100, "dur": 0.960, + "args": { + "External id": 130579, "device": 3, "context": 1, "stream": 7, "correlation": 241692006, "bytes": 4, "memory bandwidth (GB/s)": 0.004166666666666667 + } + }, + { + "ph": "f", "id": 241692006, "pid": 3, "tid": 7, "ts": 6302685611074.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685463919.584, "dur": 106.740, + "args": { + "External id": 130579, "cbid": 41, "correlation": 241692006 + } + }, + { + "ph": "s", "id": 241692006, "pid": 5717, "tid": 5717, "ts": 6302685463919.584, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685464264.903, "dur": 8.100, + "args": { + "External id": 130581, "cbid": 317, "correlation": 241692012 + } + }, + { + "ph": "f", "id": 241692012, "pid": 5717, "tid": 5717, "ts": 6302685464264.903, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685464292.974, "dur": 58.629, + "args": { + "External id": 130581, "cbid": 135, "correlation": 241692014 + } + }, + { + "ph": "f", "id": 241692014, "pid": 5717, "tid": 5717, "ts": 6302685464292.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685464364.123, "dur": 11.840, + "args": { + "External id": 130581, "cbid": 147, "correlation": 241692018 + } + }, + { + "ph": "s", "id": 241692018, "pid": 5717, "tid": 5717, "ts": 6302685464364.123, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5717, "tid": 5717, + "ts": 6302685464528.383, "dur": 7.130, + "args": { + "External id": 130581, "cbid": 409, "correlation": 241692021 + } + }, + { + "ph": "f", "id": 241692021, "pid": 5717, "tid": 5717, "ts": 6302685464528.383, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685464573.183, "dur": 10.110, + "args": { + "External id": 130581, "cbid": 135, "correlation": 241692024 + } + }, + { + "ph": "f", "id": 241692024, "pid": 5717, "tid": 5717, "ts": 6302685464573.183, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685464584.763, "dur": 8.680, + "args": { + "External id": 130581, "cbid": 147, "correlation": 241692025 + } + }, + { + "ph": "s", "id": 241692025, "pid": 5717, "tid": 5717, "ts": 6302685464584.763, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllReduce_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 3, "tid": 20, + "ts": 6302685611077.588, "dur": 45.441, + "args": { + "External id": 130581, "queued": 0, "device": 3, "context": 1, "stream": 20, "correlation": 241692027, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.007812, "warps per SM": 0.023438, "grid": [1, 1, 1], "block": [96, 1, 1], "est. achieved occupancy %": 0, "Collective name": "allreduce", "In msg nelems": 1, "Out msg nelems": 1, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 241692027, "pid": 3, "tid": 20, "ts": 6302685611077.588, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5717, "tid": 5717, + "ts": 6302685464603.503, "dur": 73.599, + "args": { + "External id": 130581, "cbid": 430, "correlation": 241692027 + } + }, + { + "ph": "s", "id": 241692027, "pid": 5717, "tid": 5717, "ts": 6302685464603.503, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685464686.393, "dur": 3.769, + "args": { + "External id": 130581, "cbid": 135, "correlation": 241692029 + } + }, + { + "ph": "f", "id": 241692029, "pid": 5717, "tid": 5717, "ts": 6302685464686.393, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685464691.093, "dur": 4.549, + "args": { + "External id": 130581, "cbid": 147, "correlation": 241692030 + } + }, + { + "ph": "s", "id": 241692030, "pid": 5717, "tid": 5717, "ts": 6302685464691.093, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685464709.782, "dur": 5.820, + "args": { + "External id": 130581, "cbid": 135, "correlation": 241692033 + } + }, + { + "ph": "f", "id": 241692033, "pid": 5717, "tid": 5717, "ts": 6302685464709.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5717, "tid": 5717, + "ts": 6302685464785.472, "dur": 3.850, + "args": { + "External id": 130581, "cbid": 135, "correlation": 241692040 + } + }, + { + "ph": "f", "id": 241692040, "pid": 5717, "tid": 5717, "ts": 6302685464785.472, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5717, "tid": 5717, + "ts": 6302685465777.600, "dur": 13.290, + "args": { + "External id": 130585, "cbid": 147, "correlation": 241692045 + } + }, + { + "ph": "s", "id": 241692045, "pid": 5717, "tid": 5717, "ts": 6302685465777.600, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685611123.733, "dur": 1.056, + "args": { + "External id": 130586, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692061, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241692061, "pid": 3, "tid": 7, "ts": 6302685611123.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685466101.649, "dur": 62.550, + "args": { + "External id": 130586, "cbid": 211, "correlation": 241692061 + } + }, + { + "ph": "s", "id": 241692061, "pid": 5717, "tid": 5717, "ts": 6302685466101.649, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 3, "tid": 7, + "ts": 6302685611125.429, "dur": 0.992, + "args": { + "External id": 130592, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692071, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241692071, "pid": 3, "tid": 7, "ts": 6302685611125.429, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685467418.786, "dur": 56.460, + "args": { + "External id": 130592, "cbid": 211, "correlation": 241692071 + } + }, + { + "ph": "s", "id": 241692071, "pid": 5717, "tid": 5717, "ts": 6302685467418.786, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685611126.997, "dur": 1.088, + "args": { + "External id": 130593, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692081, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241692081, "pid": 3, "tid": 7, "ts": 6302685611126.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685467652.906, "dur": 29.290, + "args": { + "External id": 130593, "cbid": 211, "correlation": 241692081 + } + }, + { + "ph": "s", "id": 241692081, "pid": 5717, "tid": 5717, "ts": 6302685467652.906, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685611128.725, "dur": 0.960, + "args": { + "External id": 130594, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692091, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241692091, "pid": 3, "tid": 7, "ts": 6302685611128.725, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685467759.506, "dur": 28.729, + "args": { + "External id": 130594, "cbid": 211, "correlation": 241692091 + } + }, + { + "ph": "s", "id": 241692091, "pid": 5717, "tid": 5717, "ts": 6302685467759.506, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array)", "pid": 3, "tid": 7, + "ts": 6302685611130.293, "dur": 1.024, + "args": { + "External id": 130595, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692101, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241692101, "pid": 3, "tid": 7, "ts": 6302685611130.293, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685467897.205, "dur": 28.260, + "args": { + "External id": 130595, "cbid": 211, "correlation": 241692101 + } + }, + { + "ph": "s", "id": 241692101, "pid": 5717, "tid": 5717, "ts": 6302685467897.205, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float)", "pid": 3, "tid": 7, + "ts": 6302685611131.989, "dur": 116.289, + "args": { + "External id": 130599, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692107, "registers per thread": 28, "shared memory": 0, "blocks per SM": 2.500000, "warps per SM": 40.000000, "grid": [320, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 241692107, "pid": 3, "tid": 7, "ts": 6302685611131.989, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685470326.420, "dur": 117.489, + "args": { + "External id": 130599, "cbid": 211, "correlation": 241692107 + } + }, + { + "ph": "s", "id": 241692107, "pid": 5717, "tid": 5717, "ts": 6302685470326.420, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float)", "pid": 3, "tid": 7, + "ts": 6302685611248.854, "dur": 111.168, + "args": { + "External id": 130599, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692110, "registers per thread": 28, "shared memory": 0, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 241692110, "pid": 3, "tid": 7, "ts": 6302685611248.854, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685470470.680, "dur": 161.669, + "args": { + "External id": 130599, "cbid": 211, "correlation": 241692110 + } + }, + { + "ph": "s", "id": 241692110, "pid": 5717, "tid": 5717, "ts": 6302685470470.680, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685611360.662, "dur": 1.312, + "args": { + "External id": 130601, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692120, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241692120, "pid": 3, "tid": 7, "ts": 6302685611360.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685471217.208, "dur": 61.090, + "args": { + "External id": 130601, "cbid": 211, "correlation": 241692120 + } + }, + { + "ph": "s", "id": 241692120, "pid": 5717, "tid": 5717, "ts": 6302685471217.208, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685471401.347, "dur": 21.370, + "args": { + "External id": 130604, "cbid": 138, "correlation": 241692125 + } + }, + { + "ph": "f", "id": 241692125, "pid": 5717, "tid": 5717, "ts": 6302685471401.347, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685471427.177, "dur": 5.210, + "args": { + "External id": 130604, "cbid": 138, "correlation": 241692126 + } + }, + { + "ph": "f", "id": 241692126, "pid": 5717, "tid": 5717, "ts": 6302685471427.177, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685471435.307, "dur": 4.970, + "args": { + "External id": 130604, "cbid": 138, "correlation": 241692127 + } + }, + { + "ph": "f", "id": 241692127, "pid": 5717, "tid": 5717, "ts": 6302685471435.307, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685471442.917, "dur": 4.430, + "args": { + "External id": 130604, "cbid": 138, "correlation": 241692128 + } + }, + { + "ph": "f", "id": 241692128, "pid": 5717, "tid": 5717, "ts": 6302685471442.917, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685471449.387, "dur": 3.870, + "args": { + "External id": 130604, "cbid": 138, "correlation": 241692129 + } + }, + { + "ph": "f", "id": 241692129, "pid": 5717, "tid": 5717, "ts": 6302685471449.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685471456.037, "dur": 4.120, + "args": { + "External id": 130604, "cbid": 138, "correlation": 241692130 + } + }, + { + "ph": "f", "id": 241692130, "pid": 5717, "tid": 5717, "ts": 6302685471456.037, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685471462.057, "dur": 4.920, + "args": { + "External id": 130604, "cbid": 138, "correlation": 241692131 + } + }, + { + "ph": "f", "id": 241692131, "pid": 5717, "tid": 5717, "ts": 6302685471462.057, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 3, "tid": 7, + "ts": 6302685611366.966, "dur": 0.832, + "args": { + "External id": 130604, "device": 3, "context": 1, "stream": 7, "correlation": 241692134, "bytes": 1, "memory bandwidth (GB/s)": 0.001201923076923077 + } + }, + { + "ph": "f", "id": 241692134, "pid": 3, "tid": 7, "ts": 6302685611366.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685471483.477, "dur": 79.810, + "args": { + "External id": 130604, "cbid": 41, "correlation": 241692134 + } + }, + { + "ph": "s", "id": 241692134, "pid": 5717, "tid": 5717, "ts": 6302685471483.477, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542835.105, "dur": 12.600, + "args": { + "cbid": 138, "correlation": 241692136 + } + }, + { + "ph": "f", "id": 241692136, "pid": 5717, "tid": 423623104, "ts": 6302685542835.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542848.845, "dur": 2.311, + "args": { + "cbid": 138, "correlation": 241692137 + } + }, + { + "ph": "f", "id": 241692137, "pid": 5717, "tid": 423623104, "ts": 6302685542848.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542859.576, "dur": 1.620, + "args": { + "cbid": 138, "correlation": 241692138 + } + }, + { + "ph": "f", "id": 241692138, "pid": 5717, "tid": 423623104, "ts": 6302685542859.576, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542868.685, "dur": 1.560, + "args": { + "cbid": 138, "correlation": 241692139 + } + }, + { + "ph": "f", "id": 241692139, "pid": 5717, "tid": 423623104, "ts": 6302685542868.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542870.685, "dur": 1.160, + "args": { + "cbid": 138, "correlation": 241692140 + } + }, + { + "ph": "f", "id": 241692140, "pid": 5717, "tid": 423623104, "ts": 6302685542870.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542873.596, "dur": 1.220, + "args": { + "cbid": 138, "correlation": 241692141 + } + }, + { + "ph": "f", "id": 241692141, "pid": 5717, "tid": 423623104, "ts": 6302685542873.596, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542878.205, "dur": 1.631, + "args": { + "cbid": 138, "correlation": 241692142 + } + }, + { + "ph": "f", "id": 241692142, "pid": 5717, "tid": 423623104, "ts": 6302685542878.205, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542880.216, "dur": 1.100, + "args": { + "cbid": 138, "correlation": 241692143 + } + }, + { + "ph": "f", "id": 241692143, "pid": 5717, "tid": 423623104, "ts": 6302685542880.216, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542883.025, "dur": 1.140, + "args": { + "cbid": 138, "correlation": 241692144 + } + }, + { + "ph": "f", "id": 241692144, "pid": 5717, "tid": 423623104, "ts": 6302685542883.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542887.276, "dur": 1.580, + "args": { + "cbid": 138, "correlation": 241692145 + } + }, + { + "ph": "f", "id": 241692145, "pid": 5717, "tid": 423623104, "ts": 6302685542887.276, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542894.905, "dur": 2.540, + "args": { + "cbid": 138, "correlation": 241692146 + } + }, + { + "ph": "f", "id": 241692146, "pid": 5717, "tid": 423623104, "ts": 6302685542894.905, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542900.145, "dur": 1.330, + "args": { + "cbid": 138, "correlation": 241692147 + } + }, + { + "ph": "f", "id": 241692147, "pid": 5717, "tid": 423623104, "ts": 6302685542900.145, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542905.165, "dur": 2.140, + "args": { + "cbid": 138, "correlation": 241692148 + } + }, + { + "ph": "f", "id": 241692148, "pid": 5717, "tid": 423623104, "ts": 6302685542905.165, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542907.685, "dur": 1.200, + "args": { + "cbid": 138, "correlation": 241692149 + } + }, + { + "ph": "f", "id": 241692149, "pid": 5717, "tid": 423623104, "ts": 6302685542907.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542910.815, "dur": 1.150, + "args": { + "cbid": 138, "correlation": 241692150 + } + }, + { + "ph": "f", "id": 241692150, "pid": 5717, "tid": 423623104, "ts": 6302685542910.815, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542915.235, "dur": 1.810, + "args": { + "cbid": 138, "correlation": 241692151 + } + }, + { + "ph": "f", "id": 241692151, "pid": 5717, "tid": 423623104, "ts": 6302685542915.235, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542917.405, "dur": 1.050, + "args": { + "cbid": 138, "correlation": 241692152 + } + }, + { + "ph": "f", "id": 241692152, "pid": 5717, "tid": 423623104, "ts": 6302685542917.405, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542919.955, "dur": 1.170, + "args": { + "cbid": 138, "correlation": 241692153 + } + }, + { + "ph": "f", "id": 241692153, "pid": 5717, "tid": 423623104, "ts": 6302685542919.955, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542924.135, "dur": 1.870, + "args": { + "cbid": 138, "correlation": 241692154 + } + }, + { + "ph": "f", "id": 241692154, "pid": 5717, "tid": 423623104, "ts": 6302685542924.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542926.365, "dur": 1.070, + "args": { + "cbid": 138, "correlation": 241692155 + } + }, + { + "ph": "f", "id": 241692155, "pid": 5717, "tid": 423623104, "ts": 6302685542926.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542928.775, "dur": 1.130, + "args": { + "cbid": 138, "correlation": 241692156 + } + }, + { + "ph": "f", "id": 241692156, "pid": 5717, "tid": 423623104, "ts": 6302685542928.775, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542933.005, "dur": 1.530, + "args": { + "cbid": 138, "correlation": 241692157 + } + }, + { + "ph": "f", "id": 241692157, "pid": 5717, "tid": 423623104, "ts": 6302685542933.005, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542934.915, "dur": 1.030, + "args": { + "cbid": 138, "correlation": 241692158 + } + }, + { + "ph": "f", "id": 241692158, "pid": 5717, "tid": 423623104, "ts": 6302685542934.915, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542937.105, "dur": 1.160, + "args": { + "cbid": 138, "correlation": 241692159 + } + }, + { + "ph": "f", "id": 241692159, "pid": 5717, "tid": 423623104, "ts": 6302685542937.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542941.355, "dur": 1.530, + "args": { + "cbid": 138, "correlation": 241692160 + } + }, + { + "ph": "f", "id": 241692160, "pid": 5717, "tid": 423623104, "ts": 6302685542941.355, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542943.245, "dur": 1.040, + "args": { + "cbid": 138, "correlation": 241692161 + } + }, + { + "ph": "f", "id": 241692161, "pid": 5717, "tid": 423623104, "ts": 6302685542943.245, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542945.905, "dur": 1.150, + "args": { + "cbid": 138, "correlation": 241692162 + } + }, + { + "ph": "f", "id": 241692162, "pid": 5717, "tid": 423623104, "ts": 6302685542945.905, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542950.015, "dur": 1.380, + "args": { + "cbid": 138, "correlation": 241692163 + } + }, + { + "ph": "f", "id": 241692163, "pid": 5717, "tid": 423623104, "ts": 6302685542950.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542951.805, "dur": 1.140, + "args": { + "cbid": 138, "correlation": 241692164 + } + }, + { + "ph": "f", "id": 241692164, "pid": 5717, "tid": 423623104, "ts": 6302685542951.805, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542954.555, "dur": 1.190, + "args": { + "cbid": 138, "correlation": 241692165 + } + }, + { + "ph": "f", "id": 241692165, "pid": 5717, "tid": 423623104, "ts": 6302685542954.555, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542958.385, "dur": 1.440, + "args": { + "cbid": 138, "correlation": 241692166 + } + }, + { + "ph": "f", "id": 241692166, "pid": 5717, "tid": 423623104, "ts": 6302685542958.385, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542960.185, "dur": 1.080, + "args": { + "cbid": 138, "correlation": 241692167 + } + }, + { + "ph": "f", "id": 241692167, "pid": 5717, "tid": 423623104, "ts": 6302685542960.185, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542962.945, "dur": 1.180, + "args": { + "cbid": 138, "correlation": 241692168 + } + }, + { + "ph": "f", "id": 241692168, "pid": 5717, "tid": 423623104, "ts": 6302685542962.945, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542967.285, "dur": 4.800, + "args": { + "cbid": 138, "correlation": 241692169 + } + }, + { + "ph": "f", "id": 241692169, "pid": 5717, "tid": 423623104, "ts": 6302685542967.285, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542972.575, "dur": 1.140, + "args": { + "cbid": 138, "correlation": 241692170 + } + }, + { + "ph": "f", "id": 241692170, "pid": 5717, "tid": 423623104, "ts": 6302685542972.575, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542975.585, "dur": 1.130, + "args": { + "cbid": 138, "correlation": 241692171 + } + }, + { + "ph": "f", "id": 241692171, "pid": 5717, "tid": 423623104, "ts": 6302685542975.585, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542982.825, "dur": 1.880, + "args": { + "cbid": 138, "correlation": 241692172 + } + }, + { + "ph": "f", "id": 241692172, "pid": 5717, "tid": 423623104, "ts": 6302685542982.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542985.155, "dur": 1.110, + "args": { + "cbid": 138, "correlation": 241692173 + } + }, + { + "ph": "f", "id": 241692173, "pid": 5717, "tid": 423623104, "ts": 6302685542985.155, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542988.065, "dur": 4.100, + "args": { + "cbid": 138, "correlation": 241692174 + } + }, + { + "ph": "f", "id": 241692174, "pid": 5717, "tid": 423623104, "ts": 6302685542988.065, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685542995.245, "dur": 1.750, + "args": { + "cbid": 138, "correlation": 241692175 + } + }, + { + "ph": "f", "id": 241692175, "pid": 5717, "tid": 423623104, "ts": 6302685542995.245, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685543003.755, "dur": 1.400, + "args": { + "cbid": 138, "correlation": 241692177 + } + }, + { + "ph": "f", "id": 241692177, "pid": 5717, "tid": 423623104, "ts": 6302685543003.755, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685543011.005, "dur": 2.300, + "args": { + "cbid": 138, "correlation": 241692179 + } + }, + { + "ph": "f", "id": 241692179, "pid": 5717, "tid": 423623104, "ts": 6302685543011.005, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 5717, + "ts": 6302685471566.097, "dur": 139822.313, + "args": { + "External id": 130604, "cbid": 131, "correlation": 241692135 + } + }, + { + "ph": "s", "id": 241692135, "pid": 5717, "tid": 5717, "ts": 6302685471566.097, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 3, "tid": 7, + "ts": 6302685611717.593, "dur": 0.960, + "args": { + "External id": 130608, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692196, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241692196, "pid": 3, "tid": 7, "ts": 6302685611717.593, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685611656.070, "dur": 71.939, + "args": { + "External id": 130608, "cbid": 211, "correlation": 241692196 + } + }, + { + "ph": "s", "id": 241692196, "pid": 5717, "tid": 5717, "ts": 6302685611656.070, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 3, "tid": 7, + "ts": 6302685611860.346, "dur": 0.992, + "args": { + "External id": 130610, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692206, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 241692206, "pid": 3, "tid": 7, "ts": 6302685611860.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685611829.849, "dur": 38.120, + "args": { + "External id": 130610, "cbid": 211, "correlation": 241692206 + } + }, + { + "ph": "s", "id": 241692206, "pid": 5717, "tid": 5717, "ts": 6302685611829.849, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685611945.359, "dur": 12.080, + "args": { + "External id": 130613, "cbid": 138, "correlation": 241692211 + } + }, + { + "ph": "f", "id": 241692211, "pid": 5717, "tid": 5717, "ts": 6302685611945.359, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685611961.449, "dur": 5.040, + "args": { + "External id": 130613, "cbid": 138, "correlation": 241692212 + } + }, + { + "ph": "f", "id": 241692212, "pid": 5717, "tid": 5717, "ts": 6302685611961.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685611973.669, "dur": 3.950, + "args": { + "External id": 130613, "cbid": 138, "correlation": 241692213 + } + }, + { + "ph": "f", "id": 241692213, "pid": 5717, "tid": 5717, "ts": 6302685611973.669, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685611979.519, "dur": 5.400, + "args": { + "External id": 130613, "cbid": 138, "correlation": 241692214 + } + }, + { + "ph": "f", "id": 241692214, "pid": 5717, "tid": 5717, "ts": 6302685611979.519, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685611988.839, "dur": 3.850, + "args": { + "External id": 130613, "cbid": 138, "correlation": 241692215 + } + }, + { + "ph": "f", "id": 241692215, "pid": 5717, "tid": 5717, "ts": 6302685611988.839, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685611995.129, "dur": 5.080, + "args": { + "External id": 130613, "cbid": 138, "correlation": 241692216 + } + }, + { + "ph": "f", "id": 241692216, "pid": 5717, "tid": 5717, "ts": 6302685611995.129, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685612003.829, "dur": 5.220, + "args": { + "External id": 130613, "cbid": 138, "correlation": 241692217 + } + }, + { + "ph": "f", "id": 241692217, "pid": 5717, "tid": 5717, "ts": 6302685612003.829, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685612011.419, "dur": 3.470, + "args": { + "External id": 130613, "cbid": 138, "correlation": 241692218 + } + }, + { + "ph": "f", "id": 241692218, "pid": 5717, "tid": 5717, "ts": 6302685612011.419, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685612017.179, "dur": 3.880, + "args": { + "External id": 130613, "cbid": 138, "correlation": 241692219 + } + }, + { + "ph": "f", "id": 241692219, "pid": 5717, "tid": 5717, "ts": 6302685612017.179, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 5717, + "ts": 6302685612023.079, "dur": 4.520, + "args": { + "External id": 130613, "cbid": 138, "correlation": 241692220 + } + }, + { + "ph": "f", "id": 241692220, "pid": 5717, "tid": 5717, "ts": 6302685612023.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 3, "tid": 7, + "ts": 6302685612097.724, "dur": 1.024, + "args": { + "External id": 130613, "device": 3, "context": 1, "stream": 7, "correlation": 241692222, "bytes": 1, "memory bandwidth (GB/s)": 0.0009765625 + } + }, + { + "ph": "f", "id": 241692222, "pid": 3, "tid": 7, "ts": 6302685612097.724, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5717, "tid": 5717, + "ts": 6302685612038.579, "dur": 65.490, + "args": { + "External id": 130613, "cbid": 41, "correlation": 241692222 + } + }, + { + "ph": "s", "id": 241692222, "pid": 5717, "tid": 5717, "ts": 6302685612038.579, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5717, "tid": 5717, + "ts": 6302685612106.459, "dur": 13.530, + "args": { + "External id": 130613, "cbid": 131, "correlation": 241692223 + } + }, + { + "ph": "s", "id": 241692223, "pid": 5717, "tid": 5717, "ts": 6302685612106.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5717, "tid": 5717, + "ts": 6302685612793.077, "dur": 8.290, + "args": { + "cbid": 317, "correlation": 241692229 + } + }, + { + "ph": "f", "id": 241692229, "pid": 5717, "tid": 5717, "ts": 6302685612793.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float)", "pid": 3, "tid": 7, + "ts": 6302685615332.084, "dur": 1.728, + "args": { + "External id": 130616, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692232, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.859375, "warps per SM": 13.750000, "grid": [110, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 29 + } + }, + { + "ph": "f", "id": 241692232, "pid": 3, "tid": 7, "ts": 6302685615332.084, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685615286.432, "dur": 52.139, + "args": { + "External id": 130616, "cbid": 211, "correlation": 241692232 + } + }, + { + "ph": "s", "id": 241692232, "pid": 5717, "tid": 5717, "ts": 6302685615286.432, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float)", "pid": 3, "tid": 7, + "ts": 6302685615355.796, "dur": 1.536, + "args": { + "External id": 130616, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692235, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.140625, "warps per SM": 2.250000, "grid": [18, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 5 + } + }, + { + "ph": "f", "id": 241692235, "pid": 3, "tid": 7, "ts": 6302685615355.796, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685615344.511, "dur": 12.800, + "args": { + "External id": 130616, "cbid": 211, "correlation": 241692235 + } + }, + { + "ph": "s", "id": 241692235, "pid": 5717, "tid": 5717, "ts": 6302685615344.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 3, "tid": 7, + "ts": 6302685619290.674, "dur": 503.812, + "args": { + "External id": 130746, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692241, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.687500, "warps per SM": 27.000000, "grid": [216, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 56 + } + }, + { + "ph": "f", "id": 241692241, "pid": 3, "tid": 7, "ts": 6302685619290.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685619272.523, "dur": 21.119, + "args": { + "External id": 130746, "cbid": 211, "correlation": 241692241 + } + }, + { + "ph": "s", "id": 241692241, "pid": 5717, "tid": 5717, "ts": 6302685619272.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 3, "tid": 7, + "ts": 6302685619795.158, "dur": 255.266, + "args": { + "External id": 130746, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692244, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 241692244, "pid": 3, "tid": 7, "ts": 6302685619795.158, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685619309.322, "dur": 7.470, + "args": { + "External id": 130746, "cbid": 211, "correlation": 241692244 + } + }, + { + "ph": "s", "id": 241692244, "pid": 5717, "tid": 5717, "ts": 6302685619309.322, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 3, "tid": 7, + "ts": 6302685620051.096, "dur": 254.562, + "args": { + "External id": 130746, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692247, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 241692247, "pid": 3, "tid": 7, "ts": 6302685620051.096, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685619322.562, "dur": 5.400, + "args": { + "External id": 130746, "cbid": 211, "correlation": 241692247 + } + }, + { + "ph": "s", "id": 241692247, "pid": 5717, "tid": 5717, "ts": 6302685619322.562, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 3, "tid": 7, + "ts": 6302685620306.298, "dur": 253.378, + "args": { + "External id": 130746, "queued": 0, "device": 3, "context": 1, "stream": 7, "correlation": 241692250, "registers per thread": 64, "shared memory": 0, "blocks per SM": 0.554688, "warps per SM": 8.875000, "grid": [71, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 18 + } + }, + { + "ph": "f", "id": 241692250, "pid": 3, "tid": 7, "ts": 6302685620306.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5717, "tid": 5717, + "ts": 6302685619331.932, "dur": 4.940, + "args": { + "External id": 130746, "cbid": 211, "correlation": 241692250 + } + }, + { + "ph": "s", "id": 241692250, "pid": 5717, "tid": 5717, "ts": 6302685619331.932, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceSynchronize", "pid": 5717, "tid": 5717, + "ts": 6302685619630.982, "dur": 931.127, + "args": { + "cbid": 165, "correlation": 241692256 + } + }, + { + "ph": "s", "id": 241692256, "pid": 5717, "tid": 5717, "ts": 6302685619630.982, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685643079.978, "dur": 9.051, + "args": { + "cbid": 138, "correlation": 241692258 + } + }, + { + "ph": "f", "id": 241692258, "pid": 5717, "tid": 423623104, "ts": 6302685643079.978, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685643089.498, "dur": 0.720, + "args": { + "cbid": 138, "correlation": 241692259 + } + }, + { + "ph": "f", "id": 241692259, "pid": 5717, "tid": 423623104, "ts": 6302685643089.498, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685643095.958, "dur": 0.480, + "args": { + "cbid": 138, "correlation": 241692260 + } + }, + { + "ph": "f", "id": 241692260, "pid": 5717, "tid": 423623104, "ts": 6302685643095.958, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685643100.449, "dur": 1.100, + "args": { + "cbid": 138, "correlation": 241692261 + } + }, + { + "ph": "f", "id": 241692261, "pid": 5717, "tid": 423623104, "ts": 6302685643100.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685643101.678, "dur": 0.480, + "args": { + "cbid": 138, "correlation": 241692262 + } + }, + { + "ph": "f", "id": 241692262, "pid": 5717, "tid": 423623104, "ts": 6302685643101.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685643103.118, "dur": 0.400, + "args": { + "cbid": 138, "correlation": 241692263 + } + }, + { + "ph": "f", "id": 241692263, "pid": 5717, "tid": 423623104, "ts": 6302685643103.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685643107.798, "dur": 1.160, + "args": { + "cbid": 138, "correlation": 241692264 + } + }, + { + "ph": "f", "id": 241692264, "pid": 5717, "tid": 423623104, "ts": 6302685643107.798, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685643109.089, "dur": 0.420, + "args": { + "cbid": 138, "correlation": 241692265 + } + }, + { + "ph": "f", "id": 241692265, "pid": 5717, "tid": 423623104, "ts": 6302685643109.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5717, "tid": 423623104, + "ts": 6302685643110.549, "dur": 0.420, + "args": { + "cbid": 138, "correlation": 241692266 + } + }, + { + "ph": "f", "id": 241692266, "pid": 5717, "tid": 423623104, "ts": 6302685643110.549, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "Optimizer.step#AdamW.step", "pid": 3, "tid": 7, + "ts": 6302685615332.083, "dur": 5227.594, + "args": { + "External id": 130615 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce", "pid": 3, "tid": 7, + "ts": 6302685545573.414, "dur": 349.221, + "args": { + "External id": 130423 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.0)", "pid": 3, "tid": 7, + "ts": 6302685534182.416, "dur": 48.002, + "args": { + "External id": 130385 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 3, "tid": 7, + "ts": 6302685524791.881, "dur": 131.011, + "args": { + "External id": 130280 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 3, "tid": 7, + "ts": 6302685255558.078, "dur": 360.101, + "args": { + "External id": 124098 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 3, "tid": 7, + "ts": 6302685478450.636, "dur": 214.020, + "args": { + "External id": 129782 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 3, "tid": 7, + "ts": 6302685250404.247, "dur": 350.341, + "args": { + "External id": 123975 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 3, "tid": 7, + "ts": 6302685246207.864, "dur": 30.594, + "args": { + "External id": 123852 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.2)", "pid": 3, "tid": 7, + "ts": 6302685504361.360, "dur": 675.911, + "args": { + "External id": 130087 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 3, "tid": 7, + "ts": 6302685241866.071, "dur": 27.522, + "args": { + "External id": 123729 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 3, "tid": 7, + "ts": 6302685463027.576, "dur": 231.620, + "args": { + "External id": 129616 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 3, "tid": 7, + "ts": 6302685237557.046, "dur": 17.731, + "args": { + "External id": 123606 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "ProfilerStep#8191", "pid": 3, "tid": 7, + "ts": 6302685181822.259, "dur": 430276.490, + "args": { + "External id": 122881 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 3, "tid": 7, + "ts": 6302685220110.035, "dur": 27.330, + "args": { + "External id": 123360 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 3, "tid": 7, + "ts": 6302685449228.656, "dur": 549.030, + "args": { + "External id": 129450 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 3, "tid": 7, + "ts": 6302685215970.964, "dur": 568.230, + "args": { + "External id": 123237 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.9)", "pid": 3, "tid": 7, + "ts": 6302685403545.624, "dur": 38.979, + "args": { + "External id": 128925 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 3, "tid": 7, + "ts": 6302685509434.166, "dur": 245.380, + "args": { + "External id": 130114 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.1)", "pid": 3, "tid": 7, + "ts": 6302685519820.676, "dur": 541.478, + "args": { + "External id": 130253 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 3, "tid": 7, + "ts": 6302685211818.613, "dur": 338.948, + "args": { + "External id": 123114 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.4)", "pid": 3, "tid": 7, + "ts": 6302685473285.477, "dur": 601.063, + "args": { + "External id": 129755 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 3, "tid": 7, + "ts": 6302685225775.806, "dur": 134.659, + "args": { + "External id": 123483 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.3)", "pid": 3, "tid": 7, + "ts": 6302685488963.484, "dur": 505.957, + "args": { + "External id": 129921 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out", "pid": 3, "tid": 7, + "ts": 6302685207752.278, "dur": 714.759, + "args": { + "External id": 123028 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.9)", "pid": 3, "tid": 7, + "ts": 6302685260766.181, "dur": 20.962, + "args": { + "External id": 124221 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 3, "tid": 7, + "ts": 6302685420971.163, "dur": 241.957, + "args": { + "External id": 129118 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 3, "tid": 7, + "ts": 6302685403587.737, "dur": 265.604, + "args": { + "External id": 128952 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.8)", "pid": 3, "tid": 7, + "ts": 6302685417239.680, "dur": 378.756, + "args": { + "External id": 129091 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.7)", "pid": 3, "tid": 7, + "ts": 6302685430472.931, "dur": 392.453, + "args": { + "External id": 129257 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 3, "tid": 7, + "ts": 6302685434909.956, "dur": 580.647, + "args": { + "External id": 129284 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.6)", "pid": 3, "tid": 7, + "ts": 6302685444276.555, "dur": 374.597, + "args": { + "External id": 129423 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.5)", "pid": 3, "tid": 7, + "ts": 6302685458485.686, "dur": 466.373, + "args": { + "External id": 129589 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 3, "tid": 7, + "ts": 6302685493796.672, "dur": 232.260, + "args": { + "External id": 129948 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 3, "tid": 17, + "ts": 6302685450348.857, "dur": 330.276, + "args": { + "External id": 129469 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 3, "tid": 17, + "ts": 6302685436082.573, "dur": 274.884, + "args": { + "External id": 129303 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 3, "tid": 17, + "ts": 6302685421722.625, "dur": 467.398, + "args": { + "External id": 129137 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather", "pid": 3, "tid": 17, + "ts": 6302685182791.802, "dur": 295.652, + "args": { + "External id": 122937 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 3, "tid": 17, + "ts": 6302685212178.711, "dur": 393.702, + "args": { + "External id": 123327 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 3, "tid": 17, + "ts": 6302685463865.375, "dur": 606.246, + "args": { + "External id": 129635 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 3, "tid": 17, + "ts": 6302685184814.377, "dur": 263.748, + "args": { + "External id": 123081 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 3, "tid": 17, + "ts": 6302685208470.043, "dur": 74.979, + "args": { + "External id": 123204 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 3, "tid": 17, + "ts": 6302685479368.275, "dur": 795.656, + "args": { + "External id": 129801 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 3, "tid": 17, + "ts": 6302685220730.552, "dur": 1379.564, + "args": { + "External id": 123450 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 3, "tid": 17, + "ts": 6302685234679.713, "dur": 207.875, + "args": { + "External id": 123573 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 3, "tid": 17, + "ts": 6302685238595.742, "dur": 196.900, + "args": { + "External id": 123696 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 3, "tid": 17, + "ts": 6302685381129.199, "dur": 163.748, + "args": { + "External id": 128805 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 3, "tid": 17, + "ts": 6302685494654.150, "dur": 597.575, + "args": { + "External id": 129967 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 3, "tid": 17, + "ts": 6302685242033.368, "dur": 183.268, + "args": { + "External id": 123819 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 3, "tid": 17, + "ts": 6302685245868.501, "dur": 197.220, + "args": { + "External id": 123942 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 3, "tid": 17, + "ts": 6302685249810.899, "dur": 178.755, + "args": { + "External id": 124065 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 3, "tid": 17, + "ts": 6302685510284.796, "dur": 753.896, + "args": { + "External id": 130133 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.9)", "pid": 3, "tid": 17, + "ts": 6302685253769.520, "dur": 168.196, + "args": { + "External id": 124188 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 3, "tid": 17, + "ts": 6302685404400.383, "dur": 419.365, + "args": { + "External id": 128971 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:all_reduce", "pid": 3, "tid": 20, + "ts": 6302685611077.587, "dur": 45.443, + "args": { + "External id": 130582 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685545925.865, "dur": 64975.403, + "args": { + "External id": 130437 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685519814.180, "dur": 4975.751, + "args": { + "External id": 130164 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685250402.263, "dur": 5151.753, + "args": { + "External id": 124096 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685255554.718, "dur": 5208.873, + "args": { + "External id": 124219 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685403586.233, "dur": 13650.409, + "args": { + "External id": 128939 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685246274.424, "dur": 4127.137, + "args": { + "External id": 123973 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685534322.001, "dur": 11244.023, + "args": { + "External id": 130399 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685242396.059, "dur": 3804.319, + "args": { + "External id": 123850 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685504356.688, "dur": 5074.600, + "args": { + "External id": 129998 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685434903.204, "dur": 9371.849, + "args": { + "External id": 129271 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685239020.833, "dur": 2841.080, + "args": { + "External id": 123727 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685235095.684, "dur": 2455.156, + "args": { + "External id": 123604 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685215964.532, "dur": 4140.481, + "args": { + "External id": 123358 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685207748.982, "dur": 4064.577, + "args": { + "External id": 123112 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685211814.805, "dur": 4149.057, + "args": { + "External id": 123235 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685473285.765, "dur": 5162.569, + "args": { + "External id": 129666 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685223121.098, "dur": 2490.037, + "args": { + "External id": 123481 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685488961.084, "dur": 4832.646, + "args": { + "External id": 129832 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685183267.998, "dur": 24479.482, + "args": { + "External id": 123026 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685449223.408, "dur": 9259.336, + "args": { + "External id": 129437 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685420970.171, "dur": 9501.098, + "args": { + "External id": 129105 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685381617.587, "dur": 6992.855, + "args": { + "External id": 128836 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685463027.128, "dur": 10256.047, + "args": { + "External id": 129603 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685509433.078, "dur": 10380.432, + "args": { + "External id": 130101 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685444276.555, "dur": 4945.191, + "args": { + "External id": 129334 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685417237.696, "dur": 3731.229, + "args": { + "External id": 129002 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685478451.180, "dur": 10507.442, + "args": { + "External id": 129769 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685524792.457, "dur": 9382.601, + "args": { + "External id": 130267 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685458483.958, "dur": 4540.324, + "args": { + "External id": 129500 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 3, "tid": 20, + "ts": 6302685430472.259, "dur": 4429.219, + "args": { + "External id": 129168 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 3, "tid": 20, + "ts": 6302685493795.424, "dur": 10559.762, + "args": { + "External id": 129935 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 5717, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 5717, "tid": 0, + "args": { + "labels": "CPU" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 5717, "tid": 0, + "args": { + "sort_index": 5717 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 0, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 0, "tid": 0, + "args": { + "labels": "GPU 0" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 0, "tid": 0, + "args": { + "sort_index": 5000000 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 1, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 1, "tid": 0, + "args": { + "labels": "GPU 1" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 1, "tid": 0, + "args": { + "sort_index": 5000001 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 2, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 2, "tid": 0, + "args": { + "labels": "GPU 2" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 2, "tid": 0, + "args": { + "sort_index": 5000002 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 3, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 3, "tid": 0, + "args": { + "labels": "GPU 3" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 3, "tid": 0, + "args": { + "sort_index": 5000003 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 4, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 4, "tid": 0, + "args": { + "labels": "GPU 4" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 4, "tid": 0, + "args": { + "sort_index": 5000004 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 5, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 5, "tid": 0, + "args": { + "labels": "GPU 5" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 5, "tid": 0, + "args": { + "sort_index": 5000005 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 6, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 6, "tid": 0, + "args": { + "labels": "GPU 6" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 6, "tid": 0, + "args": { + "sort_index": 5000006 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 7, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 7, "tid": 0, + "args": { + "labels": "GPU 7" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 7, "tid": 0, + "args": { + "sort_index": 5000007 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 8, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 8, "tid": 0, + "args": { + "labels": "GPU 8" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 8, "tid": 0, + "args": { + "sort_index": 5000008 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 9, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 9, "tid": 0, + "args": { + "labels": "GPU 9" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 9, "tid": 0, + "args": { + "sort_index": 5000009 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 10, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 10, "tid": 0, + "args": { + "labels": "GPU 10" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 10, "tid": 0, + "args": { + "sort_index": 5000010 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 11, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 11, "tid": 0, + "args": { + "labels": "GPU 11" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 11, "tid": 0, + "args": { + "sort_index": 5000011 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 12, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 12, "tid": 0, + "args": { + "labels": "GPU 12" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 12, "tid": 0, + "args": { + "sort_index": 5000012 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 13, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 13, "tid": 0, + "args": { + "labels": "GPU 13" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 13, "tid": 0, + "args": { + "sort_index": 5000013 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 14, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 14, "tid": 0, + "args": { + "labels": "GPU 14" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 14, "tid": 0, + "args": { + "sort_index": 5000014 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6302684941819.180, "pid": 15, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6302684941819.180, "pid": 15, "tid": 0, + "args": { + "labels": "GPU 15" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 15, "tid": 0, + "args": { + "sort_index": 5000015 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6302684941819.180, "pid": 3, "tid": 7, + "args": { + "name": "stream 7 " + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 3, "tid": 7, + "args": { + "sort_index": 7 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6302684941819.180, "pid": 3, "tid": 17, + "args": { + "name": "stream 17 " + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 3, "tid": 17, + "args": { + "sort_index": 17 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6302684941819.180, "pid": 3, "tid": 20, + "args": { + "name": "stream 20 " + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 3, "tid": 20, + "args": { + "sort_index": 20 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6302684941819.180, "pid": 5717, "tid": 6759, + "args": { + "name": "thread 6759 (pt_autograd_3)" + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 5717, "tid": 6759, + "args": { + "sort_index": 6759 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6302684941819.180, "pid": 5717, "tid": 6759, + "args": { + "name": "thread 6759 (python3)" + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 5717, "tid": 6759, + "args": { + "sort_index": 6759 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6302684941819.180, "pid": 5717, "tid": 5717, + "args": { + "name": "thread 5717 (python3)" + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6302684941819.180, "pid": 5717, "tid": 5717, + "args": { + "sort_index": 5717 + } + }, + { + "ph": "X", "cat": "Trace", "ts": 6302684941741.280, "dur": 678832.234, + "pid": "Spans", "tid": "PyTorch Profiler", + "name": "PyTorch Profiler (0)", + "args": { + "Op count": 0 + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6302684941741.280, + "pid": "Spans", "tid": 0, + "args": { + "sort_index": 536870912 + } + }, + { + "name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g", + "pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 6302684941741.280 + }, + { + "name": "Record Window End", "ph": "i", "s": "g", + "pid": "", "tid": "", "ts": 6302685646941.744 + } + ], + "traceName": "exp/mtp.120M.batch8.seqlen2048.context2048.warmup1000.update1.steps15000.nft4.lr5e-4.cosine/profile_trace/iteration_8192/rank3_trace.json", + "displayTimeUnit": "ms", + "baseTimeNanoseconds": 1743521598000000000 +} \ No newline at end of file diff --git a/profile_trace/iteration_9728/rank0_trace.json b/profile_trace/iteration_9728/rank0_trace.json new file mode 100644 index 0000000000000000000000000000000000000000..2583d0f4e1fec71d9c65a024010d3c7f2a51266b --- /dev/null +++ b/profile_trace/iteration_9728/rank0_trace.json @@ -0,0 +1,109173 @@ + +{ + "schemaVersion": 1, + "deviceProperties": [ + { + "id": 0, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + }, + { + "id": 1, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + }, + { + "id": 2, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + }, + { + "id": 3, "name": "NVIDIA GeForce RTX 4090", "totalGlobalMem": 25386352640, + "computeMajor": 8, "computeMinor": 9, + "maxThreadsPerBlock": 1024, "maxThreadsPerMultiprocessor": 1536, + "regsPerBlock": 65536, "warpSize": 32, + "sharedMemPerBlock": 49152, "numSms": 128 + , "regsPerMultiprocessor": 65536, "sharedMemPerBlockOptin": 101376, "sharedMemPerMultiprocessor": 102400 + } + ], + "cupti_version": 22, + "cuda_runtime_version": 12040, + "cuda_driver_version": 12040, + "distributedInfo": {"backend": "nccl", "rank": 0, "world_size": 4, "pg_count": 1, "pg_config": [{"pg_name": "0", "pg_desc": "default_pg", "backend_config": "cuda:nccl", "pg_size": 4, "ranks": [0, 1, 2, 3]}], "nccl_version": "2.21.5"}, + "record_shapes": 1, + "trace_id": "C092A424BA0A4E2889C4CF39284A3272", + "traceEvents": [ + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: DivBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771871327.961, "dur": 70.229, + "args": { + "External id": 151553,"Record function id": 0, "Sequence number": 3058984, "Fwd thread id": 1, "Ev Idx": 0 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "DivBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771871341.910, "dur": 49.460, + "args": { + "External id": 151554,"Sequence number": 3058984, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 1 + } + }, + { + "ph": "f", "id": 1, "pid": 5714, "tid": 6744, "ts": 6303771871341.910, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771871347.820, "dur": 41.160, + "args": { + "External id": 151555,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 2 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771871409.740, "dur": 174.790, + "args": { + "External id": 151556,"Record function id": 0, "Ev Idx": 3 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward", "pid": 5714, "tid": 6744, + "ts": 6303771871441.710, "dur": 78.210, + "args": { + "External id": 151557,"Record function id": 0, "Ev Idx": 4 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.9", "pid": 5714, "tid": 6744, + "ts": 6303771871468.100, "dur": 36.020, + "args": { + "External id": 151558,"Record function id": 0, "Ev Idx": 5 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771871525.730, "dur": 2.290, + "args": { + "External id": 151559,"Sequence number": 3058983, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 6 + } + }, + { + "ph": "f", "id": 2, "pid": 5714, "tid": 6744, "ts": 6303771871525.730, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771871531.450, "dur": 48.310, + "args": { + "External id": 151560,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771871538.220, "dur": 40.830, + "args": { + "External id": 151561,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 8 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771871550.420, "dur": 2.630, + "args": { + "External id": 151562,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 9 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771871598.400, "dur": 15755.885, + "args": { + "External id": 151563,"Record function id": 0, "Sequence number": 3058981, "Fwd thread id": 1, "Ev Idx": 10 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771871601.490, "dur": 15743.355, + "args": { + "External id": 151564,"Sequence number": 3058981, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 11 + } + }, + { + "ph": "f", "id": 3, "pid": 5714, "tid": 6744, "ts": 6303771871601.490, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771871641.180, "dur": 4.610, + "args": { + "External id": 151565,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 12 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771871650.410, "dur": 15596.545, + "args": { + "External id": 151566,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 13 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771871651.780, "dur": 15594.855, + "args": { + "External id": 151567,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 14 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771871654.310, "dur": 7.880, + "args": { + "External id": 151568,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 15 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771871663.650, "dur": 15582.255, + "args": { + "External id": 151569,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 16 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5714, "tid": 6744, + "ts": 6303771887250.915, "dur": 0.610, + "args": { + "External id": 151570,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 17 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5714, "tid": 6744, + "ts": 6303771887253.225, "dur": 2.130, + "args": { + "External id": 151571,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 18 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5714, "tid": 6744, + "ts": 6303771887254.325, "dur": 0.850, + "args": { + "External id": 151572,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 19 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 6744, + "ts": 6303771887260.425, "dur": 30.430, + "args": { + "External id": 151573,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 20 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 6744, + "ts": 6303771887307.605, "dur": 27.170, + "args": { + "External id": 151574,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 21 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 6744, + "ts": 6303771887309.305, "dur": 25.150, + "args": { + "External id": 151575,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 22 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 6744, + "ts": 6303771887310.805, "dur": 23.110, + "args": { + "External id": 151576,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 23 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887369.685, "dur": 17.040, + "args": { + "External id": 151577,"Record function id": 0, "Sequence number": 3058980, "Fwd thread id": 1, "Ev Idx": 24 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887372.385, "dur": 11.600, + "args": { + "External id": 151578,"Sequence number": 3058980, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 25 + } + }, + { + "ph": "f", "id": 4, "pid": 5714, "tid": 6744, "ts": 6303771887372.385, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771887376.525, "dur": 7.150, + "args": { + "External id": 151579,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 26 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771887378.665, "dur": 4.550, + "args": { + "External id": 151580,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 27 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887392.525, "dur": 79.049, + "args": { + "External id": 151581,"Record function id": 0, "Sequence number": 3058979, "Fwd thread id": 1, "Ev Idx": 28 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887394.085, "dur": 70.460, + "args": { + "External id": 151582,"Sequence number": 3058979, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 29 + } + }, + { + "ph": "f", "id": 5, "pid": 5714, "tid": 6744, "ts": 6303771887394.085, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771887397.535, "dur": 66.279, + "args": { + "External id": 151583,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 30 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771887402.885, "dur": 30.329, + "args": { + "External id": 151584,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 31 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771887404.925, "dur": 6.670, + "args": { + "External id": 151585,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 32 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771887412.705, "dur": 20.080, + "args": { + "External id": 151586,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 33 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771887414.905, "dur": 16.929, + "args": { + "External id": 151587,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 34 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771887436.665, "dur": 4.080, + "args": { + "External id": 151588,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 35 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771887439.085, "dur": 1.149, + "args": { + "External id": 151589,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 36 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771887441.725, "dur": 21.000, + "args": { + "External id": 151590,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 37 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887480.334, "dur": 60.170, + "args": { + "External id": 151591,"Record function id": 0, "Sequence number": 3058978, "Fwd thread id": 1, "Ev Idx": 38 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887482.225, "dur": 53.759, + "args": { + "External id": 151592,"Sequence number": 3058978, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 39 + } + }, + { + "ph": "f", "id": 6, "pid": 5714, "tid": 6744, "ts": 6303771887482.225, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5714, "tid": 6744, + "ts": 6303771887485.465, "dur": 50.059, + "args": { + "External id": 151593,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "3"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 40 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771887489.054, "dur": 23.451, + "args": { + "External id": 151594,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 41 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771887490.385, "dur": 6.109, + "args": { + "External id": 151595,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 42 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771887497.445, "dur": 14.669, + "args": { + "External id": 151596,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 43 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771887498.674, "dur": 12.531, + "args": { + "External id": 151597,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 44 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6303771887515.205, "dur": 5.200, + "args": { + "External id": 151598,"Record function id": 0, "Concrete Inputs": ["", "2", "3"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 45 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771887518.545, "dur": 1.169, + "args": { + "External id": 151599,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 46 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771887521.245, "dur": 13.509, + "args": { + "External id": 151600,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 47 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887548.294, "dur": 67.950, + "args": { + "External id": 151601,"Record function id": 0, "Sequence number": 3058977, "Fwd thread id": 1, "Ev Idx": 48 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887549.954, "dur": 53.870, + "args": { + "External id": 151602,"Sequence number": 3058977, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 49 + } + }, + { + "ph": "f", "id": 7, "pid": 5714, "tid": 6744, "ts": 6303771887549.954, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771887552.034, "dur": 51.330, + "args": { + "External id": 151603,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 50 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771887554.784, "dur": 20.190, + "args": { + "External id": 151604,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 51 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771887555.874, "dur": 5.150, + "args": { + "External id": 151605,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 52 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771887561.904, "dur": 12.740, + "args": { + "External id": 151606,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 53 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771887563.344, "dur": 10.410, + "args": { + "External id": 151607,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 54 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771887577.394, "dur": 2.720, + "args": { + "External id": 151608,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 55 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771887578.904, "dur": 0.840, + "args": { + "External id": 151609,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 56 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771887580.914, "dur": 21.670, + "args": { + "External id": 151610,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 57 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887628.944, "dur": 52.630, + "args": { + "External id": 151611,"Record function id": 0, "Sequence number": 3058976, "Fwd thread id": 1, "Ev Idx": 58 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887630.674, "dur": 46.650, + "args": { + "External id": 151612,"Sequence number": 3058976, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 59 + } + }, + { + "ph": "f", "id": 8, "pid": 5714, "tid": 6744, "ts": 6303771887630.674, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771887632.594, "dur": 44.370, + "args": { + "External id": 151613,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 60 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771887634.274, "dur": 20.160, + "args": { + "External id": 151614,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 61 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771887635.434, "dur": 5.140, + "args": { + "External id": 151615,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 62 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771887641.484, "dur": 12.670, + "args": { + "External id": 151616,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 63 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771887642.544, "dur": 10.820, + "args": { + "External id": 151617,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 64 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771887656.694, "dur": 4.970, + "args": { + "External id": 151618,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 65 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771887659.464, "dur": 1.840, + "args": { + "External id": 151619,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 66 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771887662.374, "dur": 13.800, + "args": { + "External id": 151620,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 67 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887689.334, "dur": 33.170, + "args": { + "External id": 151621,"Record function id": 0, "Sequence number": 3058975, "Fwd thread id": 1, "Ev Idx": 68 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771887691.014, "dur": 0.910, + "args": { + "External id": 151622,"Sequence number": 3058975, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 69 + } + }, + { + "ph": "f", "id": 9, "pid": 5714, "tid": 6744, "ts": 6303771887691.014, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771887693.844, "dur": 25.810, + "args": { + "External id": 151623,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 70 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771887695.564, "dur": 23.590, + "args": { + "External id": 151624,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 71 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771887702.534, "dur": 0.720, + "args": { + "External id": 151625,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 72 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771887729.864, "dur": 680.299, + "args": { + "External id": 151626,"Record function id": 0, "Sequence number": 3058973, "Fwd thread id": 1, "Ev Idx": 73 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771887731.874, "dur": 647.618, + "args": { + "External id": 151627,"Sequence number": 3058973, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 74 + } + }, + { + "ph": "f", "id": 10, "pid": 5714, "tid": 6744, "ts": 6303771887731.874, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771887759.254, "dur": 2.680, + "args": { + "External id": 151628,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 75 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771887763.944, "dur": 551.299, + "args": { + "External id": 151629,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 76 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771887766.054, "dur": 548.899, + "args": { + "External id": 151630,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 77 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771887768.174, "dur": 7.130, + "args": { + "External id": 151631,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 78 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771887776.404, "dur": 537.889, + "args": { + "External id": 151632,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 79 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5714, "tid": 6744, + "ts": 6303771888318.043, "dur": 0.240, + "args": { + "External id": 151633,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 80 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5714, "tid": 6744, + "ts": 6303771888319.493, "dur": 1.980, + "args": { + "External id": 151634,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 81 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5714, "tid": 6744, + "ts": 6303771888320.543, "dur": 0.790, + "args": { + "External id": 151635,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 82 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 6744, + "ts": 6303771888324.903, "dur": 22.169, + "args": { + "External id": 151636,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 83 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 6744, + "ts": 6303771888352.363, "dur": 20.109, + "args": { + "External id": 151637,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 84 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 6744, + "ts": 6303771888353.323, "dur": 18.849, + "args": { + "External id": 151638,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 85 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 6744, + "ts": 6303771888354.363, "dur": 17.400, + "args": { + "External id": 151639,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 86 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771888388.812, "dur": 17.680, + "args": { + "External id": 151640,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 87 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888422.732, "dur": 12.240, + "args": { + "External id": 151641,"Record function id": 0, "Sequence number": 3058972, "Fwd thread id": 1, "Ev Idx": 88 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888424.952, "dur": 7.510, + "args": { + "External id": 151642,"Sequence number": 3058972, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 89 + } + }, + { + "ph": "f", "id": 11, "pid": 5714, "tid": 6744, "ts": 6303771888424.952, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771888427.622, "dur": 4.550, + "args": { + "External id": 151643,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 90 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771888428.662, "dur": 3.230, + "args": { + "External id": 151644,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 91 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888439.312, "dur": 61.000, + "args": { + "External id": 151645,"Record function id": 0, "Sequence number": 3058971, "Fwd thread id": 1, "Ev Idx": 92 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888440.582, "dur": 53.120, + "args": { + "External id": 151646,"Sequence number": 3058971, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 93 + } + }, + { + "ph": "f", "id": 12, "pid": 5714, "tid": 6744, "ts": 6303771888440.582, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771888442.392, "dur": 50.730, + "args": { + "External id": 151647,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 94 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771888444.932, "dur": 24.390, + "args": { + "External id": 151648,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 95 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771888446.492, "dur": 5.850, + "args": { + "External id": 151649,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 96 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771888453.372, "dur": 15.590, + "args": { + "External id": 151650,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 97 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771888454.822, "dur": 13.300, + "args": { + "External id": 151651,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 98 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771888472.012, "dur": 3.760, + "args": { + "External id": 151652,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 99 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771888474.382, "dur": 0.940, + "args": { + "External id": 151653,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771888476.692, "dur": 15.540, + "args": { + "External id": 151654,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888508.562, "dur": 55.740, + "args": { + "External id": 151655,"Record function id": 0, "Sequence number": 3058970, "Fwd thread id": 1, "Ev Idx": 102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888510.342, "dur": 48.720, + "args": { + "External id": 151656,"Sequence number": 3058970, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 103 + } + }, + { + "ph": "f", "id": 13, "pid": 5714, "tid": 6744, "ts": 6303771888510.342, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5714, "tid": 6744, + "ts": 6303771888512.542, "dur": 46.040, + "args": { + "External id": 151657,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "2"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771888514.322, "dur": 22.470, + "args": { + "External id": 151658,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771888515.512, "dur": 6.450, + "args": { + "External id": 151659,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771888522.882, "dur": 13.610, + "args": { + "External id": 151660,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771888523.962, "dur": 11.700, + "args": { + "External id": 151661,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6303771888538.972, "dur": 5.240, + "args": { + "External id": 151662,"Record function id": 0, "Concrete Inputs": ["", "2", "2"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771888542.172, "dur": 1.340, + "args": { + "External id": 151663,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771888544.952, "dur": 12.810, + "args": { + "External id": 151664,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888573.222, "dur": 65.980, + "args": { + "External id": 151665,"Record function id": 0, "Sequence number": 3058969, "Fwd thread id": 1, "Ev Idx": 112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888575.192, "dur": 59.050, + "args": { + "External id": 151666,"Sequence number": 3058969, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 113 + } + }, + { + "ph": "f", "id": 14, "pid": 5714, "tid": 6744, "ts": 6303771888575.192, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771888577.562, "dur": 56.140, + "args": { + "External id": 151667,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771888579.562, "dur": 22.920, + "args": { + "External id": 151668,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771888580.732, "dur": 5.920, + "args": { + "External id": 151669,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771888587.542, "dur": 14.580, + "args": { + "External id": 151670,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771888589.052, "dur": 12.000, + "args": { + "External id": 151671,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771888605.632, "dur": 4.920, + "args": { + "External id": 151672,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771888609.022, "dur": 1.030, + "args": { + "External id": 151673,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771888611.462, "dur": 21.240, + "args": { + "External id": 151674,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888648.582, "dur": 89.440, + "args": { + "External id": 151675,"Record function id": 0, "Sequence number": 3058968, "Fwd thread id": 1, "Ev Idx": 122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888650.552, "dur": 64.240, + "args": { + "External id": 151676,"Sequence number": 3058968, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 123 + } + }, + { + "ph": "f", "id": 15, "pid": 5714, "tid": 6744, "ts": 6303771888650.552, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771888652.832, "dur": 61.450, + "args": { + "External id": 151677,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771888654.682, "dur": 36.250, + "args": { + "External id": 151678,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771888656.062, "dur": 5.670, + "args": { + "External id": 151679,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771888662.782, "dur": 27.760, + "args": { + "External id": 151680,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771888664.082, "dur": 25.440, + "args": { + "External id": 151681,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771888692.402, "dur": 3.420, + "args": { + "External id": 151682,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771888694.382, "dur": 0.970, + "args": { + "External id": 151683,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771888697.672, "dur": 15.650, + "args": { + "External id": 151684,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771888721.032, "dur": 13.820, + "args": { + "External id": 151685,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888748.062, "dur": 37.729, + "args": { + "External id": 151686,"Record function id": 0, "Sequence number": 3058967, "Fwd thread id": 1, "Ev Idx": 133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771888750.382, "dur": 1.110, + "args": { + "External id": 151687,"Sequence number": 3058967, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 134 + } + }, + { + "ph": "f", "id": 16, "pid": 5714, "tid": 6744, "ts": 6303771888750.382, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771888753.762, "dur": 28.840, + "args": { + "External id": 151688,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771888755.732, "dur": 26.370, + "args": { + "External id": 151689,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771888764.072, "dur": 0.880, + "args": { + "External id": 151690,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771888793.331, "dur": 1050.508, + "args": { + "External id": 151691,"Record function id": 0, "Sequence number": 3058965, "Fwd thread id": 1, "Ev Idx": 138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771888796.431, "dur": 1023.338, + "args": { + "External id": 151692,"Sequence number": 3058965, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 139 + } + }, + { + "ph": "f", "id": 17, "pid": 5714, "tid": 6744, "ts": 6303771888796.431, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771888823.471, "dur": 2.820, + "args": { + "External id": 151693,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771888828.442, "dur": 934.187, + "args": { + "External id": 151694,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771888829.602, "dur": 932.767, + "args": { + "External id": 151695,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771888832.031, "dur": 7.011, + "args": { + "External id": 151696,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771888839.922, "dur": 921.927, + "args": { + "External id": 151697,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5714, "tid": 6744, + "ts": 6303771889765.109, "dur": 0.171, + "args": { + "External id": 151698,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5714, "tid": 6744, + "ts": 6303771889766.369, "dur": 3.211, + "args": { + "External id": 151699,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5714, "tid": 6744, + "ts": 6303771889768.689, "dur": 0.691, + "args": { + "External id": 151700,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 6744, + "ts": 6303771889772.580, "dur": 18.649, + "args": { + "External id": 151701,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 6744, + "ts": 6303771889795.739, "dur": 17.860, + "args": { + "External id": 151702,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 6744, + "ts": 6303771889796.579, "dur": 16.800, + "args": { + "External id": 151703,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 6744, + "ts": 6303771889797.529, "dur": 15.510, + "args": { + "External id": 151704,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771889827.029, "dur": 13.450, + "args": { + "External id": 151705,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771889855.159, "dur": 11.100, + "args": { + "External id": 151706,"Record function id": 0, "Sequence number": 3058964, "Fwd thread id": 1, "Ev Idx": 153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771889857.049, "dur": 6.870, + "args": { + "External id": 151707,"Sequence number": 3058964, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 154 + } + }, + { + "ph": "f", "id": 18, "pid": 5714, "tid": 6744, "ts": 6303771889857.049, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771889859.589, "dur": 4.080, + "args": { + "External id": 151708,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771889860.599, "dur": 2.850, + "args": { + "External id": 151709,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771889870.159, "dur": 54.610, + "args": { + "External id": 151710,"Record function id": 0, "Sequence number": 3058963, "Fwd thread id": 1, "Ev Idx": 157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771889872.439, "dur": 46.610, + "args": { + "External id": 151711,"Sequence number": 3058963, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 158 + } + }, + { + "ph": "f", "id": 19, "pid": 5714, "tid": 6744, "ts": 6303771889872.439, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771889874.049, "dur": 44.560, + "args": { + "External id": 151712,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771889876.259, "dur": 22.050, + "args": { + "External id": 151713,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771889877.669, "dur": 5.260, + "args": { + "External id": 151714,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771889883.869, "dur": 14.110, + "args": { + "External id": 151715,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771889885.179, "dur": 12.040, + "args": { + "External id": 151716,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771889899.689, "dur": 3.290, + "args": { + "External id": 151717,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771889901.709, "dur": 0.870, + "args": { + "External id": 151718,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771889903.749, "dur": 14.060, + "args": { + "External id": 151719,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771889932.069, "dur": 46.370, + "args": { + "External id": 151720,"Record function id": 0, "Sequence number": 3058962, "Fwd thread id": 1, "Ev Idx": 167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771889933.659, "dur": 41.030, + "args": { + "External id": 151721,"Sequence number": 3058962, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 168 + } + }, + { + "ph": "f", "id": 20, "pid": 5714, "tid": 6744, "ts": 6303771889933.659, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5714, "tid": 6744, + "ts": 6303771889935.519, "dur": 38.830, + "args": { + "External id": 151722,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771889937.819, "dur": 18.930, + "args": { + "External id": 151723,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771889938.909, "dur": 5.010, + "args": { + "External id": 151724,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771889944.729, "dur": 11.770, + "args": { + "External id": 151725,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771889945.729, "dur": 9.990, + "args": { + "External id": 151726,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6303771889957.819, "dur": 4.410, + "args": { + "External id": 151727,"Record function id": 0, "Concrete Inputs": ["", "2", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771889960.559, "dur": 1.060, + "args": { + "External id": 151728,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771889962.869, "dur": 10.910, + "args": { + "External id": 151729,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771889984.769, "dur": 45.650, + "args": { + "External id": 151730,"Record function id": 0, "Sequence number": 3058961, "Fwd thread id": 1, "Ev Idx": 177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771889986.239, "dur": 40.330, + "args": { + "External id": 151731,"Sequence number": 3058961, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 178 + } + }, + { + "ph": "f", "id": 21, "pid": 5714, "tid": 6744, "ts": 6303771889986.239, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771889987.899, "dur": 38.300, + "args": { + "External id": 151732,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771889990.359, "dur": 17.050, + "args": { + "External id": 151733,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771889991.259, "dur": 4.300, + "args": { + "External id": 151734,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771889996.349, "dur": 10.810, + "args": { + "External id": 151735,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771889997.299, "dur": 9.060, + "args": { + "External id": 151736,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771890008.529, "dur": 2.880, + "args": { + "External id": 151737,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771890010.209, "dur": 0.820, + "args": { + "External id": 151738,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771890011.999, "dur": 13.450, + "args": { + "External id": 151739,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771890036.979, "dur": 59.340, + "args": { + "External id": 151740,"Record function id": 0, "Sequence number": 3058960, "Fwd thread id": 1, "Ev Idx": 187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771890038.499, "dur": 41.360, + "args": { + "External id": 151741,"Sequence number": 3058960, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 188 + } + }, + { + "ph": "f", "id": 22, "pid": 5714, "tid": 6744, "ts": 6303771890038.499, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771890041.459, "dur": 38.080, + "args": { + "External id": 151742,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771890043.849, "dur": 18.840, + "args": { + "External id": 151743,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771890044.809, "dur": 5.390, + "args": { + "External id": 151744,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771890050.939, "dur": 11.460, + "args": { + "External id": 151745,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771890052.099, "dur": 9.550, + "args": { + "External id": 151746,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771890063.809, "dur": 2.490, + "args": { + "External id": 151747,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771890065.289, "dur": 0.710, + "args": { + "External id": 151748,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771890066.939, "dur": 11.980, + "args": { + "External id": 151749,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771890084.379, "dur": 9.640, + "args": { + "External id": 151750,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771890103.549, "dur": 31.519, + "args": { + "External id": 151751,"Record function id": 0, "Sequence number": 3058959, "Fwd thread id": 1, "Ev Idx": 198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771890105.189, "dur": 0.860, + "args": { + "External id": 151752,"Sequence number": 3058959, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 199 + } + }, + { + "ph": "f", "id": 23, "pid": 5714, "tid": 6744, "ts": 6303771890105.189, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771890107.369, "dur": 21.979, + "args": { + "External id": 151753,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771890108.839, "dur": 20.060, + "args": { + "External id": 151754,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[1], [], [], []], "Ev Idx": 201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771890114.959, "dur": 0.680, + "args": { + "External id": 151755,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771890141.079, "dur": 1135.017, + "args": { + "External id": 151756,"Record function id": 0, "Sequence number": 3058958, "Fwd thread id": 1, "Ev Idx": 203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771890148.528, "dur": 1103.148, + "args": { + "External id": 151757,"Sequence number": 3058958, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 204 + } + }, + { + "ph": "f", "id": 24, "pid": 5714, "tid": 6744, "ts": 6303771890148.528, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771890170.119, "dur": 2.560, + "args": { + "External id": 151758,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771890174.499, "dur": 1020.657, + "args": { + "External id": 151759,"Record function id": 0, "Concrete Inputs": ["", "", "6", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771890175.459, "dur": 1019.387, + "args": { + "External id": 151760,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771890177.219, "dur": 5.429, + "args": { + "External id": 151761,"Record function id": 0, "Concrete Inputs": ["[]", "[]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771890183.588, "dur": 1010.688, + "args": { + "External id": 151762,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::lift_fresh", "pid": 5714, "tid": 6744, + "ts": 6303771891197.606, "dur": 0.170, + "args": { + "External id": 151763,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach_", "pid": 5714, "tid": 6744, + "ts": 6303771891198.786, "dur": 1.780, + "args": { + "External id": 151764,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach_", "pid": 5714, "tid": 6744, + "ts": 6303771891199.706, "dur": 0.670, + "args": { + "External id": 151765,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 6744, + "ts": 6303771891203.536, "dur": 18.470, + "args": { + "External id": 151766,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 6744, + "ts": 6303771891226.886, "dur": 17.910, + "args": { + "External id": 151767,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 6744, + "ts": 6303771891227.696, "dur": 16.910, + "args": { + "External id": 151768,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 6744, + "ts": 6303771891228.696, "dur": 15.580, + "args": { + "External id": 151769,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771891258.996, "dur": 13.440, + "args": { + "External id": 151770,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891286.156, "dur": 17.960, + "args": { + "External id": 151771,"Record function id": 0, "Sequence number": 3058957, "Fwd thread id": 1, "Ev Idx": 218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891288.356, "dur": 7.050, + "args": { + "External id": 151772,"Sequence number": 3058957, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 219 + } + }, + { + "ph": "f", "id": 25, "pid": 5714, "tid": 6744, "ts": 6303771891288.356, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771891290.976, "dur": 4.190, + "args": { + "External id": 151773,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771891291.996, "dur": 2.920, + "args": { + "External id": 151774,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891308.916, "dur": 56.160, + "args": { + "External id": 151775,"Record function id": 0, "Sequence number": 3058956, "Fwd thread id": 1, "Ev Idx": 222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891310.056, "dur": 48.930, + "args": { + "External id": 151776,"Sequence number": 3058956, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 223 + } + }, + { + "ph": "f", "id": 26, "pid": 5714, "tid": 6744, "ts": 6303771891310.056, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771891311.646, "dur": 46.880, + "args": { + "External id": 151777,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], []], "Ev Idx": 224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771891313.986, "dur": 23.330, + "args": { + "External id": 151778,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771891315.556, "dur": 5.260, + "args": { + "External id": 151779,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771891321.696, "dur": 15.290, + "args": { + "External id": 151780,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771891323.786, "dur": 12.420, + "args": { + "External id": 151781,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771891338.736, "dur": 3.270, + "args": { + "External id": 151782,"Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771891340.776, "dur": 0.870, + "args": { + "External id": 151783,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[1572864, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771891342.796, "dur": 14.810, + "args": { + "External id": 151784,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891373.306, "dur": 48.150, + "args": { + "External id": 151785,"Record function id": 0, "Sequence number": 3058955, "Fwd thread id": 1, "Ev Idx": 232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SelectBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891374.866, "dur": 42.620, + "args": { + "External id": 151786,"Sequence number": 3058955, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 233 + } + }, + { + "ph": "f", "id": 27, "pid": 5714, "tid": 6744, "ts": 6303771891374.866, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select_backward", "pid": 5714, "tid": 6744, + "ts": 6303771891376.626, "dur": 40.450, + "args": { + "External id": 151787,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "2", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771891378.126, "dur": 21.540, + "args": { + "External id": 151788,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771891379.936, "dur": 5.770, + "args": { + "External id": 151789,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771891386.466, "dur": 12.920, + "args": { + "External id": 151790,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771891388.496, "dur": 10.140, + "args": { + "External id": 151791,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6303771891400.686, "dur": 4.510, + "args": { + "External id": 151792,"Record function id": 0, "Concrete Inputs": ["", "2", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771891403.566, "dur": 1.020, + "args": { + "External id": 151793,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771891405.836, "dur": 10.670, + "args": { + "External id": 151794,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891428.116, "dur": 52.109, + "args": { + "External id": 151795,"Record function id": 0, "Sequence number": 3058954, "Fwd thread id": 1, "Ev Idx": 242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891429.846, "dur": 46.459, + "args": { + "External id": 151796,"Sequence number": 3058954, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 243 + } + }, + { + "ph": "f", "id": 28, "pid": 5714, "tid": 6744, "ts": 6303771891429.846, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771891431.436, "dur": 44.489, + "args": { + "External id": 151797,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771891433.096, "dur": 18.360, + "args": { + "External id": 151798,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771891434.066, "dur": 4.330, + "args": { + "External id": 151799,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771891439.136, "dur": 12.060, + "args": { + "External id": 151800,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771891441.156, "dur": 9.340, + "args": { + "External id": 151801,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771891452.606, "dur": 2.610, + "args": { + "External id": 151802,"Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771891454.116, "dur": 0.810, + "args": { + "External id": 151803,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771891455.876, "dur": 19.310, + "args": { + "External id": 151804,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891486.736, "dur": 61.349, + "args": { + "External id": 151805,"Record function id": 0, "Sequence number": 3058953, "Fwd thread id": 1, "Ev Idx": 252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SliceBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891488.205, "dur": 42.220, + "args": { + "External id": 151806,"Sequence number": 3058953, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 253 + } + }, + { + "ph": "f", "id": 29, "pid": 5714, "tid": 6744, "ts": 6303771891488.205, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice_backward", "pid": 5714, "tid": 6744, + "ts": 6303771891489.856, "dur": 40.229, + "args": { + "External id": 151807,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], [], []], "Ev Idx": 254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 6744, + "ts": 6303771891491.256, "dur": 20.889, + "args": { + "External id": 151808,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771891493.396, "dur": 4.380, + "args": { + "External id": 151809,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 4, 768]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 6744, + "ts": 6303771891498.496, "dur": 13.380, + "args": { + "External id": 151810,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 6744, + "ts": 6303771891500.516, "dur": 10.620, + "args": { + "External id": 151811,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], []], "Ev Idx": 258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771891514.405, "dur": 2.580, + "args": { + "External id": 151812,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771891515.945, "dur": 0.711, + "args": { + "External id": 151813,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771891517.656, "dur": 11.749, + "args": { + "External id": 151814,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771891535.256, "dur": 9.940, + "args": { + "External id": 151815,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], []], "Ev Idx": 262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771891556.416, "dur": 270.379, + "args": { + "External id": 151816,"Record function id": 0, "Sequence number": 3058952, "Fwd thread id": 1, "Ev Idx": 263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771891558.536, "dur": 260.539, + "args": { + "External id": 151817,"Sequence number": 3058952, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 264 + } + }, + { + "ph": "f", "id": 30, "pid": 5714, "tid": 6744, "ts": 6303771891558.536, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771891680.835, "dur": 29.270, + "args": { + "External id": 151818,"kernel_hash": "c6pwrjtaatk26ciodo5pmvyk7s5bgtuynny44q5qcq4cspn3x4h2", "grid": "grid(131328,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "8", "2048", "4", "131328", "384"], "kernel_file": "/tmp/torchinductor_root/6p/c6pwrjtaatk26ciodo5pmvyk7s5bgtuynny44q5qcq4cspn3x4h2.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [6291456, 3072, 768, 1], [8192, 4, 1, 1], [131328, 131328, 131328, 1, 768], [], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [8, 2048, 4, 768], [8, 2048, 4, 1], [1, 1, 1, 768, 171], [], [], [], [], []], "Ev Idx": 265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771891732.075, "dur": 18.070, + "args": { + "External id": 151819,"kernel_hash": "cvi2geo3kp7he4ronsudo7p4b3n4w3af22ohl5mnz4aa62cpmlkv", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "171"], "kernel_file": "/tmp/torchinductor_root/vi/cvi2geo3kp7he4ronsudo7p4b3n4w3af22ohl5mnz4aa62cpmlkv.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[131328, 131328, 131328, 1, 768], [768, 768, 768, 1], [], []], "Input Dims": [[1, 1, 1, 768, 171], [1, 1, 1, 768], [], []], "Ev Idx": 266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771891771.455, "dur": 21.140, + "args": { + "External id": 151820,"kernel_hash": "clabhfcwwwl4deuomn7k5h33jcp5cek5y5pevfi3r5njb7ep6ocn", "grid": "grid(65536,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "65536", "768"], "kernel_file": "/tmp/torchinductor_root/la/clabhfcwwwl4deuomn7k5h33jcp5cek5y5pevfi3r5njb7ep6ocn.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [1], [6291456, 3072, 768, 1], [8192, 4, 1, 1], [6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [768], [8, 2048, 4, 768], [8, 2048, 4, 1], [8, 2048, 4, 768], [], []], "Ev Idx": 267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771891842.195, "dur": 13.180, + "args": { + "External id": 151821,"Record function id": 0, "Ev Idx": 268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771891845.375, "dur": 7.650, + "args": { + "External id": 151822,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771891848.755, "dur": 3.350, + "args": { + "External id": 151823,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771891849.555, "dur": 2.340, + "args": { + "External id": 151824,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: StackBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891860.815, "dur": 25.600, + "args": { + "External id": 151825,"Record function id": 0, "Sequence number": 3058951, "Fwd thread id": 1, "Ev Idx": 272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "StackBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891862.015, "dur": 19.200, + "args": { + "External id": 151826,"Sequence number": 3058951, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 768, 1]], "Input Dims": [[8, 2048, 4, 768]], "Ev Idx": 273 + } + }, + { + "ph": "f", "id": 31, "pid": 5714, "tid": 6744, "ts": 6303771891862.015, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6303771891863.815, "dur": 7.410, + "args": { + "External id": 151827,"Record function id": 0, "Concrete Inputs": ["", "-2", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771891868.315, "dur": 1.400, + "args": { + "External id": 151828,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6303771891871.875, "dur": 3.750, + "args": { + "External id": 151829,"Record function id": 0, "Concrete Inputs": ["", "-2", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771891873.485, "dur": 1.580, + "args": { + "External id": 151830,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6303771891876.105, "dur": 2.000, + "args": { + "External id": 151831,"Record function id": 0, "Concrete Inputs": ["", "-2", "2"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771891877.385, "dur": 0.250, + "args": { + "External id": 151832,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 6744, + "ts": 6303771891878.555, "dur": 2.150, + "args": { + "External id": 151833,"Record function id": 0, "Concrete Inputs": ["", "-2", "3"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771891880.075, "dur": 0.210, + "args": { + "External id": 151834,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891890.495, "dur": 4.400, + "args": { + "External id": 151835,"Record function id": 0, "Sequence number": 3058950, "Fwd thread id": 1, "Ev Idx": 282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771891891.745, "dur": 0.760, + "args": { + "External id": 151836,"Sequence number": 3058950, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 283 + } + }, + { + "ph": "f", "id": 32, "pid": 5714, "tid": 6744, "ts": 6303771891891.745, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771891898.875, "dur": 351.479, + "args": { + "External id": 151837,"Record function id": 0, "Sequence number": 3058949, "Fwd thread id": 1, "Ev Idx": 284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771891900.435, "dur": 340.649, + "args": { + "External id": 151838,"Sequence number": 3058949, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 285 + } + }, + { + "ph": "f", "id": 33, "pid": 5714, "tid": 6744, "ts": 6303771891900.435, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771891934.464, "dur": 10.400, + "args": { + "External id": 151839,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6303771891940.284, "dur": 4.120, + "args": { + "External id": 151840,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771891947.324, "dur": 5.000, + "args": { + "External id": 151841,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771891948.664, "dur": 2.771, + "args": { + "External id": 151842,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771891950.484, "dur": 0.680, + "args": { + "External id": 151843,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 6744, + "ts": 6303771891955.435, "dur": 46.940, + "args": { + "External id": 151844,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771891956.015, "dur": 2.740, + "args": { + "External id": 151845,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771891956.584, "dur": 1.651, + "args": { + "External id": 151846,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771891957.724, "dur": 0.351, + "args": { + "External id": 151847,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 6744, + "ts": 6303771891959.584, "dur": 41.951, + "args": { + "External id": 151848,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771891960.475, "dur": 40.100, + "args": { + "External id": 151849,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 6744, + "ts": 6303771892007.784, "dur": 5.100, + "args": { + "External id": 151850,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771892010.495, "dur": 2.200, + "args": { + "External id": 151851,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771892042.694, "dur": 8.290, + "args": { + "External id": 151852,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771892052.304, "dur": 2.870, + "args": { + "External id": 151853,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771892056.084, "dur": 2.690, + "args": { + "External id": 151854,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771892092.564, "dur": 3.640, + "args": { + "External id": 151855,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771892093.764, "dur": 2.130, + "args": { + "External id": 151856,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5714, "tid": 6744, + "ts": 6303771892116.964, "dur": 106.630, + "args": { + "External id": 151857,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6303771892122.474, "dur": 6.590, + "args": { + "External id": 151858,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892126.804, "dur": 1.140, + "args": { + "External id": 151859,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771892130.984, "dur": 4.850, + "args": { + "External id": 151860,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892134.674, "dur": 0.390, + "args": { + "External id": 151861,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6303771892137.574, "dur": 2.100, + "args": { + "External id": 151862,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892138.894, "dur": 0.410, + "args": { + "External id": 151863,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771892140.484, "dur": 2.030, + "args": { + "External id": 151864,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892141.844, "dur": 0.260, + "args": { + "External id": 151865,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771892147.524, "dur": 2.650, + "args": { + "External id": 151866,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892148.754, "dur": 1.120, + "args": { + "External id": 151867,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771892152.164, "dur": 4.940, + "args": { + "External id": 151868,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6303771892155.474, "dur": 1.420, + "args": { + "External id": 151869,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771892157.974, "dur": 1.910, + "args": { + "External id": 151870,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892159.274, "dur": 0.280, + "args": { + "External id": 151871,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771892160.734, "dur": 2.090, + "args": { + "External id": 151872,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771892161.484, "dur": 1.200, + "args": { + "External id": 151873,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771892164.294, "dur": 47.200, + "args": { + "External id": 151874,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771892213.694, "dur": 1.810, + "args": { + "External id": 151875,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771892216.484, "dur": 3.040, + "args": { + "External id": 151876,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892218.564, "dur": 0.410, + "args": { + "External id": 151877,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771892221.434, "dur": 0.630, + "args": { + "External id": 151878,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771892262.644, "dur": 10.280, + "args": { + "External id": 151879,"Record function id": 0, "Ev Idx": 326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771892265.584, "dur": 6.010, + "args": { + "External id": 151880,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771892267.654, "dur": 3.050, + "args": { + "External id": 151881,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771892268.464, "dur": 2.040, + "args": { + "External id": 151882,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892277.464, "dur": 8.010, + "args": { + "External id": 151883,"Record function id": 0, "Sequence number": 3058948, "Fwd thread id": 1, "Ev Idx": 330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892278.864, "dur": 4.390, + "args": { + "External id": 151884,"Sequence number": 3058948, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 331 + } + }, + { + "ph": "f", "id": 34, "pid": 5714, "tid": 6744, "ts": 6303771892278.864, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771892280.534, "dur": 2.480, + "args": { + "External id": 151885,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771892281.384, "dur": 1.410, + "args": { + "External id": 151886,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892289.774, "dur": 105.460, + "args": { + "External id": 151887,"Record function id": 0, "Sequence number": 3058947, "Fwd thread id": 1, "Ev Idx": 334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892292.004, "dur": 94.759, + "args": { + "External id": 151888,"Sequence number": 3058947, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 335 + } + }, + { + "ph": "f", "id": 35, "pid": 5714, "tid": 6744, "ts": 6303771892292.004, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771892294.464, "dur": 13.480, + "args": { + "External id": 151889,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771892295.734, "dur": 11.440, + "args": { + "External id": 151890,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892305.684, "dur": 1.090, + "args": { + "External id": 151891,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771892309.114, "dur": 40.380, + "args": { + "External id": 151892,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771892351.584, "dur": 6.410, + "args": { + "External id": 151893,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771892352.644, "dur": 4.560, + "args": { + "External id": 151894,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892355.164, "dur": 1.780, + "args": { + "External id": 151895,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771892359.804, "dur": 2.480, + "args": { + "External id": 151896,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771892360.514, "dur": 1.390, + "args": { + "External id": 151897,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892361.504, "dur": 0.260, + "args": { + "External id": 151898,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771892362.964, "dur": 22.850, + "args": { + "External id": 151899,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892404.423, "dur": 9.400, + "args": { + "External id": 151900,"Record function id": 0, "Sequence number": 3058946, "Fwd thread id": 1, "Ev Idx": 347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892406.143, "dur": 5.651, + "args": { + "External id": 151901,"Sequence number": 3058946, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 348 + } + }, + { + "ph": "f", "id": 36, "pid": 5714, "tid": 6744, "ts": 6303771892406.143, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771892408.114, "dur": 3.500, + "args": { + "External id": 151902,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771892409.123, "dur": 2.320, + "args": { + "External id": 151903,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892418.694, "dur": 8.640, + "args": { + "External id": 151904,"Record function id": 0, "Sequence number": 3058945, "Fwd thread id": 1, "Ev Idx": 351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892419.954, "dur": 5.349, + "args": { + "External id": 151905,"Sequence number": 3058945, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 352 + } + }, + { + "ph": "f", "id": 37, "pid": 5714, "tid": 6744, "ts": 6303771892419.954, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771892421.114, "dur": 3.960, + "args": { + "External id": 151906,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771892422.163, "dur": 2.380, + "args": { + "External id": 151907,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892423.614, "dur": 0.680, + "args": { + "External id": 151908,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771892432.263, "dur": 7.431, + "args": { + "External id": 151909,"Record function id": 0, "Ev Idx": 356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771892434.134, "dur": 4.480, + "args": { + "External id": 151910,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771892435.783, "dur": 2.480, + "args": { + "External id": 151911,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771892436.523, "dur": 1.580, + "args": { + "External id": 151912,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892443.314, "dur": 7.069, + "args": { + "External id": 151913,"Record function id": 0, "Sequence number": 3058944, "Fwd thread id": 1, "Ev Idx": 360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892444.703, "dur": 3.920, + "args": { + "External id": 151914,"Sequence number": 3058944, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 361 + } + }, + { + "ph": "f", "id": 38, "pid": 5714, "tid": 6744, "ts": 6303771892444.703, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771892446.734, "dur": 1.769, + "args": { + "External id": 151915,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771892447.434, "dur": 0.900, + "args": { + "External id": 151916,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892453.703, "dur": 84.740, + "args": { + "External id": 151917,"Record function id": 0, "Sequence number": 3058943, "Fwd thread id": 1, "Ev Idx": 364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892454.894, "dur": 74.939, + "args": { + "External id": 151918,"Sequence number": 3058943, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 365 + } + }, + { + "ph": "f", "id": 39, "pid": 5714, "tid": 6744, "ts": 6303771892454.894, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771892456.574, "dur": 4.440, + "args": { + "External id": 151919,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771892457.194, "dur": 3.400, + "args": { + "External id": 151920,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892459.663, "dur": 0.751, + "args": { + "External id": 151921,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771892461.914, "dur": 33.389, + "args": { + "External id": 151922,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771892497.233, "dur": 4.900, + "args": { + "External id": 151923,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771892498.103, "dur": 3.210, + "args": { + "External id": 151924,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892499.643, "dur": 1.440, + "args": { + "External id": 151925,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771892503.573, "dur": 3.750, + "args": { + "External id": 151926,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771892504.283, "dur": 2.660, + "args": { + "External id": 151927,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892506.523, "dur": 0.280, + "args": { + "External id": 151928,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771892507.963, "dur": 21.010, + "args": { + "External id": 151929,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892547.813, "dur": 32.680, + "args": { + "External id": 151930,"Record function id": 0, "Sequence number": 3058942, "Fwd thread id": 1, "Ev Idx": 377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892549.473, "dur": 5.300, + "args": { + "External id": 151931,"Sequence number": 3058942, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 378 + } + }, + { + "ph": "f", "id": 40, "pid": 5714, "tid": 6744, "ts": 6303771892549.473, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771892551.133, "dur": 3.500, + "args": { + "External id": 151932,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771892552.153, "dur": 2.290, + "args": { + "External id": 151933,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771892557.543, "dur": 18.850, + "args": { + "External id": 151934,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892586.463, "dur": 11.560, + "args": { + "External id": 151935,"Record function id": 0, "Sequence number": 3058941, "Fwd thread id": 1, "Ev Idx": 382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892588.243, "dur": 7.410, + "args": { + "External id": 151936,"Sequence number": 3058941, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 383 + } + }, + { + "ph": "f", "id": 41, "pid": 5714, "tid": 6744, "ts": 6303771892588.243, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771892589.263, "dur": 6.110, + "args": { + "External id": 151937,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771892591.493, "dur": 2.930, + "args": { + "External id": 151938,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892593.353, "dur": 0.800, + "args": { + "External id": 151939,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771892603.033, "dur": 7.560, + "args": { + "External id": 151940,"Record function id": 0, "Ev Idx": 387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771892604.873, "dur": 4.650, + "args": { + "External id": 151941,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771892606.643, "dur": 2.420, + "args": { + "External id": 151942,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771892607.343, "dur": 1.530, + "args": { + "External id": 151943,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892614.183, "dur": 84.510, + "args": { + "External id": 151944,"Record function id": 0, "Sequence number": 3058940, "Fwd thread id": 1, "Ev Idx": 391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892615.293, "dur": 39.080, + "args": { + "External id": 151945,"Sequence number": 3058940, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 392 + } + }, + { + "ph": "f", "id": 42, "pid": 5714, "tid": 6744, "ts": 6303771892615.293, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771892617.493, "dur": 21.430, + "args": { + "External id": 151946,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771892640.323, "dur": 13.580, + "args": { + "External id": 151947,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771892657.423, "dur": 30.350, + "args": { + "External id": 151948,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771892691.663, "dur": 2.190, + "args": { + "External id": 151949,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771892707.213, "dur": 7.670, + "args": { + "External id": 151950,"Record function id": 0, "Ev Idx": 397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771892709.673, "dur": 4.130, + "args": { + "External id": 151951,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771892711.213, "dur": 2.160, + "args": { + "External id": 151952,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771892711.833, "dur": 1.370, + "args": { + "External id": 151953,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892718.953, "dur": 36.760, + "args": { + "External id": 151954,"Record function id": 0, "Sequence number": 3058939, "Fwd thread id": 1, "Ev Idx": 401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892720.103, "dur": 32.070, + "args": { + "External id": 151955,"Sequence number": 3058939, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 402 + } + }, + { + "ph": "f", "id": 43, "pid": 5714, "tid": 6744, "ts": 6303771892720.103, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771892722.513, "dur": 29.310, + "args": { + "External id": 151956,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771892725.023, "dur": 26.520, + "args": { + "External id": 151957,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892728.113, "dur": 5.340, + "args": { + "External id": 151958,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771892734.433, "dur": 16.560, + "args": { + "External id": 151959,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892761.693, "dur": 57.890, + "args": { + "External id": 151960,"Record function id": 0, "Sequence number": 3058938, "Fwd thread id": 1, "Ev Idx": 407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892763.063, "dur": 35.140, + "args": { + "External id": 151961,"Sequence number": 3058938, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 408 + } + }, + { + "ph": "f", "id": 44, "pid": 5714, "tid": 6744, "ts": 6303771892763.063, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771892764.843, "dur": 17.730, + "args": { + "External id": 151962,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771892783.723, "dur": 14.070, + "args": { + "External id": 151963,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771892801.103, "dur": 13.950, + "args": { + "External id": 151964,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892826.382, "dur": 69.680, + "args": { + "External id": 151965,"Record function id": 0, "Sequence number": 3058937, "Fwd thread id": 1, "Ev Idx": 412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892828.062, "dur": 62.820, + "args": { + "External id": 151966,"Sequence number": 3058937, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 413 + } + }, + { + "ph": "f", "id": 45, "pid": 5714, "tid": 6744, "ts": 6303771892828.062, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771892832.153, "dur": 24.300, + "args": { + "External id": 151967,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771892835.402, "dur": 0.571, + "args": { + "External id": 151968,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771892836.902, "dur": 0.240, + "args": { + "External id": 151969,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771892858.293, "dur": 20.609, + "args": { + "External id": 151970,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771892862.173, "dur": 15.809, + "args": { + "External id": 151971,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771892879.733, "dur": 9.560, + "args": { + "External id": 151972,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771892903.162, "dur": 4.160, + "args": { + "External id": 151973,"Record function id": 0, "Sequence number": 3058936, "Fwd thread id": 1, "Ev Idx": 420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771892904.682, "dur": 0.540, + "args": { + "External id": 151974,"Sequence number": 3058936, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 421 + } + }, + { + "ph": "f", "id": 46, "pid": 5714, "tid": 6744, "ts": 6303771892904.682, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771892910.893, "dur": 42.739, + "args": { + "External id": 151975,"Record function id": 0, "Sequence number": 3058935, "Fwd thread id": 1, "Ev Idx": 422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771892911.982, "dur": 37.660, + "args": { + "External id": 151976,"Sequence number": 3058935, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 423 + } + }, + { + "ph": "f", "id": 47, "pid": 5714, "tid": 6744, "ts": 6303771892911.982, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6303771892916.482, "dur": 7.780, + "args": { + "External id": 151977,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771892921.333, "dur": 1.360, + "args": { + "External id": 151978,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771892925.872, "dur": 23.170, + "args": { + "External id": 151979,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771892929.742, "dur": 18.330, + "args": { + "External id": 151980,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892960.122, "dur": 79.970, + "args": { + "External id": 151981,"Record function id": 0, "Sequence number": 3058934, "Fwd thread id": 1, "Ev Idx": 428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771892961.662, "dur": 60.020, + "args": { + "External id": 151982,"Sequence number": 3058934, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 429 + } + }, + { + "ph": "f", "id": 48, "pid": 5714, "tid": 6744, "ts": 6303771892961.662, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771892963.412, "dur": 27.960, + "args": { + "External id": 151983,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771892964.592, "dur": 0.330, + "args": { + "External id": 151984,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771892965.762, "dur": 0.220, + "args": { + "External id": 151985,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771892971.222, "dur": 18.950, + "args": { + "External id": 151986,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771892992.462, "dur": 16.680, + "args": { + "External id": 151987,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771892994.962, "dur": 13.170, + "args": { + "External id": 151988,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771893010.042, "dur": 9.780, + "args": { + "External id": 151989,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771893027.172, "dur": 9.660, + "args": { + "External id": 151990,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893049.072, "dur": 51.140, + "args": { + "External id": 151991,"Record function id": 0, "Sequence number": 3058933, "Fwd thread id": 1, "Ev Idx": 438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893050.712, "dur": 24.350, + "args": { + "External id": 151992,"Sequence number": 3058933, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 439 + } + }, + { + "ph": "f", "id": 49, "pid": 5714, "tid": 6744, "ts": 6303771893050.712, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771893053.252, "dur": 21.440, + "args": { + "External id": 151993,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771893054.132, "dur": 20.310, + "args": { + "External id": 151994,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893057.412, "dur": 4.820, + "args": { + "External id": 151995,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771893063.082, "dur": 10.890, + "args": { + "External id": 151996,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771893079.552, "dur": 15.620, + "args": { + "External id": 151997,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893107.292, "dur": 5.080, + "args": { + "External id": 151998,"Record function id": 0, "Sequence number": 3058932, "Fwd thread id": 1, "Ev Idx": 445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893108.912, "dur": 0.960, + "args": { + "External id": 151999,"Sequence number": 3058932, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 446 + } + }, + { + "ph": "f", "id": 50, "pid": 5714, "tid": 6744, "ts": 6303771893108.912, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893115.982, "dur": 11.040, + "args": { + "External id": 152000,"Record function id": 0, "Sequence number": 3058931, "Fwd thread id": 1, "Ev Idx": 447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893119.392, "dur": 5.720, + "args": { + "External id": 152001,"Sequence number": 3058931, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 448 + } + }, + { + "ph": "f", "id": 51, "pid": 5714, "tid": 6744, "ts": 6303771893119.392, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771893121.092, "dur": 3.850, + "args": { + "External id": 152002,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771893121.952, "dur": 2.780, + "args": { + "External id": 152003,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893131.512, "dur": 90.640, + "args": { + "External id": 152004,"Record function id": 0, "Sequence number": 3058930, "Fwd thread id": 1, "Ev Idx": 451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893132.742, "dur": 84.770, + "args": { + "External id": 152005,"Sequence number": 3058930, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 452 + } + }, + { + "ph": "f", "id": 52, "pid": 5714, "tid": 6744, "ts": 6303771893132.742, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771893134.852, "dur": 5.240, + "args": { + "External id": 152006,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771893136.132, "dur": 3.180, + "args": { + "External id": 152007,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893138.042, "dur": 0.970, + "args": { + "External id": 152008,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771893140.992, "dur": 38.930, + "args": { + "External id": 152009,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771893181.832, "dur": 6.420, + "args": { + "External id": 152010,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771893184.082, "dur": 3.380, + "args": { + "External id": 152011,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893185.652, "dur": 1.610, + "args": { + "External id": 152012,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771893189.862, "dur": 2.410, + "args": { + "External id": 152013,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771893190.572, "dur": 1.340, + "args": { + "External id": 152014,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893191.522, "dur": 0.250, + "args": { + "External id": 152015,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771893192.922, "dur": 23.810, + "args": { + "External id": 152016,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893230.242, "dur": 9.960, + "args": { + "External id": 152017,"Record function id": 0, "Sequence number": 3058929, "Fwd thread id": 1, "Ev Idx": 464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893232.092, "dur": 6.220, + "args": { + "External id": 152018,"Sequence number": 3058929, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 465 + } + }, + { + "ph": "f", "id": 53, "pid": 5714, "tid": 6744, "ts": 6303771893232.092, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771893234.922, "dur": 3.220, + "args": { + "External id": 152019,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771893235.962, "dur": 1.980, + "args": { + "External id": 152020,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893243.862, "dur": 7.730, + "args": { + "External id": 152021,"Record function id": 0, "Sequence number": 3058928, "Fwd thread id": 1, "Ev Idx": 468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893245.102, "dur": 4.500, + "args": { + "External id": 152022,"Sequence number": 3058928, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 469 + } + }, + { + "ph": "f", "id": 54, "pid": 5714, "tid": 6744, "ts": 6303771893245.102, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771893245.732, "dur": 3.640, + "args": { + "External id": 152023,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771893246.612, "dur": 2.190, + "args": { + "External id": 152024,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893248.022, "dur": 0.550, + "args": { + "External id": 152025,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771893256.672, "dur": 8.580, + "args": { + "External id": 152026,"Record function id": 0, "Ev Idx": 473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771893258.412, "dur": 5.740, + "args": { + "External id": 152027,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771893260.072, "dur": 3.700, + "args": { + "External id": 152028,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771893261.862, "dur": 1.720, + "args": { + "External id": 152029,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893268.652, "dur": 6.470, + "args": { + "External id": 152030,"Record function id": 0, "Sequence number": 3058927, "Fwd thread id": 1, "Ev Idx": 477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893269.862, "dur": 3.300, + "args": { + "External id": 152031,"Sequence number": 3058927, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 478 + } + }, + { + "ph": "f", "id": 55, "pid": 5714, "tid": 6744, "ts": 6303771893269.862, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771893270.872, "dur": 2.150, + "args": { + "External id": 152032,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771893271.562, "dur": 1.290, + "args": { + "External id": 152033,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6303771893279.752, "dur": 276.369, + "args": { + "External id": 152034,"Record function id": 0, "Sequence number": 3058926, "Fwd thread id": 1, "Ev Idx": 481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6303771893281.392, "dur": 260.319, + "args": { + "External id": 152035,"Sequence number": 3058926, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 482 + } + }, + { + "ph": "f", "id": 56, "pid": 5714, "tid": 6744, "ts": 6303771893281.392, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771893294.412, "dur": 17.349, + "args": { + "External id": 152036,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893304.712, "dur": 6.429, + "args": { + "External id": 152037,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771893314.412, "dur": 5.620, + "args": { + "External id": 152038,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893315.752, "dur": 4.040, + "args": { + "External id": 152039,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771893321.252, "dur": 5.760, + "args": { + "External id": 152040,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893322.292, "dur": 4.489, + "args": { + "External id": 152041,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771893345.621, "dur": 168.960, + "args": { + "External id": 152042,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771893427.221, "dur": 5.730, + "args": { + "External id": 152043,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771893435.141, "dur": 3.030, + "args": { + "External id": 152044,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771893526.041, "dur": 5.020, + "args": { + "External id": 152045,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771893535.361, "dur": 0.610, + "args": { + "External id": 152046,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771893538.551, "dur": 0.580, + "args": { + "External id": 152047,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771893566.621, "dur": 188.319, + "args": { + "External id": 152048,"Record function id": 0, "Sequence number": 3058925, "Fwd thread id": 1, "Ev Idx": 495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771893569.011, "dur": 178.089, + "args": { + "External id": 152049,"Sequence number": 3058925, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 496 + } + }, + { + "ph": "f", "id": 57, "pid": 5714, "tid": 6744, "ts": 6303771893569.011, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771893586.471, "dur": 34.450, + "args": { + "External id": 152050,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893589.431, "dur": 6.040, + "args": { + "External id": 152051,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771893596.551, "dur": 23.650, + "args": { + "External id": 152052,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771893629.841, "dur": 7.860, + "args": { + "External id": 152053,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893632.671, "dur": 4.610, + "args": { + "External id": 152054,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771893766.331, "dur": 152.319, + "args": { + "External id": 152055,"Record function id": 0, "Sequence number": 3058924, "Fwd thread id": 1, "Ev Idx": 502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771893769.000, "dur": 142.530, + "args": { + "External id": 152056,"Sequence number": 3058924, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 503 + } + }, + { + "ph": "f", "id": 58, "pid": 5714, "tid": 6744, "ts": 6303771893769.000, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771893782.511, "dur": 29.649, + "args": { + "External id": 152057,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893785.871, "dur": 5.740, + "args": { + "External id": 152058,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771893792.660, "dur": 18.840, + "args": { + "External id": 152059,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771893819.840, "dur": 6.480, + "args": { + "External id": 152060,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893821.400, "dur": 4.540, + "args": { + "External id": 152061,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893929.710, "dur": 22.130, + "args": { + "External id": 152062,"Record function id": 0, "Sequence number": 3058923, "Fwd thread id": 1, "Ev Idx": 509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893932.010, "dur": 16.960, + "args": { + "External id": 152063,"Sequence number": 3058923, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 510 + } + }, + { + "ph": "f", "id": 59, "pid": 5714, "tid": 6744, "ts": 6303771893932.010, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771893934.630, "dur": 14.050, + "args": { + "External id": 152064,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771893944.680, "dur": 3.700, + "args": { + "External id": 152065,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893956.020, "dur": 7.280, + "args": { + "External id": 152066,"Record function id": 0, "Sequence number": 3058922, "Fwd thread id": 1, "Ev Idx": 513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893958.240, "dur": 3.130, + "args": { + "External id": 152067,"Sequence number": 3058922, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 514 + } + }, + { + "ph": "f", "id": 60, "pid": 5714, "tid": 6744, "ts": 6303771893958.240, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771893959.410, "dur": 1.840, + "args": { + "External id": 152068,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771893960.120, "dur": 0.960, + "args": { + "External id": 152069,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893966.670, "dur": 5.650, + "args": { + "External id": 152070,"Record function id": 0, "Sequence number": 3058921, "Fwd thread id": 1, "Ev Idx": 517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893967.820, "dur": 2.720, + "args": { + "External id": 152071,"Sequence number": 3058921, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 518 + } + }, + { + "ph": "f", "id": 61, "pid": 5714, "tid": 6744, "ts": 6303771893967.820, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771893968.830, "dur": 1.550, + "args": { + "External id": 152072,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771893969.520, "dur": 0.700, + "args": { + "External id": 152073,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893975.740, "dur": 7.230, + "args": { + "External id": 152074,"Record function id": 0, "Sequence number": 3058920, "Fwd thread id": 1, "Ev Idx": 521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893977.160, "dur": 3.760, + "args": { + "External id": 152075,"Sequence number": 3058920, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 522 + } + }, + { + "ph": "f", "id": 62, "pid": 5714, "tid": 6744, "ts": 6303771893977.160, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771893977.930, "dur": 2.860, + "args": { + "External id": 152076,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771893979.560, "dur": 1.070, + "args": { + "External id": 152077,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893986.400, "dur": 94.780, + "args": { + "External id": 152078,"Record function id": 0, "Sequence number": 3058919, "Fwd thread id": 1, "Ev Idx": 525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771893987.510, "dur": 85.880, + "args": { + "External id": 152079,"Sequence number": 3058919, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 526 + } + }, + { + "ph": "f", "id": 63, "pid": 5714, "tid": 6744, "ts": 6303771893987.510, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771893990.050, "dur": 6.070, + "args": { + "External id": 152080,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771893991.300, "dur": 4.240, + "args": { + "External id": 152081,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771893992.970, "dur": 2.230, + "args": { + "External id": 152082,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771893997.160, "dur": 41.600, + "args": { + "External id": 152083,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894041.740, "dur": 4.350, + "args": { + "External id": 152084,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894042.720, "dur": 2.490, + "args": { + "External id": 152085,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894044.210, "dur": 0.780, + "args": { + "External id": 152086,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894047.740, "dur": 2.570, + "args": { + "External id": 152087,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894048.570, "dur": 1.400, + "args": { + "External id": 152088,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894049.550, "dur": 0.280, + "args": { + "External id": 152089,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771894052.060, "dur": 20.480, + "args": { + "External id": 152090,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894089.590, "dur": 9.220, + "args": { + "External id": 152091,"Record function id": 0, "Sequence number": 3058918, "Fwd thread id": 1, "Ev Idx": 538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894091.440, "dur": 5.340, + "args": { + "External id": 152092,"Sequence number": 3058918, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 539 + } + }, + { + "ph": "f", "id": 64, "pid": 5714, "tid": 6744, "ts": 6303771894091.440, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771894093.310, "dur": 3.320, + "args": { + "External id": 152093,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771894094.260, "dur": 2.180, + "args": { + "External id": 152094,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894102.390, "dur": 8.930, + "args": { + "External id": 152095,"Record function id": 0, "Sequence number": 3058917, "Fwd thread id": 1, "Ev Idx": 542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894103.490, "dur": 5.720, + "args": { + "External id": 152096,"Sequence number": 3058917, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 543 + } + }, + { + "ph": "f", "id": 65, "pid": 5714, "tid": 6744, "ts": 6303771894103.490, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894104.250, "dur": 4.760, + "args": { + "External id": 152097,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894105.190, "dur": 3.200, + "args": { + "External id": 152098,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894107.550, "dur": 0.600, + "args": { + "External id": 152099,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771894116.370, "dur": 8.080, + "args": { + "External id": 152100,"Record function id": 0, "Ev Idx": 547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771894118.440, "dur": 4.760, + "args": { + "External id": 152101,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771894120.190, "dur": 2.600, + "args": { + "External id": 152102,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771894120.870, "dur": 1.730, + "args": { + "External id": 152103,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894129.520, "dur": 6.350, + "args": { + "External id": 152104,"Record function id": 0, "Sequence number": 3058916, "Fwd thread id": 1, "Ev Idx": 551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894130.960, "dur": 2.950, + "args": { + "External id": 152105,"Sequence number": 3058916, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 552 + } + }, + { + "ph": "f", "id": 66, "pid": 5714, "tid": 6744, "ts": 6303771894130.960, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771894131.870, "dur": 1.900, + "args": { + "External id": 152106,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771894132.540, "dur": 1.060, + "args": { + "External id": 152107,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894139.750, "dur": 81.120, + "args": { + "External id": 152108,"Record function id": 0, "Sequence number": 3058915, "Fwd thread id": 1, "Ev Idx": 555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894141.850, "dur": 71.469, + "args": { + "External id": 152109,"Sequence number": 3058915, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 556 + } + }, + { + "ph": "f", "id": 67, "pid": 5714, "tid": 6744, "ts": 6303771894141.850, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894143.620, "dur": 3.800, + "args": { + "External id": 152110,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894144.170, "dur": 2.870, + "args": { + "External id": 152111,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894145.360, "dur": 1.470, + "args": { + "External id": 152112,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771894148.310, "dur": 34.000, + "args": { + "External id": 152113,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894183.939, "dur": 5.140, + "args": { + "External id": 152114,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894184.779, "dur": 3.531, + "args": { + "External id": 152115,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894187.459, "dur": 0.660, + "args": { + "External id": 152116,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894190.490, "dur": 2.700, + "args": { + "External id": 152117,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894191.390, "dur": 1.360, + "args": { + "External id": 152118,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894192.350, "dur": 0.260, + "args": { + "External id": 152119,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771894193.870, "dur": 18.729, + "args": { + "External id": 152120,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894228.830, "dur": 31.860, + "args": { + "External id": 152121,"Record function id": 0, "Sequence number": 3058914, "Fwd thread id": 1, "Ev Idx": 568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894230.419, "dur": 5.220, + "args": { + "External id": 152122,"Sequence number": 3058914, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 569 + } + }, + { + "ph": "f", "id": 68, "pid": 5714, "tid": 6744, "ts": 6303771894230.419, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771894232.179, "dur": 3.271, + "args": { + "External id": 152123,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771894233.190, "dur": 2.080, + "args": { + "External id": 152124,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771894238.210, "dur": 18.080, + "args": { + "External id": 152125,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894266.730, "dur": 10.329, + "args": { + "External id": 152126,"Record function id": 0, "Sequence number": 3058913, "Fwd thread id": 1, "Ev Idx": 573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894268.390, "dur": 5.980, + "args": { + "External id": 152127,"Sequence number": 3058913, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 574 + } + }, + { + "ph": "f", "id": 69, "pid": 5714, "tid": 6744, "ts": 6303771894268.390, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894269.299, "dur": 4.831, + "args": { + "External id": 152128,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894270.450, "dur": 2.789, + "args": { + "External id": 152129,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894272.270, "dur": 0.709, + "args": { + "External id": 152130,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771894282.079, "dur": 9.680, + "args": { + "External id": 152131,"Record function id": 0, "Ev Idx": 578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771894283.849, "dur": 6.790, + "args": { + "External id": 152132,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771894285.559, "dur": 4.650, + "args": { + "External id": 152133,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771894287.319, "dur": 2.690, + "args": { + "External id": 152134,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894295.429, "dur": 15.390, + "args": { + "External id": 152135,"Record function id": 0, "Sequence number": 3058912, "Fwd thread id": 1, "Ev Idx": 582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894304.299, "dur": 4.000, + "args": { + "External id": 152136,"Sequence number": 3058912, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 583 + } + }, + { + "ph": "f", "id": 70, "pid": 5714, "tid": 6744, "ts": 6303771894304.299, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771894305.499, "dur": 2.620, + "args": { + "External id": 152137,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771894306.309, "dur": 1.580, + "args": { + "External id": 152138,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894314.729, "dur": 86.800, + "args": { + "External id": 152139,"Record function id": 0, "Sequence number": 3058911, "Fwd thread id": 1, "Ev Idx": 586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894316.219, "dur": 76.880, + "args": { + "External id": 152140,"Sequence number": 3058911, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 587 + } + }, + { + "ph": "f", "id": 71, "pid": 5714, "tid": 6744, "ts": 6303771894316.219, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894317.959, "dur": 5.140, + "args": { + "External id": 152141,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894318.509, "dur": 4.180, + "args": { + "External id": 152142,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894321.789, "dur": 0.690, + "args": { + "External id": 152143,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771894323.859, "dur": 37.820, + "args": { + "External id": 152144,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894363.579, "dur": 4.130, + "args": { + "External id": 152145,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894364.589, "dur": 2.290, + "args": { + "External id": 152146,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894365.979, "dur": 0.700, + "args": { + "External id": 152147,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894369.119, "dur": 2.930, + "args": { + "External id": 152148,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894369.889, "dur": 1.730, + "args": { + "External id": 152149,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894371.129, "dur": 0.340, + "args": { + "External id": 152150,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771894372.639, "dur": 19.690, + "args": { + "External id": 152151,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894409.879, "dur": 27.680, + "args": { + "External id": 152152,"Record function id": 0, "Sequence number": 3058910, "Fwd thread id": 1, "Ev Idx": 599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894411.529, "dur": 5.360, + "args": { + "External id": 152153,"Sequence number": 3058910, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 600 + } + }, + { + "ph": "f", "id": 72, "pid": 5714, "tid": 6744, "ts": 6303771894411.529, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771894413.319, "dur": 3.360, + "args": { + "External id": 152154,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771894414.269, "dur": 2.230, + "args": { + "External id": 152155,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771894419.409, "dur": 14.890, + "args": { + "External id": 152156,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894442.929, "dur": 10.830, + "args": { + "External id": 152157,"Record function id": 0, "Sequence number": 3058909, "Fwd thread id": 1, "Ev Idx": 604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894445.599, "dur": 5.950, + "args": { + "External id": 152158,"Sequence number": 3058909, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 605 + } + }, + { + "ph": "f", "id": 73, "pid": 5714, "tid": 6744, "ts": 6303771894445.599, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894446.399, "dur": 4.900, + "args": { + "External id": 152159,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894447.539, "dur": 2.860, + "args": { + "External id": 152160,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894449.279, "dur": 0.850, + "args": { + "External id": 152161,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771894458.759, "dur": 8.600, + "args": { + "External id": 152162,"Record function id": 0, "Ev Idx": 609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771894460.549, "dur": 5.670, + "args": { + "External id": 152163,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771894462.249, "dur": 3.580, + "args": { + "External id": 152164,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771894462.939, "dur": 2.710, + "args": { + "External id": 152165,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894470.859, "dur": 69.020, + "args": { + "External id": 152166,"Record function id": 0, "Sequence number": 3058908, "Fwd thread id": 1, "Ev Idx": 613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894472.149, "dur": 31.550, + "args": { + "External id": 152167,"Sequence number": 3058908, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 614 + } + }, + { + "ph": "f", "id": 74, "pid": 5714, "tid": 6744, "ts": 6303771894472.149, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771894473.799, "dur": 17.070, + "args": { + "External id": 152168,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771894492.229, "dur": 11.040, + "args": { + "External id": 152169,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771894506.519, "dur": 23.880, + "args": { + "External id": 152170,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771894533.169, "dur": 2.240, + "args": { + "External id": 152171,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771894548.099, "dur": 9.070, + "args": { + "External id": 152172,"Record function id": 0, "Ev Idx": 619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771894550.369, "dur": 5.660, + "args": { + "External id": 152173,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771894551.849, "dur": 3.750, + "args": { + "External id": 152174,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771894553.709, "dur": 1.670, + "args": { + "External id": 152175,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894561.849, "dur": 31.770, + "args": { + "External id": 152176,"Record function id": 0, "Sequence number": 3058907, "Fwd thread id": 1, "Ev Idx": 623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894563.009, "dur": 27.130, + "args": { + "External id": 152177,"Sequence number": 3058907, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 624 + } + }, + { + "ph": "f", "id": 75, "pid": 5714, "tid": 6744, "ts": 6303771894563.009, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771894564.359, "dur": 25.400, + "args": { + "External id": 152178,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771894565.479, "dur": 23.960, + "args": { + "External id": 152179,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894568.669, "dur": 5.110, + "args": { + "External id": 152180,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771894574.929, "dur": 13.910, + "args": { + "External id": 152181,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894599.439, "dur": 53.790, + "args": { + "External id": 152182,"Record function id": 0, "Sequence number": 3058906, "Fwd thread id": 1, "Ev Idx": 629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894600.839, "dur": 28.650, + "args": { + "External id": 152183,"Sequence number": 3058906, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 630 + } + }, + { + "ph": "f", "id": 76, "pid": 5714, "tid": 6744, "ts": 6303771894600.839, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771894602.799, "dur": 13.870, + "args": { + "External id": 152184,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771894617.919, "dur": 11.170, + "args": { + "External id": 152185,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771894632.489, "dur": 15.960, + "args": { + "External id": 152186,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894660.029, "dur": 56.800, + "args": { + "External id": 152187,"Record function id": 0, "Sequence number": 3058905, "Fwd thread id": 1, "Ev Idx": 634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894661.629, "dur": 50.780, + "args": { + "External id": 152188,"Sequence number": 3058905, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 635 + } + }, + { + "ph": "f", "id": 77, "pid": 5714, "tid": 6744, "ts": 6303771894661.629, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771894664.189, "dur": 18.169, + "args": { + "External id": 152189,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771894665.729, "dur": 0.460, + "args": { + "External id": 152190,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771894667.258, "dur": 0.211, + "args": { + "External id": 152191,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771894683.698, "dur": 16.231, + "args": { + "External id": 152192,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771894686.838, "dur": 12.171, + "args": { + "External id": 152193,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771894700.758, "dur": 9.780, + "args": { + "External id": 152194,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771894723.618, "dur": 3.780, + "args": { + "External id": 152195,"Record function id": 0, "Sequence number": 3058904, "Fwd thread id": 1, "Ev Idx": 642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771894725.188, "dur": 0.400, + "args": { + "External id": 152196,"Sequence number": 3058904, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 643 + } + }, + { + "ph": "f", "id": 78, "pid": 5714, "tid": 6744, "ts": 6303771894725.188, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771894730.648, "dur": 32.970, + "args": { + "External id": 152197,"Record function id": 0, "Sequence number": 3058903, "Fwd thread id": 1, "Ev Idx": 644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771894731.758, "dur": 28.370, + "args": { + "External id": 152198,"Sequence number": 3058903, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 645 + } + }, + { + "ph": "f", "id": 79, "pid": 5714, "tid": 6744, "ts": 6303771894731.758, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6303771894734.038, "dur": 5.320, + "args": { + "External id": 152199,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894736.648, "dur": 1.310, + "args": { + "External id": 152200,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771894740.238, "dur": 19.330, + "args": { + "External id": 152201,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771894742.538, "dur": 16.190, + "args": { + "External id": 152202,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894769.728, "dur": 81.190, + "args": { + "External id": 152203,"Record function id": 0, "Sequence number": 3058902, "Fwd thread id": 1, "Ev Idx": 650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894771.418, "dur": 61.580, + "args": { + "External id": 152204,"Sequence number": 3058902, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 651 + } + }, + { + "ph": "f", "id": 80, "pid": 5714, "tid": 6744, "ts": 6303771894771.418, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771894772.908, "dur": 27.970, + "args": { + "External id": 152205,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771894774.048, "dur": 0.330, + "args": { + "External id": 152206,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771894775.218, "dur": 0.190, + "args": { + "External id": 152207,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771894781.858, "dur": 17.760, + "args": { + "External id": 152208,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771894801.898, "dur": 17.320, + "args": { + "External id": 152209,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771894805.438, "dur": 12.850, + "args": { + "External id": 152210,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771894821.258, "dur": 10.010, + "args": { + "External id": 152211,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771894838.378, "dur": 9.370, + "args": { + "External id": 152212,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894858.698, "dur": 40.100, + "args": { + "External id": 152213,"Record function id": 0, "Sequence number": 3058901, "Fwd thread id": 1, "Ev Idx": 660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894860.458, "dur": 23.610, + "args": { + "External id": 152214,"Sequence number": 3058901, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 661 + } + }, + { + "ph": "f", "id": 81, "pid": 5714, "tid": 6744, "ts": 6303771894860.458, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771894861.948, "dur": 21.800, + "args": { + "External id": 152215,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771894862.878, "dur": 20.650, + "args": { + "External id": 152216,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894866.098, "dur": 4.950, + "args": { + "External id": 152217,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771894871.928, "dur": 11.070, + "args": { + "External id": 152218,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771894888.158, "dur": 8.450, + "args": { + "External id": 152219,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894906.248, "dur": 4.830, + "args": { + "External id": 152220,"Record function id": 0, "Sequence number": 3058900, "Fwd thread id": 1, "Ev Idx": 667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771894907.748, "dur": 1.000, + "args": { + "External id": 152221,"Sequence number": 3058900, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 668 + } + }, + { + "ph": "f", "id": 82, "pid": 5714, "tid": 6744, "ts": 6303771894907.748, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771894915.548, "dur": 290.979, + "args": { + "External id": 152222,"Record function id": 0, "Sequence number": 3058899, "Fwd thread id": 1, "Ev Idx": 669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771894917.418, "dur": 279.919, + "args": { + "External id": 152223,"Sequence number": 3058899, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 670 + } + }, + { + "ph": "f", "id": 83, "pid": 5714, "tid": 6744, "ts": 6303771894917.418, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771894942.828, "dur": 7.300, + "args": { + "External id": 152224,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6303771894946.458, "dur": 3.210, + "args": { + "External id": 152225,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894952.428, "dur": 4.960, + "args": { + "External id": 152226,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894953.708, "dur": 3.070, + "args": { + "External id": 152227,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894955.728, "dur": 0.790, + "args": { + "External id": 152228,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 6744, + "ts": 6303771894959.478, "dur": 39.130, + "args": { + "External id": 152229,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771894961.128, "dur": 2.720, + "args": { + "External id": 152230,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771894961.618, "dur": 1.870, + "args": { + "External id": 152231,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771894962.658, "dur": 0.680, + "args": { + "External id": 152232,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 6744, + "ts": 6303771894964.648, "dur": 33.200, + "args": { + "External id": 152233,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771894965.528, "dur": 31.670, + "args": { + "External id": 152234,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 6744, + "ts": 6303771895003.028, "dur": 3.650, + "args": { + "External id": 152235,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771895004.368, "dur": 2.140, + "args": { + "External id": 152236,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771895028.928, "dur": 5.600, + "args": { + "External id": 152237,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771895035.788, "dur": 4.030, + "args": { + "External id": 152238,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771895040.758, "dur": 3.770, + "args": { + "External id": 152239,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771895071.368, "dur": 3.380, + "args": { + "External id": 152240,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771895072.488, "dur": 1.990, + "args": { + "External id": 152241,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5714, "tid": 6744, + "ts": 6303771895086.037, "dur": 96.320, + "args": { + "External id": 152242,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6303771895090.657, "dur": 4.711, + "args": { + "External id": 152243,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895093.348, "dur": 1.129, + "args": { + "External id": 152244,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771895096.688, "dur": 3.740, + "args": { + "External id": 152245,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895099.308, "dur": 0.520, + "args": { + "External id": 152246,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6303771895101.917, "dur": 2.320, + "args": { + "External id": 152247,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895103.257, "dur": 0.620, + "args": { + "External id": 152248,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771895105.208, "dur": 2.449, + "args": { + "External id": 152249,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895106.948, "dur": 0.360, + "args": { + "External id": 152250,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771895110.877, "dur": 2.691, + "args": { + "External id": 152251,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895112.868, "dur": 0.380, + "args": { + "External id": 152252,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771895114.528, "dur": 4.549, + "args": { + "External id": 152253,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6303771895117.528, "dur": 1.309, + "args": { + "External id": 152254,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771895120.108, "dur": 3.360, + "args": { + "External id": 152255,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895121.828, "dur": 1.309, + "args": { + "External id": 152256,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771895124.388, "dur": 2.360, + "args": { + "External id": 152257,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771895125.208, "dur": 1.369, + "args": { + "External id": 152258,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771895127.817, "dur": 41.571, + "args": { + "External id": 152259,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771895172.017, "dur": 2.071, + "args": { + "External id": 152260,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771895175.067, "dur": 3.620, + "args": { + "External id": 152261,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895177.527, "dur": 0.540, + "args": { + "External id": 152262,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771895180.777, "dur": 0.740, + "args": { + "External id": 152263,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771895218.757, "dur": 10.530, + "args": { + "External id": 152264,"Record function id": 0, "Ev Idx": 711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771895221.627, "dur": 6.360, + "args": { + "External id": 152265,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771895223.817, "dur": 3.280, + "args": { + "External id": 152266,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771895224.757, "dur": 2.140, + "args": { + "External id": 152267,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895233.257, "dur": 8.040, + "args": { + "External id": 152268,"Record function id": 0, "Sequence number": 3058898, "Fwd thread id": 1, "Ev Idx": 715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895234.747, "dur": 4.380, + "args": { + "External id": 152269,"Sequence number": 3058898, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 716 + } + }, + { + "ph": "f", "id": 84, "pid": 5714, "tid": 6744, "ts": 6303771895234.747, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771895236.377, "dur": 2.490, + "args": { + "External id": 152270,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771895237.227, "dur": 1.400, + "args": { + "External id": 152271,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895244.907, "dur": 99.930, + "args": { + "External id": 152272,"Record function id": 0, "Sequence number": 3058897, "Fwd thread id": 1, "Ev Idx": 719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895246.037, "dur": 90.840, + "args": { + "External id": 152273,"Sequence number": 3058897, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 720 + } + }, + { + "ph": "f", "id": 85, "pid": 5714, "tid": 6744, "ts": 6303771895246.037, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771895248.347, "dur": 5.010, + "args": { + "External id": 152274,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771895249.847, "dur": 2.870, + "args": { + "External id": 152275,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895251.357, "dur": 1.040, + "args": { + "External id": 152276,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771895254.337, "dur": 37.500, + "args": { + "External id": 152277,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771895293.657, "dur": 11.980, + "args": { + "External id": 152278,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771895294.777, "dur": 9.950, + "args": { + "External id": 152279,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895303.617, "dur": 0.810, + "args": { + "External id": 152280,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771895307.287, "dur": 3.050, + "args": { + "External id": 152281,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771895308.067, "dur": 1.750, + "args": { + "External id": 152282,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895309.257, "dur": 0.420, + "args": { + "External id": 152283,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771895310.987, "dur": 24.920, + "args": { + "External id": 152284,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895353.487, "dur": 9.330, + "args": { + "External id": 152285,"Record function id": 0, "Sequence number": 3058896, "Fwd thread id": 1, "Ev Idx": 732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895355.247, "dur": 5.560, + "args": { + "External id": 152286,"Sequence number": 3058896, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 733 + } + }, + { + "ph": "f", "id": 86, "pid": 5714, "tid": 6744, "ts": 6303771895355.247, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771895357.097, "dur": 3.530, + "args": { + "External id": 152287,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771895357.977, "dur": 2.410, + "args": { + "External id": 152288,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895366.337, "dur": 8.420, + "args": { + "External id": 152289,"Record function id": 0, "Sequence number": 3058895, "Fwd thread id": 1, "Ev Idx": 736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895367.467, "dur": 5.300, + "args": { + "External id": 152290,"Sequence number": 3058895, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 737 + } + }, + { + "ph": "f", "id": 87, "pid": 5714, "tid": 6744, "ts": 6303771895367.467, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771895368.387, "dur": 4.170, + "args": { + "External id": 152291,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771895369.427, "dur": 2.530, + "args": { + "External id": 152292,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895371.027, "dur": 0.670, + "args": { + "External id": 152293,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771895379.627, "dur": 7.330, + "args": { + "External id": 152294,"Record function id": 0, "Ev Idx": 741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771895381.447, "dur": 4.450, + "args": { + "External id": 152295,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771895383.077, "dur": 2.470, + "args": { + "External id": 152296,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771895383.767, "dur": 1.600, + "args": { + "External id": 152297,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895390.607, "dur": 5.960, + "args": { + "External id": 152298,"Record function id": 0, "Sequence number": 3058894, "Fwd thread id": 1, "Ev Idx": 745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895392.007, "dur": 2.720, + "args": { + "External id": 152299,"Sequence number": 3058894, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 746 + } + }, + { + "ph": "f", "id": 88, "pid": 5714, "tid": 6744, "ts": 6303771895392.007, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771895393.007, "dur": 1.580, + "args": { + "External id": 152300,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771895393.537, "dur": 0.880, + "args": { + "External id": 152301,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895400.087, "dur": 80.590, + "args": { + "External id": 152302,"Record function id": 0, "Sequence number": 3058893, "Fwd thread id": 1, "Ev Idx": 749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895401.287, "dur": 71.680, + "args": { + "External id": 152303,"Sequence number": 3058893, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 750 + } + }, + { + "ph": "f", "id": 89, "pid": 5714, "tid": 6744, "ts": 6303771895401.287, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771895403.027, "dur": 2.770, + "args": { + "External id": 152304,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771895403.647, "dur": 1.740, + "args": { + "External id": 152305,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895404.577, "dur": 0.600, + "args": { + "External id": 152306,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771895406.657, "dur": 33.170, + "args": { + "External id": 152307,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771895441.567, "dur": 4.200, + "args": { + "External id": 152308,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771895442.837, "dur": 2.130, + "args": { + "External id": 152309,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895444.127, "dur": 0.640, + "args": { + "External id": 152310,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771895447.557, "dur": 3.410, + "args": { + "External id": 152311,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771895448.297, "dur": 2.210, + "args": { + "External id": 152312,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895449.057, "dur": 1.300, + "args": { + "External id": 152313,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771895451.547, "dur": 20.600, + "args": { + "External id": 152314,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895488.807, "dur": 31.640, + "args": { + "External id": 152315,"Record function id": 0, "Sequence number": 3058892, "Fwd thread id": 1, "Ev Idx": 762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895490.607, "dur": 5.190, + "args": { + "External id": 152316,"Sequence number": 3058892, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 763 + } + }, + { + "ph": "f", "id": 90, "pid": 5714, "tid": 6744, "ts": 6303771895490.607, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771895492.277, "dur": 3.340, + "args": { + "External id": 152317,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771895493.097, "dur": 2.330, + "args": { + "External id": 152318,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771895498.337, "dur": 18.100, + "args": { + "External id": 152319,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895526.276, "dur": 10.171, + "args": { + "External id": 152320,"Record function id": 0, "Sequence number": 3058891, "Fwd thread id": 1, "Ev Idx": 767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895528.047, "dur": 6.249, + "args": { + "External id": 152321,"Sequence number": 3058891, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 768 + } + }, + { + "ph": "f", "id": 91, "pid": 5714, "tid": 6744, "ts": 6303771895528.047, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771895529.176, "dur": 4.871, + "args": { + "External id": 152322,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771895530.407, "dur": 2.709, + "args": { + "External id": 152323,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895532.076, "dur": 0.780, + "args": { + "External id": 152324,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771895541.407, "dur": 7.189, + "args": { + "External id": 152325,"Record function id": 0, "Ev Idx": 772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771895543.047, "dur": 4.529, + "args": { + "External id": 152326,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771895544.716, "dur": 2.431, + "args": { + "External id": 152327,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771895545.396, "dur": 1.571, + "args": { + "External id": 152328,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895553.256, "dur": 72.060, + "args": { + "External id": 152329,"Record function id": 0, "Sequence number": 3058890, "Fwd thread id": 1, "Ev Idx": 776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895554.496, "dur": 31.211, + "args": { + "External id": 152330,"Sequence number": 3058890, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 777 + } + }, + { + "ph": "f", "id": 92, "pid": 5714, "tid": 6744, "ts": 6303771895554.496, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771895556.236, "dur": 16.560, + "args": { + "External id": 152331,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771895574.176, "dur": 11.160, + "args": { + "External id": 152332,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771895588.467, "dur": 25.980, + "args": { + "External id": 152333,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771895617.147, "dur": 3.429, + "args": { + "External id": 152334,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771895633.596, "dur": 7.820, + "args": { + "External id": 152335,"Record function id": 0, "Ev Idx": 782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771895635.686, "dur": 4.620, + "args": { + "External id": 152336,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771895637.076, "dur": 2.780, + "args": { + "External id": 152337,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771895638.086, "dur": 1.580, + "args": { + "External id": 152338,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895644.986, "dur": 31.530, + "args": { + "External id": 152339,"Record function id": 0, "Sequence number": 3058889, "Fwd thread id": 1, "Ev Idx": 786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895646.086, "dur": 27.140, + "args": { + "External id": 152340,"Sequence number": 3058889, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 787 + } + }, + { + "ph": "f", "id": 93, "pid": 5714, "tid": 6744, "ts": 6303771895646.086, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771895647.556, "dur": 25.290, + "args": { + "External id": 152341,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771895648.716, "dur": 23.880, + "args": { + "External id": 152342,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895651.806, "dur": 5.120, + "args": { + "External id": 152343,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771895657.956, "dur": 14.080, + "args": { + "External id": 152344,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895682.476, "dur": 51.760, + "args": { + "External id": 152345,"Record function id": 0, "Sequence number": 3058888, "Fwd thread id": 1, "Ev Idx": 792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895683.966, "dur": 29.040, + "args": { + "External id": 152346,"Sequence number": 3058888, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 793 + } + }, + { + "ph": "f", "id": 94, "pid": 5714, "tid": 6744, "ts": 6303771895683.966, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771895685.846, "dur": 14.070, + "args": { + "External id": 152347,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771895701.366, "dur": 11.210, + "args": { + "External id": 152348,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771895716.126, "dur": 13.740, + "args": { + "External id": 152349,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895740.876, "dur": 55.450, + "args": { + "External id": 152350,"Record function id": 0, "Sequence number": 3058887, "Fwd thread id": 1, "Ev Idx": 797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895742.456, "dur": 49.550, + "args": { + "External id": 152351,"Sequence number": 3058887, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 798 + } + }, + { + "ph": "f", "id": 95, "pid": 5714, "tid": 6744, "ts": 6303771895742.456, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771895744.616, "dur": 17.500, + "args": { + "External id": 152352,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771895746.136, "dur": 0.450, + "args": { + "External id": 152353,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771895747.526, "dur": 0.230, + "args": { + "External id": 152354,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771895763.326, "dur": 16.770, + "args": { + "External id": 152355,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771895766.256, "dur": 12.910, + "args": { + "External id": 152356,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 803 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771895781.096, "dur": 9.340, + "args": { + "External id": 152357,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771895803.106, "dur": 3.760, + "args": { + "External id": 152358,"Record function id": 0, "Sequence number": 3058886, "Fwd thread id": 1, "Ev Idx": 805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771895804.616, "dur": 0.390, + "args": { + "External id": 152359,"Sequence number": 3058886, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 806 + } + }, + { + "ph": "f", "id": 96, "pid": 5714, "tid": 6744, "ts": 6303771895804.616, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771895810.006, "dur": 33.600, + "args": { + "External id": 152360,"Record function id": 0, "Sequence number": 3058885, "Fwd thread id": 1, "Ev Idx": 807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771895811.226, "dur": 28.890, + "args": { + "External id": 152361,"Sequence number": 3058885, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 808 + } + }, + { + "ph": "f", "id": 97, "pid": 5714, "tid": 6744, "ts": 6303771895811.226, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6303771895813.456, "dur": 5.300, + "args": { + "External id": 152362,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895816.036, "dur": 1.340, + "args": { + "External id": 152363,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771895819.606, "dur": 19.970, + "args": { + "External id": 152364,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771895822.146, "dur": 16.550, + "args": { + "External id": 152365,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895849.746, "dur": 84.700, + "args": { + "External id": 152366,"Record function id": 0, "Sequence number": 3058884, "Fwd thread id": 1, "Ev Idx": 813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895851.326, "dur": 65.640, + "args": { + "External id": 152367,"Sequence number": 3058884, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 814 + } + }, + { + "ph": "f", "id": 98, "pid": 5714, "tid": 6744, "ts": 6303771895851.326, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771895852.996, "dur": 29.080, + "args": { + "External id": 152368,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771895854.136, "dur": 0.300, + "args": { + "External id": 152369,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771895855.226, "dur": 0.200, + "args": { + "External id": 152370,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771895861.546, "dur": 19.330, + "args": { + "External id": 152371,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771895883.156, "dur": 16.270, + "args": { + "External id": 152372,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771895885.536, "dur": 12.880, + "args": { + "External id": 152373,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771895900.226, "dur": 15.020, + "args": { + "External id": 152374,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771895922.236, "dur": 9.180, + "args": { + "External id": 152375,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895942.456, "dur": 48.190, + "args": { + "External id": 152376,"Record function id": 0, "Sequence number": 3058883, "Fwd thread id": 1, "Ev Idx": 823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895944.166, "dur": 23.550, + "args": { + "External id": 152377,"Sequence number": 3058883, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 824 + } + }, + { + "ph": "f", "id": 99, "pid": 5714, "tid": 6744, "ts": 6303771895944.166, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771895945.646, "dur": 21.720, + "args": { + "External id": 152378,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771895946.526, "dur": 20.640, + "args": { + "External id": 152379,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771895949.976, "dur": 5.010, + "args": { + "External id": 152380,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771895955.776, "dur": 10.870, + "args": { + "External id": 152381,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771895972.396, "dur": 13.319, + "args": { + "External id": 152382,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771895998.835, "dur": 17.440, + "args": { + "External id": 152383,"Record function id": 0, "Sequence number": 3058882, "Fwd thread id": 1, "Ev Idx": 830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896000.566, "dur": 0.960, + "args": { + "External id": 152384,"Sequence number": 3058882, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 831 + } + }, + { + "ph": "f", "id": 100, "pid": 5714, "tid": 6744, "ts": 6303771896000.566, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771896004.106, "dur": 10.029, + "args": { + "External id": 152385,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896020.926, "dur": 10.320, + "args": { + "External id": 152386,"Record function id": 0, "Sequence number": 3058881, "Fwd thread id": 1, "Ev Idx": 833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896022.355, "dur": 6.911, + "args": { + "External id": 152387,"Sequence number": 3058881, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 834 + } + }, + { + "ph": "f", "id": 101, "pid": 5714, "tid": 6744, "ts": 6303771896022.355, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771896024.126, "dur": 4.900, + "args": { + "External id": 152388,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771896026.126, "dur": 2.669, + "args": { + "External id": 152389,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896034.835, "dur": 88.150, + "args": { + "External id": 152390,"Record function id": 0, "Sequence number": 3058880, "Fwd thread id": 1, "Ev Idx": 837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896036.046, "dur": 81.729, + "args": { + "External id": 152391,"Sequence number": 3058880, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 838 + } + }, + { + "ph": "f", "id": 102, "pid": 5714, "tid": 6744, "ts": 6303771896036.046, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896038.586, "dur": 5.060, + "args": { + "External id": 152392,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896039.835, "dur": 3.031, + "args": { + "External id": 152393,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896041.466, "dur": 1.049, + "args": { + "External id": 152394,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771896045.455, "dur": 37.930, + "args": { + "External id": 152395,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896085.135, "dur": 5.370, + "args": { + "External id": 152396,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896085.955, "dur": 3.680, + "args": { + "External id": 152397,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896087.815, "dur": 1.600, + "args": { + "External id": 152398,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896092.165, "dur": 3.900, + "args": { + "External id": 152399,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896092.945, "dur": 2.720, + "args": { + "External id": 152400,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896095.275, "dur": 0.250, + "args": { + "External id": 152401,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771896096.755, "dur": 20.230, + "args": { + "External id": 152402,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896131.065, "dur": 8.850, + "args": { + "External id": 152403,"Record function id": 0, "Sequence number": 3058879, "Fwd thread id": 1, "Ev Idx": 850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896132.725, "dur": 5.410, + "args": { + "External id": 152404,"Sequence number": 3058879, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 851 + } + }, + { + "ph": "f", "id": 103, "pid": 5714, "tid": 6744, "ts": 6303771896132.725, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771896134.645, "dur": 3.340, + "args": { + "External id": 152405,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771896135.675, "dur": 2.100, + "args": { + "External id": 152406,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896143.455, "dur": 9.080, + "args": { + "External id": 152407,"Record function id": 0, "Sequence number": 3058878, "Fwd thread id": 1, "Ev Idx": 854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896144.675, "dur": 6.030, + "args": { + "External id": 152408,"Sequence number": 3058878, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 855 + } + }, + { + "ph": "f", "id": 104, "pid": 5714, "tid": 6744, "ts": 6303771896144.675, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896145.305, "dur": 5.200, + "args": { + "External id": 152409,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896147.315, "dur": 2.570, + "args": { + "External id": 152410,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896148.875, "dur": 0.790, + "args": { + "External id": 152411,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771896157.775, "dur": 7.600, + "args": { + "External id": 152412,"Record function id": 0, "Ev Idx": 859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771896159.465, "dur": 4.750, + "args": { + "External id": 152413,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771896161.205, "dur": 2.620, + "args": { + "External id": 152414,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771896161.885, "dur": 1.760, + "args": { + "External id": 152415,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896168.875, "dur": 6.480, + "args": { + "External id": 152416,"Record function id": 0, "Sequence number": 3058877, "Fwd thread id": 1, "Ev Idx": 863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896170.065, "dur": 3.200, + "args": { + "External id": 152417,"Sequence number": 3058877, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 864 + } + }, + { + "ph": "f", "id": 105, "pid": 5714, "tid": 6744, "ts": 6303771896170.065, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771896170.905, "dur": 2.230, + "args": { + "External id": 152418,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771896171.655, "dur": 1.310, + "args": { + "External id": 152419,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6303771896180.035, "dur": 219.140, + "args": { + "External id": 152420,"Record function id": 0, "Sequence number": 3058876, "Fwd thread id": 1, "Ev Idx": 867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6303771896181.525, "dur": 203.340, + "args": { + "External id": 152421,"Sequence number": 3058876, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 868 + } + }, + { + "ph": "f", "id": 106, "pid": 5714, "tid": 6744, "ts": 6303771896181.525, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771896192.475, "dur": 7.900, + "args": { + "External id": 152422,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896194.455, "dur": 5.440, + "args": { + "External id": 152423,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771896202.605, "dur": 5.910, + "args": { + "External id": 152424,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896204.025, "dur": 4.250, + "args": { + "External id": 152425,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771896211.255, "dur": 5.080, + "args": { + "External id": 152426,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896212.065, "dur": 4.000, + "args": { + "External id": 152427,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771896230.485, "dur": 128.920, + "args": { + "External id": 152428,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771896280.045, "dur": 5.610, + "args": { + "External id": 152429,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771896287.125, "dur": 3.520, + "args": { + "External id": 152430,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771896371.115, "dur": 3.290, + "args": { + "External id": 152431,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771896378.485, "dur": 0.580, + "args": { + "External id": 152432,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771896381.695, "dur": 0.490, + "args": { + "External id": 152433,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771896409.375, "dur": 165.549, + "args": { + "External id": 152434,"Record function id": 0, "Sequence number": 3058875, "Fwd thread id": 1, "Ev Idx": 881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771896411.565, "dur": 155.729, + "args": { + "External id": 152435,"Sequence number": 3058875, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 882 + } + }, + { + "ph": "f", "id": 107, "pid": 5714, "tid": 6744, "ts": 6303771896411.565, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771896426.125, "dur": 33.909, + "args": { + "External id": 152436,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896429.265, "dur": 6.160, + "args": { + "External id": 152437,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771896436.565, "dur": 22.789, + "args": { + "External id": 152438,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771896468.225, "dur": 6.480, + "args": { + "External id": 152439,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896470.034, "dur": 4.220, + "args": { + "External id": 152440,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771896586.184, "dur": 152.130, + "args": { + "External id": 152441,"Record function id": 0, "Sequence number": 3058874, "Fwd thread id": 1, "Ev Idx": 888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771896588.694, "dur": 142.000, + "args": { + "External id": 152442,"Sequence number": 3058874, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 889 + } + }, + { + "ph": "f", "id": 108, "pid": 5714, "tid": 6744, "ts": 6303771896588.694, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771896601.754, "dur": 31.300, + "args": { + "External id": 152443,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896605.204, "dur": 5.840, + "args": { + "External id": 152444,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771896612.114, "dur": 20.380, + "args": { + "External id": 152445,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771896640.624, "dur": 6.830, + "args": { + "External id": 152446,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896642.524, "dur": 4.550, + "args": { + "External id": 152447,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896749.114, "dur": 12.600, + "args": { + "External id": 152448,"Record function id": 0, "Sequence number": 3058873, "Fwd thread id": 1, "Ev Idx": 895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896751.414, "dur": 7.750, + "args": { + "External id": 152449,"Sequence number": 3058873, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 896 + } + }, + { + "ph": "f", "id": 109, "pid": 5714, "tid": 6744, "ts": 6303771896751.414, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771896754.094, "dur": 4.810, + "args": { + "External id": 152450,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771896755.214, "dur": 3.460, + "args": { + "External id": 152451,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896765.694, "dur": 7.340, + "args": { + "External id": 152452,"Record function id": 0, "Sequence number": 3058872, "Fwd thread id": 1, "Ev Idx": 899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896766.834, "dur": 4.320, + "args": { + "External id": 152453,"Sequence number": 3058872, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 900 + } + }, + { + "ph": "f", "id": 110, "pid": 5714, "tid": 6744, "ts": 6303771896766.834, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771896767.934, "dur": 3.040, + "args": { + "External id": 152454,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771896769.874, "dur": 0.910, + "args": { + "External id": 152455,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896776.464, "dur": 6.810, + "args": { + "External id": 152456,"Record function id": 0, "Sequence number": 3058871, "Fwd thread id": 1, "Ev Idx": 903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896778.634, "dur": 2.920, + "args": { + "External id": 152457,"Sequence number": 3058871, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 904 + } + }, + { + "ph": "f", "id": 111, "pid": 5714, "tid": 6744, "ts": 6303771896778.634, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771896779.734, "dur": 1.670, + "args": { + "External id": 152458,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771896780.494, "dur": 0.760, + "args": { + "External id": 152459,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896787.724, "dur": 5.770, + "args": { + "External id": 152460,"Record function id": 0, "Sequence number": 3058870, "Fwd thread id": 1, "Ev Idx": 907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896789.224, "dur": 2.370, + "args": { + "External id": 152461,"Sequence number": 3058870, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 908 + } + }, + { + "ph": "f", "id": 112, "pid": 5714, "tid": 6744, "ts": 6303771896789.224, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771896790.004, "dur": 1.460, + "args": { + "External id": 152462,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771896790.504, "dur": 0.800, + "args": { + "External id": 152463,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896796.924, "dur": 93.449, + "args": { + "External id": 152464,"Record function id": 0, "Sequence number": 3058869, "Fwd thread id": 1, "Ev Idx": 911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896798.094, "dur": 84.699, + "args": { + "External id": 152465,"Sequence number": 3058869, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 912 + } + }, + { + "ph": "f", "id": 113, "pid": 5714, "tid": 6744, "ts": 6303771896798.094, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896801.674, "dur": 6.200, + "args": { + "External id": 152466,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896802.994, "dur": 4.260, + "args": { + "External id": 152467,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896804.694, "dur": 2.240, + "args": { + "External id": 152468,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771896808.784, "dur": 41.660, + "args": { + "External id": 152469,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896852.314, "dur": 5.510, + "args": { + "External id": 152470,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896853.194, "dur": 3.720, + "args": { + "External id": 152471,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896855.954, "dur": 0.740, + "args": { + "External id": 152472,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896859.334, "dur": 2.390, + "args": { + "External id": 152473,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896860.184, "dur": 1.140, + "args": { + "External id": 152474,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896860.884, "dur": 0.300, + "args": { + "External id": 152475,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771896862.324, "dur": 19.729, + "args": { + "External id": 152476,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896898.613, "dur": 8.760, + "args": { + "External id": 152477,"Record function id": 0, "Sequence number": 3058868, "Fwd thread id": 1, "Ev Idx": 924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896900.413, "dur": 5.300, + "args": { + "External id": 152478,"Sequence number": 3058868, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 925 + } + }, + { + "ph": "f", "id": 114, "pid": 5714, "tid": 6744, "ts": 6303771896900.413, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771896902.273, "dur": 3.291, + "args": { + "External id": 152479,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771896903.124, "dur": 2.249, + "args": { + "External id": 152480,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896912.104, "dur": 8.200, + "args": { + "External id": 152481,"Record function id": 0, "Sequence number": 3058867, "Fwd thread id": 1, "Ev Idx": 928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896913.393, "dur": 4.760, + "args": { + "External id": 152482,"Sequence number": 3058867, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 929 + } + }, + { + "ph": "f", "id": 115, "pid": 5714, "tid": 6744, "ts": 6303771896913.393, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896914.084, "dur": 3.869, + "args": { + "External id": 152483,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896915.053, "dur": 2.271, + "args": { + "External id": 152484,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896916.393, "dur": 0.691, + "args": { + "External id": 152485,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771896925.504, "dur": 8.960, + "args": { + "External id": 152486,"Record function id": 0, "Ev Idx": 933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771896927.253, "dur": 5.991, + "args": { + "External id": 152487,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771896929.013, "dur": 3.800, + "args": { + "External id": 152488,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771896929.773, "dur": 2.860, + "args": { + "External id": 152489,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896938.024, "dur": 7.809, + "args": { + "External id": 152490,"Record function id": 0, "Sequence number": 3058866, "Fwd thread id": 1, "Ev Idx": 937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896939.353, "dur": 4.211, + "args": { + "External id": 152491,"Sequence number": 3058866, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 938 + } + }, + { + "ph": "f", "id": 116, "pid": 5714, "tid": 6744, "ts": 6303771896939.353, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771896941.393, "dur": 2.040, + "args": { + "External id": 152492,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771896942.144, "dur": 1.120, + "args": { + "External id": 152493,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896949.213, "dur": 81.750, + "args": { + "External id": 152494,"Record function id": 0, "Sequence number": 3058865, "Fwd thread id": 1, "Ev Idx": 941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771896950.264, "dur": 73.369, + "args": { + "External id": 152495,"Sequence number": 3058865, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 942 + } + }, + { + "ph": "f", "id": 117, "pid": 5714, "tid": 6744, "ts": 6303771896950.264, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896951.953, "dur": 3.920, + "args": { + "External id": 152496,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896952.564, "dur": 2.920, + "args": { + "External id": 152497,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896954.664, "dur": 0.609, + "args": { + "External id": 152498,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771896956.764, "dur": 34.199, + "args": { + "External id": 152499,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896992.673, "dur": 4.090, + "args": { + "External id": 152500,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896993.603, "dur": 2.380, + "args": { + "External id": 152501,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771896995.113, "dur": 0.680, + "args": { + "External id": 152502,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771896998.333, "dur": 3.530, + "args": { + "External id": 152503,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771896999.003, "dur": 2.540, + "args": { + "External id": 152504,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897001.123, "dur": 0.280, + "args": { + "External id": 152505,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771897002.493, "dur": 20.320, + "args": { + "External id": 152506,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897039.353, "dur": 31.240, + "args": { + "External id": 152507,"Record function id": 0, "Sequence number": 3058864, "Fwd thread id": 1, "Ev Idx": 954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897041.223, "dur": 5.280, + "args": { + "External id": 152508,"Sequence number": 3058864, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 955 + } + }, + { + "ph": "f", "id": 118, "pid": 5714, "tid": 6744, "ts": 6303771897041.223, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771897043.053, "dur": 3.250, + "args": { + "External id": 152509,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771897044.073, "dur": 2.070, + "args": { + "External id": 152510,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771897049.063, "dur": 18.130, + "args": { + "External id": 152511,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897077.673, "dur": 13.330, + "args": { + "External id": 152512,"Record function id": 0, "Sequence number": 3058863, "Fwd thread id": 1, "Ev Idx": 959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897079.393, "dur": 9.230, + "args": { + "External id": 152513,"Sequence number": 3058863, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 960 + } + }, + { + "ph": "f", "id": 119, "pid": 5714, "tid": 6744, "ts": 6303771897079.393, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771897080.253, "dur": 8.120, + "args": { + "External id": 152514,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771897081.403, "dur": 5.980, + "args": { + "External id": 152515,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897085.393, "dur": 1.720, + "args": { + "External id": 152516,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771897096.063, "dur": 8.950, + "args": { + "External id": 152517,"Record function id": 0, "Ev Idx": 964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771897097.883, "dur": 6.040, + "args": { + "External id": 152518,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771897099.613, "dur": 3.880, + "args": { + "External id": 152519,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771897101.703, "dur": 1.550, + "args": { + "External id": 152520,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897108.543, "dur": 6.530, + "args": { + "External id": 152521,"Record function id": 0, "Sequence number": 3058862, "Fwd thread id": 1, "Ev Idx": 968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897109.843, "dur": 3.500, + "args": { + "External id": 152522,"Sequence number": 3058862, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 969 + } + }, + { + "ph": "f", "id": 120, "pid": 5714, "tid": 6744, "ts": 6303771897109.843, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771897110.973, "dur": 2.220, + "args": { + "External id": 152523,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771897111.713, "dur": 1.270, + "args": { + "External id": 152524,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897118.483, "dur": 82.120, + "args": { + "External id": 152525,"Record function id": 0, "Sequence number": 3058861, "Fwd thread id": 1, "Ev Idx": 972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897119.633, "dur": 72.480, + "args": { + "External id": 152526,"Sequence number": 3058861, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 973 + } + }, + { + "ph": "f", "id": 121, "pid": 5714, "tid": 6744, "ts": 6303771897119.633, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771897121.323, "dur": 3.970, + "args": { + "External id": 152527,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771897121.863, "dur": 3.050, + "args": { + "External id": 152528,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897123.933, "dur": 0.730, + "args": { + "External id": 152529,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771897126.113, "dur": 34.350, + "args": { + "External id": 152530,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771897162.093, "dur": 4.240, + "args": { + "External id": 152531,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771897163.023, "dur": 2.500, + "args": { + "External id": 152532,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897164.553, "dur": 0.760, + "args": { + "External id": 152533,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771897167.893, "dur": 3.510, + "args": { + "External id": 152534,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771897169.663, "dur": 1.380, + "args": { + "External id": 152535,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897170.673, "dur": 0.220, + "args": { + "External id": 152536,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771897172.103, "dur": 19.250, + "args": { + "External id": 152537,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897208.973, "dur": 27.410, + "args": { + "External id": 152538,"Record function id": 0, "Sequence number": 3058860, "Fwd thread id": 1, "Ev Idx": 985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897210.663, "dur": 5.430, + "args": { + "External id": 152539,"Sequence number": 3058860, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 986 + } + }, + { + "ph": "f", "id": 122, "pid": 5714, "tid": 6744, "ts": 6303771897210.663, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771897212.493, "dur": 3.420, + "args": { + "External id": 152540,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771897213.543, "dur": 2.200, + "args": { + "External id": 152541,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771897218.523, "dur": 14.550, + "args": { + "External id": 152542,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897241.923, "dur": 10.920, + "args": { + "External id": 152543,"Record function id": 0, "Sequence number": 3058859, "Fwd thread id": 1, "Ev Idx": 990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897243.673, "dur": 6.990, + "args": { + "External id": 152544,"Sequence number": 3058859, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 991 + } + }, + { + "ph": "f", "id": 123, "pid": 5714, "tid": 6744, "ts": 6303771897243.673, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771897245.503, "dur": 4.910, + "args": { + "External id": 152545,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771897246.613, "dur": 2.890, + "args": { + "External id": 152546,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897248.413, "dur": 0.840, + "args": { + "External id": 152547,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771897257.933, "dur": 8.230, + "args": { + "External id": 152548,"Record function id": 0, "Ev Idx": 995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771897259.623, "dur": 5.450, + "args": { + "External id": 152549,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771897261.293, "dur": 3.340, + "args": { + "External id": 152550,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771897261.933, "dur": 2.520, + "args": { + "External id": 152551,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897269.843, "dur": 83.929, + "args": { + "External id": 152552,"Record function id": 0, "Sequence number": 3058858, "Fwd thread id": 1, "Ev Idx": 999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897270.983, "dur": 41.090, + "args": { + "External id": 152553,"Sequence number": 3058858, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1000 + } + }, + { + "ph": "f", "id": 124, "pid": 5714, "tid": 6744, "ts": 6303771897270.983, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771897272.643, "dur": 17.720, + "args": { + "External id": 152554,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771897291.713, "dur": 19.830, + "args": { + "External id": 152555,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771897315.363, "dur": 26.980, + "args": { + "External id": 152556,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771897346.463, "dur": 2.229, + "args": { + "External id": 152557,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771897362.832, "dur": 7.660, + "args": { + "External id": 152558,"Record function id": 0, "Ev Idx": 1005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771897365.112, "dur": 4.300, + "args": { + "External id": 152559,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771897366.703, "dur": 2.260, + "args": { + "External id": 152560,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771897367.383, "dur": 1.409, + "args": { + "External id": 152561,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897374.363, "dur": 32.549, + "args": { + "External id": 152562,"Record function id": 0, "Sequence number": 3058857, "Fwd thread id": 1, "Ev Idx": 1009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897375.623, "dur": 28.129, + "args": { + "External id": 152563,"Sequence number": 3058857, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1010 + } + }, + { + "ph": "f", "id": 125, "pid": 5714, "tid": 6744, "ts": 6303771897375.623, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771897377.072, "dur": 26.291, + "args": { + "External id": 152564,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771897378.952, "dur": 24.120, + "args": { + "External id": 152565,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897382.463, "dur": 5.060, + "args": { + "External id": 152566,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771897388.472, "dur": 14.111, + "args": { + "External id": 152567,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897413.132, "dur": 52.860, + "args": { + "External id": 152568,"Record function id": 0, "Sequence number": 3058856, "Fwd thread id": 1, "Ev Idx": 1015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897414.503, "dur": 28.449, + "args": { + "External id": 152569,"Sequence number": 3058856, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1016 + } + }, + { + "ph": "f", "id": 126, "pid": 5714, "tid": 6744, "ts": 6303771897414.503, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771897416.472, "dur": 14.050, + "args": { + "External id": 152570,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771897431.702, "dur": 10.820, + "args": { + "External id": 152571,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771897446.342, "dur": 14.790, + "args": { + "External id": 152572,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897472.672, "dur": 66.820, + "args": { + "External id": 152573,"Record function id": 0, "Sequence number": 3058855, "Fwd thread id": 1, "Ev Idx": 1020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897474.222, "dur": 60.250, + "args": { + "External id": 152574,"Sequence number": 3058855, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1021 + } + }, + { + "ph": "f", "id": 127, "pid": 5714, "tid": 6744, "ts": 6303771897474.222, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771897476.672, "dur": 29.090, + "args": { + "External id": 152575,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771897478.152, "dur": 0.450, + "args": { + "External id": 152576,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771897489.422, "dur": 0.250, + "args": { + "External id": 152577,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771897507.082, "dur": 15.640, + "args": { + "External id": 152578,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771897510.012, "dur": 11.760, + "args": { + "External id": 152579,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771897523.472, "dur": 9.270, + "args": { + "External id": 152580,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771897546.702, "dur": 3.680, + "args": { + "External id": 152581,"Record function id": 0, "Sequence number": 3058854, "Fwd thread id": 1, "Ev Idx": 1028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771897548.182, "dur": 0.420, + "args": { + "External id": 152582,"Sequence number": 3058854, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1029 + } + }, + { + "ph": "f", "id": 128, "pid": 5714, "tid": 6744, "ts": 6303771897548.182, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771897553.842, "dur": 34.990, + "args": { + "External id": 152583,"Record function id": 0, "Sequence number": 3058853, "Fwd thread id": 1, "Ev Idx": 1030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771897555.062, "dur": 29.660, + "args": { + "External id": 152584,"Sequence number": 3058853, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1031 + } + }, + { + "ph": "f", "id": 129, "pid": 5714, "tid": 6744, "ts": 6303771897555.062, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6303771897558.332, "dur": 5.300, + "args": { + "External id": 152585,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897560.942, "dur": 1.340, + "args": { + "External id": 152586,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771897564.512, "dur": 19.660, + "args": { + "External id": 152587,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771897566.642, "dur": 16.680, + "args": { + "External id": 152588,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897595.082, "dur": 76.930, + "args": { + "External id": 152589,"Record function id": 0, "Sequence number": 3058852, "Fwd thread id": 1, "Ev Idx": 1036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897596.612, "dur": 58.130, + "args": { + "External id": 152590,"Sequence number": 3058852, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1037 + } + }, + { + "ph": "f", "id": 130, "pid": 5714, "tid": 6744, "ts": 6303771897596.612, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771897598.252, "dur": 26.440, + "args": { + "External id": 152591,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771897599.242, "dur": 0.330, + "args": { + "External id": 152592,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771897600.482, "dur": 0.210, + "args": { + "External id": 152593,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771897606.012, "dur": 17.390, + "args": { + "External id": 152594,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771897625.772, "dur": 16.150, + "args": { + "External id": 152595,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771897628.032, "dur": 12.940, + "args": { + "External id": 152596,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771897643.792, "dur": 9.220, + "args": { + "External id": 152597,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771897659.542, "dur": 9.370, + "args": { + "External id": 152598,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897680.472, "dur": 40.210, + "args": { + "External id": 152599,"Record function id": 0, "Sequence number": 3058851, "Fwd thread id": 1, "Ev Idx": 1046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897682.362, "dur": 23.530, + "args": { + "External id": 152600,"Sequence number": 3058851, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1047 + } + }, + { + "ph": "f", "id": 131, "pid": 5714, "tid": 6744, "ts": 6303771897682.362, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771897683.832, "dur": 21.740, + "args": { + "External id": 152601,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771897684.752, "dur": 20.570, + "args": { + "External id": 152602,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897688.352, "dur": 4.670, + "args": { + "External id": 152603,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771897693.932, "dur": 10.850, + "args": { + "External id": 152604,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771897709.922, "dur": 8.740, + "args": { + "External id": 152605,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897727.122, "dur": 5.080, + "args": { + "External id": 152606,"Record function id": 0, "Sequence number": 3058850, "Fwd thread id": 1, "Ev Idx": 1053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771897728.692, "dur": 0.920, + "args": { + "External id": 152607,"Sequence number": 3058850, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1054 + } + }, + { + "ph": "f", "id": 132, "pid": 5714, "tid": 6744, "ts": 6303771897728.692, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771897736.642, "dur": 288.819, + "args": { + "External id": 152608,"Record function id": 0, "Sequence number": 3058849, "Fwd thread id": 1, "Ev Idx": 1055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771897738.092, "dur": 277.879, + "args": { + "External id": 152609,"Sequence number": 3058849, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1056 + } + }, + { + "ph": "f", "id": 133, "pid": 5714, "tid": 6744, "ts": 6303771897738.092, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771897763.272, "dur": 7.230, + "args": { + "External id": 152610,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6303771897766.872, "dur": 3.170, + "args": { + "External id": 152611,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 1058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771897772.762, "dur": 5.520, + "args": { + "External id": 152612,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771897773.991, "dur": 3.691, + "args": { + "External id": 152613,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897775.711, "dur": 1.700, + "args": { + "External id": 152614,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 6744, + "ts": 6303771897780.371, "dur": 39.231, + "args": { + "External id": 152615,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 1062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771897782.122, "dur": 2.300, + "args": { + "External id": 152616,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 1063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771897782.622, "dur": 1.449, + "args": { + "External id": 152617,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897783.602, "dur": 0.309, + "args": { + "External id": 152618,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 6744, + "ts": 6303771897785.311, "dur": 33.551, + "args": { + "External id": 152619,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771897786.322, "dur": 31.800, + "args": { + "External id": 152620,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 6744, + "ts": 6303771897823.991, "dur": 3.560, + "args": { + "External id": 152621,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 1068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771897825.282, "dur": 2.089, + "args": { + "External id": 152622,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771897848.702, "dur": 5.449, + "args": { + "External id": 152623,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771897855.522, "dur": 3.929, + "args": { + "External id": 152624,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771897860.462, "dur": 2.460, + "args": { + "External id": 152625,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771897888.911, "dur": 4.440, + "args": { + "External id": 152626,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771897891.001, "dur": 2.040, + "args": { + "External id": 152627,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5714, "tid": 6744, + "ts": 6303771897905.741, "dur": 94.280, + "args": { + "External id": 152628,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 1075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6303771897908.901, "dur": 5.550, + "args": { + "External id": 152629,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897911.531, "dur": 2.030, + "args": { + "External id": 152630,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771897915.711, "dur": 3.130, + "args": { + "External id": 152631,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 1078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897917.881, "dur": 0.430, + "args": { + "External id": 152632,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 1079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6303771897919.981, "dur": 2.030, + "args": { + "External id": 152633,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897921.381, "dur": 0.290, + "args": { + "External id": 152634,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771897922.851, "dur": 2.120, + "args": { + "External id": 152635,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897924.311, "dur": 0.340, + "args": { + "External id": 152636,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 1083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771897928.121, "dur": 2.000, + "args": { + "External id": 152637,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 1084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897929.401, "dur": 0.380, + "args": { + "External id": 152638,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 1085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771897931.141, "dur": 4.730, + "args": { + "External id": 152639,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 1086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6303771897934.191, "dur": 1.470, + "args": { + "External id": 152640,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 1087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771897940.161, "dur": 1.990, + "args": { + "External id": 152641,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 1088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897941.591, "dur": 0.250, + "args": { + "External id": 152642,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 1089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771897944.311, "dur": 1.970, + "args": { + "External id": 152643,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771897945.051, "dur": 1.100, + "args": { + "External id": 152644,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771897947.191, "dur": 40.880, + "args": { + "External id": 152645,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 1092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771897990.541, "dur": 1.760, + "args": { + "External id": 152646,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 1093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771897993.201, "dur": 3.320, + "args": { + "External id": 152647,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 1094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771897995.431, "dur": 0.480, + "args": { + "External id": 152648,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 1095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771897998.531, "dur": 0.710, + "args": { + "External id": 152649,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 1096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771898037.221, "dur": 9.740, + "args": { + "External id": 152650,"Record function id": 0, "Ev Idx": 1097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771898039.711, "dur": 6.000, + "args": { + "External id": 152651,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771898041.821, "dur": 3.030, + "args": { + "External id": 152652,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771898042.631, "dur": 2.010, + "args": { + "External id": 152653,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898051.151, "dur": 8.250, + "args": { + "External id": 152654,"Record function id": 0, "Sequence number": 3058848, "Fwd thread id": 1, "Ev Idx": 1101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898052.851, "dur": 4.460, + "args": { + "External id": 152655,"Sequence number": 3058848, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1102 + } + }, + { + "ph": "f", "id": 134, "pid": 5714, "tid": 6744, "ts": 6303771898052.851, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771898054.591, "dur": 2.490, + "args": { + "External id": 152656,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771898055.461, "dur": 1.390, + "args": { + "External id": 152657,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898063.181, "dur": 198.340, + "args": { + "External id": 152658,"Record function id": 0, "Sequence number": 3058847, "Fwd thread id": 1, "Ev Idx": 1105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898064.401, "dur": 190.009, + "args": { + "External id": 152659,"Sequence number": 3058847, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1106 + } + }, + { + "ph": "f", "id": 135, "pid": 5714, "tid": 6744, "ts": 6303771898064.401, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771898066.781, "dur": 5.740, + "args": { + "External id": 152660,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771898069.161, "dur": 2.710, + "args": { + "External id": 152661,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898070.701, "dur": 0.850, + "args": { + "External id": 152662,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771898073.371, "dur": 37.520, + "args": { + "External id": 152663,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771898112.901, "dur": 4.730, + "args": { + "External id": 152664,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771898113.891, "dur": 2.930, + "args": { + "External id": 152665,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898115.781, "dur": 0.790, + "args": { + "External id": 152666,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771898120.421, "dur": 2.670, + "args": { + "External id": 152667,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771898121.151, "dur": 1.440, + "args": { + "External id": 152668,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898122.151, "dur": 0.300, + "args": { + "External id": 152669,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771898123.751, "dur": 129.739, + "args": { + "External id": 152670,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898269.970, "dur": 9.671, + "args": { + "External id": 152671,"Record function id": 0, "Sequence number": 3058846, "Fwd thread id": 1, "Ev Idx": 1118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898271.721, "dur": 6.109, + "args": { + "External id": 152672,"Sequence number": 3058846, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1119 + } + }, + { + "ph": "f", "id": 136, "pid": 5714, "tid": 6744, "ts": 6303771898271.721, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771898273.481, "dur": 4.160, + "args": { + "External id": 152673,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771898274.450, "dur": 2.991, + "args": { + "External id": 152674,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898283.390, "dur": 8.700, + "args": { + "External id": 152675,"Record function id": 0, "Sequence number": 3058845, "Fwd thread id": 1, "Ev Idx": 1122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898285.381, "dur": 4.660, + "args": { + "External id": 152676,"Sequence number": 3058845, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1123 + } + }, + { + "ph": "f", "id": 137, "pid": 5714, "tid": 6744, "ts": 6303771898285.381, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771898286.101, "dur": 3.729, + "args": { + "External id": 152677,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771898286.981, "dur": 2.229, + "args": { + "External id": 152678,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898288.270, "dur": 0.691, + "args": { + "External id": 152679,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771898304.541, "dur": 7.680, + "args": { + "External id": 152680,"Record function id": 0, "Ev Idx": 1127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771898306.501, "dur": 4.600, + "args": { + "External id": 152681,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771898308.250, "dur": 2.500, + "args": { + "External id": 152682,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771898308.910, "dur": 1.620, + "args": { + "External id": 152683,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898316.221, "dur": 7.559, + "args": { + "External id": 152684,"Record function id": 0, "Sequence number": 3058844, "Fwd thread id": 1, "Ev Idx": 1131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898317.510, "dur": 4.340, + "args": { + "External id": 152685,"Sequence number": 3058844, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1132 + } + }, + { + "ph": "f", "id": 138, "pid": 5714, "tid": 6744, "ts": 6303771898317.510, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771898318.710, "dur": 2.971, + "args": { + "External id": 152686,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771898320.370, "dur": 1.140, + "args": { + "External id": 152687,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898327.300, "dur": 84.430, + "args": { + "External id": 152688,"Record function id": 0, "Sequence number": 3058843, "Fwd thread id": 1, "Ev Idx": 1135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898328.460, "dur": 75.010, + "args": { + "External id": 152689,"Sequence number": 3058843, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1136 + } + }, + { + "ph": "f", "id": 139, "pid": 5714, "tid": 6744, "ts": 6303771898328.460, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771898330.220, "dur": 3.230, + "args": { + "External id": 152690,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771898330.890, "dur": 2.070, + "args": { + "External id": 152691,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898332.050, "dur": 0.710, + "args": { + "External id": 152692,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771898335.070, "dur": 34.970, + "args": { + "External id": 152693,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771898371.870, "dur": 4.270, + "args": { + "External id": 152694,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771898372.710, "dur": 2.590, + "args": { + "External id": 152695,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898374.360, "dur": 0.690, + "args": { + "External id": 152696,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771898377.720, "dur": 3.220, + "args": { + "External id": 152697,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771898378.330, "dur": 2.270, + "args": { + "External id": 152698,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898380.200, "dur": 0.260, + "args": { + "External id": 152699,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771898381.580, "dur": 21.080, + "args": { + "External id": 152700,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898419.890, "dur": 31.110, + "args": { + "External id": 152701,"Record function id": 0, "Sequence number": 3058842, "Fwd thread id": 1, "Ev Idx": 1148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898421.560, "dur": 5.020, + "args": { + "External id": 152702,"Sequence number": 3058842, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1149 + } + }, + { + "ph": "f", "id": 140, "pid": 5714, "tid": 6744, "ts": 6303771898421.560, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771898423.180, "dur": 3.260, + "args": { + "External id": 152703,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771898424.050, "dur": 2.200, + "args": { + "External id": 152704,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771898429.370, "dur": 17.940, + "args": { + "External id": 152705,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898456.860, "dur": 10.800, + "args": { + "External id": 152706,"Record function id": 0, "Sequence number": 3058841, "Fwd thread id": 1, "Ev Idx": 1153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898458.650, "dur": 6.760, + "args": { + "External id": 152707,"Sequence number": 3058841, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1154 + } + }, + { + "ph": "f", "id": 141, "pid": 5714, "tid": 6744, "ts": 6303771898458.650, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771898460.370, "dur": 4.800, + "args": { + "External id": 152708,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771898461.640, "dur": 2.700, + "args": { + "External id": 152709,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898463.250, "dur": 0.840, + "args": { + "External id": 152710,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771898472.870, "dur": 7.110, + "args": { + "External id": 152711,"Record function id": 0, "Ev Idx": 1158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771898474.480, "dur": 4.440, + "args": { + "External id": 152712,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771898476.180, "dur": 2.310, + "args": { + "External id": 152713,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771898476.850, "dur": 1.460, + "args": { + "External id": 152714,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898483.540, "dur": 70.090, + "args": { + "External id": 152715,"Record function id": 0, "Sequence number": 3058840, "Fwd thread id": 1, "Ev Idx": 1162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898484.870, "dur": 31.020, + "args": { + "External id": 152716,"Sequence number": 3058840, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1163 + } + }, + { + "ph": "f", "id": 142, "pid": 5714, "tid": 6744, "ts": 6303771898484.870, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771898486.430, "dur": 16.390, + "args": { + "External id": 152717,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771898504.130, "dur": 11.310, + "args": { + "External id": 152718,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771898518.680, "dur": 23.930, + "args": { + "External id": 152719,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771898546.170, "dur": 2.970, + "args": { + "External id": 152720,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771898562.060, "dur": 7.520, + "args": { + "External id": 152721,"Record function id": 0, "Ev Idx": 1168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771898564.210, "dur": 4.240, + "args": { + "External id": 152722,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771898565.730, "dur": 2.270, + "args": { + "External id": 152723,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771898566.400, "dur": 1.420, + "args": { + "External id": 152724,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898573.310, "dur": 32.230, + "args": { + "External id": 152725,"Record function id": 0, "Sequence number": 3058839, "Fwd thread id": 1, "Ev Idx": 1172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898574.530, "dur": 27.640, + "args": { + "External id": 152726,"Sequence number": 3058839, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1173 + } + }, + { + "ph": "f", "id": 143, "pid": 5714, "tid": 6744, "ts": 6303771898574.530, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771898575.920, "dur": 25.870, + "args": { + "External id": 152727,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771898577.830, "dur": 23.670, + "args": { + "External id": 152728,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898580.400, "dur": 5.340, + "args": { + "External id": 152729,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771898586.700, "dur": 14.250, + "args": { + "External id": 152730,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898611.540, "dur": 51.600, + "args": { + "External id": 152731,"Record function id": 0, "Sequence number": 3058838, "Fwd thread id": 1, "Ev Idx": 1178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898612.910, "dur": 28.610, + "args": { + "External id": 152732,"Sequence number": 3058838, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1179 + } + }, + { + "ph": "f", "id": 144, "pid": 5714, "tid": 6744, "ts": 6303771898612.910, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771898614.790, "dur": 14.140, + "args": { + "External id": 152733,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771898630.190, "dur": 10.910, + "args": { + "External id": 152734,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771898644.500, "dur": 13.960, + "args": { + "External id": 152735,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898669.810, "dur": 57.510, + "args": { + "External id": 152736,"Record function id": 0, "Sequence number": 3058837, "Fwd thread id": 1, "Ev Idx": 1183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898671.560, "dur": 51.300, + "args": { + "External id": 152737,"Sequence number": 3058837, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1184 + } + }, + { + "ph": "f", "id": 145, "pid": 5714, "tid": 6744, "ts": 6303771898671.560, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771898674.480, "dur": 17.529, + "args": { + "External id": 152738,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771898676.100, "dur": 0.449, + "args": { + "External id": 152739,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771898677.589, "dur": 0.220, + "args": { + "External id": 152740,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771898693.229, "dur": 17.591, + "args": { + "External id": 152741,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771898696.400, "dur": 13.400, + "args": { + "External id": 152742,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771898711.649, "dur": 9.431, + "args": { + "External id": 152743,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771898734.349, "dur": 4.591, + "args": { + "External id": 152744,"Record function id": 0, "Sequence number": 3058836, "Fwd thread id": 1, "Ev Idx": 1191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771898736.729, "dur": 0.431, + "args": { + "External id": 152745,"Sequence number": 3058836, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1192 + } + }, + { + "ph": "f", "id": 146, "pid": 5714, "tid": 6744, "ts": 6303771898736.729, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771898742.249, "dur": 33.210, + "args": { + "External id": 152746,"Record function id": 0, "Sequence number": 3058835, "Fwd thread id": 1, "Ev Idx": 1193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771898743.309, "dur": 28.660, + "args": { + "External id": 152747,"Sequence number": 3058835, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1194 + } + }, + { + "ph": "f", "id": 147, "pid": 5714, "tid": 6744, "ts": 6303771898743.309, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6303771898745.420, "dur": 4.980, + "args": { + "External id": 152748,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898747.620, "dur": 1.369, + "args": { + "External id": 152749,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771898751.300, "dur": 20.060, + "args": { + "External id": 152750,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771898753.500, "dur": 16.929, + "args": { + "External id": 152751,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898781.609, "dur": 80.110, + "args": { + "External id": 152752,"Record function id": 0, "Sequence number": 3058834, "Fwd thread id": 1, "Ev Idx": 1199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898783.139, "dur": 60.870, + "args": { + "External id": 152753,"Sequence number": 3058834, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1200 + } + }, + { + "ph": "f", "id": 148, "pid": 5714, "tid": 6744, "ts": 6303771898783.139, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771898784.659, "dur": 29.190, + "args": { + "External id": 152754,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771898785.739, "dur": 0.320, + "args": { + "External id": 152755,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771898787.069, "dur": 0.170, + "args": { + "External id": 152756,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771898792.889, "dur": 19.730, + "args": { + "External id": 152757,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771898815.869, "dur": 16.040, + "args": { + "External id": 152758,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771898818.119, "dur": 12.710, + "args": { + "External id": 152759,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771898832.709, "dur": 9.530, + "args": { + "External id": 152760,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771898849.239, "dur": 9.340, + "args": { + "External id": 152761,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898869.639, "dur": 47.290, + "args": { + "External id": 152762,"Record function id": 0, "Sequence number": 3058833, "Fwd thread id": 1, "Ev Idx": 1209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898871.489, "dur": 22.900, + "args": { + "External id": 152763,"Sequence number": 3058833, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1210 + } + }, + { + "ph": "f", "id": 149, "pid": 5714, "tid": 6744, "ts": 6303771898871.489, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771898872.939, "dur": 21.130, + "args": { + "External id": 152764,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771898873.839, "dur": 19.980, + "args": { + "External id": 152765,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898876.819, "dur": 4.880, + "args": { + "External id": 152766,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771898882.589, "dur": 10.690, + "args": { + "External id": 152767,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771898898.829, "dur": 13.030, + "args": { + "External id": 152768,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898924.099, "dur": 17.910, + "args": { + "External id": 152769,"Record function id": 0, "Sequence number": 3058832, "Fwd thread id": 1, "Ev Idx": 1216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898925.879, "dur": 0.960, + "args": { + "External id": 152770,"Sequence number": 3058832, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1217 + } + }, + { + "ph": "f", "id": 150, "pid": 5714, "tid": 6744, "ts": 6303771898925.879, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771898929.389, "dur": 10.390, + "args": { + "External id": 152771,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898946.839, "dur": 10.100, + "args": { + "External id": 152772,"Record function id": 0, "Sequence number": 3058831, "Fwd thread id": 1, "Ev Idx": 1219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898948.269, "dur": 6.480, + "args": { + "External id": 152773,"Sequence number": 3058831, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1220 + } + }, + { + "ph": "f", "id": 151, "pid": 5714, "tid": 6744, "ts": 6303771898948.269, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771898949.869, "dur": 4.640, + "args": { + "External id": 152774,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771898951.599, "dur": 2.690, + "args": { + "External id": 152775,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898960.569, "dur": 85.770, + "args": { + "External id": 152776,"Record function id": 0, "Sequence number": 3058830, "Fwd thread id": 1, "Ev Idx": 1223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771898961.759, "dur": 79.310, + "args": { + "External id": 152777,"Sequence number": 3058830, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1224 + } + }, + { + "ph": "f", "id": 152, "pid": 5714, "tid": 6744, "ts": 6303771898961.759, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771898963.919, "dur": 4.870, + "args": { + "External id": 152778,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771898965.229, "dur": 2.780, + "args": { + "External id": 152779,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771898966.599, "dur": 1.060, + "args": { + "External id": 152780,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771898970.659, "dur": 38.640, + "args": { + "External id": 152781,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899011.019, "dur": 4.030, + "args": { + "External id": 152782,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899011.879, "dur": 2.320, + "args": { + "External id": 152783,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899013.349, "dur": 0.660, + "args": { + "External id": 152784,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899016.639, "dur": 3.100, + "args": { + "External id": 152785,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899017.329, "dur": 2.060, + "args": { + "External id": 152786,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899019.009, "dur": 0.230, + "args": { + "External id": 152787,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771899020.369, "dur": 20.010, + "args": { + "External id": 152788,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899054.179, "dur": 9.420, + "args": { + "External id": 152789,"Record function id": 0, "Sequence number": 3058829, "Fwd thread id": 1, "Ev Idx": 1236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899055.859, "dur": 5.670, + "args": { + "External id": 152790,"Sequence number": 3058829, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1237 + } + }, + { + "ph": "f", "id": 153, "pid": 5714, "tid": 6744, "ts": 6303771899055.859, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771899057.529, "dur": 3.800, + "args": { + "External id": 152791,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771899059.159, "dur": 1.990, + "args": { + "External id": 152792,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899067.249, "dur": 8.430, + "args": { + "External id": 152793,"Record function id": 0, "Sequence number": 3058828, "Fwd thread id": 1, "Ev Idx": 1240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899068.439, "dur": 5.400, + "args": { + "External id": 152794,"Sequence number": 3058828, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1241 + } + }, + { + "ph": "f", "id": 154, "pid": 5714, "tid": 6744, "ts": 6303771899068.439, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899069.109, "dur": 4.530, + "args": { + "External id": 152795,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899070.999, "dur": 2.050, + "args": { + "External id": 152796,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899072.229, "dur": 0.580, + "args": { + "External id": 152797,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771899080.749, "dur": 7.550, + "args": { + "External id": 152798,"Record function id": 0, "Ev Idx": 1245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771899082.449, "dur": 4.710, + "args": { + "External id": 152799,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771899084.229, "dur": 2.530, + "args": { + "External id": 152800,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771899084.909, "dur": 1.670, + "args": { + "External id": 152801,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899091.839, "dur": 6.180, + "args": { + "External id": 152802,"Record function id": 0, "Sequence number": 3058827, "Fwd thread id": 1, "Ev Idx": 1249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899092.989, "dur": 3.170, + "args": { + "External id": 152803,"Sequence number": 3058827, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1250 + } + }, + { + "ph": "f", "id": 155, "pid": 5714, "tid": 6744, "ts": 6303771899092.989, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771899093.969, "dur": 2.070, + "args": { + "External id": 152804,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771899094.589, "dur": 1.290, + "args": { + "External id": 152805,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6303771899102.309, "dur": 211.919, + "args": { + "External id": 152806,"Record function id": 0, "Sequence number": 3058826, "Fwd thread id": 1, "Ev Idx": 1253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6303771899103.709, "dur": 188.139, + "args": { + "External id": 152807,"Sequence number": 3058826, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1254 + } + }, + { + "ph": "f", "id": 156, "pid": 5714, "tid": 6744, "ts": 6303771899103.709, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771899113.889, "dur": 8.510, + "args": { + "External id": 152808,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899115.739, "dur": 6.140, + "args": { + "External id": 152809,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771899124.659, "dur": 4.309, + "args": { + "External id": 152810,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899125.539, "dur": 3.180, + "args": { + "External id": 152811,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771899130.128, "dur": 4.280, + "args": { + "External id": 152812,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899130.939, "dur": 3.149, + "args": { + "External id": 152813,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771899148.328, "dur": 118.580, + "args": { + "External id": 152814,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 1261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771899198.068, "dur": 5.571, + "args": { + "External id": 152815,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771899205.139, "dur": 2.980, + "args": { + "External id": 152816,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771899277.998, "dur": 3.270, + "args": { + "External id": 152817,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771899285.698, "dur": 0.530, + "args": { + "External id": 152818,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771899288.818, "dur": 0.450, + "args": { + "External id": 152819,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771899324.578, "dur": 162.620, + "args": { + "External id": 152820,"Record function id": 0, "Sequence number": 3058825, "Fwd thread id": 1, "Ev Idx": 1267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771899326.688, "dur": 152.820, + "args": { + "External id": 152821,"Sequence number": 3058825, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1268 + } + }, + { + "ph": "f", "id": 157, "pid": 5714, "tid": 6744, "ts": 6303771899326.688, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771899341.008, "dur": 32.440, + "args": { + "External id": 152822,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899344.168, "dur": 6.210, + "args": { + "External id": 152823,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771899351.578, "dur": 21.260, + "args": { + "External id": 152824,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771899381.378, "dur": 6.380, + "args": { + "External id": 152825,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899383.018, "dur": 4.330, + "args": { + "External id": 152826,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771899498.398, "dur": 152.769, + "args": { + "External id": 152827,"Record function id": 0, "Sequence number": 3058824, "Fwd thread id": 1, "Ev Idx": 1274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771899500.998, "dur": 142.460, + "args": { + "External id": 152828,"Sequence number": 3058824, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1275 + } + }, + { + "ph": "f", "id": 158, "pid": 5714, "tid": 6744, "ts": 6303771899500.998, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771899515.588, "dur": 30.880, + "args": { + "External id": 152829,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899518.538, "dur": 5.940, + "args": { + "External id": 152830,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771899525.588, "dur": 20.270, + "args": { + "External id": 152831,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771899554.368, "dur": 6.250, + "args": { + "External id": 152832,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899555.868, "dur": 4.330, + "args": { + "External id": 152833,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899662.218, "dur": 12.569, + "args": { + "External id": 152834,"Record function id": 0, "Sequence number": 3058823, "Fwd thread id": 1, "Ev Idx": 1281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899664.487, "dur": 7.580, + "args": { + "External id": 152835,"Sequence number": 3058823, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1282 + } + }, + { + "ph": "f", "id": 159, "pid": 5714, "tid": 6744, "ts": 6303771899664.487, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771899667.018, "dur": 4.780, + "args": { + "External id": 152836,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771899668.007, "dur": 3.531, + "args": { + "External id": 152837,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899678.827, "dur": 6.380, + "args": { + "External id": 152838,"Record function id": 0, "Sequence number": 3058822, "Fwd thread id": 1, "Ev Idx": 1285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899680.947, "dur": 2.530, + "args": { + "External id": 152839,"Sequence number": 3058822, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1286 + } + }, + { + "ph": "f", "id": 160, "pid": 5714, "tid": 6744, "ts": 6303771899680.947, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771899681.817, "dur": 1.550, + "args": { + "External id": 152840,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771899682.377, "dur": 0.820, + "args": { + "External id": 152841,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899688.707, "dur": 5.110, + "args": { + "External id": 152842,"Record function id": 0, "Sequence number": 3058821, "Fwd thread id": 1, "Ev Idx": 1289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899689.787, "dur": 2.310, + "args": { + "External id": 152843,"Sequence number": 3058821, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1290 + } + }, + { + "ph": "f", "id": 161, "pid": 5714, "tid": 6744, "ts": 6303771899689.787, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771899690.617, "dur": 1.320, + "args": { + "External id": 152844,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771899691.127, "dur": 0.650, + "args": { + "External id": 152845,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899697.157, "dur": 6.210, + "args": { + "External id": 152846,"Record function id": 0, "Sequence number": 3058820, "Fwd thread id": 1, "Ev Idx": 1293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899698.267, "dur": 3.120, + "args": { + "External id": 152847,"Sequence number": 3058820, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1294 + } + }, + { + "ph": "f", "id": 162, "pid": 5714, "tid": 6744, "ts": 6303771899698.267, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771899699.017, "dur": 2.210, + "args": { + "External id": 152848,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771899700.317, "dur": 0.740, + "args": { + "External id": 152849,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899706.797, "dur": 91.680, + "args": { + "External id": 152850,"Record function id": 0, "Sequence number": 3058819, "Fwd thread id": 1, "Ev Idx": 1297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899707.927, "dur": 83.120, + "args": { + "External id": 152851,"Sequence number": 3058819, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1298 + } + }, + { + "ph": "f", "id": 163, "pid": 5714, "tid": 6744, "ts": 6303771899707.927, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899710.807, "dur": 5.620, + "args": { + "External id": 152852,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899712.077, "dur": 3.800, + "args": { + "External id": 152853,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899713.617, "dur": 1.950, + "args": { + "External id": 152854,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771899717.417, "dur": 41.040, + "args": { + "External id": 152855,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899761.127, "dur": 3.980, + "args": { + "External id": 152856,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899762.067, "dur": 2.210, + "args": { + "External id": 152857,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899763.427, "dur": 0.650, + "args": { + "External id": 152858,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899766.567, "dur": 2.190, + "args": { + "External id": 152859,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899767.217, "dur": 1.200, + "args": { + "External id": 152860,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899768.017, "dur": 0.260, + "args": { + "External id": 152861,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771899770.157, "dur": 20.080, + "args": { + "External id": 152862,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899806.997, "dur": 8.840, + "args": { + "External id": 152863,"Record function id": 0, "Sequence number": 3058818, "Fwd thread id": 1, "Ev Idx": 1310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899808.757, "dur": 5.280, + "args": { + "External id": 152864,"Sequence number": 3058818, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1311 + } + }, + { + "ph": "f", "id": 164, "pid": 5714, "tid": 6744, "ts": 6303771899808.757, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771899810.537, "dur": 3.340, + "args": { + "External id": 152865,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771899811.467, "dur": 2.210, + "args": { + "External id": 152866,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899819.607, "dur": 8.520, + "args": { + "External id": 152867,"Record function id": 0, "Sequence number": 3058817, "Fwd thread id": 1, "Ev Idx": 1314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899820.727, "dur": 5.420, + "args": { + "External id": 152868,"Sequence number": 3058817, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1315 + } + }, + { + "ph": "f", "id": 165, "pid": 5714, "tid": 6744, "ts": 6303771899820.727, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899821.567, "dur": 4.340, + "args": { + "External id": 152869,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899822.457, "dur": 2.850, + "args": { + "External id": 152870,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899824.497, "dur": 0.550, + "args": { + "External id": 152871,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771899833.257, "dur": 8.900, + "args": { + "External id": 152872,"Record function id": 0, "Ev Idx": 1319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771899835.047, "dur": 5.780, + "args": { + "External id": 152873,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771899836.857, "dur": 3.530, + "args": { + "External id": 152874,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771899837.597, "dur": 2.630, + "args": { + "External id": 152875,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899845.917, "dur": 6.310, + "args": { + "External id": 152876,"Record function id": 0, "Sequence number": 3058816, "Fwd thread id": 1, "Ev Idx": 1323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899847.097, "dur": 3.230, + "args": { + "External id": 152877,"Sequence number": 3058816, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1324 + } + }, + { + "ph": "f", "id": 166, "pid": 5714, "tid": 6744, "ts": 6303771899847.097, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771899848.137, "dur": 2.020, + "args": { + "External id": 152878,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771899848.757, "dur": 1.230, + "args": { + "External id": 152879,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899855.847, "dur": 78.980, + "args": { + "External id": 152880,"Record function id": 0, "Sequence number": 3058815, "Fwd thread id": 1, "Ev Idx": 1327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899856.987, "dur": 70.840, + "args": { + "External id": 152881,"Sequence number": 3058815, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1328 + } + }, + { + "ph": "f", "id": 167, "pid": 5714, "tid": 6744, "ts": 6303771899856.987, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899859.347, "dur": 2.730, + "args": { + "External id": 152882,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899859.907, "dur": 1.810, + "args": { + "External id": 152883,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899860.937, "dur": 0.580, + "args": { + "External id": 152884,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771899862.837, "dur": 33.610, + "args": { + "External id": 152885,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899898.217, "dur": 4.790, + "args": { + "External id": 152886,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899899.067, "dur": 3.160, + "args": { + "External id": 152887,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899901.367, "dur": 0.660, + "args": { + "External id": 152888,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899904.547, "dur": 2.350, + "args": { + "External id": 152889,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899905.317, "dur": 1.240, + "args": { + "External id": 152890,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899906.167, "dur": 0.240, + "args": { + "External id": 152891,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771899907.647, "dur": 19.320, + "args": { + "External id": 152892,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899943.567, "dur": 31.960, + "args": { + "External id": 152893,"Record function id": 0, "Sequence number": 3058814, "Fwd thread id": 1, "Ev Idx": 1340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899945.407, "dur": 6.180, + "args": { + "External id": 152894,"Sequence number": 3058814, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1341 + } + }, + { + "ph": "f", "id": 168, "pid": 5714, "tid": 6744, "ts": 6303771899945.407, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771899947.207, "dur": 4.240, + "args": { + "External id": 152895,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771899949.027, "dur": 2.240, + "args": { + "External id": 152896,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771899954.257, "dur": 17.900, + "args": { + "External id": 152897,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899981.667, "dur": 11.500, + "args": { + "External id": 152898,"Record function id": 0, "Sequence number": 3058813, "Fwd thread id": 1, "Ev Idx": 1345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771899983.277, "dur": 7.610, + "args": { + "External id": 152899,"Sequence number": 3058813, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1346 + } + }, + { + "ph": "f", "id": 169, "pid": 5714, "tid": 6744, "ts": 6303771899983.277, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771899984.087, "dur": 6.550, + "args": { + "External id": 152900,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771899985.277, "dur": 4.540, + "args": { + "External id": 152901,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771899987.977, "dur": 1.570, + "args": { + "External id": 152902,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771899998.117, "dur": 7.290, + "args": { + "External id": 152903,"Record function id": 0, "Ev Idx": 1350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771899999.837, "dur": 4.500, + "args": { + "External id": 152904,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771900001.467, "dur": 2.420, + "args": { + "External id": 152905,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771900002.157, "dur": 1.560, + "args": { + "External id": 152906,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900009.117, "dur": 6.900, + "args": { + "External id": 152907,"Record function id": 0, "Sequence number": 3058812, "Fwd thread id": 1, "Ev Idx": 1354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900010.437, "dur": 3.610, + "args": { + "External id": 152908,"Sequence number": 3058812, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1355 + } + }, + { + "ph": "f", "id": 170, "pid": 5714, "tid": 6744, "ts": 6303771900010.437, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771900011.547, "dur": 2.350, + "args": { + "External id": 152909,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771900012.317, "dur": 1.390, + "args": { + "External id": 152910,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900019.537, "dur": 87.900, + "args": { + "External id": 152911,"Record function id": 0, "Sequence number": 3058811, "Fwd thread id": 1, "Ev Idx": 1358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900020.657, "dur": 78.900, + "args": { + "External id": 152912,"Sequence number": 3058811, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1359 + } + }, + { + "ph": "f", "id": 171, "pid": 5714, "tid": 6744, "ts": 6303771900020.657, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771900023.466, "dur": 2.911, + "args": { + "External id": 152913,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771900024.086, "dur": 1.900, + "args": { + "External id": 152914,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900025.146, "dur": 0.640, + "args": { + "External id": 152915,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771900027.246, "dur": 34.571, + "args": { + "External id": 152916,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771900063.566, "dur": 4.840, + "args": { + "External id": 152917,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771900064.457, "dur": 3.109, + "args": { + "External id": 152918,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900066.686, "dur": 0.660, + "args": { + "External id": 152919,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771900069.917, "dur": 3.489, + "args": { + "External id": 152920,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771900070.666, "dur": 1.940, + "args": { + "External id": 152921,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900071.686, "dur": 0.780, + "args": { + "External id": 152922,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771900075.026, "dur": 23.691, + "args": { + "External id": 152923,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900115.737, "dur": 27.099, + "args": { + "External id": 152924,"Record function id": 0, "Sequence number": 3058810, "Fwd thread id": 1, "Ev Idx": 1371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900117.566, "dur": 5.331, + "args": { + "External id": 152925,"Sequence number": 3058810, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1372 + } + }, + { + "ph": "f", "id": 172, "pid": 5714, "tid": 6744, "ts": 6303771900117.566, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771900119.297, "dur": 3.429, + "args": { + "External id": 152926,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771900120.266, "dur": 2.271, + "args": { + "External id": 152927,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771900125.356, "dur": 14.150, + "args": { + "External id": 152928,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900148.906, "dur": 9.670, + "args": { + "External id": 152929,"Record function id": 0, "Sequence number": 3058809, "Fwd thread id": 1, "Ev Idx": 1376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900150.516, "dur": 5.920, + "args": { + "External id": 152930,"Sequence number": 3058809, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1377 + } + }, + { + "ph": "f", "id": 173, "pid": 5714, "tid": 6744, "ts": 6303771900150.516, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771900151.326, "dur": 4.870, + "args": { + "External id": 152931,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771900152.376, "dur": 2.940, + "args": { + "External id": 152932,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900154.196, "dur": 0.860, + "args": { + "External id": 152933,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771900163.446, "dur": 8.050, + "args": { + "External id": 152934,"Record function id": 0, "Ev Idx": 1381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771900165.106, "dur": 5.300, + "args": { + "External id": 152935,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771900166.736, "dur": 3.200, + "args": { + "External id": 152936,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771900167.426, "dur": 2.350, + "args": { + "External id": 152937,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900175.076, "dur": 71.410, + "args": { + "External id": 152938,"Record function id": 0, "Sequence number": 3058808, "Fwd thread id": 1, "Ev Idx": 1385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900177.296, "dur": 32.160, + "args": { + "External id": 152939,"Sequence number": 3058808, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1386 + } + }, + { + "ph": "f", "id": 174, "pid": 5714, "tid": 6744, "ts": 6303771900177.296, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771900178.796, "dur": 17.600, + "args": { + "External id": 152940,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771900197.776, "dur": 11.220, + "args": { + "External id": 152941,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771900212.466, "dur": 24.700, + "args": { + "External id": 152942,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771900239.946, "dur": 2.190, + "args": { + "External id": 152943,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771900254.836, "dur": 7.810, + "args": { + "External id": 152944,"Record function id": 0, "Ev Idx": 1391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771900257.066, "dur": 4.450, + "args": { + "External id": 152945,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771900258.456, "dur": 2.550, + "args": { + "External id": 152946,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771900259.306, "dur": 1.520, + "args": { + "External id": 152947,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900266.396, "dur": 39.240, + "args": { + "External id": 152948,"Record function id": 0, "Sequence number": 3058807, "Fwd thread id": 1, "Ev Idx": 1395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900267.566, "dur": 26.930, + "args": { + "External id": 152949,"Sequence number": 3058807, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1396 + } + }, + { + "ph": "f", "id": 175, "pid": 5714, "tid": 6744, "ts": 6303771900267.566, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771900269.046, "dur": 25.080, + "args": { + "External id": 152950,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771900270.096, "dur": 23.740, + "args": { + "External id": 152951,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900273.006, "dur": 5.200, + "args": { + "External id": 152952,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771900279.306, "dur": 13.990, + "args": { + "External id": 152953,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900312.286, "dur": 55.030, + "args": { + "External id": 152954,"Record function id": 0, "Sequence number": 3058806, "Fwd thread id": 1, "Ev Idx": 1401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900313.916, "dur": 31.360, + "args": { + "External id": 152955,"Sequence number": 3058806, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1402 + } + }, + { + "ph": "f", "id": 176, "pid": 5714, "tid": 6744, "ts": 6303771900313.916, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771900315.936, "dur": 15.450, + "args": { + "External id": 152956,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771900332.926, "dur": 11.870, + "args": { + "External id": 152957,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771900348.346, "dur": 14.190, + "args": { + "External id": 152958,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900374.086, "dur": 57.700, + "args": { + "External id": 152959,"Record function id": 0, "Sequence number": 3058805, "Fwd thread id": 1, "Ev Idx": 1406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900375.666, "dur": 51.330, + "args": { + "External id": 152960,"Sequence number": 3058805, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1407 + } + }, + { + "ph": "f", "id": 177, "pid": 5714, "tid": 6744, "ts": 6303771900375.666, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771900378.976, "dur": 18.380, + "args": { + "External id": 152961,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771900380.576, "dur": 0.400, + "args": { + "External id": 152962,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771900382.096, "dur": 0.220, + "args": { + "External id": 152963,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771900398.506, "dur": 16.400, + "args": { + "External id": 152964,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771900401.656, "dur": 12.220, + "args": { + "External id": 152965,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771900415.716, "dur": 9.490, + "args": { + "External id": 152966,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771900438.556, "dur": 3.630, + "args": { + "External id": 152967,"Record function id": 0, "Sequence number": 3058804, "Fwd thread id": 1, "Ev Idx": 1414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771900440.066, "dur": 0.400, + "args": { + "External id": 152968,"Sequence number": 3058804, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1415 + } + }, + { + "ph": "f", "id": 178, "pid": 5714, "tid": 6744, "ts": 6303771900440.066, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771900445.526, "dur": 32.950, + "args": { + "External id": 152969,"Record function id": 0, "Sequence number": 3058803, "Fwd thread id": 1, "Ev Idx": 1416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771900446.616, "dur": 28.209, + "args": { + "External id": 152970,"Sequence number": 3058803, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1417 + } + }, + { + "ph": "f", "id": 179, "pid": 5714, "tid": 6744, "ts": 6303771900446.616, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6303771900448.886, "dur": 5.230, + "args": { + "External id": 152971,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900451.256, "dur": 1.380, + "args": { + "External id": 152972,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771900455.006, "dur": 19.150, + "args": { + "External id": 152973,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771900457.336, "dur": 15.949, + "args": { + "External id": 152974,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900484.685, "dur": 78.480, + "args": { + "External id": 152975,"Record function id": 0, "Sequence number": 3058802, "Fwd thread id": 1, "Ev Idx": 1422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900486.196, "dur": 59.500, + "args": { + "External id": 152976,"Sequence number": 3058802, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1423 + } + }, + { + "ph": "f", "id": 180, "pid": 5714, "tid": 6744, "ts": 6303771900486.196, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771900487.665, "dur": 27.460, + "args": { + "External id": 152977,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771900488.805, "dur": 0.340, + "args": { + "External id": 152978,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771900490.956, "dur": 0.180, + "args": { + "External id": 152979,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771900496.265, "dur": 17.620, + "args": { + "External id": 152980,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771900516.176, "dur": 16.200, + "args": { + "External id": 152981,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771900518.665, "dur": 12.700, + "args": { + "External id": 152982,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771900533.176, "dur": 10.789, + "args": { + "External id": 152983,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771900551.056, "dur": 9.260, + "args": { + "External id": 152984,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900571.225, "dur": 40.000, + "args": { + "External id": 152985,"Record function id": 0, "Sequence number": 3058801, "Fwd thread id": 1, "Ev Idx": 1432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900573.065, "dur": 23.210, + "args": { + "External id": 152986,"Sequence number": 3058801, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1433 + } + }, + { + "ph": "f", "id": 181, "pid": 5714, "tid": 6744, "ts": 6303771900573.065, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771900574.485, "dur": 21.430, + "args": { + "External id": 152987,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771900575.385, "dur": 20.270, + "args": { + "External id": 152988,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900578.485, "dur": 4.920, + "args": { + "External id": 152989,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771900584.305, "dur": 10.860, + "args": { + "External id": 152990,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771900600.435, "dur": 8.590, + "args": { + "External id": 152991,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900617.595, "dur": 5.940, + "args": { + "External id": 152992,"Record function id": 0, "Sequence number": 3058800, "Fwd thread id": 1, "Ev Idx": 1439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900619.865, "dur": 0.930, + "args": { + "External id": 152993,"Sequence number": 3058800, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1440 + } + }, + { + "ph": "f", "id": 182, "pid": 5714, "tid": 6744, "ts": 6303771900619.865, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771900627.775, "dur": 277.650, + "args": { + "External id": 152994,"Record function id": 0, "Sequence number": 3058799, "Fwd thread id": 1, "Ev Idx": 1441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771900629.175, "dur": 267.900, + "args": { + "External id": 152995,"Sequence number": 3058799, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[6291456, 3072, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1442 + } + }, + { + "ph": "f", "id": 183, "pid": 5714, "tid": 6744, "ts": 6303771900629.175, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771900652.595, "dur": 7.120, + "args": { + "External id": 152996,"Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6303771900656.095, "dur": 3.160, + "args": { + "External id": 152997,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]", "[3072, 1]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[6291456, 3072, 1], [], []], "Input Dims": [[8, 2048, 768], [], []], "Ev Idx": 1444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771900661.975, "dur": 5.220, + "args": { + "External id": 152998,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771900663.215, "dur": 3.390, + "args": { + "External id": 152999,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900665.525, "dur": 0.790, + "args": { + "External id": 153000,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 6744, + "ts": 6303771900669.305, "dur": 37.640, + "args": { + "External id": 153001,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[3072, 1], [1, 2048], []], "Input Dims": [[16384, 768], [2048, 768], []], "Ev Idx": 1448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771900669.835, "dur": 2.160, + "args": { + "External id": 153002,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 2048]], "Input Dims": [[2048, 768]], "Ev Idx": 1449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771900670.325, "dur": 1.310, + "args": { + "External id": 153003,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 2048], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900671.165, "dur": 0.290, + "args": { + "External id": 153004,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[2048, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 2048], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 6744, + "ts": 6303771900672.775, "dur": 33.450, + "args": { + "External id": 153005,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771900673.665, "dur": 31.800, + "args": { + "External id": 153006,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[3072, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 1453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 6744, + "ts": 6303771900712.255, "dur": 3.520, + "args": { + "External id": 153007,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [4194304, 2048, 1]], "Input Dims": [[16384, 2048], [8, 2048, 2048]], "Ev Idx": 1454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771900713.455, "dur": 2.130, + "args": { + "External id": 153008,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771900736.035, "dur": 5.640, + "args": { + "External id": 153009,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771900742.965, "dur": 3.850, + "args": { + "External id": 153010,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771900747.705, "dur": 3.210, + "args": { + "External id": 153011,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771900777.135, "dur": 3.360, + "args": { + "External id": 153012,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771900778.015, "dur": 2.210, + "args": { + "External id": 153013,"Record function id": 0, "Concrete Inputs": ["", "[-1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::einsum", "pid": 5714, "tid": 6744, + "ts": 6303771900791.825, "dur": 90.400, + "args": { + "External id": 153014,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["", "TensorList", ""], "Input Strides": [[], [[3072, 1], [2048, 1]], []], "Input Dims": [[], [[16384, 768], [16384, 2048]], []], "Ev Idx": 1461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6303771900795.875, "dur": 4.280, + "args": { + "External id": 153015,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900798.245, "dur": 1.040, + "args": { + "External id": 153016,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768, 1]", "[3072, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771900801.715, "dur": 3.280, + "args": { + "External id": 153017,"Record function id": 0, "Concrete Inputs": ["", "[1, 2, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[3072, 1, 1], []], "Input Dims": [[16384, 768, 1], []], "Ev Idx": 1464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900803.995, "dur": 0.410, + "args": { + "External id": 153018,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 16384]", "[1, 1, 3072]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[3072, 1, 1], [], [], []], "Input Dims": [[16384, 768, 1], [], [], []], "Ev Idx": 1465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 6744, + "ts": 6303771900806.435, "dur": 1.670, + "args": { + "External id": 153019,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 1466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900807.485, "dur": 0.290, + "args": { + "External id": 153020,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771900808.955, "dur": 2.090, + "args": { + "External id": 153021,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900810.385, "dur": 0.310, + "args": { + "External id": 153022,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048, 16384]", "[1, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[16384, 2048, 1], [], [], []], "Ev Idx": 1469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771900814.265, "dur": 2.070, + "args": { + "External id": 153023,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 3072], []], "Input Dims": [[768, 1, 16384], []], "Ev Idx": 1470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900815.585, "dur": 0.400, + "args": { + "External id": 153024,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384, 1]", "[1, 3072, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 3072], [], [], []], "Input Dims": [[768, 1, 16384], [], [], []], "Ev Idx": 1471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771900817.255, "dur": 5.420, + "args": { + "External id": 153025,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 3072, 1], []], "Input Dims": [[768, 16384, 1], []], "Ev Idx": 1472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_reshape_alias", "pid": 5714, "tid": 6744, + "ts": 6303771900820.195, "dur": 2.250, + "args": { + "External id": 153026,"Record function id": 0, "Concrete Inputs": ["", "[1, 768, 16384]", "[768, 1, 3072]"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList"], "Input Strides": [[1, 3072, 1], [], []], "Input Dims": [[768, 16384, 1], [], []], "Ev Idx": 1473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771900823.635, "dur": 2.100, + "args": { + "External id": 153027,"Record function id": 0, "Concrete Inputs": ["", "[2, 1, 0]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1, 1, 2048], []], "Input Dims": [[1, 2048, 16384], []], "Ev Idx": 1474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900825.135, "dur": 0.290, + "args": { + "External id": 153028,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048, 1]", "[2048, 1, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1, 2048], [], [], []], "Input Dims": [[1, 2048, 16384], [], [], []], "Ev Idx": 1475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771900826.525, "dur": 2.090, + "args": { + "External id": 153029,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771900827.215, "dur": 1.230, + "args": { + "External id": 153030,"Record function id": 0, "Concrete Inputs": ["", "[1, 16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[16384, 2048, 1], []], "Ev Idx": 1477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771900829.665, "dur": 40.670, + "args": { + "External id": 153031,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1, 3072], [33554432, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048]], "Ev Idx": 1478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771900872.825, "dur": 1.900, + "args": { + "External id": 153032,"Record function id": 0, "Concrete Inputs": ["", "[768, 1, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 2048, 1], []], "Input Dims": [[1, 768, 2048], []], "Ev Idx": 1479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::permute", "pid": 5714, "tid": 6744, + "ts": 6303771900875.605, "dur": 3.010, + "args": { + "External id": 153033,"Record function id": 0, "Concrete Inputs": ["", "[0, 2, 1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 2048, 1], []], "Input Dims": [[768, 1, 2048], []], "Ev Idx": 1480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900877.665, "dur": 0.440, + "args": { + "External id": 153034,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 2048, 1], [], [], []], "Input Dims": [[768, 1, 2048], [], [], []], "Ev Idx": 1481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771900880.745, "dur": 0.680, + "args": { + "External id": 153035,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1, 2048], []], "Input Dims": [[768, 2048, 1], []], "Ev Idx": 1482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771900917.385, "dur": 9.890, + "args": { + "External id": 153036,"Record function id": 0, "Ev Idx": 1483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771900919.735, "dur": 6.340, + "args": { + "External id": 153037,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771900921.885, "dur": 3.299, + "args": { + "External id": 153038,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771900922.825, "dur": 2.159, + "args": { + "External id": 153039,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900931.275, "dur": 9.700, + "args": { + "External id": 153040,"Record function id": 0, "Sequence number": 3058798, "Fwd thread id": 1, "Ev Idx": 1487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900932.555, "dur": 6.520, + "args": { + "External id": 153041,"Sequence number": 3058798, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1488 + } + }, + { + "ph": "f", "id": 184, "pid": 5714, "tid": 6744, "ts": 6303771900932.555, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771900936.484, "dur": 2.340, + "args": { + "External id": 153042,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771900937.255, "dur": 1.369, + "args": { + "External id": 153043,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900944.715, "dur": 89.329, + "args": { + "External id": 153044,"Record function id": 0, "Sequence number": 3058797, "Fwd thread id": 1, "Ev Idx": 1491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771900945.875, "dur": 80.819, + "args": { + "External id": 153045,"Sequence number": 3058797, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1492 + } + }, + { + "ph": "f", "id": 185, "pid": 5714, "tid": 6744, "ts": 6303771900945.875, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771900949.015, "dur": 4.400, + "args": { + "External id": 153046,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771900950.295, "dur": 2.480, + "args": { + "External id": 153047,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900951.724, "dur": 0.740, + "args": { + "External id": 153048,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771900954.304, "dur": 37.360, + "args": { + "External id": 153049,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771900993.484, "dur": 5.351, + "args": { + "External id": 153050,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771900994.615, "dur": 3.409, + "args": { + "External id": 153051,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771900997.135, "dur": 0.640, + "args": { + "External id": 153052,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771901000.464, "dur": 3.091, + "args": { + "External id": 153053,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771901001.155, "dur": 2.029, + "args": { + "External id": 153054,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901002.135, "dur": 0.900, + "args": { + "External id": 153055,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771901004.195, "dur": 21.579, + "args": { + "External id": 153056,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901042.544, "dur": 9.050, + "args": { + "External id": 153057,"Record function id": 0, "Sequence number": 3058796, "Fwd thread id": 1, "Ev Idx": 1504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901044.324, "dur": 5.480, + "args": { + "External id": 153058,"Sequence number": 3058796, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1505 + } + }, + { + "ph": "f", "id": 186, "pid": 5714, "tid": 6744, "ts": 6303771901044.324, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771901046.134, "dur": 3.500, + "args": { + "External id": 153059,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771901047.064, "dur": 2.360, + "args": { + "External id": 153060,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901055.154, "dur": 8.100, + "args": { + "External id": 153061,"Record function id": 0, "Sequence number": 3058795, "Fwd thread id": 1, "Ev Idx": 1508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901056.414, "dur": 4.920, + "args": { + "External id": 153062,"Sequence number": 3058795, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1509 + } + }, + { + "ph": "f", "id": 187, "pid": 5714, "tid": 6744, "ts": 6303771901056.414, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771901057.314, "dur": 3.810, + "args": { + "External id": 153063,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771901058.264, "dur": 2.250, + "args": { + "External id": 153064,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901059.574, "dur": 0.680, + "args": { + "External id": 153065,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771901068.014, "dur": 7.210, + "args": { + "External id": 153066,"Record function id": 0, "Ev Idx": 1513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771901069.784, "dur": 4.390, + "args": { + "External id": 153067,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771901071.514, "dur": 2.300, + "args": { + "External id": 153068,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771901072.154, "dur": 1.490, + "args": { + "External id": 153069,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901078.804, "dur": 6.550, + "args": { + "External id": 153070,"Record function id": 0, "Sequence number": 3058794, "Fwd thread id": 1, "Ev Idx": 1517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901080.054, "dur": 3.570, + "args": { + "External id": 153071,"Sequence number": 3058794, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[4194304, 2048, 1]], "Input Dims": [[8, 2048, 2048]], "Ev Idx": 1518 + } + }, + { + "ph": "f", "id": 188, "pid": 5714, "tid": 6744, "ts": 6303771901080.054, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771901081.824, "dur": 1.670, + "args": { + "External id": 153072,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771901082.434, "dur": 0.880, + "args": { + "External id": 153073,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 1520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901088.684, "dur": 80.480, + "args": { + "External id": 153074,"Record function id": 0, "Sequence number": 3058793, "Fwd thread id": 1, "Ev Idx": 1521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901089.794, "dur": 71.560, + "args": { + "External id": 153075,"Sequence number": 3058793, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1522 + } + }, + { + "ph": "f", "id": 189, "pid": 5714, "tid": 6744, "ts": 6303771901089.794, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771901091.524, "dur": 3.510, + "args": { + "External id": 153076,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[16384, 2048]], "Ev Idx": 1523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771901092.134, "dur": 2.480, + "args": { + "External id": 153077,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[16384, 2048], [], []], "Ev Idx": 1524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901093.854, "dur": 0.580, + "args": { + "External id": 153078,"Record function id": 0, "Concrete Inputs": ["", "[2048, 16384]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[16384, 2048], [], [], []], "Ev Idx": 1525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771901095.804, "dur": 33.290, + "args": { + "External id": 153079,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768]], "Ev Idx": 1526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771901130.834, "dur": 3.970, + "args": { + "External id": 153080,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771901131.714, "dur": 2.340, + "args": { + "External id": 153081,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 1528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901133.194, "dur": 0.660, + "args": { + "External id": 153082,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 1529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771901136.274, "dur": 3.130, + "args": { + "External id": 153083,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771901136.864, "dur": 2.200, + "args": { + "External id": 153084,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901137.844, "dur": 1.060, + "args": { + "External id": 153085,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771901140.084, "dur": 20.370, + "args": { + "External id": 153086,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 1533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901177.644, "dur": 40.000, + "args": { + "External id": 153087,"Record function id": 0, "Sequence number": 3058792, "Fwd thread id": 1, "Ev Idx": 1534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901179.374, "dur": 13.810, + "args": { + "External id": 153088,"Sequence number": 3058792, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1535 + } + }, + { + "ph": "f", "id": 190, "pid": 5714, "tid": 6744, "ts": 6303771901179.374, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771901189.484, "dur": 3.530, + "args": { + "External id": 153089,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771901190.534, "dur": 2.270, + "args": { + "External id": 153090,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771901195.964, "dur": 17.740, + "args": { + "External id": 153091,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901223.864, "dur": 10.210, + "args": { + "External id": 153092,"Record function id": 0, "Sequence number": 3058791, "Fwd thread id": 1, "Ev Idx": 1539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901225.534, "dur": 6.290, + "args": { + "External id": 153093,"Sequence number": 3058791, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1540 + } + }, + { + "ph": "f", "id": 191, "pid": 5714, "tid": 6744, "ts": 6303771901225.534, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771901226.444, "dur": 5.140, + "args": { + "External id": 153094,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 2048]], "Ev Idx": 1541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771901227.774, "dur": 2.950, + "args": { + "External id": 153095,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 1542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901229.524, "dur": 0.930, + "args": { + "External id": 153096,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 1543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771901239.044, "dur": 8.130, + "args": { + "External id": 153097,"Record function id": 0, "Ev Idx": 1544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771901240.694, "dur": 5.400, + "args": { + "External id": 153098,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771901242.324, "dur": 3.350, + "args": { + "External id": 153099,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771901243.794, "dur": 1.650, + "args": { + "External id": 153100,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901250.664, "dur": 81.250, + "args": { + "External id": 153101,"Record function id": 0, "Sequence number": 3058790, "Fwd thread id": 1, "Ev Idx": 1548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901251.744, "dur": 31.220, + "args": { + "External id": 153102,"Sequence number": 3058790, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1549 + } + }, + { + "ph": "f", "id": 192, "pid": 5714, "tid": 6744, "ts": 6303771901251.744, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771901253.264, "dur": 16.420, + "args": { + "External id": 153103,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771901271.134, "dur": 11.380, + "args": { + "External id": 153104,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771901285.814, "dur": 34.810, + "args": { + "External id": 153105,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771901323.854, "dur": 3.360, + "args": { + "External id": 153106,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771901341.084, "dur": 8.300, + "args": { + "External id": 153107,"Record function id": 0, "Ev Idx": 1554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771901343.384, "dur": 4.780, + "args": { + "External id": 153108,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771901344.984, "dur": 2.730, + "args": { + "External id": 153109,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771901345.914, "dur": 1.600, + "args": { + "External id": 153110,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901353.224, "dur": 33.170, + "args": { + "External id": 153111,"Record function id": 0, "Sequence number": 3058789, "Fwd thread id": 1, "Ev Idx": 1558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901354.384, "dur": 28.039, + "args": { + "External id": 153112,"Sequence number": 3058789, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1559 + } + }, + { + "ph": "f", "id": 193, "pid": 5714, "tid": 6744, "ts": 6303771901354.384, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771901355.814, "dur": 26.229, + "args": { + "External id": 153113,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771901356.854, "dur": 24.929, + "args": { + "External id": 153114,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901359.634, "dur": 5.340, + "args": { + "External id": 153115,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771901366.714, "dur": 14.469, + "args": { + "External id": 153116,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901392.343, "dur": 53.031, + "args": { + "External id": 153117,"Record function id": 0, "Sequence number": 3058788, "Fwd thread id": 1, "Ev Idx": 1564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901393.763, "dur": 29.400, + "args": { + "External id": 153118,"Sequence number": 3058788, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1565 + } + }, + { + "ph": "f", "id": 194, "pid": 5714, "tid": 6744, "ts": 6303771901393.763, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771901395.654, "dur": 14.669, + "args": { + "External id": 153119,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771901411.743, "dur": 10.980, + "args": { + "External id": 153120,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771901426.203, "dur": 14.851, + "args": { + "External id": 153121,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901452.614, "dur": 56.439, + "args": { + "External id": 153122,"Record function id": 0, "Sequence number": 3058787, "Fwd thread id": 1, "Ev Idx": 1569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901454.254, "dur": 50.519, + "args": { + "External id": 153123,"Sequence number": 3058787, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1570 + } + }, + { + "ph": "f", "id": 195, "pid": 5714, "tid": 6744, "ts": 6303771901454.254, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771901456.603, "dur": 17.490, + "args": { + "External id": 153124,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771901458.114, "dur": 0.429, + "args": { + "External id": 153125,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771901459.523, "dur": 0.200, + "args": { + "External id": 153126,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771901475.303, "dur": 16.760, + "args": { + "External id": 153127,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771901478.363, "dur": 12.740, + "args": { + "External id": 153128,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771901492.843, "dur": 10.000, + "args": { + "External id": 153129,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771901515.953, "dur": 3.590, + "args": { + "External id": 153130,"Record function id": 0, "Sequence number": 3058786, "Fwd thread id": 1, "Ev Idx": 1577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771901517.533, "dur": 0.410, + "args": { + "External id": 153131,"Sequence number": 3058786, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1578 + } + }, + { + "ph": "f", "id": 196, "pid": 5714, "tid": 6744, "ts": 6303771901517.533, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771901522.723, "dur": 32.730, + "args": { + "External id": 153132,"Record function id": 0, "Sequence number": 3058785, "Fwd thread id": 1, "Ev Idx": 1579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771901523.803, "dur": 28.220, + "args": { + "External id": 153133,"Sequence number": 3058785, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1580 + } + }, + { + "ph": "f", "id": 197, "pid": 5714, "tid": 6744, "ts": 6303771901523.803, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6303771901525.983, "dur": 5.090, + "args": { + "External id": 153134,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901528.283, "dur": 1.400, + "args": { + "External id": 153135,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771901531.923, "dur": 19.520, + "args": { + "External id": 153136,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771901534.293, "dur": 16.260, + "args": { + "External id": 153137,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901561.553, "dur": 81.090, + "args": { + "External id": 153138,"Record function id": 0, "Sequence number": 3058784, "Fwd thread id": 1, "Ev Idx": 1585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901564.083, "dur": 61.260, + "args": { + "External id": 153139,"Sequence number": 3058784, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1586 + } + }, + { + "ph": "f", "id": 198, "pid": 5714, "tid": 6744, "ts": 6303771901564.083, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771901565.573, "dur": 29.070, + "args": { + "External id": 153140,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771901566.653, "dur": 0.330, + "args": { + "External id": 153141,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771901567.813, "dur": 0.160, + "args": { + "External id": 153142,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771901573.473, "dur": 19.890, + "args": { + "External id": 153143,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771901595.793, "dur": 16.560, + "args": { + "External id": 153144,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771901598.323, "dur": 13.090, + "args": { + "External id": 153145,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771901613.133, "dur": 10.380, + "args": { + "External id": 153146,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771901630.313, "dur": 9.070, + "args": { + "External id": 153147,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901650.463, "dur": 49.530, + "args": { + "External id": 153148,"Record function id": 0, "Sequence number": 3058783, "Fwd thread id": 1, "Ev Idx": 1595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901652.213, "dur": 23.980, + "args": { + "External id": 153149,"Sequence number": 3058783, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1596 + } + }, + { + "ph": "f", "id": 199, "pid": 5714, "tid": 6744, "ts": 6303771901652.213, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771901653.653, "dur": 22.240, + "args": { + "External id": 153150,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771901655.473, "dur": 20.200, + "args": { + "External id": 153151,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901658.763, "dur": 4.800, + "args": { + "External id": 153152,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771901664.443, "dur": 10.690, + "args": { + "External id": 153153,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771901680.483, "dur": 13.330, + "args": { + "External id": 153154,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[6291456, 3072, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901707.323, "dur": 17.580, + "args": { + "External id": 153155,"Record function id": 0, "Sequence number": 3058782, "Fwd thread id": 1, "Ev Idx": 1602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901708.993, "dur": 0.960, + "args": { + "External id": 153156,"Sequence number": 3058782, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1603 + } + }, + { + "ph": "f", "id": 200, "pid": 5714, "tid": 6744, "ts": 6303771901708.993, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771901712.393, "dur": 10.290, + "args": { + "External id": 153157,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901729.693, "dur": 9.080, + "args": { + "External id": 153158,"Record function id": 0, "Sequence number": 3058781, "Fwd thread id": 1, "Ev Idx": 1605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901731.053, "dur": 5.670, + "args": { + "External id": 153159,"Sequence number": 3058781, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1606 + } + }, + { + "ph": "f", "id": 201, "pid": 5714, "tid": 6744, "ts": 6303771901731.053, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771901732.783, "dur": 3.720, + "args": { + "External id": 153160,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771901733.623, "dur": 2.660, + "args": { + "External id": 153161,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901742.483, "dur": 86.750, + "args": { + "External id": 153162,"Record function id": 0, "Sequence number": 3058780, "Fwd thread id": 1, "Ev Idx": 1609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901743.563, "dur": 79.970, + "args": { + "External id": 153163,"Sequence number": 3058780, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1610 + } + }, + { + "ph": "f", "id": 202, "pid": 5714, "tid": 6744, "ts": 6303771901743.563, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771901746.253, "dur": 5.890, + "args": { + "External id": 153164,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771901748.513, "dur": 2.830, + "args": { + "External id": 153165,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901750.003, "dur": 1.020, + "args": { + "External id": 153166,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771901753.083, "dur": 38.370, + "args": { + "External id": 153167,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771901793.313, "dur": 4.170, + "args": { + "External id": 153168,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771901794.193, "dur": 2.470, + "args": { + "External id": 153169,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901795.633, "dur": 0.780, + "args": { + "External id": 153170,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771901798.963, "dur": 3.370, + "args": { + "External id": 153171,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771901800.643, "dur": 1.260, + "args": { + "External id": 153172,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901801.443, "dur": 0.320, + "args": { + "External id": 153173,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771901802.973, "dur": 19.680, + "args": { + "External id": 153174,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901837.322, "dur": 8.771, + "args": { + "External id": 153175,"Record function id": 0, "Sequence number": 3058779, "Fwd thread id": 1, "Ev Idx": 1622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901839.053, "dur": 5.049, + "args": { + "External id": 153176,"Sequence number": 3058779, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1623 + } + }, + { + "ph": "f", "id": 203, "pid": 5714, "tid": 6744, "ts": 6303771901839.053, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771901840.762, "dur": 3.180, + "args": { + "External id": 153177,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771901841.673, "dur": 2.089, + "args": { + "External id": 153178,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901849.613, "dur": 8.360, + "args": { + "External id": 153179,"Record function id": 0, "Sequence number": 3058778, "Fwd thread id": 1, "Ev Idx": 1626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901850.733, "dur": 5.489, + "args": { + "External id": 153180,"Sequence number": 3058778, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1627 + } + }, + { + "ph": "f", "id": 204, "pid": 5714, "tid": 6744, "ts": 6303771901850.733, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771901851.422, "dur": 4.580, + "args": { + "External id": 153181,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771901852.362, "dur": 3.080, + "args": { + "External id": 153182,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901854.642, "dur": 0.560, + "args": { + "External id": 153183,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771901863.142, "dur": 7.531, + "args": { + "External id": 153184,"Record function id": 0, "Ev Idx": 1631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771901864.742, "dur": 4.780, + "args": { + "External id": 153185,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771901866.453, "dur": 2.660, + "args": { + "External id": 153186,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771901867.153, "dur": 1.769, + "args": { + "External id": 153187,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901874.333, "dur": 6.049, + "args": { + "External id": 153188,"Record function id": 0, "Sequence number": 3058777, "Fwd thread id": 1, "Ev Idx": 1635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771901875.453, "dur": 3.169, + "args": { + "External id": 153189,"Sequence number": 3058777, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1636 + } + }, + { + "ph": "f", "id": 205, "pid": 5714, "tid": 6744, "ts": 6303771901875.453, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771901876.373, "dur": 2.100, + "args": { + "External id": 153190,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771901877.033, "dur": 1.229, + "args": { + "External id": 153191,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6303771901884.822, "dur": 205.580, + "args": { + "External id": 153192,"Record function id": 0, "Sequence number": 3058776, "Fwd thread id": 1, "Ev Idx": 1639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFuncBackward", "pid": 5714, "tid": 6744, + "ts": 6303771901886.342, "dur": 190.080, + "args": { + "External id": 153193,"Sequence number": 3058776, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1640 + } + }, + { + "ph": "f", "id": 206, "pid": 5714, "tid": 6744, "ts": 6303771901886.342, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771901897.482, "dur": 8.651, + "args": { + "External id": 153194,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901899.422, "dur": 6.191, + "args": { + "External id": 153195,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771901908.133, "dur": 4.960, + "args": { + "External id": 153196,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901909.722, "dur": 3.131, + "args": { + "External id": 153197,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771901914.402, "dur": 4.400, + "args": { + "External id": 153198,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771901915.453, "dur": 3.149, + "args": { + "External id": 153199,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771901932.962, "dur": 118.880, + "args": { + "External id": 153200,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 1647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771901982.242, "dur": 5.770, + "args": { + "External id": 153201,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771901989.792, "dur": 3.500, + "args": { + "External id": 153202,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771902062.722, "dur": 3.250, + "args": { + "External id": 153203,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771902070.272, "dur": 0.550, + "args": { + "External id": 153204,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 6744, + "ts": 6303771902073.492, "dur": 0.460, + "args": { + "External id": 153205,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771902100.402, "dur": 165.740, + "args": { + "External id": 153206,"Record function id": 0, "Sequence number": 3058775, "Fwd thread id": 1, "Ev Idx": 1653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771902102.452, "dur": 155.780, + "args": { + "External id": 153207,"Sequence number": 3058775, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1654 + } + }, + { + "ph": "f", "id": 207, "pid": 5714, "tid": 6744, "ts": 6303771902102.452, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771902116.642, "dur": 34.800, + "args": { + "External id": 153208,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902120.122, "dur": 6.040, + "args": { + "External id": 153209,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771902127.282, "dur": 23.600, + "args": { + "External id": 153210,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771902159.422, "dur": 7.230, + "args": { + "External id": 153211,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902161.452, "dur": 4.740, + "args": { + "External id": 153212,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771902277.132, "dur": 178.559, + "args": { + "External id": 153213,"Record function id": 0, "Sequence number": 3058774, "Fwd thread id": 1, "Ev Idx": 1660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771902279.701, "dur": 168.290, + "args": { + "External id": 153214,"Sequence number": 3058774, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1661 + } + }, + { + "ph": "f", "id": 208, "pid": 5714, "tid": 6744, "ts": 6303771902279.701, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771902292.761, "dur": 55.191, + "args": { + "External id": 153215,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902295.981, "dur": 30.380, + "args": { + "External id": 153216,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771902327.772, "dur": 19.540, + "args": { + "External id": 153217,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771902356.232, "dur": 7.300, + "args": { + "External id": 153218,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902358.232, "dur": 4.900, + "args": { + "External id": 153219,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902466.671, "dur": 12.400, + "args": { + "External id": 153220,"Record function id": 0, "Sequence number": 3058773, "Fwd thread id": 1, "Ev Idx": 1667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902468.901, "dur": 7.640, + "args": { + "External id": 153221,"Sequence number": 3058773, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1668 + } + }, + { + "ph": "f", "id": 209, "pid": 5714, "tid": 6744, "ts": 6303771902468.901, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771902471.341, "dur": 4.950, + "args": { + "External id": 153222,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771902472.371, "dur": 3.650, + "args": { + "External id": 153223,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902483.141, "dur": 5.830, + "args": { + "External id": 153224,"Record function id": 0, "Sequence number": 3058772, "Fwd thread id": 1, "Ev Idx": 1671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902484.331, "dur": 2.880, + "args": { + "External id": 153225,"Sequence number": 3058772, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1672 + } + }, + { + "ph": "f", "id": 210, "pid": 5714, "tid": 6744, "ts": 6303771902484.331, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771902485.201, "dur": 1.890, + "args": { + "External id": 153226,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771902486.051, "dur": 0.870, + "args": { + "External id": 153227,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902492.441, "dur": 5.170, + "args": { + "External id": 153228,"Record function id": 0, "Sequence number": 3058771, "Fwd thread id": 1, "Ev Idx": 1675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902493.541, "dur": 2.400, + "args": { + "External id": 153229,"Sequence number": 3058771, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1676 + } + }, + { + "ph": "f", "id": 211, "pid": 5714, "tid": 6744, "ts": 6303771902493.541, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771902494.371, "dur": 1.410, + "args": { + "External id": 153230,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771902494.951, "dur": 0.670, + "args": { + "External id": 153231,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902501.131, "dur": 5.530, + "args": { + "External id": 153232,"Record function id": 0, "Sequence number": 3058770, "Fwd thread id": 1, "Ev Idx": 1679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902502.241, "dur": 2.540, + "args": { + "External id": 153233,"Sequence number": 3058770, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1680 + } + }, + { + "ph": "f", "id": 212, "pid": 5714, "tid": 6744, "ts": 6303771902502.241, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771902503.041, "dur": 1.610, + "args": { + "External id": 153234,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771902503.681, "dur": 0.820, + "args": { + "External id": 153235,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902510.151, "dur": 93.000, + "args": { + "External id": 153236,"Record function id": 0, "Sequence number": 3058769, "Fwd thread id": 1, "Ev Idx": 1683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902511.221, "dur": 84.460, + "args": { + "External id": 153237,"Sequence number": 3058769, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1684 + } + }, + { + "ph": "f", "id": 213, "pid": 5714, "tid": 6744, "ts": 6303771902511.221, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902513.591, "dur": 5.660, + "args": { + "External id": 153238,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902514.871, "dur": 3.820, + "args": { + "External id": 153239,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902516.381, "dur": 1.980, + "args": { + "External id": 153240,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771902520.411, "dur": 41.000, + "args": { + "External id": 153241,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902563.401, "dur": 4.970, + "args": { + "External id": 153242,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902564.401, "dur": 3.150, + "args": { + "External id": 153243,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902566.611, "dur": 0.720, + "args": { + "External id": 153244,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902571.351, "dur": 2.730, + "args": { + "External id": 153245,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902572.361, "dur": 1.280, + "args": { + "External id": 153246,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902573.261, "dur": 0.230, + "args": { + "External id": 153247,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771902574.731, "dur": 20.150, + "args": { + "External id": 153248,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902611.401, "dur": 8.870, + "args": { + "External id": 153249,"Record function id": 0, "Sequence number": 3058768, "Fwd thread id": 1, "Ev Idx": 1696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902613.191, "dur": 5.140, + "args": { + "External id": 153250,"Sequence number": 3058768, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1697 + } + }, + { + "ph": "f", "id": 214, "pid": 5714, "tid": 6744, "ts": 6303771902613.191, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771902614.921, "dur": 3.230, + "args": { + "External id": 153251,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771902615.831, "dur": 2.130, + "args": { + "External id": 153252,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902623.941, "dur": 8.360, + "args": { + "External id": 153253,"Record function id": 0, "Sequence number": 3058767, "Fwd thread id": 1, "Ev Idx": 1700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902625.781, "dur": 4.490, + "args": { + "External id": 153254,"Sequence number": 3058767, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1701 + } + }, + { + "ph": "f", "id": 215, "pid": 5714, "tid": 6744, "ts": 6303771902625.781, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902626.491, "dur": 3.580, + "args": { + "External id": 153255,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902627.461, "dur": 2.060, + "args": { + "External id": 153256,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902628.721, "dur": 0.580, + "args": { + "External id": 153257,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771902637.421, "dur": 8.720, + "args": { + "External id": 153258,"Record function id": 0, "Ev Idx": 1705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771902639.161, "dur": 5.850, + "args": { + "External id": 153259,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771902640.941, "dur": 3.610, + "args": { + "External id": 153260,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771902641.701, "dur": 2.680, + "args": { + "External id": 153261,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902650.081, "dur": 6.950, + "args": { + "External id": 153262,"Record function id": 0, "Sequence number": 3058766, "Fwd thread id": 1, "Ev Idx": 1709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902651.321, "dur": 3.860, + "args": { + "External id": 153263,"Sequence number": 3058766, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1710 + } + }, + { + "ph": "f", "id": 216, "pid": 5714, "tid": 6744, "ts": 6303771902651.321, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771902652.361, "dur": 2.670, + "args": { + "External id": 153264,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771902653.741, "dur": 1.120, + "args": { + "External id": 153265,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902660.411, "dur": 77.329, + "args": { + "External id": 153266,"Record function id": 0, "Sequence number": 3058765, "Fwd thread id": 1, "Ev Idx": 1713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902661.491, "dur": 69.609, + "args": { + "External id": 153267,"Sequence number": 3058765, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1714 + } + }, + { + "ph": "f", "id": 217, "pid": 5714, "tid": 6744, "ts": 6303771902661.491, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902663.281, "dur": 2.690, + "args": { + "External id": 153268,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902663.911, "dur": 1.700, + "args": { + "External id": 153269,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902664.861, "dur": 0.540, + "args": { + "External id": 153270,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771902667.611, "dur": 33.310, + "args": { + "External id": 153271,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902702.771, "dur": 4.000, + "args": { + "External id": 153272,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902703.591, "dur": 2.410, + "args": { + "External id": 153273,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902705.191, "dur": 0.610, + "args": { + "External id": 153274,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902708.181, "dur": 2.390, + "args": { + "External id": 153275,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902708.881, "dur": 1.370, + "args": { + "External id": 153276,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902709.811, "dur": 0.300, + "args": { + "External id": 153277,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771902711.201, "dur": 19.079, + "args": { + "External id": 153278,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902746.091, "dur": 32.669, + "args": { + "External id": 153279,"Record function id": 0, "Sequence number": 3058764, "Fwd thread id": 1, "Ev Idx": 1726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902748.680, "dur": 5.291, + "args": { + "External id": 153280,"Sequence number": 3058764, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1727 + } + }, + { + "ph": "f", "id": 218, "pid": 5714, "tid": 6744, "ts": 6303771902748.680, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771902750.440, "dur": 3.351, + "args": { + "External id": 153281,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771902751.340, "dur": 2.260, + "args": { + "External id": 153282,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 6744, + "ts": 6303771902756.580, "dur": 18.731, + "args": { + "External id": 153283,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902784.671, "dur": 11.049, + "args": { + "External id": 153284,"Record function id": 0, "Sequence number": 3058763, "Fwd thread id": 1, "Ev Idx": 1731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902786.420, "dur": 6.940, + "args": { + "External id": 153285,"Sequence number": 3058763, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1732 + } + }, + { + "ph": "f", "id": 219, "pid": 5714, "tid": 6744, "ts": 6303771902786.420, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902787.271, "dur": 5.820, + "args": { + "External id": 153286,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902788.500, "dur": 3.711, + "args": { + "External id": 153287,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902791.160, "dur": 0.771, + "args": { + "External id": 153288,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771902800.671, "dur": 7.209, + "args": { + "External id": 153289,"Record function id": 0, "Ev Idx": 1736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771902802.291, "dur": 4.500, + "args": { + "External id": 153290,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771902803.880, "dur": 2.460, + "args": { + "External id": 153291,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771902804.560, "dur": 1.600, + "args": { + "External id": 153292,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902811.511, "dur": 6.689, + "args": { + "External id": 153293,"Record function id": 0, "Sequence number": 3058762, "Fwd thread id": 1, "Ev Idx": 1740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "UnsafeViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902812.780, "dur": 3.671, + "args": { + "External id": 153294,"Sequence number": 3058762, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1741 + } + }, + { + "ph": "f", "id": 220, "pid": 5714, "tid": 6744, "ts": 6303771902812.780, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771902813.860, "dur": 2.411, + "args": { + "External id": 153295,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771902814.660, "dur": 1.380, + "args": { + "External id": 153296,"Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902821.920, "dur": 82.400, + "args": { + "External id": 153297,"Record function id": 0, "Sequence number": 3058761, "Fwd thread id": 1, "Ev Idx": 1744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MmBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902822.990, "dur": 73.180, + "args": { + "External id": 153298,"Sequence number": 3058761, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1745 + } + }, + { + "ph": "f", "id": 221, "pid": 5714, "tid": 6744, "ts": 6303771902822.990, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902824.730, "dur": 3.700, + "args": { + "External id": 153299,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902826.290, "dur": 1.740, + "args": { + "External id": 153300,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[16384, 768], [], []], "Ev Idx": 1747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902827.250, "dur": 0.570, + "args": { + "External id": 153301,"Record function id": 0, "Concrete Inputs": ["", "[768, 16384]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 1748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771902829.400, "dur": 34.770, + "args": { + "External id": 153302,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1]], "Input Dims": [[768, 16384], [16384, 768]], "Ev Idx": 1749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902865.870, "dur": 4.070, + "args": { + "External id": 153303,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902866.800, "dur": 2.310, + "args": { + "External id": 153304,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902868.200, "dur": 0.700, + "args": { + "External id": 153305,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902871.470, "dur": 3.260, + "args": { + "External id": 153306,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902873.130, "dur": 1.230, + "args": { + "External id": 153307,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902873.900, "dur": 0.310, + "args": { + "External id": 153308,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771902875.370, "dur": 20.120, + "args": { + "External id": 153309,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 1756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902912.770, "dur": 27.400, + "args": { + "External id": 153310,"Record function id": 0, "Sequence number": 3058760, "Fwd thread id": 1, "Ev Idx": 1757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ViewBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902914.490, "dur": 5.420, + "args": { + "External id": 153311,"Sequence number": 3058760, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 1758 + } + }, + { + "ph": "f", "id": 222, "pid": 5714, "tid": 6744, "ts": 6303771902914.490, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 6744, + "ts": 6303771902916.380, "dur": 3.370, + "args": { + "External id": 153312,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771902917.340, "dur": 2.230, + "args": { + "External id": 153313,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 1760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771902922.470, "dur": 14.370, + "args": { + "External id": 153314,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902945.630, "dur": 10.740, + "args": { + "External id": 153315,"Record function id": 0, "Sequence number": 3058759, "Fwd thread id": 1, "Ev Idx": 1762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902947.240, "dur": 6.830, + "args": { + "External id": 153316,"Sequence number": 3058759, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1763 + } + }, + { + "ph": "f", "id": 223, "pid": 5714, "tid": 6744, "ts": 6303771902947.240, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 6744, + "ts": 6303771902948.080, "dur": 5.750, + "args": { + "External id": 153317,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1, 768]], "Input Dims": [[768, 768]], "Ev Idx": 1764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 6744, + "ts": 6303771902949.240, "dur": 3.730, + "args": { + "External id": 153318,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1, 768], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 1765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771902951.850, "dur": 0.890, + "args": { + "External id": 153319,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[768, 1]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 768], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 1766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771902961.290, "dur": 7.390, + "args": { + "External id": 153320,"Record function id": 0, "Ev Idx": 1767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771902962.970, "dur": 4.650, + "args": { + "External id": 153321,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771902964.600, "dur": 2.620, + "args": { + "External id": 153322,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771902965.280, "dur": 1.750, + "args": { + "External id": 153323,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902972.260, "dur": 71.290, + "args": { + "External id": 153324,"Record function id": 0, "Sequence number": 3058758, "Fwd thread id": 1, "Ev Idx": 1771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771902973.390, "dur": 32.050, + "args": { + "External id": 153325,"Sequence number": 3058758, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1772 + } + }, + { + "ph": "f", "id": 224, "pid": 5714, "tid": 6744, "ts": 6303771902973.390, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771902975.000, "dur": 17.210, + "args": { + "External id": 153326,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771902993.680, "dur": 11.300, + "args": { + "External id": 153327,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 1774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771903008.390, "dur": 24.420, + "args": { + "External id": 153328,"Record function id": 0, "Concrete Inputs": ["", "[0, 1]", "True", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771903036.420, "dur": 2.230, + "args": { + "External id": 153329,"Record function id": 0, "Concrete Inputs": ["", "[768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 768, 1], []], "Input Dims": [[1, 1, 768], []], "Ev Idx": 1776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771903051.750, "dur": 8.170, + "args": { + "External id": 153330,"Record function id": 0, "Ev Idx": 1777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771903054.140, "dur": 4.730, + "args": { + "External id": 153331,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771903055.760, "dur": 2.640, + "args": { + "External id": 153332,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771903056.610, "dur": 1.560, + "args": { + "External id": 153333,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771903063.670, "dur": 31.530, + "args": { + "External id": 153334,"Record function id": 0, "Sequence number": 3058757, "Fwd thread id": 1, "Ev Idx": 1781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771903064.750, "dur": 26.970, + "args": { + "External id": 153335,"Sequence number": 3058757, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1782 + } + }, + { + "ph": "f", "id": 225, "pid": 5714, "tid": 6744, "ts": 6303771903064.750, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771903066.120, "dur": 25.230, + "args": { + "External id": 153336,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771903067.180, "dur": 23.920, + "args": { + "External id": 153337,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903070.080, "dur": 5.140, + "args": { + "External id": 153338,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771903076.340, "dur": 14.210, + "args": { + "External id": 153339,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771903101.120, "dur": 51.940, + "args": { + "External id": 153340,"Record function id": 0, "Sequence number": 3058756, "Fwd thread id": 1, "Ev Idx": 1787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MulBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771903102.530, "dur": 28.790, + "args": { + "External id": 153341,"Sequence number": 3058756, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1788 + } + }, + { + "ph": "f", "id": 226, "pid": 5714, "tid": 6744, "ts": 6303771903102.530, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771903104.380, "dur": 14.310, + "args": { + "External id": 153342,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771903120.040, "dur": 10.890, + "args": { + "External id": 153343,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 1790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 6744, + "ts": 6303771903134.560, "dur": 14.240, + "args": { + "External id": 153344,"Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 1791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771903159.610, "dur": 55.449, + "args": { + "External id": 153345,"Record function id": 0, "Sequence number": 3058755, "Fwd thread id": 1, "Ev Idx": 1792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RsqrtBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771903161.270, "dur": 49.380, + "args": { + "External id": 153346,"Sequence number": 3058755, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1793 + } + }, + { + "ph": "f", "id": 227, "pid": 5714, "tid": 6744, "ts": 6303771903161.270, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771903163.590, "dur": 18.160, + "args": { + "External id": 153347,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771903165.240, "dur": 0.430, + "args": { + "External id": 153348,"Record function id": 0, "Concrete Inputs": ["", "3"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771903166.730, "dur": 0.230, + "args": { + "External id": 153349,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 1], [], [], [], []], "Ev Idx": 1796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771903182.979, "dur": 16.100, + "args": { + "External id": 153350,"Record function id": 0, "Concrete Inputs": ["", "-0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771903186.090, "dur": 12.040, + "args": { + "External id": 153351,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[2048, 1, 1], []], "Input Dims": [[8, 2048, 1], []], "Ev Idx": 1798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771903199.879, "dur": 9.160, + "args": { + "External id": 153352,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[2048, 1, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 1], [8, 2048, 1]], "Ev Idx": 1799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771903221.730, "dur": 3.860, + "args": { + "External id": 153353,"Record function id": 0, "Sequence number": 3058754, "Fwd thread id": 1, "Ev Idx": 1800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "AddBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771903223.390, "dur": 0.400, + "args": { + "External id": 153354,"Sequence number": 3058754, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1801 + } + }, + { + "ph": "f", "id": 228, "pid": 5714, "tid": 6744, "ts": 6303771903223.390, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771903229.010, "dur": 32.540, + "args": { + "External id": 153355,"Record function id": 0, "Sequence number": 3058753, "Fwd thread id": 1, "Ev Idx": 1802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "MeanBackward1", "pid": 5714, "tid": 6744, + "ts": 6303771903230.099, "dur": 28.151, + "args": { + "External id": 153356,"Sequence number": 3058753, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 1803 + } + }, + { + "ph": "f", "id": 229, "pid": 5714, "tid": 6744, "ts": 6303771903230.099, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 6744, + "ts": 6303771903232.259, "dur": 5.140, + "args": { + "External id": 153357,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 1804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903234.559, "dur": 1.360, + "args": { + "External id": 153358,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[2048, 1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 1], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 1805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771903238.310, "dur": 19.340, + "args": { + "External id": 153359,"Record function id": 0, "Concrete Inputs": ["", "768"], "Input type": ["float", "Scalar"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 6744, + "ts": 6303771903240.630, "dur": 16.109, + "args": { + "External id": 153360,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[2048, 1, 0], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771903267.690, "dur": 87.409, + "args": { + "External id": 153361,"Record function id": 0, "Sequence number": 3058752, "Fwd thread id": 1, "Ev Idx": 1808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "PowBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771903270.030, "dur": 67.419, + "args": { + "External id": 153362,"Sequence number": 3058752, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1809 + } + }, + { + "ph": "f", "id": 230, "pid": 5714, "tid": 6744, "ts": 6303771903270.030, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 6744, + "ts": 6303771903271.499, "dur": 34.070, + "args": { + "External id": 153363,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 6744, + "ts": 6303771903272.529, "dur": 0.330, + "args": { + "External id": 153364,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771903273.739, "dur": 0.140, + "args": { + "External id": 153365,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 1812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771903279.019, "dur": 25.230, + "args": { + "External id": 153366,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771903306.839, "dur": 18.570, + "args": { + "External id": 153367,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771903310.339, "dur": 13.950, + "args": { + "External id": 153368,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 1815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 6744, + "ts": 6303771903326.189, "dur": 9.810, + "args": { + "External id": 153369,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 1816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771903342.539, "dur": 9.430, + "args": { + "External id": 153370,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771903363.529, "dur": 41.910, + "args": { + "External id": 153371,"Record function id": 0, "Sequence number": 3058751, "Fwd thread id": 1, "Ev Idx": 1818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "ToCopyBackward0", "pid": 5714, "tid": 6744, + "ts": 6303771903365.279, "dur": 24.550, + "args": { + "External id": 153372,"Sequence number": 3058751, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1819 + } + }, + { + "ph": "f", "id": 231, "pid": 5714, "tid": 6744, "ts": 6303771903365.279, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771903367.609, "dur": 21.840, + "args": { + "External id": 153373,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 1820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771903368.549, "dur": 20.590, + "args": { + "External id": 153374,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 1821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903371.789, "dur": 5.130, + "args": { + "External id": 153375,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771903377.739, "dur": 10.910, + "args": { + "External id": 153376,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771903393.969, "dur": 8.590, + "args": { + "External id": 153377,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771903412.989, "dur": 1143.938, + "args": { + "External id": 153378,"Record function id": 0, "Ev Idx": 1825 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.9)", "pid": 5714, "tid": 6744, + "ts": 6303771903432.469, "dur": 592.469, + "args": { + "External id": 153379,"Record function id": 0, "Ev Idx": 1826 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.8", "pid": 5714, "tid": 6744, + "ts": 6303771903449.119, "dur": 568.099, + "args": { + "External id": 153380,"Record function id": 0, "Ev Idx": 1827 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6303771903461.199, "dur": 542.809, + "args": { + "External id": 153381,"Record function id": 0, "Ev Idx": 1828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771903541.859, "dur": 7.130, + "args": { + "External id": 153382,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771903560.729, "dur": 21.570, + "args": { + "External id": 153383,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 1830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903565.219, "dur": 1.010, + "args": { + "External id": 153384,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903568.049, "dur": 0.290, + "args": { + "External id": 153385,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903569.639, "dur": 0.260, + "args": { + "External id": 153386,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903571.039, "dur": 0.260, + "args": { + "External id": 153387,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903572.519, "dur": 1.020, + "args": { + "External id": 153388,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903574.609, "dur": 0.230, + "args": { + "External id": 153389,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903576.059, "dur": 0.230, + "args": { + "External id": 153390,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903577.389, "dur": 0.240, + "args": { + "External id": 153391,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903578.869, "dur": 0.250, + "args": { + "External id": 153392,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771903590.769, "dur": 24.790, + "args": { + "External id": 153393,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 1840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6303771903651.598, "dur": 95.900, + "args": { + "External id": 153394,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 1841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771903662.309, "dur": 6.720, + "args": { + "External id": 153395,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6303771903674.318, "dur": 9.120, + "args": { + "External id": 153396,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 1843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771903677.689, "dur": 5.360, + "args": { + "External id": 153397,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 1844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903680.609, "dur": 0.720, + "args": { + "External id": 153398,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 1845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771903690.669, "dur": 17.340, + "args": { + "External id": 153399,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 1846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903692.738, "dur": 0.291, + "args": { + "External id": 153400,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903694.469, "dur": 1.220, + "args": { + "External id": 153401,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903696.849, "dur": 0.249, + "args": { + "External id": 153402,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903698.218, "dur": 0.220, + "args": { + "External id": 153403,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903699.578, "dur": 0.251, + "args": { + "External id": 153404,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903701.029, "dur": 0.240, + "args": { + "External id": 153405,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903702.398, "dur": 0.251, + "args": { + "External id": 153406,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903703.818, "dur": 0.251, + "args": { + "External id": 153407,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771903705.189, "dur": 0.229, + "args": { + "External id": 153408,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771903721.178, "dur": 17.840, + "args": { + "External id": 153409,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 1856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6303771903810.548, "dur": 122.690, + "args": { + "External id": 153410,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 1857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771903839.138, "dur": 90.210, + "args": { + "External id": 153411,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 1858, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6303771903855.408, "dur": 68.840, + "args": { + "External id": 153412,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 1859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771903948.978, "dur": 3.370, + "args": { + "External id": 153413,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 1860, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771904030.088, "dur": 516.868, + "args": { + "External id": 153414,"Sequence number": 3058750, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1861 + } + }, + { + "ph": "f", "id": 232, "pid": 5714, "tid": 6744, "ts": 6303771904030.088, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771904112.597, "dur": 32.371, + "args": { + "External id": 153415,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 1862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6303771904175.787, "dur": 26.940, + "args": { + "External id": 153416,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 1863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771904219.957, "dur": 34.980, + "args": { + "External id": 153417,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 1864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771904269.207, "dur": 44.410, + "args": { + "External id": 153418,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 1865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771904326.177, "dur": 20.260, + "args": { + "External id": 153419,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 1866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771904356.687, "dur": 22.240, + "args": { + "External id": 153420,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 1867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771904389.567, "dur": 17.860, + "args": { + "External id": 153421,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 1868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771904430.917, "dur": 22.550, + "args": { + "External id": 153422,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 1869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771904470.737, "dur": 15.540, + "args": { + "External id": 153423,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 1870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6303771904504.737, "dur": 19.639, + "args": { + "External id": 153424,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 1871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771904569.927, "dur": 11.700, + "args": { + "External id": 153425,"Record function id": 0, "Ev Idx": 1872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771904572.996, "dur": 7.311, + "args": { + "External id": 153426,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771904575.467, "dur": 4.049, + "args": { + "External id": 153427,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771904576.736, "dur": 2.540, + "args": { + "External id": 153428,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771904586.076, "dur": 5.691, + "args": { + "External id": 153429,"Record function id": 0, "Ev Idx": 1876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771904587.847, "dur": 2.960, + "args": { + "External id": 153430,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771904588.576, "dur": 1.780, + "args": { + "External id": 153431,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771904589.196, "dur": 0.971, + "args": { + "External id": 153432,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771904595.536, "dur": 4.980, + "args": { + "External id": 153433,"Record function id": 0, "Ev Idx": 1880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771904597.067, "dur": 2.509, + "args": { + "External id": 153434,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771904597.696, "dur": 1.471, + "args": { + "External id": 153435,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771904598.196, "dur": 0.791, + "args": { + "External id": 153436,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 1883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771904604.076, "dur": 4.791, + "args": { + "External id": 153437,"Record function id": 0, "Ev Idx": 1884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771904605.696, "dur": 2.240, + "args": { + "External id": 153438,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771904606.316, "dur": 1.231, + "args": { + "External id": 153439,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771904606.667, "dur": 0.689, + "args": { + "External id": 153440,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 1887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771904612.687, "dur": 317.399, + "args": { + "External id": 153441,"Record function id": 0, "Sequence number": 3058749, "Fwd thread id": 1, "Ev Idx": 1888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771904614.007, "dur": 308.659, + "args": { + "External id": 153442,"Sequence number": 3058749, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1889 + } + }, + { + "ph": "f", "id": 233, "pid": 5714, "tid": 6744, "ts": 6303771904614.007, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771904680.526, "dur": 37.770, + "args": { + "External id": 153443,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771904730.276, "dur": 17.650, + "args": { + "External id": 153444,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771904774.456, "dur": 125.550, + "args": { + "External id": 153445,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 1892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771904829.186, "dur": 7.140, + "args": { + "External id": 153446,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771904838.056, "dur": 3.300, + "args": { + "External id": 153447,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771904943.186, "dur": 11.310, + "args": { + "External id": 153448,"Record function id": 0, "Ev Idx": 1895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771904946.146, "dur": 7.030, + "args": { + "External id": 153449,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1896 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771904948.446, "dur": 3.810, + "args": { + "External id": 153450,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771904949.556, "dur": 2.490, + "args": { + "External id": 153451,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771904958.686, "dur": 165.159, + "args": { + "External id": 153452,"Record function id": 0, "Sequence number": 3058748, "Fwd thread id": 1, "Ev Idx": 1899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771904960.196, "dur": 157.589, + "args": { + "External id": 153453,"Sequence number": 3058748, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1900 + } + }, + { + "ph": "f", "id": 234, "pid": 5714, "tid": 6744, "ts": 6303771904960.196, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771904974.626, "dur": 33.929, + "args": { + "External id": 153454,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771904977.815, "dur": 6.171, + "args": { + "External id": 153455,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771904985.126, "dur": 22.800, + "args": { + "External id": 153456,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771905016.935, "dur": 6.760, + "args": { + "External id": 153457,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771905018.586, "dur": 4.660, + "args": { + "External id": 153458,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771905135.005, "dur": 149.380, + "args": { + "External id": 153459,"Record function id": 0, "Sequence number": 3058747, "Fwd thread id": 1, "Ev Idx": 1906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771905137.635, "dur": 139.830, + "args": { + "External id": 153460,"Sequence number": 3058747, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 1907 + } + }, + { + "ph": "f", "id": 235, "pid": 5714, "tid": 6744, "ts": 6303771905137.635, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771905150.415, "dur": 29.540, + "args": { + "External id": 153461,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 1908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771905153.305, "dur": 6.320, + "args": { + "External id": 153462,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771905160.795, "dur": 18.600, + "args": { + "External id": 153463,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 1910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771905187.725, "dur": 6.810, + "args": { + "External id": 153464,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 1911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771905189.505, "dur": 4.630, + "args": { + "External id": 153465,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771905295.575, "dur": 334.889, + "args": { + "External id": 153466,"Record function id": 0, "Sequence number": 3058746, "Fwd thread id": 1, "Ev Idx": 1913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771905307.165, "dur": 313.289, + "args": { + "External id": 153467,"Sequence number": 3058746, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 1914 + } + }, + { + "ph": "f", "id": 236, "pid": 5714, "tid": 6744, "ts": 6303771905307.165, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771905381.955, "dur": 39.520, + "args": { + "External id": 153468,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771905434.134, "dur": 22.980, + "args": { + "External id": 153469,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771905467.094, "dur": 22.071, + "args": { + "External id": 153470,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 1917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771905499.594, "dur": 17.451, + "args": { + "External id": 153471,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771905525.244, "dur": 14.220, + "args": { + "External id": 153472,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771905548.724, "dur": 13.480, + "args": { + "External id": 153473,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 1920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6303771905582.264, "dur": 21.830, + "args": { + "External id": 153474,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 1921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771905642.814, "dur": 11.120, + "args": { + "External id": 153475,"Record function id": 0, "Ev Idx": 1922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771905645.564, "dur": 7.000, + "args": { + "External id": 153476,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771905648.024, "dur": 3.870, + "args": { + "External id": 153477,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771905649.114, "dur": 2.540, + "args": { + "External id": 153478,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771905658.234, "dur": 5.170, + "args": { + "External id": 153479,"Record function id": 0, "Ev Idx": 1926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771905659.794, "dur": 2.670, + "args": { + "External id": 153480,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771905660.474, "dur": 1.540, + "args": { + "External id": 153481,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771905661.004, "dur": 0.840, + "args": { + "External id": 153482,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771905667.044, "dur": 5.150, + "args": { + "External id": 153483,"Record function id": 0, "Ev Idx": 1930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771905668.934, "dur": 2.280, + "args": { + "External id": 153484,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771905669.544, "dur": 1.240, + "args": { + "External id": 153485,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771905669.904, "dur": 0.710, + "args": { + "External id": 153486,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 1933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771905676.134, "dur": 210.690, + "args": { + "External id": 153487,"Record function id": 0, "Sequence number": 3058745, "Fwd thread id": 1, "Ev Idx": 1934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771905677.484, "dur": 180.870, + "args": { + "External id": 153488,"Sequence number": 3058745, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1935 + } + }, + { + "ph": "f", "id": 237, "pid": 5714, "tid": 6744, "ts": 6303771905677.484, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771905748.634, "dur": 22.400, + "args": { + "External id": 153489,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 1936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771905789.914, "dur": 14.630, + "args": { + "External id": 153490,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 1937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771905823.264, "dur": 17.390, + "args": { + "External id": 153491,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 1938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771905865.774, "dur": 16.430, + "args": { + "External id": 153492,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 1939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771905897.993, "dur": 10.171, + "args": { + "External id": 153493,"Record function id": 0, "Ev Idx": 1940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771905900.764, "dur": 6.129, + "args": { + "External id": 153494,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771905902.833, "dur": 3.411, + "args": { + "External id": 153495,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771905903.813, "dur": 2.251, + "args": { + "External id": 153496,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 1943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771905912.324, "dur": 762.948, + "args": { + "External id": 153497,"Record function id": 0, "Sequence number": 3058744, "Fwd thread id": 1, "Ev Idx": 1944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771905913.924, "dur": 755.888, + "args": { + "External id": 153498,"Sequence number": 3058744, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 1945 + } + }, + { + "ph": "f", "id": 238, "pid": 5714, "tid": 6744, "ts": 6303771905913.924, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.9)", "pid": 5714, "tid": 6744, + "ts": 6303771905933.993, "dur": 26.220, + "args": { + "External id": 153499,"Record function id": 0, "Ev Idx": 1946 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.9)", "pid": 5714, "tid": 6744, + "ts": 6303771905968.404, "dur": 71.009, + "args": { + "External id": 153500,"Record function id": 0, "Ev Idx": 1947 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.9)", "pid": 5714, "tid": 6744, + "ts": 6303771906046.853, "dur": 617.649, + "args": { + "External id": 153501,"Record function id": 0, "Ev Idx": 1948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771906090.493, "dur": 8.810, + "args": { + "External id": 153502,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906107.193, "dur": 3.130, + "args": { + "External id": 153503,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 1950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771906123.673, "dur": 92.030, + "args": { + "External id": 153504,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 1951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771906134.903, "dur": 77.490, + "args": { + "External id": 153505,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 1952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771906155.133, "dur": 2.090, + "args": { + "External id": 153506,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771906161.343, "dur": 30.100, + "args": { + "External id": 153507,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 1954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771906162.933, "dur": 28.190, + "args": { + "External id": 153508,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 1955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906165.223, "dur": 5.780, + "args": { + "External id": 153509,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771906172.223, "dur": 18.410, + "args": { + "External id": 153510,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 1957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771906280.013, "dur": 7.810, + "args": { + "External id": 153511,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 1958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771906281.583, "dur": 5.770, + "args": { + "External id": 153512,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771906320.673, "dur": 83.530, + "args": { + "External id": 153513,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 1960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771906336.663, "dur": 64.600, + "args": { + "External id": 153514,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 1961, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771906348.443, "dur": 48.609, + "args": { + "External id": 153515,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 1962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771906416.543, "dur": 3.440, + "args": { + "External id": 153516,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 1963, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906470.652, "dur": 4.870, + "args": { + "External id": 153517,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906513.192, "dur": 1.160, + "args": { + "External id": 153518,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906532.442, "dur": 0.770, + "args": { + "External id": 153519,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906547.682, "dur": 0.750, + "args": { + "External id": 153520,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906561.352, "dur": 0.680, + "args": { + "External id": 153521,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906574.902, "dur": 0.730, + "args": { + "External id": 153522,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906587.942, "dur": 0.760, + "args": { + "External id": 153523,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906601.402, "dur": 1.080, + "args": { + "External id": 153524,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906614.742, "dur": 0.790, + "args": { + "External id": 153525,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771906686.722, "dur": 1292.097, + "args": { + "External id": 153526,"Record function id": 0, "Ev Idx": 1973 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6303771906699.822, "dur": 811.968, + "args": { + "External id": 153527,"Record function id": 0, "Ev Idx": 1974 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6303771906711.302, "dur": 233.789, + "args": { + "External id": 153528,"Record function id": 0, "Ev Idx": 1975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906791.422, "dur": 3.480, + "args": { + "External id": 153529,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 1976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906798.902, "dur": 1.189, + "args": { + "External id": 153530,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 1977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906801.582, "dur": 0.829, + "args": { + "External id": 153531,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906803.742, "dur": 0.729, + "args": { + "External id": 153532,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906805.671, "dur": 0.611, + "args": { + "External id": 153533,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906807.691, "dur": 0.751, + "args": { + "External id": 153534,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 1981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906809.731, "dur": 0.951, + "args": { + "External id": 153535,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 1982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906812.002, "dur": 1.540, + "args": { + "External id": 153536,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 1983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906814.711, "dur": 0.771, + "args": { + "External id": 153537,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 1984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771906816.762, "dur": 0.629, + "args": { + "External id": 153538,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 1985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771906831.991, "dur": 87.000, + "args": { + "External id": 153539,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 1986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771906844.422, "dur": 71.239, + "args": { + "External id": 153540,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 1987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771906856.202, "dur": 5.940, + "args": { + "External id": 153541,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771906864.331, "dur": 30.450, + "args": { + "External id": 153542,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 1989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771906865.702, "dur": 28.809, + "args": { + "External id": 153543,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 1990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771906868.151, "dur": 6.680, + "args": { + "External id": 153544,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771906875.971, "dur": 18.080, + "args": { + "External id": 153545,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 1992 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.7", "pid": 5714, "tid": 6744, + "ts": 6303771907024.471, "dur": 479.569, + "args": { + "External id": 153546,"Record function id": 0, "Ev Idx": 1993 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6303771907039.291, "dur": 451.609, + "args": { + "External id": 153547,"Record function id": 0, "Ev Idx": 1994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771907097.791, "dur": 6.960, + "args": { + "External id": 153548,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 1995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771907115.281, "dur": 20.170, + "args": { + "External id": 153549,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 1996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907118.281, "dur": 1.030, + "args": { + "External id": 153550,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907121.301, "dur": 0.280, + "args": { + "External id": 153551,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907122.661, "dur": 0.980, + "args": { + "External id": 153552,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 1999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907124.891, "dur": 0.270, + "args": { + "External id": 153553,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907126.351, "dur": 0.260, + "args": { + "External id": 153554,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907128.291, "dur": 0.260, + "args": { + "External id": 153555,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907129.681, "dur": 0.220, + "args": { + "External id": 153556,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907131.081, "dur": 0.250, + "args": { + "External id": 153557,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907132.441, "dur": 0.220, + "args": { + "External id": 153558,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771907143.351, "dur": 20.610, + "args": { + "External id": 153559,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6303771907194.501, "dur": 89.129, + "args": { + "External id": 153560,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771907203.611, "dur": 6.430, + "args": { + "External id": 153561,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6303771907214.081, "dur": 10.300, + "args": { + "External id": 153562,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771907216.271, "dur": 7.699, + "args": { + "External id": 153563,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907220.611, "dur": 1.610, + "args": { + "External id": 153564,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771907231.470, "dur": 16.911, + "args": { + "External id": 153565,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907233.470, "dur": 0.391, + "args": { + "External id": 153566,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907235.170, "dur": 0.171, + "args": { + "External id": 153567,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907236.850, "dur": 0.240, + "args": { + "External id": 153568,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907238.250, "dur": 0.240, + "args": { + "External id": 153569,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907239.770, "dur": 0.240, + "args": { + "External id": 153570,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907241.130, "dur": 0.231, + "args": { + "External id": 153571,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907242.481, "dur": 0.280, + "args": { + "External id": 153572,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907243.830, "dur": 1.091, + "args": { + "External id": 153573,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771907246.101, "dur": 0.220, + "args": { + "External id": 153574,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771907259.250, "dur": 16.620, + "args": { + "External id": 153575,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6303771907342.000, "dur": 76.730, + "args": { + "External id": 153576,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771907356.390, "dur": 59.670, + "args": { + "External id": 153577,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2024, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6303771907367.200, "dur": 44.890, + "args": { + "External id": 153578,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771907431.840, "dur": 3.140, + "args": { + "External id": 153579,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2026, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771907516.910, "dur": 451.919, + "args": { + "External id": 153580,"Sequence number": 3058743, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2027 + } + }, + { + "ph": "f", "id": 239, "pid": 5714, "tid": 6744, "ts": 6303771907516.910, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771907578.260, "dur": 31.090, + "args": { + "External id": 153581,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6303771907637.360, "dur": 22.940, + "args": { + "External id": 153582,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771907676.760, "dur": 33.520, + "args": { + "External id": 153583,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771907723.149, "dur": 25.040, + "args": { + "External id": 153584,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771907767.800, "dur": 19.749, + "args": { + "External id": 153585,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771907797.159, "dur": 22.040, + "args": { + "External id": 153586,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771907830.449, "dur": 17.810, + "args": { + "External id": 153587,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771907869.959, "dur": 19.570, + "args": { + "External id": 153588,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771907904.929, "dur": 12.970, + "args": { + "External id": 153589,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6303771907932.289, "dur": 15.670, + "args": { + "External id": 153590,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771907992.169, "dur": 12.850, + "args": { + "External id": 153591,"Record function id": 0, "Ev Idx": 2038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771907995.269, "dur": 8.400, + "args": { + "External id": 153592,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771907997.749, "dur": 5.140, + "args": { + "External id": 153593,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771907998.819, "dur": 3.820, + "args": { + "External id": 153594,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771908009.439, "dur": 5.660, + "args": { + "External id": 153595,"Record function id": 0, "Ev Idx": 2042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771908011.189, "dur": 2.940, + "args": { + "External id": 153596,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771908012.009, "dur": 1.680, + "args": { + "External id": 153597,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771908012.609, "dur": 0.900, + "args": { + "External id": 153598,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771908018.939, "dur": 13.550, + "args": { + "External id": 153599,"Record function id": 0, "Ev Idx": 2046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771908020.399, "dur": 11.030, + "args": { + "External id": 153600,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771908029.379, "dur": 1.600, + "args": { + "External id": 153601,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771908030.019, "dur": 0.760, + "args": { + "External id": 153602,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771908036.349, "dur": 5.050, + "args": { + "External id": 153603,"Record function id": 0, "Ev Idx": 2050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771908038.039, "dur": 2.430, + "args": { + "External id": 153604,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771908038.719, "dur": 1.290, + "args": { + "External id": 153605,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771908039.089, "dur": 0.750, + "args": { + "External id": 153606,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771908045.119, "dur": 315.839, + "args": { + "External id": 153607,"Record function id": 0, "Sequence number": 3058742, "Fwd thread id": 1, "Ev Idx": 2054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771908046.429, "dur": 306.989, + "args": { + "External id": 153608,"Sequence number": 3058742, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2055 + } + }, + { + "ph": "f", "id": 240, "pid": 5714, "tid": 6744, "ts": 6303771908046.429, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771908105.939, "dur": 37.400, + "args": { + "External id": 153609,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771908155.679, "dur": 18.009, + "args": { + "External id": 153610,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771908197.668, "dur": 131.700, + "args": { + "External id": 153611,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771908250.728, "dur": 6.260, + "args": { + "External id": 153612,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771908258.728, "dur": 4.090, + "args": { + "External id": 153613,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771908374.278, "dur": 11.060, + "args": { + "External id": 153614,"Record function id": 0, "Ev Idx": 2061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771908377.088, "dur": 6.940, + "args": { + "External id": 153615,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771908379.528, "dur": 3.640, + "args": { + "External id": 153616,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771908380.658, "dur": 2.300, + "args": { + "External id": 153617,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771908389.468, "dur": 167.060, + "args": { + "External id": 153618,"Record function id": 0, "Sequence number": 3058741, "Fwd thread id": 1, "Ev Idx": 2065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771908390.948, "dur": 159.380, + "args": { + "External id": 153619,"Sequence number": 3058741, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2066 + } + }, + { + "ph": "f", "id": 241, "pid": 5714, "tid": 6744, "ts": 6303771908390.948, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771908405.218, "dur": 35.680, + "args": { + "External id": 153620,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771908408.378, "dur": 6.490, + "args": { + "External id": 153621,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771908416.118, "dur": 24.150, + "args": { + "External id": 153622,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771908449.378, "dur": 7.030, + "args": { + "External id": 153623,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771908451.128, "dur": 4.860, + "args": { + "External id": 153624,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771908567.858, "dur": 157.499, + "args": { + "External id": 153625,"Record function id": 0, "Sequence number": 3058740, "Fwd thread id": 1, "Ev Idx": 2072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771908570.458, "dur": 147.879, + "args": { + "External id": 153626,"Sequence number": 3058740, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2073 + } + }, + { + "ph": "f", "id": 242, "pid": 5714, "tid": 6744, "ts": 6303771908570.458, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771908583.618, "dur": 31.740, + "args": { + "External id": 153627,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771908586.998, "dur": 6.840, + "args": { + "External id": 153628,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771908594.927, "dur": 19.831, + "args": { + "External id": 153629,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771908624.098, "dur": 7.349, + "args": { + "External id": 153630,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771908626.427, "dur": 4.631, + "args": { + "External id": 153631,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771908736.597, "dur": 320.429, + "args": { + "External id": 153632,"Record function id": 0, "Sequence number": 3058739, "Fwd thread id": 1, "Ev Idx": 2079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771908739.527, "dur": 306.959, + "args": { + "External id": 153633,"Sequence number": 3058739, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2080 + } + }, + { + "ph": "f", "id": 243, "pid": 5714, "tid": 6744, "ts": 6303771908739.527, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771908803.187, "dur": 39.580, + "args": { + "External id": 153634,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771908855.307, "dur": 23.820, + "args": { + "External id": 153635,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771908891.297, "dur": 23.080, + "args": { + "External id": 153636,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771908927.677, "dur": 18.150, + "args": { + "External id": 153637,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771908954.607, "dur": 14.600, + "args": { + "External id": 153638,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771908976.987, "dur": 13.720, + "args": { + "External id": 153639,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6303771909010.927, "dur": 19.110, + "args": { + "External id": 153640,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771909070.017, "dur": 12.540, + "args": { + "External id": 153641,"Record function id": 0, "Ev Idx": 2088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771909072.866, "dur": 8.340, + "args": { + "External id": 153642,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771909075.417, "dur": 5.020, + "args": { + "External id": 153643,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771909076.766, "dur": 3.420, + "args": { + "External id": 153644,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771909087.157, "dur": 5.529, + "args": { + "External id": 153645,"Record function id": 0, "Ev Idx": 2092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771909088.797, "dur": 2.869, + "args": { + "External id": 153646,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771909089.466, "dur": 1.720, + "args": { + "External id": 153647,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771909090.037, "dur": 0.949, + "args": { + "External id": 153648,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771909096.466, "dur": 5.100, + "args": { + "External id": 153649,"Record function id": 0, "Ev Idx": 2096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771909098.146, "dur": 2.520, + "args": { + "External id": 153650,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771909098.806, "dur": 1.400, + "args": { + "External id": 153651,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771909099.166, "dur": 0.871, + "args": { + "External id": 153652,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771909105.397, "dur": 208.469, + "args": { + "External id": 153653,"Record function id": 0, "Sequence number": 3058738, "Fwd thread id": 1, "Ev Idx": 2100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771909106.786, "dur": 168.060, + "args": { + "External id": 153654,"Sequence number": 3058738, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2101 + } + }, + { + "ph": "f", "id": 244, "pid": 5714, "tid": 6744, "ts": 6303771909106.786, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771909171.006, "dur": 21.930, + "args": { + "External id": 153655,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771909210.256, "dur": 12.910, + "args": { + "External id": 153656,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771909240.606, "dur": 15.740, + "args": { + "External id": 153657,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771909282.686, "dur": 25.840, + "args": { + "External id": 153658,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771909325.996, "dur": 11.750, + "args": { + "External id": 153659,"Record function id": 0, "Ev Idx": 2106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771909328.956, "dur": 7.400, + "args": { + "External id": 153660,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771909331.216, "dur": 4.450, + "args": { + "External id": 153661,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771909332.266, "dur": 3.170, + "args": { + "External id": 153662,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771909341.956, "dur": 726.978, + "args": { + "External id": 153663,"Record function id": 0, "Sequence number": 3058737, "Fwd thread id": 1, "Ev Idx": 2110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771909343.986, "dur": 719.338, + "args": { + "External id": 153664,"Sequence number": 3058737, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2111 + } + }, + { + "ph": "f", "id": 245, "pid": 5714, "tid": 6744, "ts": 6303771909343.986, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6303771909361.886, "dur": 24.080, + "args": { + "External id": 153665,"Record function id": 0, "Ev Idx": 2112 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6303771909394.276, "dur": 52.200, + "args": { + "External id": 153666,"Record function id": 0, "Ev Idx": 2113 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.8)", "pid": 5714, "tid": 6744, + "ts": 6303771909453.076, "dur": 604.888, + "args": { + "External id": 153667,"Record function id": 0, "Ev Idx": 2114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771909518.636, "dur": 7.420, + "args": { + "External id": 153668,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771909534.105, "dur": 3.320, + "args": { + "External id": 153669,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771909548.845, "dur": 91.660, + "args": { + "External id": 153670,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771909558.585, "dur": 78.500, + "args": { + "External id": 153671,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771909575.805, "dur": 5.660, + "args": { + "External id": 153672,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771909585.345, "dur": 30.670, + "args": { + "External id": 153673,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771909586.805, "dur": 28.860, + "args": { + "External id": 153674,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771909589.365, "dur": 5.890, + "args": { + "External id": 153675,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771909596.455, "dur": 18.610, + "args": { + "External id": 153676,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771909702.525, "dur": 8.330, + "args": { + "External id": 153677,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771909704.365, "dur": 5.980, + "args": { + "External id": 153678,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771909727.045, "dur": 81.920, + "args": { + "External id": 153679,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771909742.375, "dur": 63.560, + "args": { + "External id": 153680,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2127, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771909754.495, "dur": 47.490, + "args": { + "External id": 153681,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771909821.555, "dur": 3.070, + "args": { + "External id": 153682,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2129, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771909874.985, "dur": 3.720, + "args": { + "External id": 153683,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771909908.055, "dur": 1.990, + "args": { + "External id": 153684,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771909927.235, "dur": 1.169, + "args": { + "External id": 153685,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771909941.935, "dur": 0.780, + "args": { + "External id": 153686,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771909955.195, "dur": 0.780, + "args": { + "External id": 153687,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771909967.935, "dur": 1.509, + "args": { + "External id": 153688,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771909981.624, "dur": 0.731, + "args": { + "External id": 153689,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771909995.695, "dur": 1.080, + "args": { + "External id": 153690,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910009.584, "dur": 0.700, + "args": { + "External id": 153691,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771910080.574, "dur": 1343.627, + "args": { + "External id": 153692,"Record function id": 0, "Ev Idx": 2139 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6303771910093.914, "dur": 812.059, + "args": { + "External id": 153693,"Record function id": 0, "Ev Idx": 2140 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6303771910104.814, "dur": 243.750, + "args": { + "External id": 153694,"Record function id": 0, "Ev Idx": 2141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771910185.834, "dur": 4.180, + "args": { + "External id": 153695,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771910194.124, "dur": 1.000, + "args": { + "External id": 153696,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771910196.854, "dur": 0.630, + "args": { + "External id": 153697,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771910198.924, "dur": 0.670, + "args": { + "External id": 153698,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771910200.824, "dur": 0.570, + "args": { + "External id": 153699,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771910202.874, "dur": 0.600, + "args": { + "External id": 153700,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771910204.804, "dur": 0.790, + "args": { + "External id": 153701,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771910206.894, "dur": 0.710, + "args": { + "External id": 153702,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771910208.834, "dur": 1.390, + "args": { + "External id": 153703,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771910211.564, "dur": 0.720, + "args": { + "External id": 153704,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771910225.554, "dur": 95.270, + "args": { + "External id": 153705,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771910236.724, "dur": 80.570, + "args": { + "External id": 153706,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771910247.864, "dur": 5.990, + "args": { + "External id": 153707,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771910256.094, "dur": 31.870, + "args": { + "External id": 153708,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771910257.524, "dur": 30.070, + "args": { + "External id": 153709,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910259.974, "dur": 6.980, + "args": { + "External id": 153710,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771910268.054, "dur": 19.060, + "args": { + "External id": 153711,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2158 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.6", "pid": 5714, "tid": 6744, + "ts": 6303771910429.014, "dur": 468.968, + "args": { + "External id": 153712,"Record function id": 0, "Ev Idx": 2159 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6303771910444.883, "dur": 439.379, + "args": { + "External id": 153713,"Record function id": 0, "Ev Idx": 2160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771910505.113, "dur": 7.260, + "args": { + "External id": 153714,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771910522.763, "dur": 21.040, + "args": { + "External id": 153715,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910525.953, "dur": 1.230, + "args": { + "External id": 153716,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910529.573, "dur": 0.280, + "args": { + "External id": 153717,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910531.133, "dur": 0.300, + "args": { + "External id": 153718,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910532.623, "dur": 1.160, + "args": { + "External id": 153719,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910534.883, "dur": 0.270, + "args": { + "External id": 153720,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910536.583, "dur": 0.220, + "args": { + "External id": 153721,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910537.783, "dur": 0.260, + "args": { + "External id": 153722,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910539.263, "dur": 0.240, + "args": { + "External id": 153723,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910540.623, "dur": 0.270, + "args": { + "External id": 153724,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771910551.663, "dur": 21.680, + "args": { + "External id": 153725,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6303771910602.903, "dur": 88.960, + "args": { + "External id": 153726,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771910612.373, "dur": 6.620, + "args": { + "External id": 153727,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6303771910623.263, "dur": 8.040, + "args": { + "External id": 153728,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771910625.623, "dur": 5.320, + "args": { + "External id": 153729,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910628.443, "dur": 0.770, + "args": { + "External id": 153730,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771910638.383, "dur": 19.070, + "args": { + "External id": 153731,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910640.413, "dur": 1.150, + "args": { + "External id": 153732,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910642.883, "dur": 0.280, + "args": { + "External id": 153733,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910644.393, "dur": 0.240, + "args": { + "External id": 153734,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910646.113, "dur": 0.420, + "args": { + "External id": 153735,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910647.933, "dur": 0.260, + "args": { + "External id": 153736,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910649.363, "dur": 0.230, + "args": { + "External id": 153737,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910650.773, "dur": 0.260, + "args": { + "External id": 153738,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910652.063, "dur": 0.340, + "args": { + "External id": 153739,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771910654.343, "dur": 1.020, + "args": { + "External id": 153740,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771910667.593, "dur": 16.500, + "args": { + "External id": 153741,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6303771910742.863, "dur": 77.870, + "args": { + "External id": 153742,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771910757.243, "dur": 60.700, + "args": { + "External id": 153743,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2190, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6303771910768.493, "dur": 45.310, + "args": { + "External id": 153744,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771910832.202, "dur": 3.111, + "args": { + "External id": 153745,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2192, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771910911.362, "dur": 501.699, + "args": { + "External id": 153746,"Sequence number": 3058736, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2193 + } + }, + { + "ph": "f", "id": 246, "pid": 5714, "tid": 6744, "ts": 6303771910911.362, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771910975.222, "dur": 32.190, + "args": { + "External id": 153747,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6303771911035.612, "dur": 23.210, + "args": { + "External id": 153748,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771911075.562, "dur": 33.920, + "args": { + "External id": 153749,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771911122.132, "dur": 25.680, + "args": { + "External id": 153750,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771911157.672, "dur": 20.120, + "args": { + "External id": 153751,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771911187.812, "dur": 23.760, + "args": { + "External id": 153752,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771911224.812, "dur": 21.870, + "args": { + "External id": 153753,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771911273.452, "dur": 36.309, + "args": { + "External id": 153754,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771911332.321, "dur": 18.740, + "args": { + "External id": 153755,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6303771911370.131, "dur": 19.930, + "args": { + "External id": 153756,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771911439.111, "dur": 12.720, + "args": { + "External id": 153757,"Record function id": 0, "Ev Idx": 2204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771911442.511, "dur": 7.830, + "args": { + "External id": 153758,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771911445.281, "dur": 4.150, + "args": { + "External id": 153759,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771911446.391, "dur": 2.780, + "args": { + "External id": 153760,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771911457.051, "dur": 7.070, + "args": { + "External id": 153761,"Record function id": 0, "Ev Idx": 2208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771911458.901, "dur": 4.130, + "args": { + "External id": 153762,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771911459.791, "dur": 2.700, + "args": { + "External id": 153763,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771911460.411, "dur": 1.870, + "args": { + "External id": 153764,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771911468.431, "dur": 5.440, + "args": { + "External id": 153765,"Record function id": 0, "Ev Idx": 2212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771911470.121, "dur": 2.650, + "args": { + "External id": 153766,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771911470.841, "dur": 1.470, + "args": { + "External id": 153767,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771911471.311, "dur": 0.800, + "args": { + "External id": 153768,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771911477.821, "dur": 5.440, + "args": { + "External id": 153769,"Record function id": 0, "Ev Idx": 2216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771911479.541, "dur": 2.690, + "args": { + "External id": 153770,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771911480.201, "dur": 1.570, + "args": { + "External id": 153771,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771911480.601, "dur": 0.980, + "args": { + "External id": 153772,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771911487.231, "dur": 342.349, + "args": { + "External id": 153773,"Record function id": 0, "Sequence number": 3058735, "Fwd thread id": 1, "Ev Idx": 2220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771911488.801, "dur": 331.949, + "args": { + "External id": 153774,"Sequence number": 3058735, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2221 + } + }, + { + "ph": "f", "id": 247, "pid": 5714, "tid": 6744, "ts": 6303771911488.801, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771911553.981, "dur": 42.940, + "args": { + "External id": 153775,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771911612.641, "dur": 20.630, + "args": { + "External id": 153776,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771911660.321, "dur": 134.710, + "args": { + "External id": 153777,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771911715.481, "dur": 7.010, + "args": { + "External id": 153778,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771911724.300, "dur": 4.540, + "args": { + "External id": 153779,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771911844.230, "dur": 12.080, + "args": { + "External id": 153780,"Record function id": 0, "Ev Idx": 2227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771911847.470, "dur": 7.390, + "args": { + "External id": 153781,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771911849.940, "dur": 3.920, + "args": { + "External id": 153782,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771911851.040, "dur": 2.550, + "args": { + "External id": 153783,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771911860.990, "dur": 190.390, + "args": { + "External id": 153784,"Record function id": 0, "Sequence number": 3058734, "Fwd thread id": 1, "Ev Idx": 2231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771911862.640, "dur": 181.380, + "args": { + "External id": 153785,"Sequence number": 3058734, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2232 + } + }, + { + "ph": "f", "id": 248, "pid": 5714, "tid": 6744, "ts": 6303771911862.640, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771911878.310, "dur": 38.710, + "args": { + "External id": 153786,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771911881.790, "dur": 7.440, + "args": { + "External id": 153787,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771911890.570, "dur": 25.760, + "args": { + "External id": 153788,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771911926.350, "dur": 7.570, + "args": { + "External id": 153789,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771911928.310, "dur": 5.180, + "args": { + "External id": 153790,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771912064.680, "dur": 181.050, + "args": { + "External id": 153791,"Record function id": 0, "Sequence number": 3058733, "Fwd thread id": 1, "Ev Idx": 2238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771912067.700, "dur": 169.750, + "args": { + "External id": 153792,"Sequence number": 3058733, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2239 + } + }, + { + "ph": "f", "id": 249, "pid": 5714, "tid": 6744, "ts": 6303771912067.700, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771912083.440, "dur": 35.940, + "args": { + "External id": 153793,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771912086.820, "dur": 7.420, + "args": { + "External id": 153794,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771912095.490, "dur": 23.150, + "args": { + "External id": 153795,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771912128.970, "dur": 9.000, + "args": { + "External id": 153796,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771912130.890, "dur": 6.590, + "args": { + "External id": 153797,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771912258.910, "dur": 377.119, + "args": { + "External id": 153798,"Record function id": 0, "Sequence number": 3058732, "Fwd thread id": 1, "Ev Idx": 2245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771912262.059, "dur": 359.839, + "args": { + "External id": 153799,"Sequence number": 3058732, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2246 + } + }, + { + "ph": "f", "id": 250, "pid": 5714, "tid": 6744, "ts": 6303771912262.059, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771912345.089, "dur": 46.750, + "args": { + "External id": 153800,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771912406.129, "dur": 27.330, + "args": { + "External id": 153801,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771912444.659, "dur": 25.360, + "args": { + "External id": 153802,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771912482.809, "dur": 20.810, + "args": { + "External id": 153803,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771912512.849, "dur": 17.000, + "args": { + "External id": 153804,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771912538.219, "dur": 16.100, + "args": { + "External id": 153805,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6303771912575.619, "dur": 22.940, + "args": { + "External id": 153806,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771912653.658, "dur": 15.240, + "args": { + "External id": 153807,"Record function id": 0, "Ev Idx": 2254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771912657.489, "dur": 9.609, + "args": { + "External id": 153808,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771912660.909, "dur": 5.229, + "args": { + "External id": 153809,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771912662.358, "dur": 3.480, + "args": { + "External id": 153810,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771912675.049, "dur": 8.000, + "args": { + "External id": 153811,"Record function id": 0, "Ev Idx": 2258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771912677.289, "dur": 4.440, + "args": { + "External id": 153812,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771912678.249, "dur": 2.789, + "args": { + "External id": 153813,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771912678.878, "dur": 1.911, + "args": { + "External id": 153814,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771912688.318, "dur": 6.580, + "args": { + "External id": 153815,"Record function id": 0, "Ev Idx": 2262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771912690.409, "dur": 3.160, + "args": { + "External id": 153816,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771912691.298, "dur": 1.651, + "args": { + "External id": 153817,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771912691.798, "dur": 0.920, + "args": { + "External id": 153818,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771912700.318, "dur": 231.110, + "args": { + "External id": 153819,"Record function id": 0, "Sequence number": 3058731, "Fwd thread id": 1, "Ev Idx": 2266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771912702.298, "dur": 197.040, + "args": { + "External id": 153820,"Sequence number": 3058731, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2267 + } + }, + { + "ph": "f", "id": 251, "pid": 5714, "tid": 6744, "ts": 6303771912702.298, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771912786.018, "dur": 26.890, + "args": { + "External id": 153821,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771912830.388, "dur": 14.480, + "args": { + "External id": 153822,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771912862.948, "dur": 16.600, + "args": { + "External id": 153823,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771912907.838, "dur": 17.990, + "args": { + "External id": 153824,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771912943.918, "dur": 11.090, + "args": { + "External id": 153825,"Record function id": 0, "Ev Idx": 2272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771912946.918, "dur": 6.680, + "args": { + "External id": 153826,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771912949.178, "dur": 3.680, + "args": { + "External id": 153827,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771912950.218, "dur": 2.400, + "args": { + "External id": 153828,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771912959.628, "dur": 810.188, + "args": { + "External id": 153829,"Record function id": 0, "Sequence number": 3058730, "Fwd thread id": 1, "Ev Idx": 2276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771912961.258, "dur": 801.758, + "args": { + "External id": 153830,"Sequence number": 3058730, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2277 + } + }, + { + "ph": "f", "id": 252, "pid": 5714, "tid": 6744, "ts": 6303771912961.258, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6303771912979.608, "dur": 25.860, + "args": { + "External id": 153831,"Record function id": 0, "Ev Idx": 2278 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6303771913014.078, "dur": 55.320, + "args": { + "External id": 153832,"Record function id": 0, "Ev Idx": 2279 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.7)", "pid": 5714, "tid": 6744, + "ts": 6303771913076.908, "dur": 679.538, + "args": { + "External id": 153833,"Record function id": 0, "Ev Idx": 2280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771913148.448, "dur": 8.069, + "args": { + "External id": 153834,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913165.188, "dur": 4.420, + "args": { + "External id": 153835,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771913181.977, "dur": 93.920, + "args": { + "External id": 153836,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771913192.297, "dur": 79.940, + "args": { + "External id": 153837,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771913207.797, "dur": 6.000, + "args": { + "External id": 153838,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771913217.797, "dur": 32.860, + "args": { + "External id": 153839,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771913219.757, "dur": 30.580, + "args": { + "External id": 153840,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913221.967, "dur": 6.460, + "args": { + "External id": 153841,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771913229.747, "dur": 20.020, + "args": { + "External id": 153842,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771913353.727, "dur": 9.280, + "args": { + "External id": 153843,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771913355.577, "dur": 6.810, + "args": { + "External id": 153844,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771913380.877, "dur": 85.280, + "args": { + "External id": 153845,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771913396.067, "dur": 66.800, + "args": { + "External id": 153846,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2293, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771913407.817, "dur": 50.690, + "args": { + "External id": 153847,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771913478.517, "dur": 3.250, + "args": { + "External id": 153848,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2295, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913535.227, "dur": 4.020, + "args": { + "External id": 153849,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913571.367, "dur": 1.409, + "args": { + "External id": 153850,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913594.407, "dur": 1.940, + "args": { + "External id": 153851,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913613.776, "dur": 0.891, + "args": { + "External id": 153852,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913630.936, "dur": 0.910, + "args": { + "External id": 153853,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913646.706, "dur": 0.940, + "args": { + "External id": 153854,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913662.586, "dur": 1.730, + "args": { + "External id": 153855,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913679.076, "dur": 1.180, + "args": { + "External id": 153856,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913697.276, "dur": 0.910, + "args": { + "External id": 153857,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771913784.046, "dur": 1284.207, + "args": { + "External id": 153858,"Record function id": 0, "Ev Idx": 2305 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6303771913799.736, "dur": 818.358, + "args": { + "External id": 153859,"Record function id": 0, "Ev Idx": 2306 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6303771913811.096, "dur": 240.870, + "args": { + "External id": 153860,"Record function id": 0, "Ev Idx": 2307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913899.426, "dur": 3.720, + "args": { + "External id": 153861,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913907.286, "dur": 0.850, + "args": { + "External id": 153862,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913909.946, "dur": 1.440, + "args": { + "External id": 153863,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913912.896, "dur": 0.560, + "args": { + "External id": 153864,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913914.746, "dur": 0.560, + "args": { + "External id": 153865,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913916.746, "dur": 0.630, + "args": { + "External id": 153866,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913918.776, "dur": 0.860, + "args": { + "External id": 153867,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913920.956, "dur": 0.930, + "args": { + "External id": 153868,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913923.126, "dur": 0.560, + "args": { + "External id": 153869,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771913924.886, "dur": 0.580, + "args": { + "External id": 153870,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771913938.656, "dur": 86.159, + "args": { + "External id": 153871,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771913949.926, "dur": 71.569, + "args": { + "External id": 153872,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771913960.376, "dur": 6.610, + "args": { + "External id": 153873,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771913969.455, "dur": 31.171, + "args": { + "External id": 153874,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771913970.815, "dur": 29.500, + "args": { + "External id": 153875,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771913972.955, "dur": 6.851, + "args": { + "External id": 153876,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771913980.946, "dur": 18.840, + "args": { + "External id": 153877,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2324 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.5", "pid": 5714, "tid": 6744, + "ts": 6303771914132.035, "dur": 477.609, + "args": { + "External id": 153878,"Record function id": 0, "Ev Idx": 2325 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6303771914147.475, "dur": 448.379, + "args": { + "External id": 153879,"Record function id": 0, "Ev Idx": 2326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771914207.545, "dur": 7.430, + "args": { + "External id": 153880,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771914225.945, "dur": 17.050, + "args": { + "External id": 153881,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914228.755, "dur": 1.150, + "args": { + "External id": 153882,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914231.515, "dur": 0.200, + "args": { + "External id": 153883,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914232.605, "dur": 0.280, + "args": { + "External id": 153884,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914233.705, "dur": 0.250, + "args": { + "External id": 153885,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914234.825, "dur": 0.190, + "args": { + "External id": 153886,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914235.815, "dur": 1.150, + "args": { + "External id": 153887,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914237.805, "dur": 0.190, + "args": { + "External id": 153888,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914238.825, "dur": 0.180, + "args": { + "External id": 153889,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914239.825, "dur": 0.260, + "args": { + "External id": 153890,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771914250.585, "dur": 21.420, + "args": { + "External id": 153891,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6303771914310.245, "dur": 88.080, + "args": { + "External id": 153892,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771914320.265, "dur": 7.260, + "args": { + "External id": 153893,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6303771914331.965, "dur": 7.960, + "args": { + "External id": 153894,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771914334.355, "dur": 5.170, + "args": { + "External id": 153895,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914336.945, "dur": 0.720, + "args": { + "External id": 153896,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771914347.285, "dur": 13.970, + "args": { + "External id": 153897,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914349.055, "dur": 0.340, + "args": { + "External id": 153898,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914350.425, "dur": 0.470, + "args": { + "External id": 153899,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914351.735, "dur": 0.900, + "args": { + "External id": 153900,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914353.555, "dur": 0.220, + "args": { + "External id": 153901,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914354.635, "dur": 0.170, + "args": { + "External id": 153902,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914355.655, "dur": 0.270, + "args": { + "External id": 153903,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914356.715, "dur": 0.250, + "args": { + "External id": 153904,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914357.785, "dur": 0.240, + "args": { + "External id": 153905,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771914358.845, "dur": 0.240, + "args": { + "External id": 153906,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771914372.075, "dur": 18.070, + "args": { + "External id": 153907,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6303771914450.785, "dur": 80.349, + "args": { + "External id": 153908,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771914464.745, "dur": 63.399, + "args": { + "External id": 153909,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2356, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6303771914476.145, "dur": 47.809, + "args": { + "External id": 153910,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771914542.524, "dur": 3.240, + "args": { + "External id": 153911,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2358, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771914623.364, "dur": 434.889, + "args": { + "External id": 153912,"Sequence number": 3058729, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2359 + } + }, + { + "ph": "f", "id": 253, "pid": 5714, "tid": 6744, "ts": 6303771914623.364, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771914688.764, "dur": 32.760, + "args": { + "External id": 153913,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6303771914748.514, "dur": 21.850, + "args": { + "External id": 153914,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771914785.334, "dur": 32.730, + "args": { + "External id": 153915,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771914830.984, "dur": 24.600, + "args": { + "External id": 153916,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771914864.904, "dur": 18.500, + "args": { + "External id": 153917,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771914892.453, "dur": 21.971, + "args": { + "External id": 153918,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771914923.844, "dur": 17.420, + "args": { + "External id": 153919,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771914962.093, "dur": 19.390, + "args": { + "External id": 153920,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771914997.003, "dur": 12.640, + "args": { + "External id": 153921,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6303771915022.783, "dur": 14.990, + "args": { + "External id": 153922,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771915081.783, "dur": 11.030, + "args": { + "External id": 153923,"Record function id": 0, "Ev Idx": 2370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771915084.703, "dur": 6.820, + "args": { + "External id": 153924,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771915087.143, "dur": 3.650, + "args": { + "External id": 153925,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771915088.073, "dur": 2.500, + "args": { + "External id": 153926,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771915097.353, "dur": 5.040, + "args": { + "External id": 153927,"Record function id": 0, "Ev Idx": 2374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771915099.093, "dur": 2.390, + "args": { + "External id": 153928,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771915099.783, "dur": 1.230, + "args": { + "External id": 153929,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771915100.153, "dur": 0.690, + "args": { + "External id": 153930,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771915106.153, "dur": 4.660, + "args": { + "External id": 153931,"Record function id": 0, "Ev Idx": 2378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771915107.673, "dur": 2.190, + "args": { + "External id": 153932,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771915108.323, "dur": 1.070, + "args": { + "External id": 153933,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771915108.673, "dur": 0.560, + "args": { + "External id": 153934,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771915114.373, "dur": 5.950, + "args": { + "External id": 153935,"Record function id": 0, "Ev Idx": 2382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771915116.213, "dur": 3.180, + "args": { + "External id": 153936,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771915116.823, "dur": 2.170, + "args": { + "External id": 153937,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771915117.373, "dur": 1.450, + "args": { + "External id": 153938,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771915124.073, "dur": 312.829, + "args": { + "External id": 153939,"Record function id": 0, "Sequence number": 3058728, "Fwd thread id": 1, "Ev Idx": 2386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771915125.553, "dur": 303.779, + "args": { + "External id": 153940,"Sequence number": 3058728, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2387 + } + }, + { + "ph": "f", "id": 254, "pid": 5714, "tid": 6744, "ts": 6303771915125.553, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771915184.123, "dur": 37.630, + "args": { + "External id": 153941,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771915235.463, "dur": 18.250, + "args": { + "External id": 153942,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771915278.273, "dur": 128.759, + "args": { + "External id": 153943,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771915337.143, "dur": 6.420, + "args": { + "External id": 153944,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771915345.172, "dur": 3.211, + "args": { + "External id": 153945,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771915449.942, "dur": 11.010, + "args": { + "External id": 153946,"Record function id": 0, "Ev Idx": 2393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771915452.922, "dur": 6.750, + "args": { + "External id": 153947,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771915455.272, "dur": 3.580, + "args": { + "External id": 153948,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771915456.282, "dur": 2.350, + "args": { + "External id": 153949,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771915465.112, "dur": 167.440, + "args": { + "External id": 153950,"Record function id": 0, "Sequence number": 3058727, "Fwd thread id": 1, "Ev Idx": 2397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771915466.522, "dur": 159.260, + "args": { + "External id": 153951,"Sequence number": 3058727, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2398 + } + }, + { + "ph": "f", "id": 255, "pid": 5714, "tid": 6744, "ts": 6303771915466.522, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771915480.702, "dur": 34.940, + "args": { + "External id": 153952,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771915483.872, "dur": 6.510, + "args": { + "External id": 153953,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771915491.582, "dur": 23.450, + "args": { + "External id": 153954,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771915524.082, "dur": 6.930, + "args": { + "External id": 153955,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771915525.792, "dur": 4.830, + "args": { + "External id": 153956,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771915643.712, "dur": 151.850, + "args": { + "External id": 153957,"Record function id": 0, "Sequence number": 3058726, "Fwd thread id": 1, "Ev Idx": 2404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771915646.322, "dur": 142.249, + "args": { + "External id": 153958,"Sequence number": 3058726, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2405 + } + }, + { + "ph": "f", "id": 256, "pid": 5714, "tid": 6744, "ts": 6303771915646.322, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771915659.392, "dur": 30.360, + "args": { + "External id": 153959,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771915662.512, "dur": 6.110, + "args": { + "External id": 153960,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771915669.712, "dur": 19.450, + "args": { + "External id": 153961,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771915697.352, "dur": 6.690, + "args": { + "External id": 153962,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771915699.132, "dur": 4.460, + "args": { + "External id": 153963,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771915806.562, "dur": 306.199, + "args": { + "External id": 153964,"Record function id": 0, "Sequence number": 3058725, "Fwd thread id": 1, "Ev Idx": 2411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771915809.362, "dur": 293.139, + "args": { + "External id": 153965,"Sequence number": 3058725, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2412 + } + }, + { + "ph": "f", "id": 257, "pid": 5714, "tid": 6744, "ts": 6303771915809.362, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771915872.051, "dur": 39.160, + "args": { + "External id": 153966,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771915923.291, "dur": 23.140, + "args": { + "External id": 153967,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771915955.721, "dur": 21.900, + "args": { + "External id": 153968,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771915988.151, "dur": 17.770, + "args": { + "External id": 153969,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771916014.461, "dur": 13.930, + "args": { + "External id": 153970,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771916035.571, "dur": 13.320, + "args": { + "External id": 153971,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6303771916067.771, "dur": 18.510, + "args": { + "External id": 153972,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771916125.121, "dur": 10.780, + "args": { + "External id": 153973,"Record function id": 0, "Ev Idx": 2420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771916127.901, "dur": 6.760, + "args": { + "External id": 153974,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771916130.281, "dur": 3.730, + "args": { + "External id": 153975,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771916131.341, "dur": 2.430, + "args": { + "External id": 153976,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771916140.231, "dur": 5.040, + "args": { + "External id": 153977,"Record function id": 0, "Ev Idx": 2424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771916141.721, "dur": 2.610, + "args": { + "External id": 153978,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771916142.381, "dur": 1.430, + "args": { + "External id": 153979,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771916142.871, "dur": 0.750, + "args": { + "External id": 153980,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771916148.951, "dur": 4.590, + "args": { + "External id": 153981,"Record function id": 0, "Ev Idx": 2428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771916150.461, "dur": 2.190, + "args": { + "External id": 153982,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771916151.051, "dur": 1.170, + "args": { + "External id": 153983,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771916151.401, "dur": 0.660, + "args": { + "External id": 153984,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771916157.031, "dur": 199.449, + "args": { + "External id": 153985,"Record function id": 0, "Sequence number": 3058724, "Fwd thread id": 1, "Ev Idx": 2432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771916158.501, "dur": 168.959, + "args": { + "External id": 153986,"Sequence number": 3058724, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2433 + } + }, + { + "ph": "f", "id": 258, "pid": 5714, "tid": 6744, "ts": 6303771916158.501, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771916219.981, "dur": 21.160, + "args": { + "External id": 153987,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771916257.830, "dur": 12.280, + "args": { + "External id": 153988,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771916286.170, "dur": 22.860, + "args": { + "External id": 153989,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771916335.370, "dur": 16.560, + "args": { + "External id": 153990,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771916367.960, "dur": 10.360, + "args": { + "External id": 153991,"Record function id": 0, "Ev Idx": 2438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771916370.850, "dur": 6.230, + "args": { + "External id": 153992,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771916372.930, "dur": 3.410, + "args": { + "External id": 153993,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771916373.850, "dur": 2.290, + "args": { + "External id": 153994,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771916382.460, "dur": 693.949, + "args": { + "External id": 153995,"Record function id": 0, "Sequence number": 3058723, "Fwd thread id": 1, "Ev Idx": 2442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771916383.990, "dur": 686.789, + "args": { + "External id": 153996,"Sequence number": 3058723, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2443 + } + }, + { + "ph": "f", "id": 259, "pid": 5714, "tid": 6744, "ts": 6303771916383.990, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6303771916401.110, "dur": 22.890, + "args": { + "External id": 153997,"Record function id": 0, "Ev Idx": 2444 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6303771916431.820, "dur": 51.110, + "args": { + "External id": 153998,"Record function id": 0, "Ev Idx": 2445 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.6)", "pid": 5714, "tid": 6744, + "ts": 6303771916489.420, "dur": 576.319, + "args": { + "External id": 153999,"Record function id": 0, "Ev Idx": 2446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771916552.670, "dur": 7.200, + "args": { + "External id": 154000,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771916567.850, "dur": 3.150, + "args": { + "External id": 154001,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771916582.120, "dur": 84.040, + "args": { + "External id": 154002,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771916591.590, "dur": 71.320, + "args": { + "External id": 154003,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771916605.340, "dur": 5.460, + "args": { + "External id": 154004,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771916614.640, "dur": 28.390, + "args": { + "External id": 154005,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771916615.910, "dur": 26.860, + "args": { + "External id": 154006,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771916618.060, "dur": 5.400, + "args": { + "External id": 154007,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771916624.550, "dur": 17.750, + "args": { + "External id": 154008,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771916726.360, "dur": 8.469, + "args": { + "External id": 154009,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771916728.249, "dur": 6.131, + "args": { + "External id": 154010,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771916750.509, "dur": 76.040, + "args": { + "External id": 154011,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771916763.449, "dur": 60.140, + "args": { + "External id": 154012,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2459, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771916774.279, "dur": 45.040, + "args": { + "External id": 154013,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771916837.569, "dur": 3.140, + "args": { + "External id": 154014,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2461, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771916889.369, "dur": 3.640, + "args": { + "External id": 154015,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771916920.549, "dur": 1.010, + "args": { + "External id": 154016,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771916938.189, "dur": 0.800, + "args": { + "External id": 154017,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771916952.619, "dur": 0.760, + "args": { + "External id": 154018,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771916966.609, "dur": 0.750, + "args": { + "External id": 154019,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771916979.459, "dur": 0.720, + "args": { + "External id": 154020,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771916992.329, "dur": 0.700, + "args": { + "External id": 154021,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917005.039, "dur": 0.860, + "args": { + "External id": 154022,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917018.699, "dur": 0.680, + "args": { + "External id": 154023,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771917088.179, "dur": 1241.337, + "args": { + "External id": 154024,"Record function id": 0, "Ev Idx": 2471 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6303771917101.339, "dur": 769.148, + "args": { + "External id": 154025,"Record function id": 0, "Ev Idx": 2472 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6303771917112.299, "dur": 231.339, + "args": { + "External id": 154026,"Record function id": 0, "Ev Idx": 2473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771917188.899, "dur": 3.520, + "args": { + "External id": 154027,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771917196.219, "dur": 0.860, + "args": { + "External id": 154028,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771917198.628, "dur": 0.640, + "args": { + "External id": 154029,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771917200.739, "dur": 0.589, + "args": { + "External id": 154030,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771917202.819, "dur": 0.589, + "args": { + "External id": 154031,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771917204.708, "dur": 1.431, + "args": { + "External id": 154032,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771917207.459, "dur": 0.709, + "args": { + "External id": 154033,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771917209.428, "dur": 0.451, + "args": { + "External id": 154034,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771917211.248, "dur": 0.491, + "args": { + "External id": 154035,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771917213.019, "dur": 0.660, + "args": { + "External id": 154036,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771917226.168, "dur": 91.390, + "args": { + "External id": 154037,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771917236.898, "dur": 77.020, + "args": { + "External id": 154038,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771917246.388, "dur": 5.880, + "args": { + "External id": 154039,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771917254.278, "dur": 30.250, + "args": { + "External id": 154040,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771917255.778, "dur": 28.450, + "args": { + "External id": 154041,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917257.928, "dur": 6.730, + "args": { + "External id": 154042,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771917265.798, "dur": 17.940, + "args": { + "External id": 154043,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2490 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.4", "pid": 5714, "tid": 6744, + "ts": 6303771917421.068, "dur": 441.999, + "args": { + "External id": 154044,"Record function id": 0, "Ev Idx": 2491 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6303771917435.638, "dur": 414.539, + "args": { + "External id": 154045,"Record function id": 0, "Ev Idx": 2492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771917492.968, "dur": 6.960, + "args": { + "External id": 154046,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771917509.818, "dur": 18.020, + "args": { + "External id": 154047,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917512.578, "dur": 1.730, + "args": { + "External id": 154048,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917516.028, "dur": 0.290, + "args": { + "External id": 154049,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917517.268, "dur": 0.290, + "args": { + "External id": 154050,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917518.548, "dur": 0.250, + "args": { + "External id": 154051,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917519.628, "dur": 0.230, + "args": { + "External id": 154052,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917520.818, "dur": 0.250, + "args": { + "External id": 154053,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917521.878, "dur": 0.170, + "args": { + "External id": 154054,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917522.968, "dur": 0.260, + "args": { + "External id": 154055,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917524.048, "dur": 1.120, + "args": { + "External id": 154056,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771917535.178, "dur": 20.280, + "args": { + "External id": 154057,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6303771917583.178, "dur": 82.860, + "args": { + "External id": 154058,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771917592.238, "dur": 6.709, + "args": { + "External id": 154059,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6303771917602.618, "dur": 7.649, + "args": { + "External id": 154060,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771917604.918, "dur": 4.989, + "args": { + "External id": 154061,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917607.538, "dur": 0.700, + "args": { + "External id": 154062,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771917617.218, "dur": 15.669, + "args": { + "External id": 154063,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917618.998, "dur": 0.429, + "args": { + "External id": 154064,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917620.507, "dur": 0.271, + "args": { + "External id": 154065,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917621.707, "dur": 0.240, + "args": { + "External id": 154066,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917623.927, "dur": 0.231, + "args": { + "External id": 154067,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917625.047, "dur": 0.251, + "args": { + "External id": 154068,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917626.207, "dur": 1.320, + "args": { + "External id": 154069,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917628.398, "dur": 0.229, + "args": { + "External id": 154070,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917629.547, "dur": 0.251, + "args": { + "External id": 154071,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771917630.627, "dur": 0.251, + "args": { + "External id": 154072,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771917642.438, "dur": 16.189, + "args": { + "External id": 154073,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6303771917714.087, "dur": 75.080, + "args": { + "External id": 154074,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771917727.767, "dur": 58.610, + "args": { + "External id": 154075,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2522, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6303771917738.447, "dur": 43.960, + "args": { + "External id": 154076,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771917800.197, "dur": 3.030, + "args": { + "External id": 154077,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2524, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771917875.497, "dur": 443.449, + "args": { + "External id": 154078,"Sequence number": 3058722, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2525 + } + }, + { + "ph": "f", "id": 260, "pid": 5714, "tid": 6744, "ts": 6303771917875.497, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771917936.707, "dur": 31.140, + "args": { + "External id": 154079,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6303771917996.837, "dur": 21.749, + "args": { + "External id": 154080,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771918033.937, "dur": 33.789, + "args": { + "External id": 154081,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771918080.277, "dur": 24.369, + "args": { + "External id": 154082,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771918113.497, "dur": 18.649, + "args": { + "External id": 154083,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771918142.256, "dur": 21.730, + "args": { + "External id": 154084,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771918173.416, "dur": 17.880, + "args": { + "External id": 154085,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771918212.606, "dur": 20.010, + "args": { + "External id": 154086,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771918248.066, "dur": 13.010, + "args": { + "External id": 154087,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6303771918274.666, "dur": 15.670, + "args": { + "External id": 154088,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771918343.196, "dur": 11.380, + "args": { + "External id": 154089,"Record function id": 0, "Ev Idx": 2536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771918346.336, "dur": 6.960, + "args": { + "External id": 154090,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771918348.806, "dur": 3.730, + "args": { + "External id": 154091,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771918349.836, "dur": 2.450, + "args": { + "External id": 154092,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771918359.216, "dur": 5.460, + "args": { + "External id": 154093,"Record function id": 0, "Ev Idx": 2540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771918361.046, "dur": 2.610, + "args": { + "External id": 154094,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771918361.816, "dur": 1.380, + "args": { + "External id": 154095,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771918362.276, "dur": 0.740, + "args": { + "External id": 154096,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771918368.546, "dur": 4.840, + "args": { + "External id": 154097,"Record function id": 0, "Ev Idx": 2544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771918370.166, "dur": 2.320, + "args": { + "External id": 154098,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771918370.876, "dur": 1.190, + "args": { + "External id": 154099,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771918371.226, "dur": 0.670, + "args": { + "External id": 154100,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771918377.086, "dur": 4.630, + "args": { + "External id": 154101,"Record function id": 0, "Ev Idx": 2548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771918378.586, "dur": 2.200, + "args": { + "External id": 154102,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771918379.226, "dur": 1.140, + "args": { + "External id": 154103,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771918379.566, "dur": 0.640, + "args": { + "External id": 154104,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771918385.526, "dur": 301.219, + "args": { + "External id": 154105,"Record function id": 0, "Sequence number": 3058721, "Fwd thread id": 1, "Ev Idx": 2552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771918386.966, "dur": 292.149, + "args": { + "External id": 154106,"Sequence number": 3058721, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2553 + } + }, + { + "ph": "f", "id": 261, "pid": 5714, "tid": 6744, "ts": 6303771918386.966, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771918445.006, "dur": 38.070, + "args": { + "External id": 154107,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771918496.356, "dur": 18.200, + "args": { + "External id": 154108,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771918538.265, "dur": 118.710, + "args": { + "External id": 154109,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771918587.925, "dur": 6.130, + "args": { + "External id": 154110,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771918595.755, "dur": 3.380, + "args": { + "External id": 154111,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771918699.675, "dur": 17.640, + "args": { + "External id": 154112,"Record function id": 0, "Ev Idx": 2559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771918709.115, "dur": 6.740, + "args": { + "External id": 154113,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771918711.695, "dur": 3.280, + "args": { + "External id": 154114,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771918712.535, "dur": 2.210, + "args": { + "External id": 154115,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771918721.685, "dur": 166.810, + "args": { + "External id": 154116,"Record function id": 0, "Sequence number": 3058720, "Fwd thread id": 1, "Ev Idx": 2563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771918723.215, "dur": 159.290, + "args": { + "External id": 154117,"Sequence number": 3058720, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2564 + } + }, + { + "ph": "f", "id": 262, "pid": 5714, "tid": 6744, "ts": 6303771918723.215, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771918737.445, "dur": 34.120, + "args": { + "External id": 154118,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771918740.535, "dur": 6.260, + "args": { + "External id": 154119,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771918748.005, "dur": 22.880, + "args": { + "External id": 154120,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771918779.765, "dur": 8.320, + "args": { + "External id": 154121,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771918781.995, "dur": 5.670, + "args": { + "External id": 154122,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771918899.475, "dur": 150.189, + "args": { + "External id": 154123,"Record function id": 0, "Sequence number": 3058719, "Fwd thread id": 1, "Ev Idx": 2570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771918902.135, "dur": 140.559, + "args": { + "External id": 154124,"Sequence number": 3058719, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2571 + } + }, + { + "ph": "f", "id": 263, "pid": 5714, "tid": 6744, "ts": 6303771918902.135, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771918915.345, "dur": 29.450, + "args": { + "External id": 154125,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771918918.275, "dur": 6.100, + "args": { + "External id": 154126,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771918925.484, "dur": 18.740, + "args": { + "External id": 154127,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771918952.964, "dur": 6.280, + "args": { + "External id": 154128,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771918954.464, "dur": 4.391, + "args": { + "External id": 154129,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771919060.454, "dur": 313.320, + "args": { + "External id": 154130,"Record function id": 0, "Sequence number": 3058718, "Fwd thread id": 1, "Ev Idx": 2577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771919063.094, "dur": 300.410, + "args": { + "External id": 154131,"Sequence number": 3058718, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2578 + } + }, + { + "ph": "f", "id": 264, "pid": 5714, "tid": 6744, "ts": 6303771919063.094, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771919124.204, "dur": 38.410, + "args": { + "External id": 154132,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771919174.234, "dur": 23.390, + "args": { + "External id": 154133,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771919207.824, "dur": 21.480, + "args": { + "External id": 154134,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771919239.374, "dur": 17.680, + "args": { + "External id": 154135,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771919265.024, "dur": 14.250, + "args": { + "External id": 154136,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771919286.234, "dur": 21.970, + "args": { + "External id": 154137,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6303771919327.484, "dur": 19.200, + "args": { + "External id": 154138,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771919386.434, "dur": 11.800, + "args": { + "External id": 154139,"Record function id": 0, "Ev Idx": 2586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771919389.243, "dur": 7.711, + "args": { + "External id": 154140,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771919391.683, "dur": 4.551, + "args": { + "External id": 154141,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771919392.603, "dur": 3.431, + "args": { + "External id": 154142,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771919402.694, "dur": 4.900, + "args": { + "External id": 154143,"Record function id": 0, "Ev Idx": 2590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771919404.323, "dur": 2.311, + "args": { + "External id": 154144,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771919405.003, "dur": 1.180, + "args": { + "External id": 154145,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771919405.383, "dur": 0.631, + "args": { + "External id": 154146,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771919411.254, "dur": 4.780, + "args": { + "External id": 154147,"Record function id": 0, "Ev Idx": 2594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771919412.863, "dur": 2.191, + "args": { + "External id": 154148,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771919413.514, "dur": 1.080, + "args": { + "External id": 154149,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771919413.854, "dur": 0.580, + "args": { + "External id": 154150,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771919419.683, "dur": 189.180, + "args": { + "External id": 154151,"Record function id": 0, "Sequence number": 3058717, "Fwd thread id": 1, "Ev Idx": 2598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771919421.103, "dur": 158.460, + "args": { + "External id": 154152,"Sequence number": 3058717, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2599 + } + }, + { + "ph": "f", "id": 265, "pid": 5714, "tid": 6744, "ts": 6303771919421.103, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771919483.573, "dur": 21.150, + "args": { + "External id": 154153,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771919520.053, "dur": 12.440, + "args": { + "External id": 154154,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771919547.883, "dur": 14.980, + "args": { + "External id": 154155,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771919587.403, "dur": 16.310, + "args": { + "External id": 154156,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771919620.033, "dur": 10.710, + "args": { + "External id": 154157,"Record function id": 0, "Ev Idx": 2604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771919622.693, "dur": 6.680, + "args": { + "External id": 154158,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771919624.763, "dur": 4.000, + "args": { + "External id": 154159,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771919625.533, "dur": 3.030, + "args": { + "External id": 154160,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771919634.863, "dur": 700.509, + "args": { + "External id": 154161,"Record function id": 0, "Sequence number": 3058716, "Fwd thread id": 1, "Ev Idx": 2608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771919636.373, "dur": 693.228, + "args": { + "External id": 154162,"Sequence number": 3058716, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2609 + } + }, + { + "ph": "f", "id": 266, "pid": 5714, "tid": 6744, "ts": 6303771919636.373, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6303771919653.113, "dur": 22.900, + "args": { + "External id": 154163,"Record function id": 0, "Ev Idx": 2610 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6303771919683.453, "dur": 49.750, + "args": { + "External id": 154164,"Record function id": 0, "Ev Idx": 2611 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.5)", "pid": 5714, "tid": 6744, + "ts": 6303771919740.003, "dur": 584.338, + "args": { + "External id": 154165,"Record function id": 0, "Ev Idx": 2612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771919803.323, "dur": 7.180, + "args": { + "External id": 154166,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771919818.462, "dur": 3.180, + "args": { + "External id": 154167,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771919832.862, "dur": 83.691, + "args": { + "External id": 154168,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771919841.962, "dur": 71.391, + "args": { + "External id": 154169,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771919856.393, "dur": 5.489, + "args": { + "External id": 154170,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771919865.433, "dur": 28.589, + "args": { + "External id": 154171,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771919866.722, "dur": 26.980, + "args": { + "External id": 154172,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771919868.722, "dur": 5.600, + "args": { + "External id": 154173,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771919875.442, "dur": 17.740, + "args": { + "External id": 154174,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771919978.292, "dur": 8.090, + "args": { + "External id": 154175,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771919979.852, "dur": 5.990, + "args": { + "External id": 154176,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771920001.362, "dur": 76.040, + "args": { + "External id": 154177,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771920014.662, "dur": 59.750, + "args": { + "External id": 154178,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2625, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771920025.552, "dur": 44.970, + "args": { + "External id": 154179,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771920088.622, "dur": 3.340, + "args": { + "External id": 154180,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2627, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920140.152, "dur": 3.620, + "args": { + "External id": 154181,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920171.292, "dur": 1.680, + "args": { + "External id": 154182,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920192.152, "dur": 0.970, + "args": { + "External id": 154183,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920207.462, "dur": 0.720, + "args": { + "External id": 154184,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920220.312, "dur": 0.630, + "args": { + "External id": 154185,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920232.312, "dur": 1.550, + "args": { + "External id": 154186,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920245.112, "dur": 0.670, + "args": { + "External id": 154187,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920257.842, "dur": 0.890, + "args": { + "External id": 154188,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920270.012, "dur": 0.780, + "args": { + "External id": 154189,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771920347.012, "dur": 1245.267, + "args": { + "External id": 154190,"Record function id": 0, "Ev Idx": 2637 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6303771920360.272, "dur": 763.758, + "args": { + "External id": 154191,"Record function id": 0, "Ev Idx": 2638 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6303771920371.171, "dur": 223.800, + "args": { + "External id": 154192,"Record function id": 0, "Ev Idx": 2639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771920449.871, "dur": 3.880, + "args": { + "External id": 154193,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771920457.511, "dur": 0.840, + "args": { + "External id": 154194,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771920460.111, "dur": 0.650, + "args": { + "External id": 154195,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771920462.371, "dur": 0.610, + "args": { + "External id": 154196,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771920464.391, "dur": 0.510, + "args": { + "External id": 154197,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771920466.261, "dur": 0.550, + "args": { + "External id": 154198,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771920468.371, "dur": 0.860, + "args": { + "External id": 154199,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771920470.531, "dur": 0.570, + "args": { + "External id": 154200,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771920472.561, "dur": 1.250, + "args": { + "External id": 154201,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771920475.231, "dur": 0.570, + "args": { + "External id": 154202,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771920488.531, "dur": 81.000, + "args": { + "External id": 154203,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771920499.411, "dur": 66.810, + "args": { + "External id": 154204,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771920508.871, "dur": 5.940, + "args": { + "External id": 154205,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771920517.071, "dur": 29.620, + "args": { + "External id": 154206,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771920518.341, "dur": 28.050, + "args": { + "External id": 154207,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920520.351, "dur": 6.120, + "args": { + "External id": 154208,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771920527.551, "dur": 18.380, + "args": { + "External id": 154209,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2656 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.3", "pid": 5714, "tid": 6744, + "ts": 6303771920672.641, "dur": 443.619, + "args": { + "External id": 154210,"Record function id": 0, "Ev Idx": 2657 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6303771920686.241, "dur": 417.359, + "args": { + "External id": 154211,"Record function id": 0, "Ev Idx": 2658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771920743.980, "dur": 6.980, + "args": { + "External id": 154212,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771920761.571, "dur": 17.529, + "args": { + "External id": 154213,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920764.231, "dur": 1.120, + "args": { + "External id": 154214,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920766.791, "dur": 0.180, + "args": { + "External id": 154215,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920767.871, "dur": 0.240, + "args": { + "External id": 154216,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920769.171, "dur": 0.889, + "args": { + "External id": 154217,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920771.200, "dur": 0.300, + "args": { + "External id": 154218,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920772.551, "dur": 0.189, + "args": { + "External id": 154219,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920773.691, "dur": 0.289, + "args": { + "External id": 154220,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920775.011, "dur": 0.229, + "args": { + "External id": 154221,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920776.171, "dur": 0.309, + "args": { + "External id": 154222,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771920786.591, "dur": 20.809, + "args": { + "External id": 154223,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6303771920835.150, "dur": 82.060, + "args": { + "External id": 154224,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771920844.130, "dur": 6.270, + "args": { + "External id": 154225,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6303771920854.310, "dur": 7.380, + "args": { + "External id": 154226,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771920856.540, "dur": 4.780, + "args": { + "External id": 154227,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920858.990, "dur": 0.690, + "args": { + "External id": 154228,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771920868.520, "dur": 14.910, + "args": { + "External id": 154229,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920870.130, "dur": 1.120, + "args": { + "External id": 154230,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920872.660, "dur": 0.280, + "args": { + "External id": 154231,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920873.950, "dur": 0.180, + "args": { + "External id": 154232,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920875.050, "dur": 0.260, + "args": { + "External id": 154233,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920876.120, "dur": 0.240, + "args": { + "External id": 154234,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920877.190, "dur": 0.160, + "args": { + "External id": 154235,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920878.260, "dur": 0.220, + "args": { + "External id": 154236,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920879.330, "dur": 0.230, + "args": { + "External id": 154237,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771920880.640, "dur": 1.030, + "args": { + "External id": 154238,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771920893.200, "dur": 16.270, + "args": { + "External id": 154239,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6303771920965.560, "dur": 76.380, + "args": { + "External id": 154240,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771920980.220, "dur": 58.910, + "args": { + "External id": 154241,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2688, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6303771920991.070, "dur": 44.180, + "args": { + "External id": 154242,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771921053.730, "dur": 3.070, + "args": { + "External id": 154243,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2690, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771921128.970, "dur": 453.829, + "args": { + "External id": 154244,"Sequence number": 3058715, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2691 + } + }, + { + "ph": "f", "id": 267, "pid": 5714, "tid": 6744, "ts": 6303771921128.970, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771921190.410, "dur": 31.389, + "args": { + "External id": 154245,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6303771921250.019, "dur": 21.970, + "args": { + "External id": 154246,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771921287.449, "dur": 42.050, + "args": { + "External id": 154247,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771921343.319, "dur": 24.910, + "args": { + "External id": 154248,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771921378.309, "dur": 19.580, + "args": { + "External id": 154249,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771921407.289, "dur": 22.180, + "args": { + "External id": 154250,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771921446.169, "dur": 18.210, + "args": { + "External id": 154251,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771921486.369, "dur": 19.460, + "args": { + "External id": 154252,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771921521.139, "dur": 12.490, + "args": { + "External id": 154253,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6303771921547.439, "dur": 15.300, + "args": { + "External id": 154254,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771921605.689, "dur": 10.940, + "args": { + "External id": 154255,"Record function id": 0, "Ev Idx": 2702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771921608.729, "dur": 6.600, + "args": { + "External id": 154256,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771921611.199, "dur": 3.410, + "args": { + "External id": 154257,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771921612.049, "dur": 2.320, + "args": { + "External id": 154258,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771921621.309, "dur": 4.880, + "args": { + "External id": 154259,"Record function id": 0, "Ev Idx": 2706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771921622.909, "dur": 2.369, + "args": { + "External id": 154260,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771921623.609, "dur": 1.209, + "args": { + "External id": 154261,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771921623.989, "dur": 0.669, + "args": { + "External id": 154262,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771921630.029, "dur": 4.860, + "args": { + "External id": 154263,"Record function id": 0, "Ev Idx": 2710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771921631.598, "dur": 2.280, + "args": { + "External id": 154264,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771921632.238, "dur": 1.211, + "args": { + "External id": 154265,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771921632.649, "dur": 0.620, + "args": { + "External id": 154266,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771921638.449, "dur": 4.569, + "args": { + "External id": 154267,"Record function id": 0, "Ev Idx": 2714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771921639.949, "dur": 2.149, + "args": { + "External id": 154268,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771921640.538, "dur": 1.131, + "args": { + "External id": 154269,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771921640.969, "dur": 0.529, + "args": { + "External id": 154270,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771921646.638, "dur": 305.580, + "args": { + "External id": 154271,"Record function id": 0, "Sequence number": 3058714, "Fwd thread id": 1, "Ev Idx": 2718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771921648.009, "dur": 296.709, + "args": { + "External id": 154272,"Sequence number": 3058714, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2719 + } + }, + { + "ph": "f", "id": 268, "pid": 5714, "tid": 6744, "ts": 6303771921648.009, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771921707.658, "dur": 38.280, + "args": { + "External id": 154273,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771921758.118, "dur": 18.480, + "args": { + "External id": 154274,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771921801.758, "dur": 120.400, + "args": { + "External id": 154275,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771921851.538, "dur": 7.210, + "args": { + "External id": 154276,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771921860.418, "dur": 3.350, + "args": { + "External id": 154277,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771921965.278, "dur": 10.830, + "args": { + "External id": 154278,"Record function id": 0, "Ev Idx": 2725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771921968.318, "dur": 6.500, + "args": { + "External id": 154279,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771921970.708, "dur": 3.250, + "args": { + "External id": 154280,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771921971.568, "dur": 2.200, + "args": { + "External id": 154281,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771921980.268, "dur": 166.949, + "args": { + "External id": 154282,"Record function id": 0, "Sequence number": 3058713, "Fwd thread id": 1, "Ev Idx": 2729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771921981.758, "dur": 159.379, + "args": { + "External id": 154283,"Sequence number": 3058713, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2730 + } + }, + { + "ph": "f", "id": 269, "pid": 5714, "tid": 6744, "ts": 6303771921981.758, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771921996.208, "dur": 34.540, + "args": { + "External id": 154284,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771921999.098, "dur": 6.220, + "args": { + "External id": 154285,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771922006.508, "dur": 23.610, + "args": { + "External id": 154286,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771922039.538, "dur": 7.010, + "args": { + "External id": 154287,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771922041.448, "dur": 4.650, + "args": { + "External id": 154288,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771922159.348, "dur": 156.689, + "args": { + "External id": 154289,"Record function id": 0, "Sequence number": 3058712, "Fwd thread id": 1, "Ev Idx": 2736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771922161.957, "dur": 146.800, + "args": { + "External id": 154290,"Sequence number": 3058712, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2737 + } + }, + { + "ph": "f", "id": 270, "pid": 5714, "tid": 6744, "ts": 6303771922161.957, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771922174.567, "dur": 29.580, + "args": { + "External id": 154291,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771922177.397, "dur": 6.010, + "args": { + "External id": 154292,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771922184.587, "dur": 18.990, + "args": { + "External id": 154293,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771922211.737, "dur": 6.570, + "args": { + "External id": 154294,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771922213.547, "dur": 4.400, + "args": { + "External id": 154295,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771922327.397, "dur": 302.139, + "args": { + "External id": 154296,"Record function id": 0, "Sequence number": 3058711, "Fwd thread id": 1, "Ev Idx": 2743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771922330.307, "dur": 289.379, + "args": { + "External id": 154297,"Sequence number": 3058711, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2744 + } + }, + { + "ph": "f", "id": 271, "pid": 5714, "tid": 6744, "ts": 6303771922330.307, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771922391.917, "dur": 39.150, + "args": { + "External id": 154298,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771922442.927, "dur": 23.610, + "args": { + "External id": 154299,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771922475.907, "dur": 21.020, + "args": { + "External id": 154300,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771922507.247, "dur": 17.869, + "args": { + "External id": 154301,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771922532.356, "dur": 14.171, + "args": { + "External id": 154302,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771922554.587, "dur": 13.400, + "args": { + "External id": 154303,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6303771922586.036, "dur": 17.900, + "args": { + "External id": 154304,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771922641.916, "dur": 10.770, + "args": { + "External id": 154305,"Record function id": 0, "Ev Idx": 2752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771922644.676, "dur": 6.590, + "args": { + "External id": 154306,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771922647.096, "dur": 3.420, + "args": { + "External id": 154307,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771922647.956, "dur": 2.340, + "args": { + "External id": 154308,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771922657.026, "dur": 4.790, + "args": { + "External id": 154309,"Record function id": 0, "Ev Idx": 2756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771922658.576, "dur": 2.310, + "args": { + "External id": 154310,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771922659.216, "dur": 1.220, + "args": { + "External id": 154311,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771922659.586, "dur": 0.690, + "args": { + "External id": 154312,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771922665.466, "dur": 4.790, + "args": { + "External id": 154313,"Record function id": 0, "Ev Idx": 2760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771922667.056, "dur": 2.250, + "args": { + "External id": 154314,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771922667.656, "dur": 1.230, + "args": { + "External id": 154315,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771922668.106, "dur": 0.620, + "args": { + "External id": 154316,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771922674.226, "dur": 190.600, + "args": { + "External id": 154317,"Record function id": 0, "Sequence number": 3058710, "Fwd thread id": 1, "Ev Idx": 2764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771922675.706, "dur": 160.790, + "args": { + "External id": 154318,"Sequence number": 3058710, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2765 + } + }, + { + "ph": "f", "id": 272, "pid": 5714, "tid": 6744, "ts": 6303771922675.706, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771922737.966, "dur": 20.560, + "args": { + "External id": 154319,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771922775.066, "dur": 12.650, + "args": { + "External id": 154320,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771922803.656, "dur": 15.370, + "args": { + "External id": 154321,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771922843.996, "dur": 16.410, + "args": { + "External id": 154322,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771922876.186, "dur": 10.030, + "args": { + "External id": 154323,"Record function id": 0, "Ev Idx": 2770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771922878.806, "dur": 6.130, + "args": { + "External id": 154324,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771922880.826, "dur": 3.460, + "args": { + "External id": 154325,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771922881.546, "dur": 2.540, + "args": { + "External id": 154326,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771922890.506, "dur": 700.748, + "args": { + "External id": 154327,"Record function id": 0, "Sequence number": 3058709, "Fwd thread id": 1, "Ev Idx": 2774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771922892.036, "dur": 693.918, + "args": { + "External id": 154328,"Sequence number": 3058709, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2775 + } + }, + { + "ph": "f", "id": 273, "pid": 5714, "tid": 6744, "ts": 6303771922892.036, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6303771922908.146, "dur": 22.120, + "args": { + "External id": 154329,"Record function id": 0, "Ev Idx": 2776 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6303771922937.826, "dur": 52.289, + "args": { + "External id": 154330,"Record function id": 0, "Ev Idx": 2777 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.4)", "pid": 5714, "tid": 6744, + "ts": 6303771922996.975, "dur": 583.989, + "args": { + "External id": 154331,"Record function id": 0, "Ev Idx": 2778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771923060.575, "dur": 7.420, + "args": { + "External id": 154332,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923075.995, "dur": 3.230, + "args": { + "External id": 154333,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771923090.045, "dur": 84.920, + "args": { + "External id": 154334,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771923099.235, "dur": 72.430, + "args": { + "External id": 154335,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771923113.945, "dur": 5.820, + "args": { + "External id": 154336,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771923123.305, "dur": 29.230, + "args": { + "External id": 154337,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771923124.585, "dur": 27.680, + "args": { + "External id": 154338,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923126.745, "dur": 5.570, + "args": { + "External id": 154339,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771923133.405, "dur": 18.350, + "args": { + "External id": 154340,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771923234.915, "dur": 7.970, + "args": { + "External id": 154341,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771923236.485, "dur": 5.950, + "args": { + "External id": 154342,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771923257.295, "dur": 86.400, + "args": { + "External id": 154343,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771923272.285, "dur": 68.300, + "args": { + "External id": 154344,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2791, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771923283.385, "dur": 53.190, + "args": { + "External id": 154345,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771923355.765, "dur": 3.150, + "args": { + "External id": 154346,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2793, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923406.965, "dur": 3.600, + "args": { + "External id": 154347,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923437.665, "dur": 1.489, + "args": { + "External id": 154348,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923455.874, "dur": 0.920, + "args": { + "External id": 154349,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923470.605, "dur": 0.709, + "args": { + "External id": 154350,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923483.914, "dur": 0.660, + "args": { + "External id": 154351,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923496.485, "dur": 0.660, + "args": { + "External id": 154352,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923509.534, "dur": 0.671, + "args": { + "External id": 154353,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923522.054, "dur": 0.850, + "args": { + "External id": 154354,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923534.054, "dur": 0.680, + "args": { + "External id": 154355,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771923602.784, "dur": 1220.598, + "args": { + "External id": 154356,"Record function id": 0, "Ev Idx": 2803 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6303771923615.564, "dur": 762.599, + "args": { + "External id": 154357,"Record function id": 0, "Ev Idx": 2804 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6303771923625.714, "dur": 219.720, + "args": { + "External id": 154358,"Record function id": 0, "Ev Idx": 2805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923702.104, "dur": 3.360, + "args": { + "External id": 154359,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923709.404, "dur": 0.870, + "args": { + "External id": 154360,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923712.004, "dur": 1.200, + "args": { + "External id": 154361,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923714.704, "dur": 0.450, + "args": { + "External id": 154362,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923716.474, "dur": 0.550, + "args": { + "External id": 154363,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923718.294, "dur": 0.510, + "args": { + "External id": 154364,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923720.154, "dur": 0.690, + "args": { + "External id": 154365,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923722.094, "dur": 0.580, + "args": { + "External id": 154366,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923724.144, "dur": 0.580, + "args": { + "External id": 154367,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771923725.974, "dur": 0.580, + "args": { + "External id": 154368,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771923738.754, "dur": 81.010, + "args": { + "External id": 154369,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771923749.374, "dur": 67.010, + "args": { + "External id": 154370,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771923758.854, "dur": 6.090, + "args": { + "External id": 154371,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771923767.374, "dur": 29.330, + "args": { + "External id": 154372,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771923768.764, "dur": 27.620, + "args": { + "External id": 154373,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771923770.714, "dur": 6.140, + "args": { + "External id": 154374,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771923777.944, "dur": 17.910, + "args": { + "External id": 154375,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2822 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.2", "pid": 5714, "tid": 6744, + "ts": 6303771923922.664, "dur": 448.308, + "args": { + "External id": 154376,"Record function id": 0, "Ev Idx": 2823 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6303771923936.253, "dur": 421.899, + "args": { + "External id": 154377,"Record function id": 0, "Ev Idx": 2824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771923992.673, "dur": 6.880, + "args": { + "External id": 154378,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771924009.583, "dur": 16.940, + "args": { + "External id": 154379,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924012.213, "dur": 1.140, + "args": { + "External id": 154380,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924015.003, "dur": 0.270, + "args": { + "External id": 154381,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924016.223, "dur": 0.300, + "args": { + "External id": 154382,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924017.333, "dur": 0.250, + "args": { + "External id": 154383,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924018.503, "dur": 0.240, + "args": { + "External id": 154384,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924019.723, "dur": 0.710, + "args": { + "External id": 154385,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924021.323, "dur": 0.230, + "args": { + "External id": 154386,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924022.523, "dur": 0.240, + "args": { + "External id": 154387,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924023.813, "dur": 0.230, + "args": { + "External id": 154388,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771924034.353, "dur": 20.360, + "args": { + "External id": 154389,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6303771924082.073, "dur": 82.150, + "args": { + "External id": 154390,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 2837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771924091.163, "dur": 6.460, + "args": { + "External id": 154391,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6303771924101.443, "dur": 7.500, + "args": { + "External id": 154392,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771924103.633, "dur": 4.950, + "args": { + "External id": 154393,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 2840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924106.173, "dur": 0.740, + "args": { + "External id": 154394,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 2841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771924115.933, "dur": 14.710, + "args": { + "External id": 154395,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924117.833, "dur": 0.460, + "args": { + "External id": 154396,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924119.453, "dur": 0.320, + "args": { + "External id": 154397,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924120.623, "dur": 0.680, + "args": { + "External id": 154398,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924122.263, "dur": 0.170, + "args": { + "External id": 154399,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924123.313, "dur": 0.280, + "args": { + "External id": 154400,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924124.323, "dur": 0.250, + "args": { + "External id": 154401,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924125.293, "dur": 0.330, + "args": { + "External id": 154402,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924126.553, "dur": 0.180, + "args": { + "External id": 154403,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771924127.583, "dur": 0.250, + "args": { + "External id": 154404,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771924140.323, "dur": 16.270, + "args": { + "External id": 154405,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 2852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6303771924212.323, "dur": 76.050, + "args": { + "External id": 154406,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 2853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771924227.033, "dur": 58.510, + "args": { + "External id": 154407,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2854, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6303771924237.563, "dur": 43.990, + "args": { + "External id": 154408,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 2855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771924307.433, "dur": 3.340, + "args": { + "External id": 154409,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2856, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771924383.363, "dur": 430.108, + "args": { + "External id": 154410,"Sequence number": 3058708, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2857 + } + }, + { + "ph": "f", "id": 274, "pid": 5714, "tid": 6744, "ts": 6303771924383.363, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771924444.332, "dur": 31.230, + "args": { + "External id": 154411,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 2858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6303771924503.902, "dur": 21.240, + "args": { + "External id": 154412,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 2859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771924539.782, "dur": 33.340, + "args": { + "External id": 154413,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 2860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771924584.972, "dur": 24.380, + "args": { + "External id": 154414,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771924618.892, "dur": 18.750, + "args": { + "External id": 154415,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771924646.712, "dur": 21.670, + "args": { + "External id": 154416,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 2863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771924678.022, "dur": 17.700, + "args": { + "External id": 154417,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 2864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771924717.212, "dur": 19.370, + "args": { + "External id": 154418,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771924751.992, "dur": 12.130, + "args": { + "External id": 154419,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6303771924777.642, "dur": 15.469, + "args": { + "External id": 154420,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 2867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771924836.911, "dur": 10.671, + "args": { + "External id": 154421,"Record function id": 0, "Ev Idx": 2868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771924839.791, "dur": 6.500, + "args": { + "External id": 154422,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771924842.171, "dur": 3.340, + "args": { + "External id": 154423,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771924843.031, "dur": 2.280, + "args": { + "External id": 154424,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771924852.071, "dur": 5.031, + "args": { + "External id": 154425,"Record function id": 0, "Ev Idx": 2872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771924853.771, "dur": 2.351, + "args": { + "External id": 154426,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771924854.471, "dur": 1.191, + "args": { + "External id": 154427,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771924854.891, "dur": 0.591, + "args": { + "External id": 154428,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771924860.982, "dur": 4.720, + "args": { + "External id": 154429,"Record function id": 0, "Ev Idx": 2876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771924862.562, "dur": 2.229, + "args": { + "External id": 154430,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771924863.151, "dur": 1.231, + "args": { + "External id": 154431,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771924863.602, "dur": 0.609, + "args": { + "External id": 154432,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 2879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771924869.241, "dur": 4.500, + "args": { + "External id": 154433,"Record function id": 0, "Ev Idx": 2880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771924870.721, "dur": 2.120, + "args": { + "External id": 154434,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771924871.281, "dur": 1.160, + "args": { + "External id": 154435,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771924871.741, "dur": 0.540, + "args": { + "External id": 154436,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 2883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771924877.211, "dur": 299.750, + "args": { + "External id": 154437,"Record function id": 0, "Sequence number": 3058707, "Fwd thread id": 1, "Ev Idx": 2884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771924878.661, "dur": 290.890, + "args": { + "External id": 154438,"Sequence number": 3058707, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2885 + } + }, + { + "ph": "f", "id": 275, "pid": 5714, "tid": 6744, "ts": 6303771924878.661, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771924936.891, "dur": 37.480, + "args": { + "External id": 154439,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771924986.651, "dur": 18.220, + "args": { + "External id": 154440,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771925028.171, "dur": 118.730, + "args": { + "External id": 154441,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 2888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771925077.181, "dur": 6.660, + "args": { + "External id": 154442,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771925085.531, "dur": 3.550, + "args": { + "External id": 154443,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771925189.941, "dur": 10.960, + "args": { + "External id": 154444,"Record function id": 0, "Ev Idx": 2891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771925193.191, "dur": 6.420, + "args": { + "External id": 154445,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771925195.541, "dur": 3.200, + "args": { + "External id": 154446,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771925196.401, "dur": 2.140, + "args": { + "External id": 154447,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771925205.191, "dur": 171.239, + "args": { + "External id": 154448,"Record function id": 0, "Sequence number": 3058706, "Fwd thread id": 1, "Ev Idx": 2895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771925206.701, "dur": 163.529, + "args": { + "External id": 154449,"Sequence number": 3058706, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2896 + } + }, + { + "ph": "f", "id": 276, "pid": 5714, "tid": 6744, "ts": 6303771925206.701, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771925221.030, "dur": 34.040, + "args": { + "External id": 154450,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771925224.250, "dur": 6.300, + "args": { + "External id": 154451,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771925231.770, "dur": 22.620, + "args": { + "External id": 154452,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771925263.661, "dur": 6.649, + "args": { + "External id": 154453,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771925265.290, "dur": 4.531, + "args": { + "External id": 154454,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771925387.960, "dur": 150.450, + "args": { + "External id": 154455,"Record function id": 0, "Sequence number": 3058705, "Fwd thread id": 1, "Ev Idx": 2902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771925390.660, "dur": 140.800, + "args": { + "External id": 154456,"Sequence number": 3058705, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 2903 + } + }, + { + "ph": "f", "id": 277, "pid": 5714, "tid": 6744, "ts": 6303771925390.660, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771925403.870, "dur": 30.180, + "args": { + "External id": 154457,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 2904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771925406.910, "dur": 6.150, + "args": { + "External id": 154458,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771925414.240, "dur": 19.190, + "args": { + "External id": 154459,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 2906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771925441.960, "dur": 6.950, + "args": { + "External id": 154460,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 2907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771925444.020, "dur": 4.480, + "args": { + "External id": 154461,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771925549.330, "dur": 302.199, + "args": { + "External id": 154462,"Record function id": 0, "Sequence number": 3058704, "Fwd thread id": 1, "Ev Idx": 2909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771925552.220, "dur": 289.099, + "args": { + "External id": 154463,"Sequence number": 3058704, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 2910 + } + }, + { + "ph": "f", "id": 278, "pid": 5714, "tid": 6744, "ts": 6303771925552.220, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771925614.620, "dur": 38.990, + "args": { + "External id": 154464,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771925665.440, "dur": 22.720, + "args": { + "External id": 154465,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771925697.229, "dur": 21.091, + "args": { + "External id": 154466,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 2913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771925728.260, "dur": 17.409, + "args": { + "External id": 154467,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771925753.249, "dur": 13.980, + "args": { + "External id": 154468,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771925774.609, "dur": 13.490, + "args": { + "External id": 154469,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 2916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6303771925807.159, "dur": 18.000, + "args": { + "External id": 154470,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 2917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771925864.139, "dur": 11.160, + "args": { + "External id": 154471,"Record function id": 0, "Ev Idx": 2918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771925866.849, "dur": 7.160, + "args": { + "External id": 154472,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771925869.269, "dur": 4.030, + "args": { + "External id": 154473,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771925870.219, "dur": 2.870, + "args": { + "External id": 154474,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771925879.679, "dur": 4.970, + "args": { + "External id": 154475,"Record function id": 0, "Ev Idx": 2922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771925881.299, "dur": 2.410, + "args": { + "External id": 154476,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771925881.979, "dur": 1.300, + "args": { + "External id": 154477,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771925882.369, "dur": 0.740, + "args": { + "External id": 154478,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771925888.349, "dur": 4.850, + "args": { + "External id": 154479,"Record function id": 0, "Ev Idx": 2926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771925889.929, "dur": 2.340, + "args": { + "External id": 154480,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771925890.559, "dur": 1.280, + "args": { + "External id": 154481,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771925890.999, "dur": 0.670, + "args": { + "External id": 154482,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 2929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771925896.799, "dur": 191.850, + "args": { + "External id": 154483,"Record function id": 0, "Sequence number": 3058703, "Fwd thread id": 1, "Ev Idx": 2930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771925898.239, "dur": 161.780, + "args": { + "External id": 154484,"Sequence number": 3058703, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2931 + } + }, + { + "ph": "f", "id": 279, "pid": 5714, "tid": 6744, "ts": 6303771925898.239, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771925960.249, "dur": 21.210, + "args": { + "External id": 154485,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 2932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771925998.829, "dur": 12.750, + "args": { + "External id": 154486,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 2933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771926027.519, "dur": 14.810, + "args": { + "External id": 154487,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 2934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771926067.409, "dur": 16.860, + "args": { + "External id": 154488,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 2935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771926099.819, "dur": 9.810, + "args": { + "External id": 154489,"Record function id": 0, "Ev Idx": 2936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771926102.459, "dur": 5.810, + "args": { + "External id": 154490,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771926104.449, "dur": 3.180, + "args": { + "External id": 154491,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771926105.199, "dur": 2.230, + "args": { + "External id": 154492,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 2939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771926113.579, "dur": 699.508, + "args": { + "External id": 154493,"Record function id": 0, "Sequence number": 3058702, "Fwd thread id": 1, "Ev Idx": 2940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771926115.139, "dur": 692.568, + "args": { + "External id": 154494,"Sequence number": 3058702, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 2941 + } + }, + { + "ph": "f", "id": 280, "pid": 5714, "tid": 6744, "ts": 6303771926115.139, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6303771926131.928, "dur": 22.380, + "args": { + "External id": 154495,"Record function id": 0, "Ev Idx": 2942 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6303771926161.908, "dur": 49.920, + "args": { + "External id": 154496,"Record function id": 0, "Ev Idx": 2943 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.3)", "pid": 5714, "tid": 6744, + "ts": 6303771926218.498, "dur": 584.009, + "args": { + "External id": 154497,"Record function id": 0, "Ev Idx": 2944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771926281.398, "dur": 7.240, + "args": { + "External id": 154498,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926303.588, "dur": 3.620, + "args": { + "External id": 154499,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771926318.868, "dur": 84.650, + "args": { + "External id": 154500,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771926328.318, "dur": 71.850, + "args": { + "External id": 154501,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 2948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771926341.298, "dur": 5.790, + "args": { + "External id": 154502,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771926350.898, "dur": 29.520, + "args": { + "External id": 154503,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 2950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771926352.248, "dur": 27.880, + "args": { + "External id": 154504,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 2951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926354.268, "dur": 6.010, + "args": { + "External id": 154505,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771926361.438, "dur": 18.210, + "args": { + "External id": 154506,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 2953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771926463.988, "dur": 8.030, + "args": { + "External id": 154507,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 2954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771926465.538, "dur": 5.990, + "args": { + "External id": 154508,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771926486.918, "dur": 78.100, + "args": { + "External id": 154509,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 2956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771926501.398, "dur": 60.800, + "args": { + "External id": 154510,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 2957, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771926512.468, "dur": 45.880, + "args": { + "External id": 154511,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 2958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771926576.918, "dur": 3.100, + "args": { + "External id": 154512,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 2959, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926628.807, "dur": 3.651, + "args": { + "External id": 154513,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926660.067, "dur": 1.160, + "args": { + "External id": 154514,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926677.387, "dur": 1.040, + "args": { + "External id": 154515,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926692.227, "dur": 0.810, + "args": { + "External id": 154516,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926705.337, "dur": 0.710, + "args": { + "External id": 154517,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926718.267, "dur": 0.800, + "args": { + "External id": 154518,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926730.887, "dur": 0.730, + "args": { + "External id": 154519,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926743.637, "dur": 0.840, + "args": { + "External id": 154520,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926756.217, "dur": 0.810, + "args": { + "External id": 154521,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771926824.487, "dur": 1254.747, + "args": { + "External id": 154522,"Record function id": 0, "Ev Idx": 2969 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6303771926837.747, "dur": 789.038, + "args": { + "External id": 154523,"Record function id": 0, "Ev Idx": 2970 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6303771926848.497, "dur": 222.620, + "args": { + "External id": 154524,"Record function id": 0, "Ev Idx": 2971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926925.437, "dur": 3.480, + "args": { + "External id": 154525,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 2972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926932.837, "dur": 0.890, + "args": { + "External id": 154526,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926935.487, "dur": 0.850, + "args": { + "External id": 154527,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926937.787, "dur": 0.640, + "args": { + "External id": 154528,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926939.817, "dur": 0.630, + "args": { + "External id": 154529,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926941.717, "dur": 0.760, + "args": { + "External id": 154530,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 2977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926943.917, "dur": 0.760, + "args": { + "External id": 154531,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 2978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926945.957, "dur": 0.640, + "args": { + "External id": 154532,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926947.977, "dur": 0.950, + "args": { + "External id": 154533,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771926950.267, "dur": 0.670, + "args": { + "External id": 154534,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 2981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771926963.737, "dur": 81.060, + "args": { + "External id": 154535,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771926974.437, "dur": 66.969, + "args": { + "External id": 154536,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 2983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771926983.667, "dur": 5.800, + "args": { + "External id": 154537,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771926991.647, "dur": 29.650, + "args": { + "External id": 154538,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 2985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771926992.967, "dur": 28.019, + "args": { + "External id": 154539,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 2986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771926994.957, "dur": 6.280, + "args": { + "External id": 154540,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771927002.337, "dur": 18.140, + "args": { + "External id": 154541,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 2988 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.1", "pid": 5714, "tid": 6744, + "ts": 6303771927148.716, "dur": 469.749, + "args": { + "External id": 154542,"Record function id": 0, "Ev Idx": 2989 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6303771927162.546, "dur": 443.009, + "args": { + "External id": 154543,"Record function id": 0, "Ev Idx": 2990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771927219.386, "dur": 7.120, + "args": { + "External id": 154544,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 2991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771927236.786, "dur": 17.780, + "args": { + "External id": 154545,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 2992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927239.436, "dur": 1.130, + "args": { + "External id": 154546,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927242.406, "dur": 0.280, + "args": { + "External id": 154547,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927243.576, "dur": 0.270, + "args": { + "External id": 154548,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927244.746, "dur": 0.640, + "args": { + "External id": 154549,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927246.296, "dur": 0.250, + "args": { + "External id": 154550,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927247.416, "dur": 0.300, + "args": { + "External id": 154551,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927248.676, "dur": 0.260, + "args": { + "External id": 154552,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 2999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927249.836, "dur": 0.270, + "args": { + "External id": 154553,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927250.966, "dur": 0.260, + "args": { + "External id": 154554,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771927262.296, "dur": 20.220, + "args": { + "External id": 154555,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6303771927333.476, "dur": 84.390, + "args": { + "External id": 154556,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771927343.006, "dur": 6.870, + "args": { + "External id": 154557,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6303771927354.016, "dur": 7.800, + "args": { + "External id": 154558,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771927356.226, "dur": 5.180, + "args": { + "External id": 154559,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927358.696, "dur": 0.910, + "args": { + "External id": 154560,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771927368.846, "dur": 14.610, + "args": { + "External id": 154561,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927370.686, "dur": 0.340, + "args": { + "External id": 154562,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927372.126, "dur": 0.280, + "args": { + "External id": 154563,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927373.286, "dur": 0.290, + "args": { + "External id": 154564,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927374.596, "dur": 0.270, + "args": { + "External id": 154565,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927375.776, "dur": 0.260, + "args": { + "External id": 154566,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927376.936, "dur": 0.350, + "args": { + "External id": 154567,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927378.376, "dur": 0.250, + "args": { + "External id": 154568,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927379.426, "dur": 0.270, + "args": { + "External id": 154569,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771927380.606, "dur": 0.270, + "args": { + "External id": 154570,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771927393.146, "dur": 16.800, + "args": { + "External id": 154571,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6303771927467.385, "dur": 76.560, + "args": { + "External id": 154572,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771927482.525, "dur": 58.560, + "args": { + "External id": 154573,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3020, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6303771927493.016, "dur": 44.300, + "args": { + "External id": 154574,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771927555.665, "dur": 3.211, + "args": { + "External id": 154575,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3022, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771927632.065, "dur": 437.379, + "args": { + "External id": 154576,"Sequence number": 3058701, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3023 + } + }, + { + "ph": "f", "id": 281, "pid": 5714, "tid": 6744, "ts": 6303771927632.065, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771927693.625, "dur": 30.950, + "args": { + "External id": 154577,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6303771927752.895, "dur": 22.360, + "args": { + "External id": 154578,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 3025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771927790.365, "dur": 33.150, + "args": { + "External id": 154579,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 3026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771927835.135, "dur": 24.660, + "args": { + "External id": 154580,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771927869.455, "dur": 18.980, + "args": { + "External id": 154581,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771927897.645, "dur": 22.210, + "args": { + "External id": 154582,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771927929.775, "dur": 17.940, + "args": { + "External id": 154583,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771927969.144, "dur": 19.871, + "args": { + "External id": 154584,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771928005.044, "dur": 12.880, + "args": { + "External id": 154585,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6303771928032.304, "dur": 16.970, + "args": { + "External id": 154586,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 3033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771928092.594, "dur": 11.040, + "args": { + "External id": 154587,"Record function id": 0, "Ev Idx": 3034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771928095.414, "dur": 6.930, + "args": { + "External id": 154588,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771928097.764, "dur": 3.800, + "args": { + "External id": 154589,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771928098.574, "dur": 2.770, + "args": { + "External id": 154590,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771928108.264, "dur": 4.990, + "args": { + "External id": 154591,"Record function id": 0, "Ev Idx": 3038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771928109.874, "dur": 2.400, + "args": { + "External id": 154592,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771928110.534, "dur": 1.310, + "args": { + "External id": 154593,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771928110.924, "dur": 0.740, + "args": { + "External id": 154594,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771928116.984, "dur": 4.710, + "args": { + "External id": 154595,"Record function id": 0, "Ev Idx": 3042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771928118.444, "dur": 2.310, + "args": { + "External id": 154596,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771928119.054, "dur": 1.270, + "args": { + "External id": 154597,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771928119.494, "dur": 0.670, + "args": { + "External id": 154598,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771928125.294, "dur": 4.940, + "args": { + "External id": 154599,"Record function id": 0, "Ev Idx": 3046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771928127.064, "dur": 2.240, + "args": { + "External id": 154600,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771928127.694, "dur": 1.200, + "args": { + "External id": 154601,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771928128.044, "dur": 0.680, + "args": { + "External id": 154602,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771928133.924, "dur": 312.139, + "args": { + "External id": 154603,"Record function id": 0, "Sequence number": 3058700, "Fwd thread id": 1, "Ev Idx": 3050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771928135.554, "dur": 303.300, + "args": { + "External id": 154604,"Sequence number": 3058700, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3051 + } + }, + { + "ph": "f", "id": 282, "pid": 5714, "tid": 6744, "ts": 6303771928135.554, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771928193.864, "dur": 37.830, + "args": { + "External id": 154605,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771928244.094, "dur": 18.210, + "args": { + "External id": 154606,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771928285.244, "dur": 131.039, + "args": { + "External id": 154607,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 3054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771928345.524, "dur": 6.810, + "args": { + "External id": 154608,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771928353.944, "dur": 3.490, + "args": { + "External id": 154609,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771928459.094, "dur": 10.679, + "args": { + "External id": 154610,"Record function id": 0, "Ev Idx": 3057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771928462.094, "dur": 6.479, + "args": { + "External id": 154611,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771928464.403, "dur": 3.330, + "args": { + "External id": 154612,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771928465.254, "dur": 2.249, + "args": { + "External id": 154613,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771928473.853, "dur": 163.880, + "args": { + "External id": 154614,"Record function id": 0, "Sequence number": 3058699, "Fwd thread id": 1, "Ev Idx": 3061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771928475.493, "dur": 156.300, + "args": { + "External id": 154615,"Sequence number": 3058699, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3062 + } + }, + { + "ph": "f", "id": 283, "pid": 5714, "tid": 6744, "ts": 6303771928475.493, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771928489.723, "dur": 34.250, + "args": { + "External id": 154616,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771928492.783, "dur": 6.340, + "args": { + "External id": 154617,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771928500.323, "dur": 23.090, + "args": { + "External id": 154618,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771928532.273, "dur": 6.710, + "args": { + "External id": 154619,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771928534.073, "dur": 4.440, + "args": { + "External id": 154620,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771928649.013, "dur": 157.720, + "args": { + "External id": 154621,"Record function id": 0, "Sequence number": 3058698, "Fwd thread id": 1, "Ev Idx": 3068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771928651.923, "dur": 148.270, + "args": { + "External id": 154622,"Sequence number": 3058698, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3069 + } + }, + { + "ph": "f", "id": 284, "pid": 5714, "tid": 6744, "ts": 6303771928651.923, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771928665.003, "dur": 37.340, + "args": { + "External id": 154623,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771928667.763, "dur": 6.120, + "args": { + "External id": 154624,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771928682.113, "dur": 19.580, + "args": { + "External id": 154625,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771928710.353, "dur": 6.800, + "args": { + "External id": 154626,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771928712.503, "dur": 4.260, + "args": { + "External id": 154627,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771928817.782, "dur": 301.730, + "args": { + "External id": 154628,"Record function id": 0, "Sequence number": 3058697, "Fwd thread id": 1, "Ev Idx": 3075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771928820.522, "dur": 288.210, + "args": { + "External id": 154629,"Sequence number": 3058697, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 3076 + } + }, + { + "ph": "f", "id": 285, "pid": 5714, "tid": 6744, "ts": 6303771928820.522, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771928882.013, "dur": 38.899, + "args": { + "External id": 154630,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771928933.132, "dur": 23.250, + "args": { + "External id": 154631,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771928966.142, "dur": 21.520, + "args": { + "External id": 154632,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771928997.872, "dur": 17.750, + "args": { + "External id": 154633,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771929023.352, "dur": 13.760, + "args": { + "External id": 154634,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771929043.882, "dur": 13.540, + "args": { + "External id": 154635,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6303771929075.372, "dur": 17.880, + "args": { + "External id": 154636,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 3083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771929132.192, "dur": 11.040, + "args": { + "External id": 154637,"Record function id": 0, "Ev Idx": 3084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771929134.992, "dur": 6.950, + "args": { + "External id": 154638,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771929137.342, "dur": 3.880, + "args": { + "External id": 154639,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771929138.362, "dur": 2.650, + "args": { + "External id": 154640,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771929147.602, "dur": 5.320, + "args": { + "External id": 154641,"Record function id": 0, "Ev Idx": 3088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771929149.462, "dur": 2.510, + "args": { + "External id": 154642,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771929150.232, "dur": 1.280, + "args": { + "External id": 154643,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771929150.602, "dur": 0.740, + "args": { + "External id": 154644,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771929156.522, "dur": 4.760, + "args": { + "External id": 154645,"Record function id": 0, "Ev Idx": 3092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771929158.052, "dur": 2.270, + "args": { + "External id": 154646,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771929158.722, "dur": 1.170, + "args": { + "External id": 154647,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771929159.062, "dur": 0.680, + "args": { + "External id": 154648,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771929165.192, "dur": 200.939, + "args": { + "External id": 154649,"Record function id": 0, "Sequence number": 3058696, "Fwd thread id": 1, "Ev Idx": 3096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771929166.662, "dur": 170.179, + "args": { + "External id": 154650,"Sequence number": 3058696, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3097 + } + }, + { + "ph": "f", "id": 286, "pid": 5714, "tid": 6744, "ts": 6303771929166.662, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771929228.592, "dur": 20.910, + "args": { + "External id": 154651,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771929266.501, "dur": 12.560, + "args": { + "External id": 154652,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771929295.721, "dur": 22.951, + "args": { + "External id": 154653,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 3100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771929344.632, "dur": 16.969, + "args": { + "External id": 154654,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771929377.791, "dur": 10.240, + "args": { + "External id": 154655,"Record function id": 0, "Ev Idx": 3102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771929380.721, "dur": 6.030, + "args": { + "External id": 154656,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771929382.791, "dur": 3.280, + "args": { + "External id": 154657,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771929383.581, "dur": 2.290, + "args": { + "External id": 154658,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771929392.231, "dur": 686.709, + "args": { + "External id": 154659,"Record function id": 0, "Sequence number": 3058695, "Fwd thread id": 1, "Ev Idx": 3106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771929393.611, "dur": 679.949, + "args": { + "External id": 154660,"Sequence number": 3058695, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3107 + } + }, + { + "ph": "f", "id": 287, "pid": 5714, "tid": 6744, "ts": 6303771929393.611, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6303771929410.061, "dur": 22.550, + "args": { + "External id": 154661,"Record function id": 0, "Ev Idx": 3108 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6303771929440.071, "dur": 51.110, + "args": { + "External id": 154662,"Record function id": 0, "Ev Idx": 3109 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.2)", "pid": 5714, "tid": 6744, + "ts": 6303771929498.021, "dur": 570.369, + "args": { + "External id": 154663,"Record function id": 0, "Ev Idx": 3110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771929561.051, "dur": 7.300, + "args": { + "External id": 154664,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771929575.941, "dur": 3.570, + "args": { + "External id": 154665,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771929590.531, "dur": 87.570, + "args": { + "External id": 154666,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771929599.821, "dur": 74.960, + "args": { + "External id": 154667,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771929616.171, "dur": 7.730, + "args": { + "External id": 154668,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771929626.651, "dur": 28.930, + "args": { + "External id": 154669,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 3116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771929627.991, "dur": 27.300, + "args": { + "External id": 154670,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 3117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771929630.141, "dur": 5.910, + "args": { + "External id": 154671,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771929637.141, "dur": 17.660, + "args": { + "External id": 154672,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 3119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771929737.980, "dur": 8.131, + "args": { + "External id": 154673,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 3120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771929739.751, "dur": 5.889, + "args": { + "External id": 154674,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771929761.551, "dur": 76.939, + "args": { + "External id": 154675,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 3122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771929775.891, "dur": 59.639, + "args": { + "External id": 154676,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3123, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771929787.071, "dur": 44.579, + "args": { + "External id": 154677,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 3124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771929850.160, "dur": 3.070, + "args": { + "External id": 154678,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3125, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771929899.680, "dur": 3.720, + "args": { + "External id": 154679,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771929930.000, "dur": 1.190, + "args": { + "External id": 154680,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771929946.820, "dur": 1.040, + "args": { + "External id": 154681,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771929960.250, "dur": 0.830, + "args": { + "External id": 154682,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771929972.930, "dur": 0.830, + "args": { + "External id": 154683,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771929985.250, "dur": 0.930, + "args": { + "External id": 154684,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771929997.640, "dur": 0.790, + "args": { + "External id": 154685,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930010.050, "dur": 0.850, + "args": { + "External id": 154686,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930021.820, "dur": 0.900, + "args": { + "External id": 154687,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771930090.420, "dur": 1242.847, + "args": { + "External id": 154688,"Record function id": 0, "Ev Idx": 3135 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6303771930103.790, "dur": 770.918, + "args": { + "External id": 154689,"Record function id": 0, "Ev Idx": 3136 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6303771930113.980, "dur": 232.909, + "args": { + "External id": 154690,"Record function id": 0, "Ev Idx": 3137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771930194.510, "dur": 3.629, + "args": { + "External id": 154691,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771930202.019, "dur": 0.940, + "args": { + "External id": 154692,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771930204.759, "dur": 0.840, + "args": { + "External id": 154693,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771930207.070, "dur": 0.740, + "args": { + "External id": 154694,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771930209.590, "dur": 0.740, + "args": { + "External id": 154695,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771930211.559, "dur": 0.771, + "args": { + "External id": 154696,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771930213.759, "dur": 0.751, + "args": { + "External id": 154697,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771930215.750, "dur": 0.720, + "args": { + "External id": 154698,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771930217.859, "dur": 0.631, + "args": { + "External id": 154699,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771930219.770, "dur": 0.700, + "args": { + "External id": 154700,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771930232.770, "dur": 87.979, + "args": { + "External id": 154701,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771930243.070, "dur": 74.269, + "args": { + "External id": 154702,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771930252.539, "dur": 6.220, + "args": { + "External id": 154703,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771930260.579, "dur": 29.410, + "args": { + "External id": 154704,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771930261.930, "dur": 27.749, + "args": { + "External id": 154705,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930264.090, "dur": 6.179, + "args": { + "External id": 154706,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771930271.429, "dur": 17.820, + "args": { + "External id": 154707,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3154 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::backward_prefetch for model.layers.0", "pid": 5714, "tid": 6744, + "ts": 6303771930426.759, "dur": 440.919, + "args": { + "External id": 154708,"Record function id": 0, "Ev Idx": 3155 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6303771930441.259, "dur": 414.149, + "args": { + "External id": 154709,"Record function id": 0, "Ev Idx": 3156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771930499.439, "dur": 7.260, + "args": { + "External id": 154710,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771930516.509, "dur": 17.930, + "args": { + "External id": 154711,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930519.389, "dur": 1.160, + "args": { + "External id": 154712,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930522.159, "dur": 0.380, + "args": { + "External id": 154713,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930523.599, "dur": 0.320, + "args": { + "External id": 154714,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930524.949, "dur": 0.320, + "args": { + "External id": 154715,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930526.169, "dur": 0.320, + "args": { + "External id": 154716,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930527.359, "dur": 0.330, + "args": { + "External id": 154717,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930528.629, "dur": 0.300, + "args": { + "External id": 154718,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930529.929, "dur": 0.280, + "args": { + "External id": 154719,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930531.119, "dur": 0.370, + "args": { + "External id": 154720,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771930541.979, "dur": 20.530, + "args": { + "External id": 154721,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 6744, + "ts": 6303771930590.349, "dur": 81.440, + "args": { + "External id": 154722,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771930599.649, "dur": 6.570, + "args": { + "External id": 154723,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 6744, + "ts": 6303771930610.039, "dur": 7.150, + "args": { + "External id": 154724,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 6744, + "ts": 6303771930612.299, "dur": 4.530, + "args": { + "External id": 154725,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930614.599, "dur": 0.690, + "args": { + "External id": 154726,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 6744, + "ts": 6303771930624.058, "dur": 13.931, + "args": { + "External id": 154727,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930625.578, "dur": 0.351, + "args": { + "External id": 154728,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930626.938, "dur": 0.320, + "args": { + "External id": 154729,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930628.149, "dur": 0.220, + "args": { + "External id": 154730,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930629.278, "dur": 0.340, + "args": { + "External id": 154731,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930630.498, "dur": 0.600, + "args": { + "External id": 154732,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930631.969, "dur": 0.289, + "args": { + "External id": 154733,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930633.218, "dur": 0.311, + "args": { + "External id": 154734,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930634.418, "dur": 0.291, + "args": { + "External id": 154735,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771930635.558, "dur": 0.200, + "args": { + "External id": 154736,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 6744, + "ts": 6303771930648.018, "dur": 16.440, + "args": { + "External id": 154737,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 6744, + "ts": 6303771930719.758, "dur": 75.240, + "args": { + "External id": 154738,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771930734.158, "dur": 58.030, + "args": { + "External id": 154739,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3186, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 6744, + "ts": 6303771930744.588, "dur": 43.720, + "args": { + "External id": 154740,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771930806.338, "dur": 2.990, + "args": { + "External id": 154741,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3188, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771930879.728, "dur": 442.669, + "args": { + "External id": 154742,"Sequence number": 3058694, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3189 + } + }, + { + "ph": "f", "id": 288, "pid": 5714, "tid": 6744, "ts": 6303771930879.728, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771930940.608, "dur": 30.800, + "args": { + "External id": 154743,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6303771930998.968, "dur": 22.640, + "args": { + "External id": 154744,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 3191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771931036.458, "dur": 33.370, + "args": { + "External id": 154745,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 3192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771931081.657, "dur": 24.451, + "args": { + "External id": 154746,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771931115.517, "dur": 18.800, + "args": { + "External id": 154747,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771931143.417, "dur": 21.770, + "args": { + "External id": 154748,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771931175.347, "dur": 17.820, + "args": { + "External id": 154749,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771931214.327, "dur": 18.670, + "args": { + "External id": 154750,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771931248.517, "dur": 12.540, + "args": { + "External id": 154751,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6303771931274.427, "dur": 15.490, + "args": { + "External id": 154752,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 3199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771931346.847, "dur": 11.010, + "args": { + "External id": 154753,"Record function id": 0, "Ev Idx": 3200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771931349.797, "dur": 6.830, + "args": { + "External id": 154754,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771931352.257, "dur": 3.620, + "args": { + "External id": 154755,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771931353.137, "dur": 2.520, + "args": { + "External id": 154756,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771931362.377, "dur": 4.940, + "args": { + "External id": 154757,"Record function id": 0, "Ev Idx": 3204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771931363.907, "dur": 2.400, + "args": { + "External id": 154758,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771931364.607, "dur": 1.240, + "args": { + "External id": 154759,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771931364.967, "dur": 0.710, + "args": { + "External id": 154760,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771931371.297, "dur": 4.620, + "args": { + "External id": 154761,"Record function id": 0, "Ev Idx": 3208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771931372.757, "dur": 2.240, + "args": { + "External id": 154762,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771931373.367, "dur": 1.230, + "args": { + "External id": 154763,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771931373.837, "dur": 0.590, + "args": { + "External id": 154764,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771931379.497, "dur": 4.510, + "args": { + "External id": 154765,"Record function id": 0, "Ev Idx": 3212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771931380.957, "dur": 2.140, + "args": { + "External id": 154766,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771931381.557, "dur": 1.080, + "args": { + "External id": 154767,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771931381.907, "dur": 0.580, + "args": { + "External id": 154768,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771931387.617, "dur": 301.539, + "args": { + "External id": 154769,"Record function id": 0, "Sequence number": 3058693, "Fwd thread id": 1, "Ev Idx": 3216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771931388.947, "dur": 292.969, + "args": { + "External id": 154770,"Sequence number": 3058693, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3217 + } + }, + { + "ph": "f", "id": 289, "pid": 5714, "tid": 6744, "ts": 6303771931388.947, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771931448.107, "dur": 38.380, + "args": { + "External id": 154771,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771931498.067, "dur": 18.069, + "args": { + "External id": 154772,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771931539.387, "dur": 119.379, + "args": { + "External id": 154773,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 3220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771931589.107, "dur": 6.449, + "args": { + "External id": 154774,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771931597.127, "dur": 3.580, + "args": { + "External id": 154775,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771931702.406, "dur": 10.640, + "args": { + "External id": 154776,"Record function id": 0, "Ev Idx": 3223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771931705.356, "dur": 6.480, + "args": { + "External id": 154777,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771931707.666, "dur": 3.330, + "args": { + "External id": 154778,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771931708.496, "dur": 2.290, + "args": { + "External id": 154779,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771931717.096, "dur": 163.680, + "args": { + "External id": 154780,"Record function id": 0, "Sequence number": 3058692, "Fwd thread id": 1, "Ev Idx": 3227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771931718.776, "dur": 155.910, + "args": { + "External id": 154781,"Sequence number": 3058692, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3228 + } + }, + { + "ph": "f", "id": 290, "pid": 5714, "tid": 6744, "ts": 6303771931718.776, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771931733.246, "dur": 33.860, + "args": { + "External id": 154782,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771931736.346, "dur": 6.480, + "args": { + "External id": 154783,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771931743.976, "dur": 22.540, + "args": { + "External id": 154784,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771931775.526, "dur": 6.780, + "args": { + "External id": 154785,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771931777.356, "dur": 4.520, + "args": { + "External id": 154786,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771931892.146, "dur": 148.949, + "args": { + "External id": 154787,"Record function id": 0, "Sequence number": 3058691, "Fwd thread id": 1, "Ev Idx": 3234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771931894.756, "dur": 139.999, + "args": { + "External id": 154788,"Sequence number": 3058691, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3235 + } + }, + { + "ph": "f", "id": 291, "pid": 5714, "tid": 6744, "ts": 6303771931894.756, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771931907.976, "dur": 29.650, + "args": { + "External id": 154789,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771931911.056, "dur": 6.310, + "args": { + "External id": 154790,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771931918.456, "dur": 18.630, + "args": { + "External id": 154791,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771931945.376, "dur": 6.380, + "args": { + "External id": 154792,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771931947.106, "dur": 4.250, + "args": { + "External id": 154793,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771932051.895, "dur": 311.030, + "args": { + "External id": 154794,"Record function id": 0, "Sequence number": 3058690, "Fwd thread id": 1, "Ev Idx": 3241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771932054.446, "dur": 298.289, + "args": { + "External id": 154795,"Sequence number": 3058690, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 3242 + } + }, + { + "ph": "f", "id": 292, "pid": 5714, "tid": 6744, "ts": 6303771932054.446, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771932118.515, "dur": 38.790, + "args": { + "External id": 154796,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771932168.805, "dur": 22.580, + "args": { + "External id": 154797,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771932200.335, "dur": 21.720, + "args": { + "External id": 154798,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771932232.015, "dur": 17.040, + "args": { + "External id": 154799,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771932256.915, "dur": 14.110, + "args": { + "External id": 154800,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771932278.115, "dur": 13.490, + "args": { + "External id": 154801,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6303771932317.865, "dur": 18.860, + "args": { + "External id": 154802,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 3249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771932375.625, "dur": 11.080, + "args": { + "External id": 154803,"Record function id": 0, "Ev Idx": 3250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771932378.505, "dur": 6.800, + "args": { + "External id": 154804,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771932380.945, "dur": 3.670, + "args": { + "External id": 154805,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771932381.765, "dur": 2.630, + "args": { + "External id": 154806,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771932390.865, "dur": 5.070, + "args": { + "External id": 154807,"Record function id": 0, "Ev Idx": 3254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771932392.445, "dur": 2.510, + "args": { + "External id": 154808,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771932393.155, "dur": 1.350, + "args": { + "External id": 154809,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771932393.515, "dur": 0.830, + "args": { + "External id": 154810,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771932399.595, "dur": 4.760, + "args": { + "External id": 154811,"Record function id": 0, "Ev Idx": 3258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771932401.075, "dur": 2.370, + "args": { + "External id": 154812,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771932401.695, "dur": 1.330, + "args": { + "External id": 154813,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771932402.175, "dur": 0.680, + "args": { + "External id": 154814,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771932408.025, "dur": 189.959, + "args": { + "External id": 154815,"Record function id": 0, "Sequence number": 3058689, "Fwd thread id": 1, "Ev Idx": 3262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771932409.425, "dur": 159.989, + "args": { + "External id": 154816,"Sequence number": 3058689, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3263 + } + }, + { + "ph": "f", "id": 293, "pid": 5714, "tid": 6744, "ts": 6303771932409.425, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771932471.294, "dur": 21.131, + "args": { + "External id": 154817,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771932508.745, "dur": 12.169, + "args": { + "External id": 154818,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771932537.524, "dur": 14.670, + "args": { + "External id": 154819,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 3266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771932577.094, "dur": 16.250, + "args": { + "External id": 154820,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771932609.244, "dur": 10.070, + "args": { + "External id": 154821,"Record function id": 0, "Ev Idx": 3268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771932612.084, "dur": 5.960, + "args": { + "External id": 154822,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771932614.164, "dur": 3.190, + "args": { + "External id": 154823,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771932614.914, "dur": 2.240, + "args": { + "External id": 154824,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771932623.524, "dur": 687.259, + "args": { + "External id": 154825,"Record function id": 0, "Sequence number": 3058688, "Fwd thread id": 1, "Ev Idx": 3272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771932624.954, "dur": 679.939, + "args": { + "External id": 154826,"Sequence number": 3058688, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3273 + } + }, + { + "ph": "f", "id": 294, "pid": 5714, "tid": 6744, "ts": 6303771932624.954, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6303771932641.254, "dur": 21.480, + "args": { + "External id": 154827,"Record function id": 0, "Ev Idx": 3274 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6303771932670.384, "dur": 49.040, + "args": { + "External id": 154828,"Record function id": 0, "Ev Idx": 3275 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.1)", "pid": 5714, "tid": 6744, + "ts": 6303771932725.814, "dur": 566.649, + "args": { + "External id": 154829,"Record function id": 0, "Ev Idx": 3276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771932788.034, "dur": 7.190, + "args": { + "External id": 154830,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771932803.144, "dur": 3.310, + "args": { + "External id": 154831,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771932817.524, "dur": 82.549, + "args": { + "External id": 154832,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771932826.704, "dur": 70.069, + "args": { + "External id": 154833,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771932839.344, "dur": 5.710, + "args": { + "External id": 154834,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771932849.004, "dur": 28.629, + "args": { + "External id": 154835,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 3282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771932850.324, "dur": 27.029, + "args": { + "External id": 154836,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 3283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771932852.484, "dur": 5.760, + "args": { + "External id": 154837,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771932859.414, "dur": 17.470, + "args": { + "External id": 154838,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 3285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771932960.953, "dur": 8.030, + "args": { + "External id": 154839,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 3286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771932962.473, "dur": 5.990, + "args": { + "External id": 154840,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771932984.183, "dur": 76.060, + "args": { + "External id": 154841,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 3288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771932997.853, "dur": 59.440, + "args": { + "External id": 154842,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3289, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771933008.623, "dur": 44.690, + "args": { + "External id": 154843,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 3290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771933071.293, "dur": 3.100, + "args": { + "External id": 154844,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3291, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771933121.573, "dur": 3.680, + "args": { + "External id": 154845,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771933151.943, "dur": 1.190, + "args": { + "External id": 154846,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771933168.693, "dur": 0.980, + "args": { + "External id": 154847,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771933182.673, "dur": 0.780, + "args": { + "External id": 154848,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771933195.533, "dur": 0.730, + "args": { + "External id": 154849,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771933208.483, "dur": 0.690, + "args": { + "External id": 154850,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771933220.923, "dur": 0.740, + "args": { + "External id": 154851,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771933233.773, "dur": 0.700, + "args": { + "External id": 154852,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771933245.693, "dur": 0.720, + "args": { + "External id": 154853,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771933322.463, "dur": 772.798, + "args": { + "External id": 154854,"Record function id": 0, "Ev Idx": 3301 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_backward (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6303771933336.063, "dur": 313.619, + "args": { + "External id": 154855,"Record function id": 0, "Ev Idx": 3302 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6303771933346.663, "dur": 220.129, + "args": { + "External id": 154856,"Record function id": 0, "Ev Idx": 3303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771933425.272, "dur": 3.580, + "args": { + "External id": 154857,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771933432.842, "dur": 0.840, + "args": { + "External id": 154858,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771933435.322, "dur": 0.680, + "args": { + "External id": 154859,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771933437.512, "dur": 0.740, + "args": { + "External id": 154860,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771933439.532, "dur": 0.600, + "args": { + "External id": 154861,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771933441.412, "dur": 0.720, + "args": { + "External id": 154862,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771933443.492, "dur": 0.600, + "args": { + "External id": 154863,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771933445.322, "dur": 0.640, + "args": { + "External id": 154864,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771933447.222, "dur": 0.570, + "args": { + "External id": 154865,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771933449.032, "dur": 0.490, + "args": { + "External id": 154866,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771933461.402, "dur": 79.270, + "args": { + "External id": 154867,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 6744, + "ts": 6303771933471.702, "dur": 65.720, + "args": { + "External id": 154868,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771933481.152, "dur": 5.860, + "args": { + "External id": 154869,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771933489.082, "dur": 28.750, + "args": { + "External id": 154870,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771933490.362, "dur": 27.190, + "args": { + "External id": 154871,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771933492.312, "dur": 5.920, + "args": { + "External id": 154872,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771933499.252, "dur": 17.840, + "args": { + "External id": 154873,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771933656.242, "dur": 429.159, + "args": { + "External id": 154874,"Sequence number": 3058687, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3321 + } + }, + { + "ph": "f", "id": 295, "pid": 5714, "tid": 6744, "ts": 6303771933656.242, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771933717.072, "dur": 30.080, + "args": { + "External id": 154875,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [2048, 1], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 5714, "tid": 6744, + "ts": 6303771933773.702, "dur": 22.109, + "args": { + "External id": 154876,"kernel_hash": "cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/pr/cprtzoov4iygpqz6sh3uv5dr6wfbeyc3fy5rhgi2t4i6wfbusdna.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [16384, 2048], [8, 2048, 2048], [8, 2048, 2048], [8, 2048, 2048], []], "Ev Idx": 3323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::bmm", "pid": 5714, "tid": 6744, + "ts": 6303771933810.551, "dur": 32.420, + "args": { + "External id": 154877,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[0, 1, 768], [0, 2048, 1], [1572864, 2048, 1]], "Input Dims": [[1, 768, 16384], [1, 16384, 2048], [1, 768, 2048]], "Ev Idx": 3324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771933855.191, "dur": 24.410, + "args": { + "External id": 154878,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771933889.191, "dur": 18.710, + "args": { + "External id": 154879,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771933917.331, "dur": 21.860, + "args": { + "External id": 154880,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 2048], [768, 1], [768, 1]], "Input Dims": [[2048, 16384], [16384, 768], [2048, 768]], "Ev Idx": 3327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771933948.551, "dur": 18.200, + "args": { + "External id": 154881,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771933988.061, "dur": 19.650, + "args": { + "External id": 154882,"kernel_hash": "cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/pb/cpbn5nxeqyu26vfv65g36sowfx5dbtv5esqqwgp35j4lrpmreuhc.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[768, 1], [768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[16384, 768], [16384, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771934023.231, "dur": 12.460, + "args": { + "External id": 154883,"kernel_hash": "cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/qf/cqf4x522u6wzb2m5ubpo3rfkiohqmphle7wug4hosaimltqmnook.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 5714, "tid": 6744, + "ts": 6303771934049.881, "dur": 15.160, + "args": { + "External id": 154884,"kernel_hash": "c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/3p/c3pjdddu6i2stfu75g6wysppqbwkextdgx533o5mvzcr4uptl3zz.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [], []], "Input Dims": [[8, 2048, 768], [16384, 768], [768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [], []], "Ev Idx": 3331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771934108.391, "dur": 11.330, + "args": { + "External id": 154885,"Record function id": 0, "Ev Idx": 3332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771934111.431, "dur": 7.020, + "args": { + "External id": 154886,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771934113.791, "dur": 3.870, + "args": { + "External id": 154887,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771934114.801, "dur": 2.630, + "args": { + "External id": 154888,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771934124.211, "dur": 5.090, + "args": { + "External id": 154889,"Record function id": 0, "Ev Idx": 3336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771934125.841, "dur": 2.540, + "args": { + "External id": 154890,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771934126.551, "dur": 1.400, + "args": { + "External id": 154891,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771934126.951, "dur": 0.840, + "args": { + "External id": 154892,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771934133.011, "dur": 4.780, + "args": { + "External id": 154893,"Record function id": 0, "Ev Idx": 3340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771934134.471, "dur": 2.310, + "args": { + "External id": 154894,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771934135.111, "dur": 1.250, + "args": { + "External id": 154895,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771934135.521, "dur": 0.680, + "args": { + "External id": 154896,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 3343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771934141.481, "dur": 4.690, + "args": { + "External id": 154897,"Record function id": 0, "Ev Idx": 3344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771934143.061, "dur": 2.200, + "args": { + "External id": 154898,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771934143.641, "dur": 1.200, + "args": { + "External id": 154899,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771934144.001, "dur": 0.670, + "args": { + "External id": 154900,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 3347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771934149.741, "dur": 309.579, + "args": { + "External id": 154901,"Record function id": 0, "Sequence number": 3058686, "Fwd thread id": 1, "Ev Idx": 3348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771934151.061, "dur": 300.659, + "args": { + "External id": 154902,"Sequence number": 3058686, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3349 + } + }, + { + "ph": "f", "id": 296, "pid": 5714, "tid": 6744, "ts": 6303771934151.061, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771934208.991, "dur": 37.339, + "args": { + "External id": 154903,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771934258.050, "dur": 18.460, + "args": { + "External id": 154904,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_backward", "pid": 5714, "tid": 6744, + "ts": 6303771934307.650, "dur": 121.590, + "args": { + "External id": 154905,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar", "long int"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [24576, 2048, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 12, 2048], [8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [2]], "Ev Idx": 3352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771934358.850, "dur": 6.730, + "args": { + "External id": 154906,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771934367.250, "dur": 3.440, + "args": { + "External id": 154907,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771934472.350, "dur": 10.620, + "args": { + "External id": 154908,"Record function id": 0, "Ev Idx": 3355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771934475.190, "dur": 6.570, + "args": { + "External id": 154909,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771934477.590, "dur": 3.290, + "args": { + "External id": 154910,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771934478.410, "dur": 2.240, + "args": { + "External id": 154911,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771934487.200, "dur": 181.000, + "args": { + "External id": 154912,"Record function id": 0, "Sequence number": 3058685, "Fwd thread id": 1, "Ev Idx": 3359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771934488.750, "dur": 171.660, + "args": { + "External id": 154913,"Sequence number": 3058685, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3360 + } + }, + { + "ph": "f", "id": 297, "pid": 5714, "tid": 6744, "ts": 6303771934488.750, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771934503.100, "dur": 34.400, + "args": { + "External id": 154914,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771934506.230, "dur": 6.410, + "args": { + "External id": 154915,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771934513.820, "dur": 23.100, + "args": { + "External id": 154916,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771934545.760, "dur": 6.950, + "args": { + "External id": 154917,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771934547.600, "dur": 4.680, + "args": { + "External id": 154918,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771934682.480, "dur": 174.489, + "args": { + "External id": 154919,"Record function id": 0, "Sequence number": 3058684, "Fwd thread id": 1, "Ev Idx": 3366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771934686.749, "dur": 162.870, + "args": { + "External id": 154920,"Sequence number": 3058684, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 3367 + } + }, + { + "ph": "f", "id": 298, "pid": 5714, "tid": 6744, "ts": 6303771934686.749, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 6744, + "ts": 6303771934703.200, "dur": 36.589, + "args": { + "External id": 154921,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", ""], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771934706.920, "dur": 7.600, + "args": { + "External id": 154922,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771934716.020, "dur": 23.009, + "args": { + "External id": 154923,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], []], "Ev Idx": 3370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 6744, + "ts": 6303771934750.200, "dur": 8.320, + "args": { + "External id": 154924,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771934752.280, "dur": 5.740, + "args": { + "External id": 154925,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771934868.709, "dur": 321.059, + "args": { + "External id": 154926,"Record function id": 0, "Sequence number": 3058683, "Fwd thread id": 1, "Ev Idx": 3373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771934871.839, "dur": 307.649, + "args": { + "External id": 154927,"Sequence number": 3058683, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64]], "Ev Idx": 3374 + } + }, + { + "ph": "f", "id": 299, "pid": 5714, "tid": 6744, "ts": 6303771934871.839, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771934938.149, "dur": 41.070, + "args": { + "External id": 154928,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771934991.609, "dur": 24.750, + "args": { + "External id": 154929,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771935026.629, "dur": 23.010, + "args": { + "External id": 154930,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 768], [768, 1], [768, 1]], "Input Dims": [[768, 16384], [16384, 768], [768, 768]], "Ev Idx": 3377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771935060.959, "dur": 18.590, + "args": { + "External id": 154931,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771935087.389, "dur": 15.070, + "args": { + "External id": 154932,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 6744, + "ts": 6303771935109.999, "dur": 14.349, + "args": { + "External id": 154933,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [768, 1], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_0", "pid": 5714, "tid": 6744, + "ts": 6303771935143.859, "dur": 18.860, + "args": { + "External id": 154934,"kernel_hash": "czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/zm/czmbsbpurxsfwlctj2shaz2gt4lhd4go2fkil72h2kdhqkxepocp.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [16384, 768], [16384, 768], []], "Ev Idx": 3381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771935203.188, "dur": 11.400, + "args": { + "External id": 154935,"Record function id": 0, "Ev Idx": 3382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771935205.919, "dur": 7.320, + "args": { + "External id": 154936,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771935208.499, "dur": 3.949, + "args": { + "External id": 154937,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771935209.468, "dur": 2.731, + "args": { + "External id": 154938,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771935219.178, "dur": 5.220, + "args": { + "External id": 154939,"Record function id": 0, "Ev Idx": 3386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771935220.798, "dur": 2.560, + "args": { + "External id": 154940,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771935221.518, "dur": 1.390, + "args": { + "External id": 154941,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771935221.928, "dur": 0.800, + "args": { + "External id": 154942,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771935228.338, "dur": 5.110, + "args": { + "External id": 154943,"Record function id": 0, "Ev Idx": 3390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771935229.908, "dur": 2.550, + "args": { + "External id": 154944,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771935230.548, "dur": 1.450, + "args": { + "External id": 154945,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771935231.068, "dur": 0.760, + "args": { + "External id": 154946,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 3393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771935237.388, "dur": 211.200, + "args": { + "External id": 154947,"Record function id": 0, "Sequence number": 3058682, "Fwd thread id": 1, "Ev Idx": 3394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771935238.808, "dur": 179.260, + "args": { + "External id": 154948,"Sequence number": 3058682, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3395 + } + }, + { + "ph": "f", "id": 300, "pid": 5714, "tid": 6744, "ts": 6303771935238.808, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 5714, "tid": 6744, + "ts": 6303771935312.618, "dur": 23.370, + "args": { + "External id": 154949,"kernel_hash": "crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee", "grid": "grid(98304,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "98304", "128"], "kernel_file": "/tmp/torchinductor_root/rz/crz27cg5tuoyv3x5267afh3vxhst2bejx22wcohc5hp655ooi3ee.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "float", "float", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [2048, 1, 1], [98304, 98304, 1, 768], [], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [8, 2048, 1], [1, 1, 768, 128], [], []], "Ev Idx": 3396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 5714, "tid": 6744, + "ts": 6303771935354.398, "dur": 13.200, + "args": { + "External id": 154950,"kernel_hash": "c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg", "grid": "grid(768,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "768", "128"], "kernel_file": "/tmp/torchinductor_root/4c/c4cf3h7thxzyvihlhvaqfv7b3fxfmfh5mch4temkcvixghtzlgzg.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[98304, 98304, 1, 768], [768, 768, 1], [], []], "Input Dims": [[1, 1, 768, 128], [1, 1, 768], [], []], "Ev Idx": 3397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 5714, "tid": 6744, + "ts": 6303771935384.868, "dur": 15.530, + "args": { + "External id": 154951,"kernel_hash": "cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/z7/cz7p7k4z6cvaki3xlgfie7khmrsimzyeisof4kqpvhaezifu3jto.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "float", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 1], [1], [1572864, 768, 1], [2048, 1, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 1], [8, 2048, 768], [], []], "Ev Idx": 3398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771935426.338, "dur": 17.090, + "args": { + "External id": 154952,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771935460.778, "dur": 10.750, + "args": { + "External id": 154953,"Record function id": 0, "Ev Idx": 3400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771935463.598, "dur": 6.610, + "args": { + "External id": 154954,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771935465.798, "dur": 3.710, + "args": { + "External id": 154955,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771935466.688, "dur": 2.570, + "args": { + "External id": 154956,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[768]], "Ev Idx": 3403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771935476.348, "dur": 789.678, + "args": { + "External id": 154957,"Record function id": 0, "Sequence number": 3058681, "Fwd thread id": 1, "Ev Idx": 3404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771935477.928, "dur": 781.588, + "args": { + "External id": 154958,"Sequence number": 3058681, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3405 + } + }, + { + "ph": "f", "id": 301, "pid": 5714, "tid": 6744, "ts": 6303771935477.928, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6303771935495.708, "dur": 23.820, + "args": { + "External id": 154959,"Record function id": 0, "Ev Idx": 3406 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6303771935527.338, "dur": 54.729, + "args": { + "External id": 154960,"Record function id": 0, "Ev Idx": 3407 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce (model.layers.0)", "pid": 5714, "tid": 6744, + "ts": 6303771935589.558, "dur": 664.148, + "args": { + "External id": 154961,"Record function id": 0, "Ev Idx": 3408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771935670.147, "dur": 10.250, + "args": { + "External id": 154962,"Record function id": 0, "Concrete Inputs": ["[7079424]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771935690.767, "dur": 4.390, + "args": { + "External id": 154963,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771935710.217, "dur": 103.980, + "args": { + "External id": 154964,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771935722.377, "dur": 88.240, + "args": { + "External id": 154965,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[[1], [768, 1], [768, 1], [768, 1], [768, 1], [1], [768, 1], [768, 1], [2048, 1]], [], [], [1769856, 1]], "Input Dims": [[[768], [768, 768], [768, 768], [768, 768], [768, 768], [768], [2048, 768], [2048, 768], [768, 2048]], [], [], [4, 1769856]], "Ev Idx": 3412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771935738.767, "dur": 7.330, + "args": { + "External id": 154966,"Record function id": 0, "Concrete Inputs": ["[3514]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771935751.137, "dur": 37.590, + "args": { + "External id": 154967,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], [], []], "Ev Idx": 3414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771935753.127, "dur": 35.260, + "args": { + "External id": 154968,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[3514], [], [], [], [], [], []], "Ev Idx": 3415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771935755.817, "dur": 7.540, + "args": { + "External id": 154969,"Record function id": 0, "Concrete Inputs": ["[3514]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771935764.807, "dur": 23.060, + "args": { + "External id": 154970,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[3514], [3514], []], "Ev Idx": 3417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771935880.627, "dur": 8.720, + "args": { + "External id": 154971,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[7079424], [], [], [], [], []], "Ev Idx": 3418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771935882.327, "dur": 6.480, + "args": { + "External id": 154972,"Record function id": 0, "Concrete Inputs": ["[1769856]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771935906.537, "dur": 85.710, + "args": { + "External id": 154973,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[1769856], [7079424], [], [], [], []], "Ev Idx": 3420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771935922.217, "dur": 66.660, + "args": { + "External id": 154974,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1769856, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[7079424], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3421, "In msg nelems": 7079424 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771935934.527, "dur": 50.060, + "args": { + "External id": 154975,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[7079424]], "Ev Idx": 3422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771936004.877, "dur": 3.440, + "args": { + "External id": 154976,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3423, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771936061.317, "dur": 4.220, + "args": { + "External id": 154977,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771936096.166, "dur": 1.180, + "args": { + "External id": 154978,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771936115.696, "dur": 1.150, + "args": { + "External id": 154979,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "147648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771936131.636, "dur": 0.830, + "args": { + "External id": 154980,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "295104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771936145.796, "dur": 1.020, + "args": { + "External id": 154981,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "442560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771936159.806, "dur": 0.900, + "args": { + "External id": 154982,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771936173.596, "dur": 1.010, + "args": { + "External id": 154983,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "590208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771936187.416, "dur": 0.850, + "args": { + "External id": 154984,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "983424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771936201.576, "dur": 0.920, + "args": { + "External id": 154985,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "1376640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771936278.926, "dur": 233.579, + "args": { + "External id": 154986,"Record function id": 0, "Sequence number": 3058680, "Fwd thread id": 1, "Ev Idx": 3433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunctionBackward", "pid": 5714, "tid": 6744, + "ts": 6303771936281.876, "dur": 198.560, + "args": { + "External id": 154987,"Sequence number": 3058680, "Fwd thread id": 1, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3434 + } + }, + { + "ph": "f", "id": 302, "pid": 5714, "tid": 6744, "ts": 6303771936281.876, + "cat": "fwdbwd", "name": "fwdbwd", "bp": "e" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_dense_backward_0", "pid": 5714, "tid": 6744, + "ts": 6303771936372.736, "dur": 26.190, + "args": { + "External id": 154988,"kernel_hash": "c25lgxev5g5pgqmgeas3rsbfpey3d2wvz72yqf537xeysxmqtd4y", "grid": "grid(24576000,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "24576000"], "kernel_file": "/tmp/torchinductor_root/25/c25lgxev5g5pgqmgeas3rsbfpey3d2wvz72yqf537xeysxmqtd4y.py", "kernel_backend": "triton", "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 3435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_dense_backward_1", "pid": 5714, "tid": 6744, + "ts": 6303771936412.126, "dur": 17.230, + "args": { + "External id": 154989,"kernel_hash": "c272mj7qj3kjbzyvvqn5kn2ut5n2c42t7wgsqj2sturgngae2y3a", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/27/c272mj7qj3kjbzyvvqn5kn2ut5n2c42t7wgsqj2sturgngae2y3a.py", "kernel_backend": "triton", "Input type": ["long int", "c10::BFloat16", "float", "Scalar"], "Input Strides": [[2048, 1], [1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048], [8, 2048, 768], [32000, 768], []], "Ev Idx": 3436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_dense_backward_2", "pid": 5714, "tid": 6744, + "ts": 6303771936447.006, "dur": 17.410, + "args": { + "External id": 154990,"kernel_hash": "cg2ylt27tmwmnxcgudqpetr6cqsy6lzmizyy2xuskasljjabrsvm", "grid": "grid(24576000,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "24576000"], "kernel_file": "/tmp/torchinductor_root/g2/cg2ylt27tmwmnxcgudqpetr6cqsy6lzmizyy2xuskasljjabrsvm.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 3437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 6744, + "ts": 6303771936489.476, "dur": 18.149, + "args": { + "External id": 154991,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 3438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "autograd::engine::evaluate_function: torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771936525.605, "dur": 11.960, + "args": { + "External id": 154992,"Record function id": 0, "Ev Idx": 3439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "torch::autograd::AccumulateGrad", "pid": 5714, "tid": 6744, + "ts": 6303771936528.965, "dur": 7.151, + "args": { + "External id": 154993,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 3440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 6744, + "ts": 6303771936531.545, "dur": 3.851, + "args": { + "External id": 154994,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 3441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 6744, + "ts": 6303771936532.445, "dur": 2.720, + "args": { + "External id": 154995,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 3442 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::root_post_backward_callback", "pid": 5714, "tid": 6744, + "ts": 6303771936555.436, "dur": 1900.105, + "args": { + "External id": 154996,"Record function id": 0, "Ev Idx": 3443 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_accumulate", "pid": 5714, "tid": 6744, + "ts": 6303771936573.665, "dur": 32.440, + "args": { + "External id": 154997,"Record function id": 0, "Ev Idx": 3444 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reshard", "pid": 5714, "tid": 6744, + "ts": 6303771936615.545, "dur": 224.660, + "args": { + "External id": 154998,"Record function id": 0, "Ev Idx": 3445 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_backward_reduce", "pid": 5714, "tid": 6744, + "ts": 6303771936848.355, "dur": 1380.827, + "args": { + "External id": 154999,"Record function id": 0, "Ev Idx": 3446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771936969.764, "dur": 9.680, + "args": { + "External id": 155000,"Record function id": 0, "Concrete Inputs": ["[52894464]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 6744, + "ts": 6303771936989.535, "dur": 4.269, + "args": { + "External id": 155001,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[52894464], []], "Ev Idx": 3448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771937010.624, "dur": 178.740, + "args": { + "External id": 155002,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[], [], [], [13223616, 1]], "Input Dims": [[], [], [], [4, 13223616]], "Ev Idx": 3449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_chunk_cat", "pid": 5714, "tid": 6744, + "ts": 6303771937025.524, "dur": 159.330, + "args": { + "External id": 155003,"Record function id": 0, "Concrete Inputs": ["", "0", "4", ""], "Input type": ["TensorList", "Scalar", "Scalar", "float"], "Input Strides": [[], [], [], [13223616, 1]], "Input Dims": [[], [], [], [4, 13223616]], "Ev Idx": 3450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771937095.164, "dur": 7.780, + "args": { + "External id": 155004,"Record function id": 0, "Concrete Inputs": ["[26063]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 6744, + "ts": 6303771937122.584, "dur": 37.510, + "args": { + "External id": 155005,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[26063], [], [], [], [], [], [], []], "Ev Idx": 3452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 6744, + "ts": 6303771937124.274, "dur": 35.440, + "args": { + "External id": 155006,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[26063], [], [], [], [], [], []], "Ev Idx": 3453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937127.084, "dur": 7.850, + "args": { + "External id": 155007,"Record function id": 0, "Concrete Inputs": ["[26063]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 6744, + "ts": 6303771937136.334, "dur": 22.740, + "args": { + "External id": 155008,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[26063], [26063], []], "Ev Idx": 3455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 6744, + "ts": 6303771937286.014, "dur": 10.020, + "args": { + "External id": 155009,"Record function id": 0, "Concrete Inputs": ["", "[13223616]", "", "", "", "False"], "Input type": ["float", "ScalarList", "", "", "", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[52894464], [], [], [], [], []], "Ev Idx": 3456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 6744, + "ts": 6303771937288.124, "dur": 7.340, + "args": { + "External id": 155010,"Record function id": 0, "Concrete Inputs": ["[13223616]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_reduce_scatter_base_", "pid": 5714, "tid": 6744, + "ts": 6303771937324.544, "dur": 95.939, + "args": { + "External id": 155011,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "-1"], "Input type": ["float", "float", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], [], []], "Input Dims": [[13223616], [52894464], [], [], [], []], "Ev Idx": 3458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771937341.104, "dur": 75.759, + "args": { + "External id": 155012,"Record function id": 0, "Collective name": "_reduce_scatter_base", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 13223616, "Process Group Name": "0", "Input type": ["float", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[52894464], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3459, "In msg nelems": 52894464 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 5714, "tid": 6744, + "ts": 6303771937355.404, "dur": 56.579, + "args": { + "External id": 155013,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[52894464]], "Ev Idx": 3460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 6744, + "ts": 6303771937433.943, "dur": 3.820, + "args": { + "External id": 155014,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3461, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937497.013, "dur": 4.810, + "args": { + "External id": 155015,"Record function id": 0, "Concrete Inputs": ["", "[8000, 768]", "[768, 1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937537.083, "dur": 1.330, + "args": { + "External id": 155016,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6144000"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937557.543, "dur": 1.340, + "args": { + "External id": 155017,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6144192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937575.513, "dur": 1.020, + "args": { + "External id": 155018,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6291648"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937592.153, "dur": 1.230, + "args": { + "External id": 155019,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6439104"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937610.513, "dur": 1.250, + "args": { + "External id": 155020,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "6586560"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937631.973, "dur": 1.340, + "args": { + "External id": 155021,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6734016"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937651.673, "dur": 1.340, + "args": { + "External id": 155022,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "6734208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937672.383, "dur": 1.380, + "args": { + "External id": 155023,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "7127424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937692.623, "dur": 1.170, + "args": { + "External id": 155024,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "7520640"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937711.783, "dur": 1.470, + "args": { + "External id": 155025,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "7913856"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937732.173, "dur": 1.460, + "args": { + "External id": 155026,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "7914048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937751.763, "dur": 1.330, + "args": { + "External id": 155027,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "8061504"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937769.203, "dur": 1.050, + "args": { + "External id": 155028,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "8208960"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937785.333, "dur": 1.160, + "args": { + "External id": 155029,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "8356416"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937801.753, "dur": 1.140, + "args": { + "External id": 155030,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "8503872"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937818.013, "dur": 1.160, + "args": { + "External id": 155031,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "8504064"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937834.482, "dur": 1.071, + "args": { + "External id": 155032,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "8897280"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937850.793, "dur": 1.089, + "args": { + "External id": 155033,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "9290496"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937867.133, "dur": 1.029, + "args": { + "External id": 155034,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "9683712"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937883.482, "dur": 1.071, + "args": { + "External id": 155035,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "9683904"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937899.913, "dur": 1.309, + "args": { + "External id": 155036,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "9831360"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937916.332, "dur": 1.390, + "args": { + "External id": 155037,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "9978816"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937932.812, "dur": 1.190, + "args": { + "External id": 155038,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "10126272"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937948.782, "dur": 1.130, + "args": { + "External id": 155039,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "10273728"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937964.632, "dur": 1.090, + "args": { + "External id": 155040,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "10273920"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937980.672, "dur": 1.140, + "args": { + "External id": 155041,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "10667136"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771937997.342, "dur": 1.260, + "args": { + "External id": 155042,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "11060352"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771938013.982, "dur": 1.170, + "args": { + "External id": 155043,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "11453568"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771938030.642, "dur": 1.030, + "args": { + "External id": 155044,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11453760"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771938046.972, "dur": 1.050, + "args": { + "External id": 155045,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11601216"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771938063.692, "dur": 1.040, + "args": { + "External id": 155046,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11748672"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771938080.122, "dur": 1.180, + "args": { + "External id": 155047,"Record function id": 0, "Concrete Inputs": ["", "[192, 768]", "[768, 1]", "11896128"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771938097.142, "dur": 1.110, + "args": { + "External id": 155048,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "12043584"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771938114.172, "dur": 1.020, + "args": { + "External id": 155049,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "12043776"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771938130.932, "dur": 0.950, + "args": { + "External id": 155050,"Record function id": 0, "Concrete Inputs": ["", "[512, 768]", "[768, 1]", "12436992"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771938146.732, "dur": 1.040, + "args": { + "External id": 155051,"Record function id": 0, "Concrete Inputs": ["", "[192, 2048]", "[2048, 1]", "12830208"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 6744, + "ts": 6303771938163.042, "dur": 1.040, + "args": { + "External id": 155052,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "13223424"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3499 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "ProfilerStep#9727", "pid": 5714, "tid": 5714, + "ts": 6303771452771.651, "dur": 688865.778, + "args": { + "External id": 147457,"Record function id": 0, "Ev Idx": 3500 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "Optimizer.zero_grad#AdamW.zero_grad", "pid": 5714, "tid": 5714, + "ts": 6303771452808.761, "dur": 225.919, + "args": { + "External id": 147458,"Record function id": 0, "Ev Idx": 3501 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "enumerate(DataLoader)#_StatefulMultiProcessingDataLoaderIter.__next__", "pid": 5714, "tid": 5714, + "ts": 6303771453071.380, "dur": 1426.647, + "args": { + "External id": 147459,"Record function id": 0, "Ev Idx": 3502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771453780.858, "dur": 6.520, + "args": { + "External id": 147460,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::set_", "pid": 5714, "tid": 5714, + "ts": 6303771453805.829, "dur": 6.009, + "args": { + "External id": 147461,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "0", "[8, 4096]", "[4096, 1]"], "Input type": ["long int", "", "Scalar", "ScalarList", "ScalarList"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[0], [], [], [], []], "Ev Idx": 3504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771454114.018, "dur": 3.120, + "args": { + "External id": 147462,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::set_", "pid": 5714, "tid": 5714, + "ts": 6303771454124.818, "dur": 2.930, + "args": { + "External id": 147463,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "0", "[8, 4096]", "[4096, 1]"], "Input type": ["long int", "", "Scalar", "ScalarList", "ScalarList"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[0], [], [], [], []], "Ev Idx": 3506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771454422.687, "dur": 3.110, + "args": { + "External id": 147464,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::set_", "pid": 5714, "tid": 5714, + "ts": 6303771454433.547, "dur": 2.900, + "args": { + "External id": 147465,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "0", "[8, 4096]", "[4096, 1]"], "Input type": ["long int", "", "Scalar", "ScalarList", "ScalarList"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[0], [], [], [], []], "Ev Idx": 3508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771454806.746, "dur": 11.600, + "args": { + "External id": 147466,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], []], "Ev Idx": 3509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771454813.006, "dur": 2.240, + "args": { + "External id": 147467,"Record function id": 0, "Concrete Inputs": ["", "[8, 4096]", "[4096, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 3510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771454820.396, "dur": 4.610, + "args": { + "External id": 147468,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], []], "Ev Idx": 3511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771454822.386, "dur": 1.150, + "args": { + "External id": 147469,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 3512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771454843.156, "dur": 135.490, + "args": { + "External id": 147470,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], [], [], []], "Ev Idx": 3513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771454850.516, "dur": 127.740, + "args": { + "External id": 147471,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], [], []], "Ev Idx": 3514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771454855.946, "dur": 8.740, + "args": { + "External id": 147472,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "[2048, 1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771454866.486, "dur": 111.150, + "args": { + "External id": 147473,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771454873.306, "dur": 0.250, + "args": { + "External id": 147474,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 3517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand_as", "pid": 5714, "tid": 5714, + "ts": 6303771454875.576, "dur": 6.070, + "args": { + "External id": 147475,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["long int", "long int"], "Input Strides": [[4096, 1], [2048, 1]], "Input Dims": [[8, 2048], [8, 2048]], "Ev Idx": 3518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 5714, + "ts": 6303771454878.786, "dur": 2.670, + "args": { + "External id": 147476,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "False"], "Input type": ["long int", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], []], "Input Dims": [[8, 2048], [], []], "Ev Idx": 3519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771454880.456, "dur": 0.620, + "args": { + "External id": 147477,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 3520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6303771454883.336, "dur": 50.750, + "args": { + "External id": 147478,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 3521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6303771454884.816, "dur": 48.910, + "args": { + "External id": 147479,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 3522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771454886.496, "dur": 8.450, + "args": { + "External id": 147480,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 3523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771454888.626, "dur": 5.870, + "args": { + "External id": 147481,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771454895.756, "dur": 37.600, + "args": { + "External id": 147482,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771454935.536, "dur": 40.700, + "args": { + "External id": 147483,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771454986.946, "dur": 76.180, + "args": { + "External id": 147484,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], [], [], [], []], "Ev Idx": 3527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771454988.516, "dur": 74.330, + "args": { + "External id": 147485,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], [], [], []], "Ev Idx": 3528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771454991.786, "dur": 5.570, + "args": { + "External id": 147486,"Record function id": 0, "Concrete Inputs": ["[8, 4096]", "[4096, 1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771454998.306, "dur": 63.910, + "args": { + "External id": 147487,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[4096, 1], [4096, 1], []], "Input Dims": [[8, 4096], [8, 4096], []], "Ev Idx": 3530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::arange", "pid": 5714, "tid": 5714, + "ts": 6303771455075.406, "dur": 36.589, + "args": { + "External id": 147488,"Record function id": 0, "Concrete Inputs": ["0", "2048", "", "", "", "False"], "Input type": ["Scalar", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771455078.226, "dur": 5.040, + "args": { + "External id": 147489,"Record function id": 0, "Concrete Inputs": ["[0]", "4", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::arange", "pid": 5714, "tid": 5714, + "ts": 6303771455085.016, "dur": 26.570, + "args": { + "External id": 147490,"Record function id": 0, "Concrete Inputs": ["0", "2048", "1", ""], "Input type": ["Scalar", "Scalar", "Scalar", "long int"], "Input Strides": [[], [], [], [1]], "Input Dims": [[], [], [], [0]], "Ev Idx": 3533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771455089.206, "dur": 4.040, + "args": { + "External id": 147491,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["long int", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 3534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::repeat", "pid": 5714, "tid": 5714, + "ts": 6303771455121.006, "dur": 54.020, + "args": { + "External id": 147492,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 3535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 5714, + "ts": 6303771455123.846, "dur": 4.300, + "args": { + "External id": 147493,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048]", "False"], "Input type": ["long int", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[2048], [], []], "Ev Idx": 3536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455126.475, "dur": 1.260, + "args": { + "External id": 147494,"Record function id": 0, "Concrete Inputs": ["", "[1, 2048]", "[2048, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 3537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771455129.186, "dur": 6.440, + "args": { + "External id": 147495,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 5714, + "ts": 6303771455138.495, "dur": 2.011, + "args": { + "External id": 147496,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[2048, 1]], "Input Dims": [[8, 2048]], "Ev Idx": 3539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unfold", "pid": 5714, "tid": 5714, + "ts": 6303771455142.346, "dur": 4.100, + "args": { + "External id": 147497,"Record function id": 0, "Concrete Inputs": ["", "0", "1", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 3540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455145.795, "dur": 0.411, + "args": { + "External id": 147498,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 1]", "[2048, 1, 2048]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 3541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unfold", "pid": 5714, "tid": 5714, + "ts": 6303771455147.275, "dur": 1.620, + "args": { + "External id": 147499,"Record function id": 0, "Concrete Inputs": ["", "1", "2048", "2048"], "Input type": ["long int", "Scalar", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 2048], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 3542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455148.186, "dur": 0.540, + "args": { + "External id": 147500,"Record function id": 0, "Concrete Inputs": ["", "[8, 1, 1, 2048]", "[2048, 2048, 2048, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1, 2048], [], [], []], "Input Dims": [[8, 2048, 1], [], [], []], "Ev Idx": 3543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand_as", "pid": 5714, "tid": 5714, + "ts": 6303771455149.986, "dur": 3.960, + "args": { + "External id": 147501,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["long int", "long int"], "Input Strides": [[2048, 1], [2048, 2048, 2048, 1]], "Input Dims": [[1, 2048], [8, 1, 1, 2048]], "Ev Idx": 3544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::expand", "pid": 5714, "tid": 5714, + "ts": 6303771455150.826, "dur": 2.940, + "args": { + "External id": 147502,"Record function id": 0, "Concrete Inputs": ["", "[8, 1, 1, 2048]", "False"], "Input type": ["long int", "ScalarList", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[1, 2048], [], []], "Ev Idx": 3545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455152.955, "dur": 0.640, + "args": { + "External id": 147503,"Record function id": 0, "Concrete Inputs": ["", "[8, 1, 1, 2048]", "[0, 2048, 2048, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[1, 2048], [], [], []], "Ev Idx": 3546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771455154.806, "dur": 19.469, + "args": { + "External id": 147504,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 2048, 2048, 1], [0, 2048, 2048, 1], []], "Input Dims": [[8, 1, 1, 2048], [8, 1, 1, 2048], []], "Ev Idx": 3547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771455182.455, "dur": 25.150, + "args": { + "External id": 147505,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "3", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[2048, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 3548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771455183.626, "dur": 23.719, + "args": { + "External id": 147506,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "3", "", "", "", "False", ""], "Input type": ["long int", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[2048, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], [], []], "Ev Idx": 3549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455186.946, "dur": 5.329, + "args": { + "External id": 147507,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "[2048, 1]", "3", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771455193.266, "dur": 13.489, + "args": { + "External id": 147508,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["int", "long int", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 3551 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::root_pre_forward", "pid": 5714, "tid": 5714, + "ts": 6303771455278.695, "dur": 150.320, + "args": { + "External id": 147509,"Record function id": 0, "Ev Idx": 3552 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::inputs_to_device", "pid": 5714, "tid": 5714, + "ts": 6303771455360.195, "dur": 55.780, + "args": { + "External id": 147510,"Record function id": 0, "Ev Idx": 3553 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771455436.925, "dur": 42.690, + "args": { + "External id": 147511,"Record function id": 0, "Ev Idx": 3554 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward", "pid": 5714, "tid": 5714, + "ts": 6303771455488.625, "dur": 1344.157, + "args": { + "External id": 147512,"Record function id": 0, "Ev Idx": 3555 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather", "pid": 5714, "tid": 5714, + "ts": 6303771455496.125, "dur": 742.248, + "args": { + "External id": 147513,"Record function id": 0, "Ev Idx": 3556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771455583.814, "dur": 10.840, + "args": { + "External id": 147514,"Record function id": 0, "Concrete Inputs": ["[13223616]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771455610.585, "dur": 100.439, + "args": { + "External id": 147515,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["c10::BFloat16", "", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[13223616], [], []], "Ev Idx": 3558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455616.185, "dur": 1.640, + "args": { + "External id": 147516,"Record function id": 0, "Concrete Inputs": ["", "[6144000]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455619.585, "dur": 0.329, + "args": { + "External id": 147517,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6144000"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455622.045, "dur": 0.400, + "args": { + "External id": 147518,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6144192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455624.854, "dur": 0.231, + "args": { + "External id": 147519,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6291648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455625.994, "dur": 1.180, + "args": { + "External id": 147520,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6439104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455629.605, "dur": 0.280, + "args": { + "External id": 147521,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6586560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455631.854, "dur": 0.231, + "args": { + "External id": 147522,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6734016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455633.045, "dur": 1.289, + "args": { + "External id": 147523,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6734208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455636.654, "dur": 0.260, + "args": { + "External id": 147524,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "7127424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455637.905, "dur": 0.209, + "args": { + "External id": 147525,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "7520640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455640.114, "dur": 0.311, + "args": { + "External id": 147526,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "7913856"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455642.354, "dur": 0.211, + "args": { + "External id": 147527,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "7914048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455643.434, "dur": 1.371, + "args": { + "External id": 147528,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8061504"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455646.785, "dur": 0.300, + "args": { + "External id": 147529,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8208960"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455648.904, "dur": 0.260, + "args": { + "External id": 147530,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8356416"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455650.104, "dur": 1.450, + "args": { + "External id": 147531,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "8503872"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455653.774, "dur": 0.210, + "args": { + "External id": 147532,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "8504064"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455654.844, "dur": 0.220, + "args": { + "External id": 147533,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "8897280"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455657.164, "dur": 0.310, + "args": { + "External id": 147534,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "9290496"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455659.514, "dur": 0.210, + "args": { + "External id": 147535,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "9683712"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455660.604, "dur": 0.900, + "args": { + "External id": 147536,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9683904"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455663.714, "dur": 0.230, + "args": { + "External id": 147537,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9831360"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455665.894, "dur": 0.220, + "args": { + "External id": 147538,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9978816"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455667.134, "dur": 1.440, + "args": { + "External id": 147539,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "10126272"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455670.744, "dur": 0.250, + "args": { + "External id": 147540,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "10273728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455671.874, "dur": 0.220, + "args": { + "External id": 147541,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "10273920"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455673.944, "dur": 0.210, + "args": { + "External id": 147542,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "10667136"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455676.694, "dur": 0.210, + "args": { + "External id": 147543,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "11060352"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455677.764, "dur": 1.370, + "args": { + "External id": 147544,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "11453568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455681.164, "dur": 0.220, + "args": { + "External id": 147545,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11453760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455683.364, "dur": 0.220, + "args": { + "External id": 147546,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11601216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455684.484, "dur": 1.090, + "args": { + "External id": 147547,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11748672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455687.624, "dur": 0.210, + "args": { + "External id": 147548,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11896128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455688.734, "dur": 0.200, + "args": { + "External id": 147549,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "12043584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455691.094, "dur": 0.380, + "args": { + "External id": 147550,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12043776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455693.464, "dur": 0.210, + "args": { + "External id": 147551,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12436992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455694.644, "dur": 0.950, + "args": { + "External id": 147552,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12830208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455697.524, "dur": 0.210, + "args": { + "External id": 147553,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "13223424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771455729.494, "dur": 38.180, + "args": { + "External id": 147554,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 3597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771455838.284, "dur": 167.369, + "args": { + "External id": 147555,"Record function id": 0, "Concrete Inputs": ["", "", "13223616", "4", "0", "15", ""], "Input type": ["TensorList", "", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], []], "Ev Idx": 3598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771455851.024, "dur": 7.240, + "args": { + "External id": 147556,"Record function id": 0, "Concrete Inputs": ["[52894464]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771455863.384, "dur": 8.720, + "args": { + "External id": 147557,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "13223616"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[52894464], [], [], []], "Ev Idx": 3600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771455865.874, "dur": 5.880, + "args": { + "External id": 147558,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "13223616", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[52894464], [], [], [], []], "Ev Idx": 3601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455869.284, "dur": 0.730, + "args": { + "External id": 147559,"Record function id": 0, "Concrete Inputs": ["", "[13223616]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[52894464], [], [], []], "Ev Idx": 3602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771455880.054, "dur": 71.960, + "args": { + "External id": 147560,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["c10::BFloat16", "", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[13223616], [], []], "Ev Idx": 3603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455881.674, "dur": 0.470, + "args": { + "External id": 147561,"Record function id": 0, "Concrete Inputs": ["", "[6144000]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455884.104, "dur": 0.190, + "args": { + "External id": 147562,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6144000"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455884.984, "dur": 0.270, + "args": { + "External id": 147563,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6144192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455886.214, "dur": 0.170, + "args": { + "External id": 147564,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6291648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455888.114, "dur": 0.980, + "args": { + "External id": 147565,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6439104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455889.974, "dur": 0.150, + "args": { + "External id": 147566,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "6586560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455890.804, "dur": 0.160, + "args": { + "External id": 147567,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "6734016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455892.734, "dur": 0.250, + "args": { + "External id": 147568,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "6734208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455893.864, "dur": 0.240, + "args": { + "External id": 147569,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "7127424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455895.594, "dur": 0.150, + "args": { + "External id": 147570,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "7520640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455896.554, "dur": 0.210, + "args": { + "External id": 147571,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "7913856"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455897.444, "dur": 1.290, + "args": { + "External id": 147572,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "7914048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455900.324, "dur": 0.950, + "args": { + "External id": 147573,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8061504"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455901.954, "dur": 0.170, + "args": { + "External id": 147574,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8208960"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455903.754, "dur": 0.260, + "args": { + "External id": 147575,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "8356416"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455905.464, "dur": 0.170, + "args": { + "External id": 147576,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "8503872"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455906.304, "dur": 0.150, + "args": { + "External id": 147577,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "8504064"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455909.014, "dur": 0.460, + "args": { + "External id": 147578,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "8897280"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455910.144, "dur": 0.150, + "args": { + "External id": 147579,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "9290496"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455911.004, "dur": 1.140, + "args": { + "External id": 147580,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "9683712"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455913.714, "dur": 0.900, + "args": { + "External id": 147581,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9683904"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455915.354, "dur": 0.150, + "args": { + "External id": 147582,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9831360"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455917.404, "dur": 0.250, + "args": { + "External id": 147583,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "9978816"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455919.014, "dur": 0.220, + "args": { + "External id": 147584,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "10126272"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455920.004, "dur": 0.150, + "args": { + "External id": 147585,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "10273728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455922.904, "dur": 0.170, + "args": { + "External id": 147586,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "10273920"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455923.734, "dur": 0.160, + "args": { + "External id": 147587,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "10667136"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455924.594, "dur": 1.540, + "args": { + "External id": 147588,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "11060352"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455927.714, "dur": 1.050, + "args": { + "External id": 147589,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "11453568"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455929.434, "dur": 0.150, + "args": { + "External id": 147590,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11453760"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455931.194, "dur": 0.260, + "args": { + "External id": 147591,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11601216"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455933.404, "dur": 0.150, + "args": { + "External id": 147592,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11748672"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455934.214, "dur": 0.160, + "args": { + "External id": 147593,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "11896128"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455936.844, "dur": 0.160, + "args": { + "External id": 147594,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "12043584"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455937.774, "dur": 0.160, + "args": { + "External id": 147595,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12043776"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455938.634, "dur": 1.070, + "args": { + "External id": 147596,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12436992"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455941.384, "dur": 0.820, + "args": { + "External id": 147597,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "12830208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771455942.874, "dur": 0.160, + "args": { + "External id": 147598,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "13223424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[13223616], [], [], []], "Ev Idx": 3641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771455968.344, "dur": 24.180, + "args": { + "External id": 147599,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 3642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771456062.193, "dur": 100.830, + "args": { + "External id": 147600,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[52894464], [13223616], [], [], []], "Ev Idx": 3643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771456087.024, "dur": 72.539, + "args": { + "External id": 147601,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 52894464, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[13223616], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3644, "In msg nelems": 13223616 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771456100.273, "dur": 54.610, + "args": { + "External id": 147602,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[13223616]], "Ev Idx": 3645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771456178.413, "dur": 3.340, + "args": { + "External id": 147603,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3646, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out", "pid": 5714, "tid": 5714, + "ts": 6303771456249.983, "dur": 437.069, + "args": { + "External id": 147604,"Record function id": 0, "Ev Idx": 3647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456427.553, "dur": 4.090, + "args": { + "External id": 147605,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[52894464], []], "Ev Idx": 3648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456436.023, "dur": 0.760, + "args": { + "External id": 147606,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[24576000], []], "Ev Idx": 3649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456438.553, "dur": 0.660, + "args": { + "External id": 147607,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456440.763, "dur": 0.530, + "args": { + "External id": 147608,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456443.613, "dur": 0.480, + "args": { + "External id": 147609,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456445.453, "dur": 0.560, + "args": { + "External id": 147610,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456447.363, "dur": 1.509, + "args": { + "External id": 147611,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456450.143, "dur": 0.909, + "args": { + "External id": 147612,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456453.252, "dur": 0.731, + "args": { + "External id": 147613,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456456.092, "dur": 0.460, + "args": { + "External id": 147614,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456457.823, "dur": 0.529, + "args": { + "External id": 147615,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456459.552, "dur": 0.531, + "args": { + "External id": 147616,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456462.432, "dur": 0.531, + "args": { + "External id": 147617,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456464.092, "dur": 0.520, + "args": { + "External id": 147618,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456465.752, "dur": 1.420, + "args": { + "External id": 147619,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456468.343, "dur": 0.700, + "args": { + "External id": 147620,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456471.523, "dur": 0.449, + "args": { + "External id": 147621,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456473.872, "dur": 0.471, + "args": { + "External id": 147622,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456475.632, "dur": 0.531, + "args": { + "External id": 147623,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456477.303, "dur": 0.469, + "args": { + "External id": 147624,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456480.252, "dur": 0.460, + "args": { + "External id": 147625,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456481.903, "dur": 0.460, + "args": { + "External id": 147626,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456483.543, "dur": 1.280, + "args": { + "External id": 147627,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456486.032, "dur": 0.771, + "args": { + "External id": 147628,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456489.032, "dur": 0.520, + "args": { + "External id": 147629,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456491.852, "dur": 0.591, + "args": { + "External id": 147630,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456493.752, "dur": 0.571, + "args": { + "External id": 147631,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456495.492, "dur": 0.471, + "args": { + "External id": 147632,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456498.003, "dur": 0.589, + "args": { + "External id": 147633,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456504.543, "dur": 0.500, + "args": { + "External id": 147634,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456506.283, "dur": 1.089, + "args": { + "External id": 147635,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456508.583, "dur": 0.909, + "args": { + "External id": 147636,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456511.603, "dur": 0.540, + "args": { + "External id": 147637,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456514.152, "dur": 0.440, + "args": { + "External id": 147638,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456515.903, "dur": 0.500, + "args": { + "External id": 147639,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456517.543, "dur": 0.460, + "args": { + "External id": 147640,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456520.192, "dur": 0.540, + "args": { + "External id": 147641,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456522.003, "dur": 0.549, + "args": { + "External id": 147642,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771456523.732, "dur": 1.300, + "args": { + "External id": 147643,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771456543.712, "dur": 105.660, + "args": { + "External id": 147644,"Record function id": 0, "Concrete Inputs": ["", "", "1", ""], "Input type": ["c10::BFloat16", "", "Scalar", "TensorList"], "Input Strides": [[13223616, 1], [], [], []], "Input Dims": [[4, 13223616], [], [], []], "Ev Idx": 3687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771456560.302, "dur": 84.160, + "args": { + "External id": 147645,"Record function id": 0, "Concrete Inputs": ["", "", "1", ""], "Input type": ["c10::BFloat16", "", "Scalar", "TensorList"], "Input Strides": [[13223616, 1], [], [], []], "Input Dims": [[4, 13223616], [], [], []], "Ev Idx": 3688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771456574.432, "dur": 2.880, + "args": { + "External id": 147646,"Record function id": 0, "Concrete Inputs": ["[2750]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771456581.552, "dur": 36.360, + "args": { + "External id": 147647,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[2750], [], [], [], [], [], [], []], "Ev Idx": 3690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771456583.172, "dur": 34.410, + "args": { + "External id": 147648,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[2750], [], [], [], [], [], []], "Ev Idx": 3691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771456586.532, "dur": 7.170, + "args": { + "External id": 147649,"Record function id": 0, "Concrete Inputs": ["[2750]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771456595.012, "dur": 22.080, + "args": { + "External id": 147650,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2750], [2750], []], "Ev Idx": 3693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771456893.342, "dur": 20.940, + "args": { + "External id": 147651,"Record function id": 0, "Ev Idx": 3694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/0", "pid": 5714, "tid": 5714, + "ts": 6303771456915.311, "dur": 147.200, + "args": { + "External id": 147652,"Record function id": 0, "Ev Idx": 3695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771456944.891, "dur": 107.260, + "args": { + "External id": 147653,"Sequence number": 3058680, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "long int"], "Input Strides": [[768, 1], [2048, 1]], "Input Dims": [[32000, 768], [8, 2048]], "Ev Idx": 3696 + } + }, + { + "ph": "s", "id": 302, "pid": 5714, "tid": 5714, "ts": 6303771456944.891, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_embedding_0", "pid": 5714, "tid": 5714, + "ts": 6303771456993.862, "dur": 27.939, + "args": { + "External id": 147654,"kernel_hash": "chx7cxfd4w3vbh4d6l24hldpnxluepxuj4zcshyicrtcgke24jvt", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/hx/chx7cxfd4w3vbh4d6l24hldpnxluepxuj4zcshyicrtcgke24jvt.py", "kernel_backend": "triton", "Input type": ["long int", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048], [32000, 768], [8, 2048, 768], []], "Ev Idx": 3697 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771457108.071, "dur": 46.390, + "args": { + "External id": 147655,"Record function id": 0, "Ev Idx": 3698 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.0)", "pid": 5714, "tid": 5714, + "ts": 6303771457163.771, "dur": 937.788, + "args": { + "External id": 147656,"Record function id": 0, "Ev Idx": 3699 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 5714, "tid": 5714, + "ts": 6303771457171.481, "dur": 447.139, + "args": { + "External id": 147657,"Record function id": 0, "Ev Idx": 3700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771457226.901, "dur": 8.340, + "args": { + "External id": 147658,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771457244.731, "dur": 22.180, + "args": { + "External id": 147659,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457248.331, "dur": 1.120, + "args": { + "External id": 147660,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457250.761, "dur": 0.190, + "args": { + "External id": 147661,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457253.011, "dur": 0.250, + "args": { + "External id": 147662,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457254.821, "dur": 1.080, + "args": { + "External id": 147663,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457256.581, "dur": 1.260, + "args": { + "External id": 147664,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457258.531, "dur": 0.240, + "args": { + "External id": 147665,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457260.171, "dur": 0.160, + "args": { + "External id": 147666,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457262.021, "dur": 0.240, + "args": { + "External id": 147667,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457263.691, "dur": 0.150, + "args": { + "External id": 147668,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771457273.661, "dur": 21.930, + "args": { + "External id": 147669,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771457331.771, "dur": 90.019, + "args": { + "External id": 147670,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771457341.001, "dur": 8.900, + "args": { + "External id": 147671,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771457353.890, "dur": 7.900, + "args": { + "External id": 147672,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771457356.141, "dur": 5.309, + "args": { + "External id": 147673,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457359.041, "dur": 0.740, + "args": { + "External id": 147674,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771457368.950, "dur": 17.851, + "args": { + "External id": 147675,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457370.501, "dur": 1.329, + "args": { + "External id": 147676,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457373.941, "dur": 0.289, + "args": { + "External id": 147677,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457374.910, "dur": 0.240, + "args": { + "External id": 147678,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457376.221, "dur": 0.160, + "args": { + "External id": 147679,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457378.301, "dur": 0.220, + "args": { + "External id": 147680,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457379.170, "dur": 0.151, + "args": { + "External id": 147681,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457380.390, "dur": 1.000, + "args": { + "External id": 147682,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457382.030, "dur": 0.260, + "args": { + "External id": 147683,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457382.930, "dur": 1.131, + "args": { + "External id": 147684,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771457396.610, "dur": 17.491, + "args": { + "External id": 147685,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771457471.630, "dur": 81.700, + "args": { + "External id": 147686,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771457488.240, "dur": 62.290, + "args": { + "External id": 147687,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3730, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771457498.590, "dur": 48.040, + "args": { + "External id": 147688,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771457567.590, "dur": 3.090, + "args": { + "External id": 147689,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3732, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 5714, "tid": 5714, + "ts": 6303771457636.380, "dur": 328.099, + "args": { + "External id": 147690,"Record function id": 0, "Ev Idx": 3733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771457709.710, "dur": 3.590, + "args": { + "External id": 147691,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771457717.100, "dur": 0.670, + "args": { + "External id": 147692,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771457719.240, "dur": 0.660, + "args": { + "External id": 147693,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771457721.420, "dur": 0.480, + "args": { + "External id": 147694,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771457723.030, "dur": 0.530, + "args": { + "External id": 147695,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771457724.670, "dur": 0.450, + "args": { + "External id": 147696,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771457727.640, "dur": 0.550, + "args": { + "External id": 147697,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771457729.300, "dur": 1.690, + "args": { + "External id": 147698,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771457732.120, "dur": 0.480, + "args": { + "External id": 147699,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771457733.690, "dur": 0.530, + "args": { + "External id": 147700,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771457747.040, "dur": 192.289, + "args": { + "External id": 147701,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771457757.540, "dur": 178.419, + "args": { + "External id": 147702,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771457771.090, "dur": 113.730, + "args": { + "External id": 147703,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771457886.960, "dur": 29.179, + "args": { + "External id": 147704,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771457888.209, "dur": 27.650, + "args": { + "External id": 147705,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771457891.120, "dur": 5.589, + "args": { + "External id": 147706,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771457897.709, "dur": 17.740, + "args": { + "External id": 147707,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6303771458059.269, "dur": 19.700, + "args": { + "External id": 147708,"Sequence number": 3058681, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3751 + } + }, + { + "ph": "s", "id": 301, "pid": 5714, "tid": 5714, "ts": 6303771458059.269, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771458069.479, "dur": 5.770, + "args": { + "External id": 147709,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 3752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771458071.499, "dur": 3.300, + "args": { + "External id": 147710,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 3753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458130.009, "dur": 16.100, + "args": { + "External id": 147711,"Record function id": 0, "Ev Idx": 3754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6303771458147.179, "dur": 1510.166, + "args": { + "External id": 147712,"Record function id": 0, "Ev Idx": 3755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771458169.029, "dur": 106.390, + "args": { + "External id": 147713,"Sequence number": 3058682, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 3756 + } + }, + { + "ph": "s", "id": 300, "pid": 5714, "tid": 5714, "ts": 6303771458169.029, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771458216.069, "dur": 25.780, + "args": { + "External id": 147714,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 3757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771458253.779, "dur": 4.449, + "args": { + "External id": 147715,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771458255.128, "dur": 2.880, + "args": { + "External id": 147716,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458303.788, "dur": 16.760, + "args": { + "External id": 147717,"Record function id": 0, "Ev Idx": 3760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6303771458321.428, "dur": 958.728, + "args": { + "External id": 147718,"Record function id": 0, "Ev Idx": 3761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771458344.528, "dur": 187.890, + "args": { + "External id": 147719,"Sequence number": 3058683, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 3762 + } + }, + { + "ph": "s", "id": 299, "pid": 5714, "tid": 5714, "ts": 6303771458344.528, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771458373.148, "dur": 35.520, + "args": { + "External id": 147720,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771458421.658, "dur": 16.610, + "args": { + "External id": 147721,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771458448.388, "dur": 14.900, + "args": { + "External id": 147722,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771458496.688, "dur": 4.700, + "args": { + "External id": 147723,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771458508.638, "dur": 1.040, + "args": { + "External id": 147724,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771458514.008, "dur": 1.290, + "args": { + "External id": 147725,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458552.018, "dur": 12.580, + "args": { + "External id": 147726,"Record function id": 0, "Ev Idx": 3769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6303771458565.608, "dur": 443.629, + "args": { + "External id": 147727,"Record function id": 0, "Ev Idx": 3770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458586.478, "dur": 4.430, + "args": { + "External id": 147728,"Record function id": 0, "Ev Idx": 3771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771458591.488, "dur": 227.419, + "args": { + "External id": 147729,"Record function id": 0, "Ev Idx": 3772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771458604.638, "dur": 213.249, + "args": { + "External id": 147730,"Sequence number": 3058684, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3773 + } + }, + { + "ph": "s", "id": 298, "pid": 5714, "tid": 5714, "ts": 6303771458604.638, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458609.968, "dur": 7.690, + "args": { + "External id": 147731,"Record function id": 0, "Ev Idx": 3774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771458618.818, "dur": 190.689, + "args": { + "External id": 147732,"Record function id": 0, "Ev Idx": 3775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458645.638, "dur": 4.880, + "args": { + "External id": 147733,"Record function id": 0, "Ev Idx": 3776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771458651.138, "dur": 132.200, + "args": { + "External id": 147734,"Record function id": 0, "Ev Idx": 3777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458655.458, "dur": 8.120, + "args": { + "External id": 147735,"Record function id": 0, "Ev Idx": 3778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771458664.128, "dur": 116.419, + "args": { + "External id": 147736,"Record function id": 0, "Ev Idx": 3779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458700.467, "dur": 9.600, + "args": { + "External id": 147737,"Record function id": 0, "Ev Idx": 3780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771458711.178, "dur": 68.349, + "args": { + "External id": 147738,"Record function id": 0, "Ev Idx": 3781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771458744.518, "dur": 23.760, + "args": { + "External id": 147739,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458789.447, "dur": 3.651, + "args": { + "External id": 147740,"Record function id": 0, "Ev Idx": 3783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771458793.867, "dur": 15.050, + "args": { + "External id": 147741,"Record function id": 0, "Ev Idx": 3784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458823.427, "dur": 5.810, + "args": { + "External id": 147742,"Record function id": 0, "Ev Idx": 3785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6303771458829.957, "dur": 178.910, + "args": { + "External id": 147743,"Record function id": 0, "Ev Idx": 3786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458833.547, "dur": 1.850, + "args": { + "External id": 147744,"Record function id": 0, "Ev Idx": 3787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771458835.907, "dur": 171.770, + "args": { + "External id": 147745,"Record function id": 0, "Ev Idx": 3788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771458847.837, "dur": 158.890, + "args": { + "External id": 147746,"Sequence number": 3058685, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3789 + } + }, + { + "ph": "s", "id": 297, "pid": 5714, "tid": 5714, "ts": 6303771458847.837, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458852.547, "dur": 4.080, + "args": { + "External id": 147747,"Record function id": 0, "Ev Idx": 3790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771458857.277, "dur": 143.160, + "args": { + "External id": 147748,"Record function id": 0, "Ev Idx": 3791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458876.017, "dur": 2.410, + "args": { + "External id": 147749,"Record function id": 0, "Ev Idx": 3792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771458879.077, "dur": 101.630, + "args": { + "External id": 147750,"Record function id": 0, "Ev Idx": 3793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458882.197, "dur": 3.060, + "args": { + "External id": 147751,"Record function id": 0, "Ev Idx": 3794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771458885.877, "dur": 93.240, + "args": { + "External id": 147752,"Record function id": 0, "Ev Idx": 3795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458917.417, "dur": 4.260, + "args": { + "External id": 147753,"Record function id": 0, "Ev Idx": 3796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771458922.517, "dur": 55.750, + "args": { + "External id": 147754,"Record function id": 0, "Ev Idx": 3797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771458949.297, "dur": 19.170, + "args": { + "External id": 147755,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771458985.987, "dur": 2.140, + "args": { + "External id": 147756,"Record function id": 0, "Ev Idx": 3799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771458988.787, "dur": 11.010, + "args": { + "External id": 147757,"Record function id": 0, "Ev Idx": 3800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771459018.997, "dur": 17.010, + "args": { + "External id": 147758,"Record function id": 0, "Ev Idx": 3801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6303771459036.857, "dur": 242.259, + "args": { + "External id": 147759,"Record function id": 0, "Ev Idx": 3802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771459059.247, "dur": 209.899, + "args": { + "External id": 147760,"Sequence number": 3058686, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 3803 + } + }, + { + "ph": "s", "id": 296, "pid": 5714, "tid": 5714, "ts": 6303771459059.247, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771459084.337, "dur": 110.689, + "args": { + "External id": 147761,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 3804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771459121.497, "dur": 11.070, + "args": { + "External id": 147762,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771459124.207, "dur": 7.510, + "args": { + "External id": 147763,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771459134.297, "dur": 5.100, + "args": { + "External id": 147764,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771459140.317, "dur": 2.180, + "args": { + "External id": 147765,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771459145.347, "dur": 3.110, + "args": { + "External id": 147766,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771459210.637, "dur": 27.340, + "args": { + "External id": 147767,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771459288.736, "dur": 38.330, + "args": { + "External id": 147768,"Record function id": 0, "Ev Idx": 3811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6303771459328.136, "dur": 326.980, + "args": { + "External id": 147769,"Record function id": 0, "Ev Idx": 3812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771459353.636, "dur": 291.900, + "args": { + "External id": 147770,"Sequence number": 3058687, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 3813 + } + }, + { + "ph": "s", "id": 295, "pid": 5714, "tid": 5714, "ts": 6303771459353.636, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771459404.556, "dur": 26.480, + "args": { + "External id": 147771,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 3814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771459445.176, "dur": 25.550, + "args": { + "External id": 147772,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771459481.996, "dur": 15.520, + "args": { + "External id": 147773,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6303771459520.076, "dur": 18.570, + "args": { + "External id": 147774,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 3817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771459550.046, "dur": 24.470, + "args": { + "External id": 147775,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6303771459595.046, "dur": 15.850, + "args": { + "External id": 147776,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3819 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.0)", "pid": 5714, "tid": 5714, + "ts": 6303771459695.685, "dur": 51.600, + "args": { + "External id": 147777,"Record function id": 0, "Ev Idx": 3820 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771459807.715, "dur": 40.930, + "args": { + "External id": 147778,"Record function id": 0, "Ev Idx": 3821 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.1)", "pid": 5714, "tid": 5714, + "ts": 6303771459857.095, "dur": 832.578, + "args": { + "External id": 147779,"Record function id": 0, "Ev Idx": 3822 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 5714, "tid": 5714, + "ts": 6303771459864.135, "dur": 443.339, + "args": { + "External id": 147780,"Record function id": 0, "Ev Idx": 3823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771459920.835, "dur": 7.920, + "args": { + "External id": 147781,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771459938.455, "dur": 21.670, + "args": { + "External id": 147782,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771459942.915, "dur": 1.280, + "args": { + "External id": 147783,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771459945.545, "dur": 0.200, + "args": { + "External id": 147784,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771459946.435, "dur": 0.240, + "args": { + "External id": 147785,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771459948.845, "dur": 1.000, + "args": { + "External id": 147786,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771459950.495, "dur": 0.270, + "args": { + "External id": 147787,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771459951.465, "dur": 1.130, + "args": { + "External id": 147788,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771459953.665, "dur": 0.170, + "args": { + "External id": 147789,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771459954.505, "dur": 0.150, + "args": { + "External id": 147790,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771459956.575, "dur": 0.260, + "args": { + "External id": 147791,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771459966.825, "dur": 21.060, + "args": { + "External id": 147792,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771460016.185, "dur": 87.339, + "args": { + "External id": 147793,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771460025.435, "dur": 6.470, + "args": { + "External id": 147794,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771460035.765, "dur": 7.590, + "args": { + "External id": 147795,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771460038.055, "dur": 4.960, + "args": { + "External id": 147796,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460040.685, "dur": 0.690, + "args": { + "External id": 147797,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771460050.264, "dur": 17.191, + "args": { + "External id": 147798,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460051.935, "dur": 1.200, + "args": { + "External id": 147799,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460054.324, "dur": 0.171, + "args": { + "External id": 147800,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460056.404, "dur": 0.320, + "args": { + "External id": 147801,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460057.624, "dur": 0.160, + "args": { + "External id": 147802,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460058.424, "dur": 0.151, + "args": { + "External id": 147803,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460060.284, "dur": 0.160, + "args": { + "External id": 147804,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460061.444, "dur": 0.160, + "args": { + "External id": 147805,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460062.295, "dur": 1.249, + "args": { + "External id": 147806,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460064.204, "dur": 0.960, + "args": { + "External id": 147807,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771460078.524, "dur": 16.851, + "args": { + "External id": 147808,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771460153.864, "dur": 81.950, + "args": { + "External id": 147809,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771460171.124, "dur": 61.790, + "args": { + "External id": 147810,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3853, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771460181.944, "dur": 46.820, + "args": { + "External id": 147811,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771460249.694, "dur": 3.180, + "args": { + "External id": 147812,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3855, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 5714, "tid": 5714, + "ts": 6303771460326.424, "dur": 232.030, + "args": { + "External id": 147813,"Record function id": 0, "Ev Idx": 3856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460403.294, "dur": 3.590, + "args": { + "External id": 147814,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460410.834, "dur": 0.720, + "args": { + "External id": 147815,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460413.124, "dur": 0.600, + "args": { + "External id": 147816,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460415.054, "dur": 0.540, + "args": { + "External id": 147817,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460416.774, "dur": 0.450, + "args": { + "External id": 147818,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460419.534, "dur": 0.480, + "args": { + "External id": 147819,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460421.704, "dur": 0.480, + "args": { + "External id": 147820,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460423.364, "dur": 1.140, + "args": { + "External id": 147821,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460425.684, "dur": 0.530, + "args": { + "External id": 147822,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460428.424, "dur": 0.440, + "args": { + "External id": 147823,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771460440.764, "dur": 92.170, + "args": { + "External id": 147824,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771460458.704, "dur": 70.879, + "args": { + "External id": 147825,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771460469.784, "dur": 6.150, + "args": { + "External id": 147826,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771460477.934, "dur": 31.449, + "args": { + "External id": 147827,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771460479.884, "dur": 29.210, + "args": { + "External id": 147828,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771460483.004, "dur": 6.690, + "args": { + "External id": 147829,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771460490.744, "dur": 17.870, + "args": { + "External id": 147830,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6303771460648.703, "dur": 18.700, + "args": { + "External id": 147831,"Sequence number": 3058688, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3874 + } + }, + { + "ph": "s", "id": 294, "pid": 5714, "tid": 5714, "ts": 6303771460648.703, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771460658.313, "dur": 5.780, + "args": { + "External id": 147832,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 3875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771460660.413, "dur": 3.210, + "args": { + "External id": 147833,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 3876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771460716.553, "dur": 9.380, + "args": { + "External id": 147834,"Record function id": 0, "Ev Idx": 3877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6303771460726.993, "dur": 1341.237, + "args": { + "External id": 147835,"Record function id": 0, "Ev Idx": 3878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771460746.773, "dur": 98.650, + "args": { + "External id": 147836,"Sequence number": 3058689, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 3879 + } + }, + { + "ph": "s", "id": 293, "pid": 5714, "tid": 5714, "ts": 6303771460746.773, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771460788.673, "dur": 24.630, + "args": { + "External id": 147837,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 3880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771460824.803, "dur": 4.650, + "args": { + "External id": 147838,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771460825.983, "dur": 3.230, + "args": { + "External id": 147839,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771460864.833, "dur": 9.870, + "args": { + "External id": 147840,"Record function id": 0, "Ev Idx": 3883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6303771460875.513, "dur": 863.958, + "args": { + "External id": 147841,"Record function id": 0, "Ev Idx": 3884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771460895.563, "dur": 169.769, + "args": { + "External id": 147842,"Sequence number": 3058690, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 3885 + } + }, + { + "ph": "s", "id": 292, "pid": 5714, "tid": 5714, "ts": 6303771460895.563, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771460920.013, "dur": 29.260, + "args": { + "External id": 147843,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771460960.333, "dur": 15.489, + "args": { + "External id": 147844,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771460986.202, "dur": 15.051, + "args": { + "External id": 147845,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771461032.042, "dur": 3.500, + "args": { + "External id": 147846,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771461042.542, "dur": 1.011, + "args": { + "External id": 147847,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771461047.972, "dur": 1.180, + "args": { + "External id": 147848,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 3891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461083.022, "dur": 8.100, + "args": { + "External id": 147849,"Record function id": 0, "Ev Idx": 3892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6303771461091.922, "dur": 403.350, + "args": { + "External id": 147850,"Record function id": 0, "Ev Idx": 3893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461108.392, "dur": 2.730, + "args": { + "External id": 147851,"Record function id": 0, "Ev Idx": 3894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771461111.812, "dur": 194.670, + "args": { + "External id": 147852,"Record function id": 0, "Ev Idx": 3895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771461123.442, "dur": 181.820, + "args": { + "External id": 147853,"Sequence number": 3058691, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3896 + } + }, + { + "ph": "s", "id": 291, "pid": 5714, "tid": 5714, "ts": 6303771461123.442, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461128.362, "dur": 5.780, + "args": { + "External id": 147854,"Record function id": 0, "Ev Idx": 3897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771461135.142, "dur": 155.110, + "args": { + "External id": 147855,"Record function id": 0, "Ev Idx": 3898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461156.382, "dur": 3.100, + "args": { + "External id": 147856,"Record function id": 0, "Ev Idx": 3899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771461160.142, "dur": 108.630, + "args": { + "External id": 147857,"Record function id": 0, "Ev Idx": 3900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461163.022, "dur": 3.670, + "args": { + "External id": 147858,"Record function id": 0, "Ev Idx": 3901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771461167.282, "dur": 99.310, + "args": { + "External id": 147859,"Record function id": 0, "Ev Idx": 3902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461199.392, "dur": 5.240, + "args": { + "External id": 147860,"Record function id": 0, "Ev Idx": 3903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771461205.602, "dur": 60.070, + "args": { + "External id": 147861,"Record function id": 0, "Ev Idx": 3904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771461233.552, "dur": 21.550, + "args": { + "External id": 147862,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461274.152, "dur": 2.800, + "args": { + "External id": 147863,"Record function id": 0, "Ev Idx": 3906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771461277.562, "dur": 12.040, + "args": { + "External id": 147864,"Record function id": 0, "Ev Idx": 3907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461310.442, "dur": 4.100, + "args": { + "External id": 147865,"Record function id": 0, "Ev Idx": 3908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6303771461315.132, "dur": 179.689, + "args": { + "External id": 147866,"Record function id": 0, "Ev Idx": 3909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461318.102, "dur": 2.270, + "args": { + "External id": 147867,"Record function id": 0, "Ev Idx": 3910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771461320.902, "dur": 173.039, + "args": { + "External id": 147868,"Record function id": 0, "Ev Idx": 3911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771461333.322, "dur": 159.670, + "args": { + "External id": 147869,"Sequence number": 3058692, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 3912 + } + }, + { + "ph": "s", "id": 290, "pid": 5714, "tid": 5714, "ts": 6303771461333.322, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461337.892, "dur": 4.600, + "args": { + "External id": 147870,"Record function id": 0, "Ev Idx": 3913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771461343.132, "dur": 143.349, + "args": { + "External id": 147871,"Record function id": 0, "Ev Idx": 3914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461362.522, "dur": 2.760, + "args": { + "External id": 147872,"Record function id": 0, "Ev Idx": 3915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771461366.032, "dur": 100.680, + "args": { + "External id": 147873,"Record function id": 0, "Ev Idx": 3916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461369.522, "dur": 3.720, + "args": { + "External id": 147874,"Record function id": 0, "Ev Idx": 3917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771461373.832, "dur": 91.029, + "args": { + "External id": 147875,"Record function id": 0, "Ev Idx": 3918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461402.721, "dur": 4.660, + "args": { + "External id": 147876,"Record function id": 0, "Ev Idx": 3919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771461408.241, "dur": 55.811, + "args": { + "External id": 147877,"Record function id": 0, "Ev Idx": 3920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771461435.301, "dur": 19.051, + "args": { + "External id": 147878,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 3921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461471.392, "dur": 2.540, + "args": { + "External id": 147879,"Record function id": 0, "Ev Idx": 3922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771461474.521, "dur": 11.391, + "args": { + "External id": 147880,"Record function id": 0, "Ev Idx": 3923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461504.081, "dur": 8.710, + "args": { + "External id": 147881,"Record function id": 0, "Ev Idx": 3924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6303771461513.631, "dur": 224.760, + "args": { + "External id": 147882,"Record function id": 0, "Ev Idx": 3925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771461533.131, "dur": 195.910, + "args": { + "External id": 147883,"Sequence number": 3058693, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 3926 + } + }, + { + "ph": "s", "id": 289, "pid": 5714, "tid": 5714, "ts": 6303771461533.131, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771461554.931, "dur": 102.950, + "args": { + "External id": 147884,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 3927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771461585.611, "dur": 11.060, + "args": { + "External id": 147885,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 3928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771461588.301, "dur": 7.380, + "args": { + "External id": 147886,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771461598.341, "dur": 5.260, + "args": { + "External id": 147887,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771461604.841, "dur": 2.240, + "args": { + "External id": 147888,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771461609.731, "dur": 4.460, + "args": { + "External id": 147889,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771461673.131, "dur": 27.260, + "args": { + "External id": 147890,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 3933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771461747.741, "dur": 18.080, + "args": { + "External id": 147891,"Record function id": 0, "Ev Idx": 3934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6303771461766.731, "dur": 298.779, + "args": { + "External id": 147892,"Record function id": 0, "Ev Idx": 3935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771461789.711, "dur": 265.989, + "args": { + "External id": 147893,"Sequence number": 3058694, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 3936 + } + }, + { + "ph": "s", "id": 288, "pid": 5714, "tid": 5714, "ts": 6303771461789.711, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771461836.031, "dur": 24.760, + "args": { + "External id": 147894,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 3937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771461873.940, "dur": 23.760, + "args": { + "External id": 147895,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771461907.171, "dur": 14.780, + "args": { + "External id": 147896,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 3939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6303771461941.800, "dur": 17.560, + "args": { + "External id": 147897,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 3940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771461970.070, "dur": 21.430, + "args": { + "External id": 147898,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 3941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6303771462011.560, "dur": 14.100, + "args": { + "External id": 147899,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 3942 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.1)", "pid": 5714, "tid": 5714, + "ts": 6303771462105.360, "dur": 49.120, + "args": { + "External id": 147900,"Record function id": 0, "Ev Idx": 3943 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771462210.890, "dur": 42.650, + "args": { + "External id": 147901,"Record function id": 0, "Ev Idx": 3944 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.2)", "pid": 5714, "tid": 5714, + "ts": 6303771462262.140, "dur": 818.448, + "args": { + "External id": 147902,"Record function id": 0, "Ev Idx": 3945 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 5714, "tid": 5714, + "ts": 6303771462269.750, "dur": 441.059, + "args": { + "External id": 147903,"Record function id": 0, "Ev Idx": 3946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771462335.390, "dur": 8.189, + "args": { + "External id": 147904,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771462352.779, "dur": 19.460, + "args": { + "External id": 147905,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462355.839, "dur": 1.191, + "args": { + "External id": 147906,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462359.259, "dur": 0.211, + "args": { + "External id": 147907,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462360.279, "dur": 0.191, + "args": { + "External id": 147908,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462361.399, "dur": 1.780, + "args": { + "External id": 147909,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462363.859, "dur": 0.160, + "args": { + "External id": 147910,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462364.670, "dur": 0.220, + "args": { + "External id": 147911,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462366.850, "dur": 0.149, + "args": { + "External id": 147912,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462367.679, "dur": 0.240, + "args": { + "External id": 147913,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462368.910, "dur": 0.160, + "args": { + "External id": 147914,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771462378.739, "dur": 22.000, + "args": { + "External id": 147915,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771462428.729, "dur": 85.350, + "args": { + "External id": 147916,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 3959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771462437.889, "dur": 6.020, + "args": { + "External id": 147917,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771462447.769, "dur": 9.110, + "args": { + "External id": 147918,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771462450.139, "dur": 6.370, + "args": { + "External id": 147919,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 3962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462453.219, "dur": 1.620, + "args": { + "External id": 147920,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 3963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771462463.789, "dur": 16.350, + "args": { + "External id": 147921,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 3964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462465.199, "dur": 0.950, + "args": { + "External id": 147922,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462467.349, "dur": 0.160, + "args": { + "External id": 147923,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462468.229, "dur": 0.170, + "args": { + "External id": 147924,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462470.489, "dur": 0.160, + "args": { + "External id": 147925,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462471.299, "dur": 0.250, + "args": { + "External id": 147926,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462472.249, "dur": 0.960, + "args": { + "External id": 147927,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462474.229, "dur": 0.160, + "args": { + "External id": 147928,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462475.029, "dur": 0.160, + "args": { + "External id": 147929,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462476.619, "dur": 1.230, + "args": { + "External id": 147930,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 3973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771462489.869, "dur": 16.770, + "args": { + "External id": 147931,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 3974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771462564.239, "dur": 82.350, + "args": { + "External id": 147932,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 3975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771462581.469, "dur": 62.290, + "args": { + "External id": 147933,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 3976, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771462591.739, "dur": 47.910, + "args": { + "External id": 147934,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 3977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771462660.339, "dur": 3.170, + "args": { + "External id": 147935,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 3978, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 5714, "tid": 5714, + "ts": 6303771462728.669, "dur": 220.129, + "args": { + "External id": 147936,"Record function id": 0, "Ev Idx": 3979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771462801.778, "dur": 3.591, + "args": { + "External id": 147937,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 3980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771462809.169, "dur": 0.989, + "args": { + "External id": 147938,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771462811.689, "dur": 0.609, + "args": { + "External id": 147939,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771462813.829, "dur": 0.609, + "args": { + "External id": 147940,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771462816.718, "dur": 0.480, + "args": { + "External id": 147941,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771462818.349, "dur": 0.469, + "args": { + "External id": 147942,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 3985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771462820.569, "dur": 0.460, + "args": { + "External id": 147943,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 3986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771462822.158, "dur": 1.371, + "args": { + "External id": 147944,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771462826.098, "dur": 0.520, + "args": { + "External id": 147945,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771462827.758, "dur": 0.440, + "args": { + "External id": 147946,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 3989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771462840.029, "dur": 83.449, + "args": { + "External id": 147947,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771462850.408, "dur": 69.720, + "args": { + "External id": 147948,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 3991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771462860.098, "dur": 6.090, + "args": { + "External id": 147949,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771462868.218, "dur": 32.180, + "args": { + "External id": 147950,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 3993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771462869.588, "dur": 30.490, + "args": { + "External id": 147951,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 3994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771462872.408, "dur": 6.660, + "args": { + "External id": 147952,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 3995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771462881.368, "dur": 18.290, + "args": { + "External id": 147953,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 3996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6303771463038.158, "dur": 19.360, + "args": { + "External id": 147954,"Sequence number": 3058695, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 3997 + } + }, + { + "ph": "s", "id": 287, "pid": 5714, "tid": 5714, "ts": 6303771463038.158, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771463048.128, "dur": 5.960, + "args": { + "External id": 147955,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 3998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771463050.238, "dur": 3.390, + "args": { + "External id": 147956,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 3999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463106.818, "dur": 9.420, + "args": { + "External id": 147957,"Record function id": 0, "Ev Idx": 4000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6303771463117.108, "dur": 1391.557, + "args": { + "External id": 147958,"Record function id": 0, "Ev Idx": 4001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771463136.318, "dur": 101.530, + "args": { + "External id": 147959,"Sequence number": 3058696, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4002 + } + }, + { + "ph": "s", "id": 286, "pid": 5714, "tid": 5714, "ts": 6303771463136.318, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771463180.588, "dur": 24.880, + "args": { + "External id": 147960,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771463216.848, "dur": 4.580, + "args": { + "External id": 147961,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771463218.028, "dur": 3.149, + "args": { + "External id": 147962,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463257.397, "dur": 9.431, + "args": { + "External id": 147963,"Record function id": 0, "Ev Idx": 4006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6303771463267.668, "dur": 883.418, + "args": { + "External id": 147964,"Record function id": 0, "Ev Idx": 4007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771463286.717, "dur": 200.160, + "args": { + "External id": 147965,"Sequence number": 3058697, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4008 + } + }, + { + "ph": "s", "id": 285, "pid": 5714, "tid": 5714, "ts": 6303771463286.717, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771463322.687, "dur": 30.170, + "args": { + "External id": 147966,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771463364.237, "dur": 15.860, + "args": { + "External id": 147967,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771463390.047, "dur": 14.970, + "args": { + "External id": 147968,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771463450.607, "dur": 3.920, + "args": { + "External id": 147969,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771463462.277, "dur": 1.040, + "args": { + "External id": 147970,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771463467.457, "dur": 1.480, + "args": { + "External id": 147971,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463504.967, "dur": 8.080, + "args": { + "External id": 147972,"Record function id": 0, "Ev Idx": 4015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6303771463513.957, "dur": 392.649, + "args": { + "External id": 147973,"Record function id": 0, "Ev Idx": 4016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463529.887, "dur": 2.950, + "args": { + "External id": 147974,"Record function id": 0, "Ev Idx": 4017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771463533.437, "dur": 189.150, + "args": { + "External id": 147975,"Record function id": 0, "Ev Idx": 4018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771463545.327, "dur": 176.169, + "args": { + "External id": 147976,"Sequence number": 3058698, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4019 + } + }, + { + "ph": "s", "id": 284, "pid": 5714, "tid": 5714, "ts": 6303771463545.327, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463549.747, "dur": 4.960, + "args": { + "External id": 147977,"Record function id": 0, "Ev Idx": 4020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771463555.657, "dur": 157.079, + "args": { + "External id": 147978,"Record function id": 0, "Ev Idx": 4021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463576.747, "dur": 3.650, + "args": { + "External id": 147979,"Record function id": 0, "Ev Idx": 4022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771463581.057, "dur": 110.310, + "args": { + "External id": 147980,"Record function id": 0, "Ev Idx": 4023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463583.607, "dur": 4.080, + "args": { + "External id": 147981,"Record function id": 0, "Ev Idx": 4024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771463588.217, "dur": 100.970, + "args": { + "External id": 147982,"Record function id": 0, "Ev Idx": 4025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463621.387, "dur": 5.340, + "args": { + "External id": 147983,"Record function id": 0, "Ev Idx": 4026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771463627.737, "dur": 60.450, + "args": { + "External id": 147984,"Record function id": 0, "Ev Idx": 4027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771463656.436, "dur": 21.271, + "args": { + "External id": 147985,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463696.707, "dur": 3.029, + "args": { + "External id": 147986,"Record function id": 0, "Ev Idx": 4029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771463700.347, "dur": 11.709, + "args": { + "External id": 147987,"Record function id": 0, "Ev Idx": 4030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463725.967, "dur": 3.780, + "args": { + "External id": 147988,"Record function id": 0, "Ev Idx": 4031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6303771463730.336, "dur": 175.910, + "args": { + "External id": 147989,"Record function id": 0, "Ev Idx": 4032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463733.156, "dur": 2.220, + "args": { + "External id": 147990,"Record function id": 0, "Ev Idx": 4033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771463735.876, "dur": 169.500, + "args": { + "External id": 147991,"Record function id": 0, "Ev Idx": 4034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771463747.326, "dur": 157.050, + "args": { + "External id": 147992,"Sequence number": 3058699, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4035 + } + }, + { + "ph": "s", "id": 283, "pid": 5714, "tid": 5714, "ts": 6303771463747.326, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463751.866, "dur": 4.410, + "args": { + "External id": 147993,"Record function id": 0, "Ev Idx": 4036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771463756.956, "dur": 140.660, + "args": { + "External id": 147994,"Record function id": 0, "Ev Idx": 4037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463775.936, "dur": 2.780, + "args": { + "External id": 147995,"Record function id": 0, "Ev Idx": 4038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771463779.376, "dur": 98.540, + "args": { + "External id": 147996,"Record function id": 0, "Ev Idx": 4039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463782.236, "dur": 3.680, + "args": { + "External id": 147997,"Record function id": 0, "Ev Idx": 4040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771463786.466, "dur": 89.520, + "args": { + "External id": 147998,"Record function id": 0, "Ev Idx": 4041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463815.636, "dur": 4.570, + "args": { + "External id": 147999,"Record function id": 0, "Ev Idx": 4042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771463820.966, "dur": 54.190, + "args": { + "External id": 148000,"Record function id": 0, "Ev Idx": 4043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771463846.806, "dur": 18.740, + "args": { + "External id": 148001,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463883.026, "dur": 2.580, + "args": { + "External id": 148002,"Record function id": 0, "Ev Idx": 4045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771463886.236, "dur": 10.840, + "args": { + "External id": 148003,"Record function id": 0, "Ev Idx": 4046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771463915.326, "dur": 8.400, + "args": { + "External id": 148004,"Record function id": 0, "Ev Idx": 4047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6303771463924.476, "dur": 225.510, + "args": { + "External id": 148005,"Record function id": 0, "Ev Idx": 4048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771463944.296, "dur": 196.039, + "args": { + "External id": 148006,"Sequence number": 3058700, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4049 + } + }, + { + "ph": "s", "id": 282, "pid": 5714, "tid": 5714, "ts": 6303771463944.296, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771463965.826, "dur": 103.000, + "args": { + "External id": 148007,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771463997.126, "dur": 11.300, + "args": { + "External id": 148008,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771463999.886, "dur": 7.660, + "args": { + "External id": 148009,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771464010.046, "dur": 6.580, + "args": { + "External id": 148010,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771464017.716, "dur": 2.150, + "args": { + "External id": 148011,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771464022.396, "dur": 3.330, + "args": { + "External id": 148012,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771464084.336, "dur": 27.259, + "args": { + "External id": 148013,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771464159.126, "dur": 17.369, + "args": { + "External id": 148014,"Record function id": 0, "Ev Idx": 4057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6303771464177.346, "dur": 328.539, + "args": { + "External id": 148015,"Record function id": 0, "Ev Idx": 4058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771464199.805, "dur": 295.850, + "args": { + "External id": 148016,"Sequence number": 3058701, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4059 + } + }, + { + "ph": "s", "id": 281, "pid": 5714, "tid": 5714, "ts": 6303771464199.805, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771464248.625, "dur": 25.050, + "args": { + "External id": 148017,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771464286.915, "dur": 41.400, + "args": { + "External id": 148018,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771464340.615, "dur": 17.400, + "args": { + "External id": 148019,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6303771464381.095, "dur": 17.170, + "args": { + "External id": 148020,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771464408.895, "dur": 22.390, + "args": { + "External id": 148021,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6303771464450.425, "dur": 13.820, + "args": { + "External id": 148022,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4065 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.2)", "pid": 5714, "tid": 5714, + "ts": 6303771464545.205, "dur": 48.309, + "args": { + "External id": 148023,"Record function id": 0, "Ev Idx": 4066 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771464649.394, "dur": 40.580, + "args": { + "External id": 148024,"Record function id": 0, "Ev Idx": 4067 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.3)", "pid": 5714, "tid": 5714, + "ts": 6303771464697.954, "dur": 836.849, + "args": { + "External id": 148025,"Record function id": 0, "Ev Idx": 4068 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 5714, "tid": 5714, + "ts": 6303771464705.284, "dur": 446.449, + "args": { + "External id": 148026,"Record function id": 0, "Ev Idx": 4069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771464762.014, "dur": 7.780, + "args": { + "External id": 148027,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771464778.974, "dur": 19.620, + "args": { + "External id": 148028,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464781.834, "dur": 1.160, + "args": { + "External id": 148029,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464785.324, "dur": 0.280, + "args": { + "External id": 148030,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464786.394, "dur": 0.240, + "args": { + "External id": 148031,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464787.614, "dur": 1.900, + "args": { + "External id": 148032,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464790.184, "dur": 0.160, + "args": { + "External id": 148033,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464791.244, "dur": 0.160, + "args": { + "External id": 148034,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464792.914, "dur": 0.310, + "args": { + "External id": 148035,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464793.884, "dur": 0.140, + "args": { + "External id": 148036,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464794.964, "dur": 0.250, + "args": { + "External id": 148037,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771464812.504, "dur": 22.070, + "args": { + "External id": 148038,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771464863.624, "dur": 86.890, + "args": { + "External id": 148039,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771464873.064, "dur": 7.490, + "args": { + "External id": 148040,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771464884.474, "dur": 7.140, + "args": { + "External id": 148041,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771464886.714, "dur": 4.570, + "args": { + "External id": 148042,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464888.974, "dur": 0.690, + "args": { + "External id": 148043,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771464898.524, "dur": 17.250, + "args": { + "External id": 148044,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464900.594, "dur": 1.070, + "args": { + "External id": 148045,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464902.434, "dur": 1.210, + "args": { + "External id": 148046,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464904.304, "dur": 0.230, + "args": { + "External id": 148047,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464905.564, "dur": 0.160, + "args": { + "External id": 148048,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464907.414, "dur": 0.250, + "args": { + "External id": 148049,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464908.314, "dur": 0.150, + "args": { + "External id": 148050,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464909.484, "dur": 0.170, + "args": { + "External id": 148051,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464911.294, "dur": 0.160, + "args": { + "External id": 148052,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771464912.414, "dur": 1.040, + "args": { + "External id": 148053,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771464926.384, "dur": 16.840, + "args": { + "External id": 148054,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771465001.193, "dur": 83.151, + "args": { + "External id": 148055,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771465019.184, "dur": 62.140, + "args": { + "External id": 148056,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4099, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771465029.833, "dur": 47.500, + "args": { + "External id": 148057,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771465100.253, "dur": 3.170, + "args": { + "External id": 148058,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4101, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 5714, "tid": 5714, + "ts": 6303771465169.813, "dur": 233.330, + "args": { + "External id": 148059,"Record function id": 0, "Ev Idx": 4102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465246.393, "dur": 3.660, + "args": { + "External id": 148060,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465253.883, "dur": 0.740, + "args": { + "External id": 148061,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465256.153, "dur": 0.500, + "args": { + "External id": 148062,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465258.173, "dur": 0.450, + "args": { + "External id": 148063,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465259.783, "dur": 0.470, + "args": { + "External id": 148064,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465261.333, "dur": 0.570, + "args": { + "External id": 148065,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465263.133, "dur": 0.530, + "args": { + "External id": 148066,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465266.263, "dur": 2.010, + "args": { + "External id": 148067,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465269.473, "dur": 0.450, + "args": { + "External id": 148068,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465271.013, "dur": 0.430, + "args": { + "External id": 148069,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771465283.713, "dur": 93.230, + "args": { + "External id": 148070,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771465294.043, "dur": 79.550, + "args": { + "External id": 148071,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771465315.023, "dur": 6.140, + "args": { + "External id": 148072,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771465323.613, "dur": 30.540, + "args": { + "External id": 148073,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771465324.903, "dur": 28.980, + "args": { + "External id": 148074,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771465327.803, "dur": 6.180, + "args": { + "External id": 148075,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771465335.023, "dur": 18.360, + "args": { + "External id": 148076,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6303771465493.443, "dur": 18.369, + "args": { + "External id": 148077,"Sequence number": 3058702, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4120 + } + }, + { + "ph": "s", "id": 280, "pid": 5714, "tid": 5714, "ts": 6303771465493.443, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771465502.612, "dur": 5.771, + "args": { + "External id": 148078,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771465504.743, "dur": 3.240, + "args": { + "External id": 148079,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771465561.022, "dur": 9.070, + "args": { + "External id": 148080,"Record function id": 0, "Ev Idx": 4123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6303771465571.042, "dur": 1353.977, + "args": { + "External id": 148081,"Record function id": 0, "Ev Idx": 4124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771465591.462, "dur": 100.050, + "args": { + "External id": 148082,"Sequence number": 3058703, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4125 + } + }, + { + "ph": "s", "id": 279, "pid": 5714, "tid": 5714, "ts": 6303771465591.462, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771465635.332, "dur": 23.540, + "args": { + "External id": 148083,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771465670.382, "dur": 4.930, + "args": { + "External id": 148084,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771465671.442, "dur": 3.670, + "args": { + "External id": 148085,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771465710.712, "dur": 9.300, + "args": { + "External id": 148086,"Record function id": 0, "Ev Idx": 4129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6303771465720.842, "dur": 875.658, + "args": { + "External id": 148087,"Record function id": 0, "Ev Idx": 4130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771465740.572, "dur": 172.810, + "args": { + "External id": 148088,"Sequence number": 3058704, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4131 + } + }, + { + "ph": "s", "id": 278, "pid": 5714, "tid": 5714, "ts": 6303771465740.572, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771465765.282, "dur": 29.920, + "args": { + "External id": 148089,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771465806.072, "dur": 16.420, + "args": { + "External id": 148090,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771465833.872, "dur": 14.800, + "args": { + "External id": 148091,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771465879.132, "dur": 3.990, + "args": { + "External id": 148092,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771465890.322, "dur": 0.920, + "args": { + "External id": 148093,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771465895.542, "dur": 1.489, + "args": { + "External id": 148094,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771465930.831, "dur": 8.231, + "args": { + "External id": 148095,"Record function id": 0, "Ev Idx": 4138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6303771465939.922, "dur": 409.059, + "args": { + "External id": 148096,"Record function id": 0, "Ev Idx": 4139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771465965.091, "dur": 2.740, + "args": { + "External id": 148097,"Record function id": 0, "Ev Idx": 4140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771465968.431, "dur": 186.420, + "args": { + "External id": 148098,"Record function id": 0, "Ev Idx": 4141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771465980.282, "dur": 173.479, + "args": { + "External id": 148099,"Sequence number": 3058705, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4142 + } + }, + { + "ph": "s", "id": 277, "pid": 5714, "tid": 5714, "ts": 6303771465980.282, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771465985.171, "dur": 4.540, + "args": { + "External id": 148100,"Record function id": 0, "Ev Idx": 4143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771465990.371, "dur": 155.050, + "args": { + "External id": 148101,"Record function id": 0, "Ev Idx": 4144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466012.231, "dur": 3.210, + "args": { + "External id": 148102,"Record function id": 0, "Ev Idx": 4145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771466016.131, "dur": 107.890, + "args": { + "External id": 148103,"Record function id": 0, "Ev Idx": 4146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466018.761, "dur": 3.610, + "args": { + "External id": 148104,"Record function id": 0, "Ev Idx": 4147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771466023.031, "dur": 98.980, + "args": { + "External id": 148105,"Record function id": 0, "Ev Idx": 4148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466055.001, "dur": 5.020, + "args": { + "External id": 148106,"Record function id": 0, "Ev Idx": 4149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771466061.051, "dur": 60.010, + "args": { + "External id": 148107,"Record function id": 0, "Ev Idx": 4150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771466089.771, "dur": 20.780, + "args": { + "External id": 148108,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466129.421, "dur": 2.860, + "args": { + "External id": 148109,"Record function id": 0, "Ev Idx": 4152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771466132.861, "dur": 11.940, + "args": { + "External id": 148110,"Record function id": 0, "Ev Idx": 4153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466158.201, "dur": 3.720, + "args": { + "External id": 148111,"Record function id": 0, "Ev Idx": 4154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6303771466162.521, "dur": 186.109, + "args": { + "External id": 148112,"Record function id": 0, "Ev Idx": 4155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466165.221, "dur": 2.010, + "args": { + "External id": 148113,"Record function id": 0, "Ev Idx": 4156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771466167.741, "dur": 180.080, + "args": { + "External id": 148114,"Record function id": 0, "Ev Idx": 4157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771466179.451, "dur": 167.159, + "args": { + "External id": 148115,"Sequence number": 3058706, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4158 + } + }, + { + "ph": "s", "id": 276, "pid": 5714, "tid": 5714, "ts": 6303771466179.451, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466183.621, "dur": 4.110, + "args": { + "External id": 148116,"Record function id": 0, "Ev Idx": 4159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771466188.421, "dur": 151.240, + "args": { + "External id": 148117,"Record function id": 0, "Ev Idx": 4160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466207.411, "dur": 2.880, + "args": { + "External id": 148118,"Record function id": 0, "Ev Idx": 4161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771466210.971, "dur": 107.660, + "args": { + "External id": 148119,"Record function id": 0, "Ev Idx": 4162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466214.351, "dur": 3.580, + "args": { + "External id": 148120,"Record function id": 0, "Ev Idx": 4163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771466218.471, "dur": 98.270, + "args": { + "External id": 148121,"Record function id": 0, "Ev Idx": 4164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466248.421, "dur": 4.440, + "args": { + "External id": 148122,"Record function id": 0, "Ev Idx": 4165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771466253.741, "dur": 62.220, + "args": { + "External id": 148123,"Record function id": 0, "Ev Idx": 4166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771466279.721, "dur": 25.350, + "args": { + "External id": 148124,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466323.981, "dur": 2.740, + "args": { + "External id": 148125,"Record function id": 0, "Ev Idx": 4168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771466327.301, "dur": 11.740, + "args": { + "External id": 148126,"Record function id": 0, "Ev Idx": 4169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466358.041, "dur": 8.480, + "args": { + "External id": 148127,"Record function id": 0, "Ev Idx": 4170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6303771466367.350, "dur": 228.190, + "args": { + "External id": 148128,"Record function id": 0, "Ev Idx": 4171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771466386.861, "dur": 198.959, + "args": { + "External id": 148129,"Sequence number": 3058707, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4172 + } + }, + { + "ph": "s", "id": 275, "pid": 5714, "tid": 5714, "ts": 6303771466386.861, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771466409.170, "dur": 103.220, + "args": { + "External id": 148130,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771466440.650, "dur": 11.280, + "args": { + "External id": 148131,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771466443.381, "dur": 7.689, + "args": { + "External id": 148132,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771466453.530, "dur": 5.570, + "args": { + "External id": 148133,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771466460.520, "dur": 2.230, + "args": { + "External id": 148134,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771466465.010, "dur": 3.450, + "args": { + "External id": 148135,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771466527.220, "dur": 27.730, + "args": { + "External id": 148136,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771466604.700, "dur": 17.600, + "args": { + "External id": 148137,"Record function id": 0, "Ev Idx": 4180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6303771466623.160, "dur": 299.379, + "args": { + "External id": 148138,"Record function id": 0, "Ev Idx": 4181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771466645.570, "dur": 267.199, + "args": { + "External id": 148139,"Sequence number": 3058708, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4182 + } + }, + { + "ph": "s", "id": 274, "pid": 5714, "tid": 5714, "ts": 6303771466645.570, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771466691.230, "dur": 24.800, + "args": { + "External id": 148140,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771466729.430, "dur": 24.490, + "args": { + "External id": 148141,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771466764.920, "dur": 15.800, + "args": { + "External id": 148142,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6303771466800.629, "dur": 16.240, + "args": { + "External id": 148143,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771466827.920, "dur": 21.760, + "args": { + "External id": 148144,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6303771466869.029, "dur": 13.751, + "args": { + "External id": 148145,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4188 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.3)", "pid": 5714, "tid": 5714, + "ts": 6303771466961.339, "dur": 49.690, + "args": { + "External id": 148146,"Record function id": 0, "Ev Idx": 4189 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771467067.129, "dur": 41.570, + "args": { + "External id": 148147,"Record function id": 0, "Ev Idx": 4190 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.4)", "pid": 5714, "tid": 5714, + "ts": 6303771467117.149, "dur": 832.708, + "args": { + "External id": 148148,"Record function id": 0, "Ev Idx": 4191 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 5714, "tid": 5714, + "ts": 6303771467124.709, "dur": 448.739, + "args": { + "External id": 148149,"Record function id": 0, "Ev Idx": 4192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771467181.829, "dur": 8.010, + "args": { + "External id": 148150,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771467199.539, "dur": 20.000, + "args": { + "External id": 148151,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467202.519, "dur": 1.180, + "args": { + "External id": 148152,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467206.189, "dur": 0.300, + "args": { + "External id": 148153,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467207.289, "dur": 0.170, + "args": { + "External id": 148154,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467208.499, "dur": 2.100, + "args": { + "External id": 148155,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467211.299, "dur": 0.180, + "args": { + "External id": 148156,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467212.489, "dur": 0.210, + "args": { + "External id": 148157,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467214.369, "dur": 0.330, + "args": { + "External id": 148158,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467215.369, "dur": 0.220, + "args": { + "External id": 148159,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467216.519, "dur": 0.160, + "args": { + "External id": 148160,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771467226.139, "dur": 22.100, + "args": { + "External id": 148161,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771467276.588, "dur": 95.630, + "args": { + "External id": 148162,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771467286.308, "dur": 6.191, + "args": { + "External id": 148163,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771467303.359, "dur": 8.569, + "args": { + "External id": 148164,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771467305.759, "dur": 5.809, + "args": { + "External id": 148165,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467308.139, "dur": 1.769, + "args": { + "External id": 148166,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771467319.148, "dur": 17.900, + "args": { + "External id": 148167,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467320.959, "dur": 1.140, + "args": { + "External id": 148168,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467323.019, "dur": 0.269, + "args": { + "External id": 148169,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467323.959, "dur": 0.220, + "args": { + "External id": 148170,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467326.399, "dur": 0.160, + "args": { + "External id": 148171,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467327.228, "dur": 0.171, + "args": { + "External id": 148172,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467328.068, "dur": 1.300, + "args": { + "External id": 148173,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467330.288, "dur": 0.220, + "args": { + "External id": 148174,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467331.239, "dur": 0.240, + "args": { + "External id": 148175,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467333.439, "dur": 1.449, + "args": { + "External id": 148176,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771467346.848, "dur": 17.570, + "args": { + "External id": 148177,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771467423.718, "dur": 84.300, + "args": { + "External id": 148178,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771467441.188, "dur": 63.780, + "args": { + "External id": 148179,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4222, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771467453.078, "dur": 47.840, + "args": { + "External id": 148180,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771467523.278, "dur": 3.080, + "args": { + "External id": 148181,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4224, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 5714, "tid": 5714, + "ts": 6303771467591.438, "dur": 220.829, + "args": { + "External id": 148182,"Record function id": 0, "Ev Idx": 4225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467666.818, "dur": 3.670, + "args": { + "External id": 148183,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467674.348, "dur": 0.770, + "args": { + "External id": 148184,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467676.628, "dur": 0.480, + "args": { + "External id": 148185,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467678.558, "dur": 0.480, + "args": { + "External id": 148186,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467680.218, "dur": 0.490, + "args": { + "External id": 148187,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467682.958, "dur": 0.580, + "args": { + "External id": 148188,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467684.878, "dur": 0.500, + "args": { + "External id": 148189,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467686.708, "dur": 1.370, + "args": { + "External id": 148190,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467689.278, "dur": 0.470, + "args": { + "External id": 148191,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467692.038, "dur": 0.450, + "args": { + "External id": 148192,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771467704.778, "dur": 81.600, + "args": { + "External id": 148193,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771467715.287, "dur": 67.820, + "args": { + "External id": 148194,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771467725.198, "dur": 6.120, + "args": { + "External id": 148195,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771467733.387, "dur": 30.151, + "args": { + "External id": 148196,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771467734.687, "dur": 28.491, + "args": { + "External id": 148197,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771467737.507, "dur": 5.831, + "args": { + "External id": 148198,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771467744.327, "dur": 18.400, + "args": { + "External id": 148199,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6303771467901.747, "dur": 23.460, + "args": { + "External id": 148200,"Sequence number": 3058709, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4243 + } + }, + { + "ph": "s", "id": 273, "pid": 5714, "tid": 5714, "ts": 6303771467901.747, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771467911.357, "dur": 10.400, + "args": { + "External id": 148201,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771467918.017, "dur": 3.330, + "args": { + "External id": 148202,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771467977.907, "dur": 9.340, + "args": { + "External id": 148203,"Record function id": 0, "Ev Idx": 4246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6303771467988.177, "dur": 1349.647, + "args": { + "External id": 148204,"Record function id": 0, "Ev Idx": 4247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771468007.687, "dur": 97.500, + "args": { + "External id": 148205,"Sequence number": 3058710, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4248 + } + }, + { + "ph": "s", "id": 272, "pid": 5714, "tid": 5714, "ts": 6303771468007.687, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771468049.077, "dur": 24.100, + "args": { + "External id": 148206,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771468084.737, "dur": 4.350, + "args": { + "External id": 148207,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771468085.757, "dur": 3.110, + "args": { + "External id": 148208,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468124.247, "dur": 9.400, + "args": { + "External id": 148209,"Record function id": 0, "Ev Idx": 4252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6303771468134.357, "dur": 865.528, + "args": { + "External id": 148210,"Record function id": 0, "Ev Idx": 4253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771468153.726, "dur": 180.220, + "args": { + "External id": 148211,"Sequence number": 3058711, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4254 + } + }, + { + "ph": "s", "id": 271, "pid": 5714, "tid": 5714, "ts": 6303771468153.726, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771468179.937, "dur": 29.580, + "args": { + "External id": 148212,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771468220.197, "dur": 16.089, + "args": { + "External id": 148213,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771468246.696, "dur": 14.110, + "args": { + "External id": 148214,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771468290.966, "dur": 3.900, + "args": { + "External id": 148215,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771468309.956, "dur": 0.940, + "args": { + "External id": 148216,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771468315.616, "dur": 1.560, + "args": { + "External id": 148217,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468351.916, "dur": 8.220, + "args": { + "External id": 148218,"Record function id": 0, "Ev Idx": 4261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6303771468361.016, "dur": 394.139, + "args": { + "External id": 148219,"Record function id": 0, "Ev Idx": 4262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468377.116, "dur": 2.650, + "args": { + "External id": 148220,"Record function id": 0, "Ev Idx": 4263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771468380.356, "dur": 188.550, + "args": { + "External id": 148221,"Record function id": 0, "Ev Idx": 4264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771468392.166, "dur": 175.650, + "args": { + "External id": 148222,"Sequence number": 3058712, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4265 + } + }, + { + "ph": "s", "id": 270, "pid": 5714, "tid": 5714, "ts": 6303771468392.166, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468397.516, "dur": 6.160, + "args": { + "External id": 148223,"Record function id": 0, "Ev Idx": 4266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771468404.346, "dur": 155.780, + "args": { + "External id": 148224,"Record function id": 0, "Ev Idx": 4267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468425.586, "dur": 3.040, + "args": { + "External id": 148225,"Record function id": 0, "Ev Idx": 4268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771468429.276, "dur": 109.390, + "args": { + "External id": 148226,"Record function id": 0, "Ev Idx": 4269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468431.946, "dur": 3.920, + "args": { + "External id": 148227,"Record function id": 0, "Ev Idx": 4270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771468436.496, "dur": 100.120, + "args": { + "External id": 148228,"Record function id": 0, "Ev Idx": 4271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468469.526, "dur": 5.220, + "args": { + "External id": 148229,"Record function id": 0, "Ev Idx": 4272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771468475.656, "dur": 60.110, + "args": { + "External id": 148230,"Record function id": 0, "Ev Idx": 4273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771468503.976, "dur": 20.810, + "args": { + "External id": 148231,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468544.056, "dur": 3.000, + "args": { + "External id": 148232,"Record function id": 0, "Ev Idx": 4275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771468547.636, "dur": 11.870, + "args": { + "External id": 148233,"Record function id": 0, "Ev Idx": 4276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468572.196, "dur": 3.650, + "args": { + "External id": 148234,"Record function id": 0, "Ev Idx": 4277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6303771468576.446, "dur": 178.269, + "args": { + "External id": 148235,"Record function id": 0, "Ev Idx": 4278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468579.066, "dur": 2.040, + "args": { + "External id": 148236,"Record function id": 0, "Ev Idx": 4279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771468581.606, "dur": 172.299, + "args": { + "External id": 148237,"Record function id": 0, "Ev Idx": 4280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771468593.996, "dur": 158.929, + "args": { + "External id": 148238,"Sequence number": 3058713, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4281 + } + }, + { + "ph": "s", "id": 269, "pid": 5714, "tid": 5714, "ts": 6303771468593.996, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468598.425, "dur": 4.051, + "args": { + "External id": 148239,"Record function id": 0, "Ev Idx": 4282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771468603.165, "dur": 143.540, + "args": { + "External id": 148240,"Record function id": 0, "Ev Idx": 4283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468623.385, "dur": 3.100, + "args": { + "External id": 148241,"Record function id": 0, "Ev Idx": 4284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771468627.205, "dur": 99.180, + "args": { + "External id": 148242,"Record function id": 0, "Ev Idx": 4285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468630.316, "dur": 3.689, + "args": { + "External id": 148243,"Record function id": 0, "Ev Idx": 4286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771468634.556, "dur": 90.169, + "args": { + "External id": 148244,"Record function id": 0, "Ev Idx": 4287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468663.336, "dur": 4.909, + "args": { + "External id": 148245,"Record function id": 0, "Ev Idx": 4288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771468668.956, "dur": 54.929, + "args": { + "External id": 148246,"Record function id": 0, "Ev Idx": 4289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771468695.425, "dur": 18.030, + "args": { + "External id": 148247,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468731.325, "dur": 2.680, + "args": { + "External id": 148248,"Record function id": 0, "Ev Idx": 4291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771468734.575, "dur": 11.610, + "args": { + "External id": 148249,"Record function id": 0, "Ev Idx": 4292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771468764.405, "dur": 8.350, + "args": { + "External id": 148250,"Record function id": 0, "Ev Idx": 4293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6303771468773.555, "dur": 225.280, + "args": { + "External id": 148251,"Record function id": 0, "Ev Idx": 4294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771468793.165, "dur": 196.240, + "args": { + "External id": 148252,"Sequence number": 3058714, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4295 + } + }, + { + "ph": "s", "id": 268, "pid": 5714, "tid": 5714, "ts": 6303771468793.165, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771468814.215, "dur": 104.470, + "args": { + "External id": 148253,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771468846.105, "dur": 11.120, + "args": { + "External id": 148254,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771468848.745, "dur": 7.610, + "args": { + "External id": 148255,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771468858.955, "dur": 5.540, + "args": { + "External id": 148256,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771468866.115, "dur": 2.220, + "args": { + "External id": 148257,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771468870.595, "dur": 4.110, + "args": { + "External id": 148258,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771468934.125, "dur": 26.850, + "args": { + "External id": 148259,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771469008.265, "dur": 17.530, + "args": { + "External id": 148260,"Record function id": 0, "Ev Idx": 4303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6303771469026.685, "dur": 307.999, + "args": { + "External id": 148261,"Record function id": 0, "Ev Idx": 4304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771469048.835, "dur": 275.789, + "args": { + "External id": 148262,"Sequence number": 3058715, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4305 + } + }, + { + "ph": "s", "id": 267, "pid": 5714, "tid": 5714, "ts": 6303771469048.835, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771469093.875, "dur": 24.189, + "args": { + "External id": 148263,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771469132.315, "dur": 23.679, + "args": { + "External id": 148264,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771469165.554, "dur": 15.270, + "args": { + "External id": 148265,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6303771469202.114, "dur": 16.260, + "args": { + "External id": 148266,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771469229.104, "dur": 22.240, + "args": { + "External id": 148267,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6303771469271.834, "dur": 14.550, + "args": { + "External id": 148268,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4311 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.4)", "pid": 5714, "tid": 5714, + "ts": 6303771469374.204, "dur": 48.690, + "args": { + "External id": 148269,"Record function id": 0, "Ev Idx": 4312 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771469478.244, "dur": 40.399, + "args": { + "External id": 148270,"Record function id": 0, "Ev Idx": 4313 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.5)", "pid": 5714, "tid": 5714, + "ts": 6303771469526.454, "dur": 828.338, + "args": { + "External id": 148271,"Record function id": 0, "Ev Idx": 4314 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 5714, "tid": 5714, + "ts": 6303771469533.903, "dur": 440.779, + "args": { + "External id": 148272,"Record function id": 0, "Ev Idx": 4315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771469590.574, "dur": 7.869, + "args": { + "External id": 148273,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771469607.373, "dur": 20.980, + "args": { + "External id": 148274,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469610.273, "dur": 1.230, + "args": { + "External id": 148275,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469614.273, "dur": 0.330, + "args": { + "External id": 148276,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469615.433, "dur": 0.290, + "args": { + "External id": 148277,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469616.863, "dur": 2.080, + "args": { + "External id": 148278,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469619.633, "dur": 0.240, + "args": { + "External id": 148279,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469620.983, "dur": 0.360, + "args": { + "External id": 148280,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469622.963, "dur": 0.260, + "args": { + "External id": 148281,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469623.883, "dur": 0.240, + "args": { + "External id": 148282,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469625.013, "dur": 0.220, + "args": { + "External id": 148283,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771469634.953, "dur": 21.910, + "args": { + "External id": 148284,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771469685.373, "dur": 89.840, + "args": { + "External id": 148285,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771469694.723, "dur": 6.360, + "args": { + "External id": 148286,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771469704.933, "dur": 7.980, + "args": { + "External id": 148287,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771469707.163, "dur": 5.400, + "args": { + "External id": 148288,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469709.343, "dur": 1.580, + "args": { + "External id": 148289,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771469719.513, "dur": 19.740, + "args": { + "External id": 148290,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469721.503, "dur": 1.190, + "args": { + "External id": 148291,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469723.503, "dur": 0.170, + "args": { + "External id": 148292,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469726.803, "dur": 0.230, + "args": { + "External id": 148293,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469729.033, "dur": 0.290, + "args": { + "External id": 148294,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469730.323, "dur": 0.170, + "args": { + "External id": 148295,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469731.183, "dur": 0.930, + "args": { + "External id": 148296,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469732.973, "dur": 0.230, + "args": { + "External id": 148297,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469733.843, "dur": 0.250, + "args": { + "External id": 148298,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771469736.213, "dur": 0.930, + "args": { + "External id": 148299,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771469749.523, "dur": 18.180, + "args": { + "External id": 148300,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771469825.433, "dur": 84.390, + "args": { + "External id": 148301,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771469843.283, "dur": 63.700, + "args": { + "External id": 148302,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4345, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771469853.613, "dur": 49.350, + "args": { + "External id": 148303,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771469923.653, "dur": 3.100, + "args": { + "External id": 148304,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4347, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 5714, "tid": 5714, + "ts": 6303771469992.273, "dur": 222.829, + "args": { + "External id": 148305,"Record function id": 0, "Ev Idx": 4348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470068.062, "dur": 3.600, + "args": { + "External id": 148306,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470075.452, "dur": 0.800, + "args": { + "External id": 148307,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470077.762, "dur": 0.510, + "args": { + "External id": 148308,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470079.712, "dur": 0.540, + "args": { + "External id": 148309,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470081.402, "dur": 0.550, + "args": { + "External id": 148310,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470084.102, "dur": 0.460, + "args": { + "External id": 148311,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470085.802, "dur": 0.570, + "args": { + "External id": 148312,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470087.652, "dur": 1.830, + "args": { + "External id": 148313,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470090.692, "dur": 0.450, + "args": { + "External id": 148314,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470093.322, "dur": 0.520, + "args": { + "External id": 148315,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771470105.752, "dur": 84.120, + "args": { + "External id": 148316,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771470115.932, "dur": 70.570, + "args": { + "External id": 148317,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771470127.002, "dur": 6.150, + "args": { + "External id": 148318,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771470135.162, "dur": 31.200, + "args": { + "External id": 148319,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771470136.412, "dur": 29.670, + "args": { + "External id": 148320,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771470139.382, "dur": 6.740, + "args": { + "External id": 148321,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771470147.232, "dur": 18.400, + "args": { + "External id": 148322,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6303771470311.342, "dur": 19.410, + "args": { + "External id": 148323,"Sequence number": 3058716, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4366 + } + }, + { + "ph": "s", "id": 266, "pid": 5714, "tid": 5714, "ts": 6303771470311.342, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771470321.012, "dur": 5.990, + "args": { + "External id": 148324,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771470323.112, "dur": 3.350, + "args": { + "External id": 148325,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470381.962, "dur": 9.240, + "args": { + "External id": 148326,"Record function id": 0, "Ev Idx": 4369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6303771470392.042, "dur": 1369.236, + "args": { + "External id": 148327,"Record function id": 0, "Ev Idx": 4370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771470412.061, "dur": 100.500, + "args": { + "External id": 148328,"Sequence number": 3058717, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4371 + } + }, + { + "ph": "s", "id": 265, "pid": 5714, "tid": 5714, "ts": 6303771470412.061, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771470456.241, "dur": 23.900, + "args": { + "External id": 148329,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771470491.352, "dur": 4.939, + "args": { + "External id": 148330,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771470492.481, "dur": 3.600, + "args": { + "External id": 148331,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470532.131, "dur": 9.060, + "args": { + "External id": 148332,"Record function id": 0, "Ev Idx": 4375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6303771470541.981, "dur": 884.518, + "args": { + "External id": 148333,"Record function id": 0, "Ev Idx": 4376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771470561.231, "dur": 172.740, + "args": { + "External id": 148334,"Sequence number": 3058718, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4377 + } + }, + { + "ph": "s", "id": 264, "pid": 5714, "tid": 5714, "ts": 6303771470561.231, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771470586.231, "dur": 30.090, + "args": { + "External id": 148335,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771470626.961, "dur": 16.580, + "args": { + "External id": 148336,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771470654.081, "dur": 14.800, + "args": { + "External id": 148337,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771470699.451, "dur": 3.960, + "args": { + "External id": 148338,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771470710.651, "dur": 1.120, + "args": { + "External id": 148339,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771470716.531, "dur": 1.370, + "args": { + "External id": 148340,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470751.581, "dur": 8.170, + "args": { + "External id": 148341,"Record function id": 0, "Ev Idx": 4384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6303771470760.641, "dur": 398.029, + "args": { + "External id": 148342,"Record function id": 0, "Ev Idx": 4385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470776.101, "dur": 2.770, + "args": { + "External id": 148343,"Record function id": 0, "Ev Idx": 4386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771470779.461, "dur": 189.629, + "args": { + "External id": 148344,"Record function id": 0, "Ev Idx": 4387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771470791.281, "dur": 176.649, + "args": { + "External id": 148345,"Sequence number": 3058719, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4388 + } + }, + { + "ph": "s", "id": 263, "pid": 5714, "tid": 5714, "ts": 6303771470791.281, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470797.091, "dur": 4.800, + "args": { + "External id": 148346,"Record function id": 0, "Ev Idx": 4389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771470802.591, "dur": 156.999, + "args": { + "External id": 148347,"Record function id": 0, "Ev Idx": 4390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470823.641, "dur": 2.930, + "args": { + "External id": 148348,"Record function id": 0, "Ev Idx": 4391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771470827.281, "dur": 110.779, + "args": { + "External id": 148349,"Record function id": 0, "Ev Idx": 4392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470829.751, "dur": 3.620, + "args": { + "External id": 148350,"Record function id": 0, "Ev Idx": 4393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771470833.981, "dur": 101.919, + "args": { + "External id": 148351,"Record function id": 0, "Ev Idx": 4394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470868.420, "dur": 5.260, + "args": { + "External id": 148352,"Record function id": 0, "Ev Idx": 4395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771470874.751, "dur": 60.229, + "args": { + "External id": 148353,"Record function id": 0, "Ev Idx": 4396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771470903.240, "dur": 21.411, + "args": { + "External id": 148354,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470943.370, "dur": 3.080, + "args": { + "External id": 148355,"Record function id": 0, "Ev Idx": 4398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771470947.020, "dur": 11.850, + "args": { + "External id": 148356,"Record function id": 0, "Ev Idx": 4399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470972.780, "dur": 3.870, + "args": { + "External id": 148357,"Record function id": 0, "Ev Idx": 4400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6303771470977.250, "dur": 181.000, + "args": { + "External id": 148358,"Record function id": 0, "Ev Idx": 4401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470980.080, "dur": 2.170, + "args": { + "External id": 148359,"Record function id": 0, "Ev Idx": 4402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771470982.750, "dur": 174.600, + "args": { + "External id": 148360,"Record function id": 0, "Ev Idx": 4403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771470994.970, "dur": 161.400, + "args": { + "External id": 148361,"Sequence number": 3058720, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4404 + } + }, + { + "ph": "s", "id": 262, "pid": 5714, "tid": 5714, "ts": 6303771470994.970, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771470999.600, "dur": 4.350, + "args": { + "External id": 148362,"Record function id": 0, "Ev Idx": 4405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771471004.630, "dur": 145.220, + "args": { + "External id": 148363,"Record function id": 0, "Ev Idx": 4406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771471023.420, "dur": 3.010, + "args": { + "External id": 148364,"Record function id": 0, "Ev Idx": 4407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771471027.260, "dur": 102.120, + "args": { + "External id": 148365,"Record function id": 0, "Ev Idx": 4408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771471030.440, "dur": 3.680, + "args": { + "External id": 148366,"Record function id": 0, "Ev Idx": 4409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771471034.630, "dur": 92.490, + "args": { + "External id": 148367,"Record function id": 0, "Ev Idx": 4410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771471063.630, "dur": 4.490, + "args": { + "External id": 148368,"Record function id": 0, "Ev Idx": 4411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771471068.910, "dur": 57.430, + "args": { + "External id": 148369,"Record function id": 0, "Ev Idx": 4412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771471097.140, "dur": 19.080, + "args": { + "External id": 148370,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771471134.560, "dur": 2.820, + "args": { + "External id": 148371,"Record function id": 0, "Ev Idx": 4414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771471138.090, "dur": 11.150, + "args": { + "External id": 148372,"Record function id": 0, "Ev Idx": 4415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771471167.740, "dur": 8.530, + "args": { + "External id": 148373,"Record function id": 0, "Ev Idx": 4416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6303771471177.100, "dur": 248.339, + "args": { + "External id": 148374,"Record function id": 0, "Ev Idx": 4417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771471196.690, "dur": 219.399, + "args": { + "External id": 148375,"Sequence number": 3058721, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4418 + } + }, + { + "ph": "s", "id": 261, "pid": 5714, "tid": 5714, "ts": 6303771471196.690, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771471218.230, "dur": 122.560, + "args": { + "External id": 148376,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771471249.990, "dur": 11.390, + "args": { + "External id": 148377,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771471252.740, "dur": 7.700, + "args": { + "External id": 148378,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771471262.980, "dur": 5.620, + "args": { + "External id": 148379,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771471270.150, "dur": 2.240, + "args": { + "External id": 148380,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771471274.680, "dur": 3.310, + "args": { + "External id": 148381,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771471358.270, "dur": 28.689, + "args": { + "External id": 148382,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771471435.049, "dur": 17.340, + "args": { + "External id": 148383,"Record function id": 0, "Ev Idx": 4426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6303771471453.299, "dur": 305.359, + "args": { + "External id": 148384,"Record function id": 0, "Ev Idx": 4427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771471476.259, "dur": 272.319, + "args": { + "External id": 148385,"Sequence number": 3058722, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4428 + } + }, + { + "ph": "s", "id": 260, "pid": 5714, "tid": 5714, "ts": 6303771471476.259, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771471523.009, "dur": 24.760, + "args": { + "External id": 148386,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771471561.409, "dur": 24.410, + "args": { + "External id": 148387,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771471595.539, "dur": 16.140, + "args": { + "External id": 148388,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6303771471634.619, "dur": 16.460, + "args": { + "External id": 148389,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771471661.719, "dur": 22.400, + "args": { + "External id": 148390,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6303771471703.639, "dur": 14.210, + "args": { + "External id": 148391,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4434 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.5)", "pid": 5714, "tid": 5714, + "ts": 6303771471798.158, "dur": 159.010, + "args": { + "External id": 148392,"Record function id": 0, "Ev Idx": 4435 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771472013.498, "dur": 40.660, + "args": { + "External id": 148393,"Record function id": 0, "Ev Idx": 4436 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.6)", "pid": 5714, "tid": 5714, + "ts": 6303771472062.758, "dur": 927.488, + "args": { + "External id": 148394,"Record function id": 0, "Ev Idx": 4437 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 5714, "tid": 5714, + "ts": 6303771472070.328, "dur": 451.269, + "args": { + "External id": 148395,"Record function id": 0, "Ev Idx": 4438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771472127.538, "dur": 8.080, + "args": { + "External id": 148396,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771472145.178, "dur": 20.490, + "args": { + "External id": 148397,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472148.278, "dur": 1.240, + "args": { + "External id": 148398,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472151.938, "dur": 0.300, + "args": { + "External id": 148399,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472152.958, "dur": 0.220, + "args": { + "External id": 148400,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472154.158, "dur": 2.010, + "args": { + "External id": 148401,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472156.958, "dur": 0.290, + "args": { + "External id": 148402,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472158.428, "dur": 0.260, + "args": { + "External id": 148403,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472160.248, "dur": 0.270, + "args": { + "External id": 148404,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472161.188, "dur": 0.200, + "args": { + "External id": 148405,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472162.418, "dur": 0.150, + "args": { + "External id": 148406,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771472172.468, "dur": 22.240, + "args": { + "External id": 148407,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771472222.837, "dur": 94.100, + "args": { + "External id": 148408,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771472232.477, "dur": 6.251, + "args": { + "External id": 148409,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771472242.408, "dur": 7.969, + "args": { + "External id": 148410,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771472244.617, "dur": 5.391, + "args": { + "External id": 148411,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472246.857, "dur": 1.531, + "args": { + "External id": 148412,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771472257.068, "dur": 16.740, + "args": { + "External id": 148413,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472258.768, "dur": 0.889, + "args": { + "External id": 148414,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472260.708, "dur": 0.160, + "args": { + "External id": 148415,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472261.537, "dur": 0.191, + "args": { + "External id": 148416,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472263.737, "dur": 0.251, + "args": { + "External id": 148417,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472264.977, "dur": 0.180, + "args": { + "External id": 148418,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472265.837, "dur": 1.091, + "args": { + "External id": 148419,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472267.857, "dur": 0.220, + "args": { + "External id": 148420,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472268.788, "dur": 0.149, + "args": { + "External id": 148421,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472270.897, "dur": 0.800, + "args": { + "External id": 148422,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771472283.488, "dur": 25.329, + "args": { + "External id": 148423,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771472374.107, "dur": 81.630, + "args": { + "External id": 148424,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771472391.107, "dur": 61.730, + "args": { + "External id": 148425,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4468, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771472401.607, "dur": 47.160, + "args": { + "External id": 148426,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771472471.137, "dur": 3.090, + "args": { + "External id": 148427,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4470, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 5714, "tid": 5714, + "ts": 6303771472540.307, "dur": 219.869, + "args": { + "External id": 148428,"Record function id": 0, "Ev Idx": 4471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472613.957, "dur": 3.660, + "args": { + "External id": 148429,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472621.537, "dur": 0.690, + "args": { + "External id": 148430,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472623.767, "dur": 0.730, + "args": { + "External id": 148431,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472626.037, "dur": 0.520, + "args": { + "External id": 148432,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472627.707, "dur": 0.420, + "args": { + "External id": 148433,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472629.227, "dur": 0.450, + "args": { + "External id": 148434,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472630.937, "dur": 0.610, + "args": { + "External id": 148435,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472632.917, "dur": 1.420, + "args": { + "External id": 148436,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472636.697, "dur": 0.460, + "args": { + "External id": 148437,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472638.257, "dur": 0.430, + "args": { + "External id": 148438,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771472651.156, "dur": 83.600, + "args": { + "External id": 148439,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771472661.456, "dur": 69.880, + "args": { + "External id": 148440,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771472672.767, "dur": 6.149, + "args": { + "External id": 148441,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771472680.856, "dur": 31.280, + "args": { + "External id": 148442,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771472683.296, "dur": 28.540, + "args": { + "External id": 148443,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771472686.487, "dur": 5.560, + "args": { + "External id": 148444,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771472693.147, "dur": 18.229, + "args": { + "External id": 148445,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6303771472949.506, "dur": 18.250, + "args": { + "External id": 148446,"Sequence number": 3058723, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4489 + } + }, + { + "ph": "s", "id": 259, "pid": 5714, "tid": 5714, "ts": 6303771472949.506, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771472958.646, "dur": 5.860, + "args": { + "External id": 148447,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771472960.696, "dur": 3.390, + "args": { + "External id": 148448,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473016.936, "dur": 9.070, + "args": { + "External id": 148449,"Record function id": 0, "Ev Idx": 4492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6303771473026.856, "dur": 1367.427, + "args": { + "External id": 148450,"Record function id": 0, "Ev Idx": 4493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771473046.416, "dur": 98.199, + "args": { + "External id": 148451,"Sequence number": 3058724, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4494 + } + }, + { + "ph": "s", "id": 258, "pid": 5714, "tid": 5714, "ts": 6303771473046.416, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771473089.166, "dur": 23.549, + "args": { + "External id": 148452,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771473124.055, "dur": 4.080, + "args": { + "External id": 148453,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771473125.146, "dur": 2.780, + "args": { + "External id": 148454,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473163.626, "dur": 9.160, + "args": { + "External id": 148455,"Record function id": 0, "Ev Idx": 4498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6303771473173.486, "dur": 878.998, + "args": { + "External id": 148456,"Record function id": 0, "Ev Idx": 4499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771473192.685, "dur": 185.060, + "args": { + "External id": 148457,"Sequence number": 3058725, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4500 + } + }, + { + "ph": "s", "id": 257, "pid": 5714, "tid": 5714, "ts": 6303771473192.685, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771473217.695, "dur": 30.030, + "args": { + "External id": 148458,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771473258.645, "dur": 16.620, + "args": { + "External id": 148459,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771473286.905, "dur": 21.780, + "args": { + "External id": 148460,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771473341.505, "dur": 3.610, + "args": { + "External id": 148461,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771473352.725, "dur": 0.880, + "args": { + "External id": 148462,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771473357.945, "dur": 1.190, + "args": { + "External id": 148463,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473395.435, "dur": 8.310, + "args": { + "External id": 148464,"Record function id": 0, "Ev Idx": 4507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6303771473404.665, "dur": 403.619, + "args": { + "External id": 148465,"Record function id": 0, "Ev Idx": 4508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473420.645, "dur": 2.850, + "args": { + "External id": 148466,"Record function id": 0, "Ev Idx": 4509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771473424.075, "dur": 184.770, + "args": { + "External id": 148467,"Record function id": 0, "Ev Idx": 4510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771473435.915, "dur": 171.890, + "args": { + "External id": 148468,"Sequence number": 3058726, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4511 + } + }, + { + "ph": "s", "id": 256, "pid": 5714, "tid": 5714, "ts": 6303771473435.915, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473441.005, "dur": 4.690, + "args": { + "External id": 148469,"Record function id": 0, "Ev Idx": 4512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771473446.375, "dur": 153.939, + "args": { + "External id": 148470,"Record function id": 0, "Ev Idx": 4513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473467.345, "dur": 3.110, + "args": { + "External id": 148471,"Record function id": 0, "Ev Idx": 4514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771473471.165, "dur": 107.500, + "args": { + "External id": 148472,"Record function id": 0, "Ev Idx": 4515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473474.195, "dur": 3.440, + "args": { + "External id": 148473,"Record function id": 0, "Ev Idx": 4516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771473478.235, "dur": 98.390, + "args": { + "External id": 148474,"Record function id": 0, "Ev Idx": 4517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473511.355, "dur": 4.860, + "args": { + "External id": 148475,"Record function id": 0, "Ev Idx": 4518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771473517.115, "dur": 58.699, + "args": { + "External id": 148476,"Record function id": 0, "Ev Idx": 4519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771473544.574, "dur": 20.460, + "args": { + "External id": 148477,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473584.134, "dur": 2.871, + "args": { + "External id": 148478,"Record function id": 0, "Ev Idx": 4521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771473587.545, "dur": 12.160, + "args": { + "External id": 148479,"Record function id": 0, "Ev Idx": 4522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473612.185, "dur": 3.669, + "args": { + "External id": 148480,"Record function id": 0, "Ev Idx": 4523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6303771473627.705, "dur": 180.079, + "args": { + "External id": 148481,"Record function id": 0, "Ev Idx": 4524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473630.905, "dur": 2.380, + "args": { + "External id": 148482,"Record function id": 0, "Ev Idx": 4525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771473633.825, "dur": 173.049, + "args": { + "External id": 148483,"Record function id": 0, "Ev Idx": 4526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771473646.364, "dur": 159.530, + "args": { + "External id": 148484,"Sequence number": 3058727, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4527 + } + }, + { + "ph": "s", "id": 255, "pid": 5714, "tid": 5714, "ts": 6303771473646.364, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473651.204, "dur": 3.910, + "args": { + "External id": 148485,"Record function id": 0, "Ev Idx": 4528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771473655.774, "dur": 142.530, + "args": { + "External id": 148486,"Record function id": 0, "Ev Idx": 4529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473674.744, "dur": 3.840, + "args": { + "External id": 148487,"Record function id": 0, "Ev Idx": 4530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771473679.264, "dur": 99.200, + "args": { + "External id": 148488,"Record function id": 0, "Ev Idx": 4531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473681.994, "dur": 3.430, + "args": { + "External id": 148489,"Record function id": 0, "Ev Idx": 4532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771473685.974, "dur": 90.640, + "args": { + "External id": 148490,"Record function id": 0, "Ev Idx": 4533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473714.894, "dur": 4.600, + "args": { + "External id": 148491,"Record function id": 0, "Ev Idx": 4534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771473720.204, "dur": 55.680, + "args": { + "External id": 148492,"Record function id": 0, "Ev Idx": 4535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771473746.944, "dur": 19.180, + "args": { + "External id": 148493,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473783.334, "dur": 2.690, + "args": { + "External id": 148494,"Record function id": 0, "Ev Idx": 4537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771473786.614, "dur": 11.220, + "args": { + "External id": 148495,"Record function id": 0, "Ev Idx": 4538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771473817.374, "dur": 7.770, + "args": { + "External id": 148496,"Record function id": 0, "Ev Idx": 4539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6303771473826.034, "dur": 225.279, + "args": { + "External id": 148497,"Record function id": 0, "Ev Idx": 4540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771473845.234, "dur": 196.690, + "args": { + "External id": 148498,"Sequence number": 3058728, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4541 + } + }, + { + "ph": "s", "id": 254, "pid": 5714, "tid": 5714, "ts": 6303771473845.234, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771473866.044, "dur": 103.150, + "args": { + "External id": 148499,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771473897.854, "dur": 11.290, + "args": { + "External id": 148500,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771473900.564, "dur": 7.640, + "args": { + "External id": 148501,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771473910.794, "dur": 5.140, + "args": { + "External id": 148502,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771473917.644, "dur": 2.400, + "args": { + "External id": 148503,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771473922.214, "dur": 3.330, + "args": { + "External id": 148504,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771473984.854, "dur": 27.770, + "args": { + "External id": 148505,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771474060.433, "dur": 17.960, + "args": { + "External id": 148506,"Record function id": 0, "Ev Idx": 4549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6303771474079.284, "dur": 312.409, + "args": { + "External id": 148507,"Record function id": 0, "Ev Idx": 4550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771474102.173, "dur": 279.780, + "args": { + "External id": 148508,"Sequence number": 3058729, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4551 + } + }, + { + "ph": "s", "id": 253, "pid": 5714, "tid": 5714, "ts": 6303771474102.173, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771474147.313, "dur": 24.340, + "args": { + "External id": 148509,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771474185.163, "dur": 24.530, + "args": { + "External id": 148510,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771474220.503, "dur": 16.000, + "args": { + "External id": 148511,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6303771474257.353, "dur": 16.610, + "args": { + "External id": 148512,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771474284.393, "dur": 30.570, + "args": { + "External id": 148513,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6303771474336.153, "dur": 14.940, + "args": { + "External id": 148514,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4557 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.6)", "pid": 5714, "tid": 5714, + "ts": 6303771474430.643, "dur": 48.920, + "args": { + "External id": 148515,"Record function id": 0, "Ev Idx": 4558 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771474536.423, "dur": 40.569, + "args": { + "External id": 148516,"Record function id": 0, "Ev Idx": 4559 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.7)", "pid": 5714, "tid": 5714, + "ts": 6303771474585.022, "dur": 823.028, + "args": { + "External id": 148517,"Record function id": 0, "Ev Idx": 4560 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 5714, "tid": 5714, + "ts": 6303771474591.632, "dur": 435.609, + "args": { + "External id": 148518,"Record function id": 0, "Ev Idx": 4561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771474647.432, "dur": 7.930, + "args": { + "External id": 148519,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771474664.802, "dur": 22.280, + "args": { + "External id": 148520,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474667.582, "dur": 1.210, + "args": { + "External id": 148521,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474671.142, "dur": 0.310, + "args": { + "External id": 148522,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474672.172, "dur": 0.170, + "args": { + "External id": 148523,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474673.312, "dur": 1.120, + "args": { + "External id": 148524,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474675.992, "dur": 0.170, + "args": { + "External id": 148525,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474677.172, "dur": 0.260, + "args": { + "External id": 148526,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474678.132, "dur": 1.300, + "args": { + "External id": 148527,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474680.112, "dur": 0.220, + "args": { + "External id": 148528,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474683.482, "dur": 0.240, + "args": { + "External id": 148529,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771474693.582, "dur": 21.610, + "args": { + "External id": 148530,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771474743.632, "dur": 88.010, + "args": { + "External id": 148531,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771474753.352, "dur": 6.660, + "args": { + "External id": 148532,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771474763.832, "dur": 8.230, + "args": { + "External id": 148533,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771474766.092, "dur": 5.600, + "args": { + "External id": 148534,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474768.322, "dur": 1.720, + "args": { + "External id": 148535,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771474778.692, "dur": 16.920, + "args": { + "External id": 148536,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474780.372, "dur": 0.850, + "args": { + "External id": 148537,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474782.072, "dur": 0.170, + "args": { + "External id": 148538,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474782.912, "dur": 0.180, + "args": { + "External id": 148539,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474785.172, "dur": 0.240, + "args": { + "External id": 148540,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474786.072, "dur": 0.140, + "args": { + "External id": 148541,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474786.872, "dur": 1.310, + "args": { + "External id": 148542,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474789.102, "dur": 0.150, + "args": { + "External id": 148543,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474789.882, "dur": 0.150, + "args": { + "External id": 148544,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771474792.182, "dur": 1.300, + "args": { + "External id": 148545,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771474807.742, "dur": 16.380, + "args": { + "External id": 148546,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771474881.502, "dur": 81.749, + "args": { + "External id": 148547,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771474898.651, "dur": 61.691, + "args": { + "External id": 148548,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4591, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771474909.102, "dur": 47.229, + "args": { + "External id": 148549,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771474976.911, "dur": 3.071, + "args": { + "External id": 148550,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4593, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 5714, "tid": 5714, + "ts": 6303771475045.491, "dur": 222.340, + "args": { + "External id": 148551,"Record function id": 0, "Ev Idx": 4594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475120.551, "dur": 3.700, + "args": { + "External id": 148552,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475128.041, "dur": 0.670, + "args": { + "External id": 148553,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475130.331, "dur": 0.520, + "args": { + "External id": 148554,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475132.261, "dur": 0.530, + "args": { + "External id": 148555,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475133.971, "dur": 0.550, + "args": { + "External id": 148556,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475136.571, "dur": 0.630, + "args": { + "External id": 148557,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475138.521, "dur": 0.450, + "args": { + "External id": 148558,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475140.281, "dur": 1.680, + "args": { + "External id": 148559,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475143.141, "dur": 0.450, + "args": { + "External id": 148560,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475145.801, "dur": 0.500, + "args": { + "External id": 148561,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771475158.571, "dur": 83.390, + "args": { + "External id": 148562,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771475168.691, "dur": 69.960, + "args": { + "External id": 148563,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771475179.691, "dur": 6.090, + "args": { + "External id": 148564,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771475187.811, "dur": 30.510, + "args": { + "External id": 148565,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771475189.131, "dur": 28.930, + "args": { + "External id": 148566,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771475191.941, "dur": 6.520, + "args": { + "External id": 148567,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771475199.431, "dur": 18.160, + "args": { + "External id": 148568,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6303771475366.050, "dur": 18.880, + "args": { + "External id": 148569,"Sequence number": 3058730, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4612 + } + }, + { + "ph": "s", "id": 252, "pid": 5714, "tid": 5714, "ts": 6303771475366.050, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771475375.661, "dur": 5.909, + "args": { + "External id": 148570,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771475377.710, "dur": 3.391, + "args": { + "External id": 148571,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771475434.690, "dur": 9.290, + "args": { + "External id": 148572,"Record function id": 0, "Ev Idx": 4615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6303771475444.880, "dur": 1328.347, + "args": { + "External id": 148573,"Record function id": 0, "Ev Idx": 4616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771475464.310, "dur": 96.780, + "args": { + "External id": 148574,"Sequence number": 3058731, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4617 + } + }, + { + "ph": "s", "id": 251, "pid": 5714, "tid": 5714, "ts": 6303771475464.310, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771475505.670, "dur": 23.520, + "args": { + "External id": 148575,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771475540.620, "dur": 4.550, + "args": { + "External id": 148576,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771475541.680, "dur": 3.300, + "args": { + "External id": 148577,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771475580.340, "dur": 9.090, + "args": { + "External id": 148578,"Record function id": 0, "Ev Idx": 4621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6303771475590.170, "dur": 855.058, + "args": { + "External id": 148579,"Record function id": 0, "Ev Idx": 4622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771475612.430, "dur": 173.280, + "args": { + "External id": 148580,"Sequence number": 3058732, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4623 + } + }, + { + "ph": "s", "id": 250, "pid": 5714, "tid": 5714, "ts": 6303771475612.430, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771475638.620, "dur": 29.530, + "args": { + "External id": 148581,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771475679.350, "dur": 16.490, + "args": { + "External id": 148582,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771475705.290, "dur": 14.850, + "args": { + "External id": 148583,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771475752.610, "dur": 2.780, + "args": { + "External id": 148584,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771475762.290, "dur": 1.930, + "args": { + "External id": 148585,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771475768.680, "dur": 1.220, + "args": { + "External id": 148586,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771475803.509, "dur": 7.911, + "args": { + "External id": 148587,"Record function id": 0, "Ev Idx": 4630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6303771475812.249, "dur": 379.580, + "args": { + "External id": 148588,"Record function id": 0, "Ev Idx": 4631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771475827.669, "dur": 2.760, + "args": { + "External id": 148589,"Record function id": 0, "Ev Idx": 4632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771475831.020, "dur": 181.329, + "args": { + "External id": 148590,"Record function id": 0, "Ev Idx": 4633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771475842.569, "dur": 168.610, + "args": { + "External id": 148591,"Sequence number": 3058733, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4634 + } + }, + { + "ph": "s", "id": 249, "pid": 5714, "tid": 5714, "ts": 6303771475842.569, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771475847.169, "dur": 4.331, + "args": { + "External id": 148592,"Record function id": 0, "Ev Idx": 4635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771475852.200, "dur": 151.719, + "args": { + "External id": 148593,"Record function id": 0, "Ev Idx": 4636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771475873.240, "dur": 3.169, + "args": { + "External id": 148594,"Record function id": 0, "Ev Idx": 4637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771475877.029, "dur": 105.690, + "args": { + "External id": 148595,"Record function id": 0, "Ev Idx": 4638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771475879.640, "dur": 3.469, + "args": { + "External id": 148596,"Record function id": 0, "Ev Idx": 4639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771475883.720, "dur": 96.879, + "args": { + "External id": 148597,"Record function id": 0, "Ev Idx": 4640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771475915.059, "dur": 5.110, + "args": { + "External id": 148598,"Record function id": 0, "Ev Idx": 4641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771475921.079, "dur": 58.620, + "args": { + "External id": 148599,"Record function id": 0, "Ev Idx": 4642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771475949.059, "dur": 20.360, + "args": { + "External id": 148600,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771475987.969, "dur": 2.910, + "args": { + "External id": 148601,"Record function id": 0, "Ev Idx": 4644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771475991.439, "dur": 11.850, + "args": { + "External id": 148602,"Record function id": 0, "Ev Idx": 4645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771476015.959, "dur": 3.560, + "args": { + "External id": 148603,"Record function id": 0, "Ev Idx": 4646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6303771476020.099, "dur": 171.350, + "args": { + "External id": 148604,"Record function id": 0, "Ev Idx": 4647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771476022.839, "dur": 2.100, + "args": { + "External id": 148605,"Record function id": 0, "Ev Idx": 4648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771476025.439, "dur": 165.270, + "args": { + "External id": 148606,"Record function id": 0, "Ev Idx": 4649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771476037.719, "dur": 152.090, + "args": { + "External id": 148607,"Sequence number": 3058734, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4650 + } + }, + { + "ph": "s", "id": 248, "pid": 5714, "tid": 5714, "ts": 6303771476037.719, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771476041.539, "dur": 4.150, + "args": { + "External id": 148608,"Record function id": 0, "Ev Idx": 4651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771476046.349, "dur": 136.250, + "args": { + "External id": 148609,"Record function id": 0, "Ev Idx": 4652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771476064.889, "dur": 2.920, + "args": { + "External id": 148610,"Record function id": 0, "Ev Idx": 4653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771476068.479, "dur": 94.900, + "args": { + "External id": 148611,"Record function id": 0, "Ev Idx": 4654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771476071.109, "dur": 3.290, + "args": { + "External id": 148612,"Record function id": 0, "Ev Idx": 4655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771476074.929, "dur": 86.610, + "args": { + "External id": 148613,"Record function id": 0, "Ev Idx": 4656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771476102.329, "dur": 4.510, + "args": { + "External id": 148614,"Record function id": 0, "Ev Idx": 4657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771476107.569, "dur": 53.240, + "args": { + "External id": 148615,"Record function id": 0, "Ev Idx": 4658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771476132.819, "dur": 18.410, + "args": { + "External id": 148616,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771476168.519, "dur": 2.650, + "args": { + "External id": 148617,"Record function id": 0, "Ev Idx": 4660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771476171.739, "dur": 10.400, + "args": { + "External id": 148618,"Record function id": 0, "Ev Idx": 4661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771476200.609, "dur": 8.030, + "args": { + "External id": 148619,"Record function id": 0, "Ev Idx": 4662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6303771476209.439, "dur": 234.729, + "args": { + "External id": 148620,"Record function id": 0, "Ev Idx": 4663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771476228.709, "dur": 206.099, + "args": { + "External id": 148621,"Sequence number": 3058735, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4664 + } + }, + { + "ph": "s", "id": 247, "pid": 5714, "tid": 5714, "ts": 6303771476228.709, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771476249.848, "dur": 110.920, + "args": { + "External id": 148622,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771476280.208, "dur": 11.831, + "args": { + "External id": 148623,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771476283.688, "dur": 7.531, + "args": { + "External id": 148624,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771476293.688, "dur": 12.131, + "args": { + "External id": 148625,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771476307.379, "dur": 2.269, + "args": { + "External id": 148626,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771476311.948, "dur": 4.560, + "args": { + "External id": 148627,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771476376.178, "dur": 27.330, + "args": { + "External id": 148628,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771476453.558, "dur": 17.640, + "args": { + "External id": 148629,"Record function id": 0, "Ev Idx": 4672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6303771476472.098, "dur": 298.329, + "args": { + "External id": 148630,"Record function id": 0, "Ev Idx": 4673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771476494.728, "dur": 266.310, + "args": { + "External id": 148631,"Sequence number": 3058736, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4674 + } + }, + { + "ph": "s", "id": 246, "pid": 5714, "tid": 5714, "ts": 6303771476494.728, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771476539.848, "dur": 23.960, + "args": { + "External id": 148632,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771476576.818, "dur": 24.280, + "args": { + "External id": 148633,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771476611.048, "dur": 15.380, + "args": { + "External id": 148634,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6303771476648.538, "dur": 16.340, + "args": { + "External id": 148635,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771476675.128, "dur": 22.130, + "args": { + "External id": 148636,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6303771476715.307, "dur": 14.331, + "args": { + "External id": 148637,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4680 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.7)", "pid": 5714, "tid": 5714, + "ts": 6303771476808.687, "dur": 48.650, + "args": { + "External id": 148638,"Record function id": 0, "Ev Idx": 4681 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771476912.857, "dur": 40.730, + "args": { + "External id": 148639,"Record function id": 0, "Ev Idx": 4682 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.8)", "pid": 5714, "tid": 5714, + "ts": 6303771476961.507, "dur": 821.358, + "args": { + "External id": 148640,"Record function id": 0, "Ev Idx": 4683 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 5714, "tid": 5714, + "ts": 6303771476968.687, "dur": 441.129, + "args": { + "External id": 148641,"Record function id": 0, "Ev Idx": 4684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771477024.907, "dur": 8.160, + "args": { + "External id": 148642,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771477042.427, "dur": 20.660, + "args": { + "External id": 148643,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477045.527, "dur": 1.210, + "args": { + "External id": 148644,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477047.987, "dur": 1.640, + "args": { + "External id": 148645,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477050.537, "dur": 0.320, + "args": { + "External id": 148646,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477052.027, "dur": 0.900, + "args": { + "External id": 148647,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477054.657, "dur": 0.240, + "args": { + "External id": 148648,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477055.917, "dur": 0.180, + "args": { + "External id": 148649,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477056.847, "dur": 0.150, + "args": { + "External id": 148650,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477058.647, "dur": 0.170, + "args": { + "External id": 148651,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477059.857, "dur": 0.150, + "args": { + "External id": 148652,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771477069.517, "dur": 21.990, + "args": { + "External id": 148653,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771477120.217, "dur": 83.220, + "args": { + "External id": 148654,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771477129.577, "dur": 6.100, + "args": { + "External id": 148655,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771477139.337, "dur": 7.149, + "args": { + "External id": 148656,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771477141.557, "dur": 4.580, + "args": { + "External id": 148657,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477143.817, "dur": 0.709, + "args": { + "External id": 148658,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771477153.257, "dur": 16.989, + "args": { + "External id": 148659,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477155.097, "dur": 0.840, + "args": { + "External id": 148660,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477157.766, "dur": 0.251, + "args": { + "External id": 148661,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477158.697, "dur": 0.160, + "args": { + "External id": 148662,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477159.846, "dur": 1.291, + "args": { + "External id": 148663,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477161.837, "dur": 0.169, + "args": { + "External id": 148664,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477162.666, "dur": 0.200, + "args": { + "External id": 148665,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477164.626, "dur": 0.260, + "args": { + "External id": 148666,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477165.526, "dur": 0.151, + "args": { + "External id": 148667,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477167.126, "dur": 0.880, + "args": { + "External id": 148668,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771477179.657, "dur": 16.229, + "args": { + "External id": 148669,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771477253.606, "dur": 91.050, + "args": { + "External id": 148670,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771477270.576, "dur": 71.150, + "args": { + "External id": 148671,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4714, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771477281.666, "dur": 56.010, + "args": { + "External id": 148672,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771477359.556, "dur": 3.140, + "args": { + "External id": 148673,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4716, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 5714, "tid": 5714, + "ts": 6303771477427.296, "dur": 223.629, + "args": { + "External id": 148674,"Record function id": 0, "Ev Idx": 4717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477502.466, "dur": 3.860, + "args": { + "External id": 148675,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477510.106, "dur": 0.700, + "args": { + "External id": 148676,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477512.306, "dur": 0.580, + "args": { + "External id": 148677,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477514.306, "dur": 0.460, + "args": { + "External id": 148678,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477515.866, "dur": 0.660, + "args": { + "External id": 148679,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477517.656, "dur": 0.440, + "args": { + "External id": 148680,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477519.376, "dur": 0.510, + "args": { + "External id": 148681,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477522.256, "dur": 1.550, + "args": { + "External id": 148682,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477525.006, "dur": 0.640, + "args": { + "External id": 148683,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477526.766, "dur": 0.540, + "args": { + "External id": 148684,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771477539.516, "dur": 85.660, + "args": { + "External id": 148685,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771477549.976, "dur": 71.909, + "args": { + "External id": 148686,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771477562.426, "dur": 6.080, + "args": { + "External id": 148687,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771477571.576, "dur": 30.880, + "args": { + "External id": 148688,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771477572.876, "dur": 29.309, + "args": { + "External id": 148689,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771477575.736, "dur": 6.620, + "args": { + "External id": 148690,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771477583.326, "dur": 18.430, + "args": { + "External id": 148691,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6303771477742.035, "dur": 18.290, + "args": { + "External id": 148692,"Sequence number": 3058737, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4735 + } + }, + { + "ph": "s", "id": 245, "pid": 5714, "tid": 5714, "ts": 6303771477742.035, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771477751.235, "dur": 5.790, + "args": { + "External id": 148693,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771477753.325, "dur": 3.240, + "args": { + "External id": 148694,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771477810.025, "dur": 9.000, + "args": { + "External id": 148695,"Record function id": 0, "Ev Idx": 4738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6303771477820.035, "dur": 1341.327, + "args": { + "External id": 148696,"Record function id": 0, "Ev Idx": 4739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771477839.585, "dur": 98.410, + "args": { + "External id": 148697,"Sequence number": 3058738, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4740 + } + }, + { + "ph": "s", "id": 244, "pid": 5714, "tid": 5714, "ts": 6303771477839.585, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771477882.205, "dur": 23.790, + "args": { + "External id": 148698,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771477917.305, "dur": 4.690, + "args": { + "External id": 148699,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771477918.495, "dur": 3.300, + "args": { + "External id": 148700,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771477957.005, "dur": 8.970, + "args": { + "External id": 148701,"Record function id": 0, "Ev Idx": 4744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6303771477966.685, "dur": 863.318, + "args": { + "External id": 148702,"Record function id": 0, "Ev Idx": 4745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771477991.935, "dur": 174.129, + "args": { + "External id": 148703,"Sequence number": 3058739, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4746 + } + }, + { + "ph": "s", "id": 243, "pid": 5714, "tid": 5714, "ts": 6303771477991.935, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771478019.445, "dur": 30.170, + "args": { + "External id": 148704,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771478060.675, "dur": 16.140, + "args": { + "External id": 148705,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771478085.815, "dur": 14.789, + "args": { + "External id": 148706,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771478132.535, "dur": 2.860, + "args": { + "External id": 148707,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771478142.154, "dur": 2.100, + "args": { + "External id": 148708,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771478148.784, "dur": 1.280, + "args": { + "External id": 148709,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478183.884, "dur": 8.930, + "args": { + "External id": 148710,"Record function id": 0, "Ev Idx": 4753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6303771478193.704, "dur": 394.990, + "args": { + "External id": 148711,"Record function id": 0, "Ev Idx": 4754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478208.954, "dur": 2.760, + "args": { + "External id": 148712,"Record function id": 0, "Ev Idx": 4755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771478212.344, "dur": 195.230, + "args": { + "External id": 148713,"Record function id": 0, "Ev Idx": 4756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771478224.314, "dur": 182.200, + "args": { + "External id": 148714,"Sequence number": 3058740, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4757 + } + }, + { + "ph": "s", "id": 242, "pid": 5714, "tid": 5714, "ts": 6303771478224.314, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478229.094, "dur": 4.300, + "args": { + "External id": 148715,"Record function id": 0, "Ev Idx": 4758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771478234.074, "dur": 164.770, + "args": { + "External id": 148716,"Record function id": 0, "Ev Idx": 4759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478254.454, "dur": 3.220, + "args": { + "External id": 148717,"Record function id": 0, "Ev Idx": 4760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771478258.354, "dur": 119.570, + "args": { + "External id": 148718,"Record function id": 0, "Ev Idx": 4761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478261.294, "dur": 3.560, + "args": { + "External id": 148719,"Record function id": 0, "Ev Idx": 4762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771478265.784, "dur": 110.100, + "args": { + "External id": 148720,"Record function id": 0, "Ev Idx": 4763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478305.144, "dur": 5.510, + "args": { + "External id": 148721,"Record function id": 0, "Ev Idx": 4764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771478311.684, "dur": 63.300, + "args": { + "External id": 148722,"Record function id": 0, "Ev Idx": 4765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771478343.324, "dur": 21.370, + "args": { + "External id": 148723,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478383.214, "dur": 2.830, + "args": { + "External id": 148724,"Record function id": 0, "Ev Idx": 4767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771478386.604, "dur": 11.640, + "args": { + "External id": 148725,"Record function id": 0, "Ev Idx": 4768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478410.904, "dur": 3.530, + "args": { + "External id": 148726,"Record function id": 0, "Ev Idx": 4769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6303771478415.074, "dur": 173.149, + "args": { + "External id": 148727,"Record function id": 0, "Ev Idx": 4770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478417.744, "dur": 2.130, + "args": { + "External id": 148728,"Record function id": 0, "Ev Idx": 4771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771478420.704, "dur": 166.730, + "args": { + "External id": 148729,"Record function id": 0, "Ev Idx": 4772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771478432.414, "dur": 154.089, + "args": { + "External id": 148730,"Sequence number": 3058741, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4773 + } + }, + { + "ph": "s", "id": 241, "pid": 5714, "tid": 5714, "ts": 6303771478432.414, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478436.404, "dur": 4.030, + "args": { + "External id": 148731,"Record function id": 0, "Ev Idx": 4774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771478441.054, "dur": 137.609, + "args": { + "External id": 148732,"Record function id": 0, "Ev Idx": 4775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478459.514, "dur": 2.610, + "args": { + "External id": 148733,"Record function id": 0, "Ev Idx": 4776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771478463.074, "dur": 95.749, + "args": { + "External id": 148734,"Record function id": 0, "Ev Idx": 4777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478465.574, "dur": 3.570, + "args": { + "External id": 148735,"Record function id": 0, "Ev Idx": 4778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771478469.704, "dur": 87.379, + "args": { + "External id": 148736,"Record function id": 0, "Ev Idx": 4779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478497.603, "dur": 4.280, + "args": { + "External id": 148737,"Record function id": 0, "Ev Idx": 4780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771478502.623, "dur": 53.671, + "args": { + "External id": 148738,"Record function id": 0, "Ev Idx": 4781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771478528.083, "dur": 18.471, + "args": { + "External id": 148739,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478563.763, "dur": 2.611, + "args": { + "External id": 148740,"Record function id": 0, "Ev Idx": 4783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771478566.914, "dur": 11.160, + "args": { + "External id": 148741,"Record function id": 0, "Ev Idx": 4784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478597.963, "dur": 8.260, + "args": { + "External id": 148742,"Record function id": 0, "Ev Idx": 4785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6303771478607.063, "dur": 221.890, + "args": { + "External id": 148743,"Record function id": 0, "Ev Idx": 4786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771478626.843, "dur": 192.810, + "args": { + "External id": 148744,"Sequence number": 3058742, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4787 + } + }, + { + "ph": "s", "id": 240, "pid": 5714, "tid": 5714, "ts": 6303771478626.843, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771478647.983, "dur": 101.850, + "args": { + "External id": 148745,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771478677.603, "dur": 12.160, + "args": { + "External id": 148746,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771478681.403, "dur": 7.500, + "args": { + "External id": 148747,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771478691.923, "dur": 5.960, + "args": { + "External id": 148748,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771478698.923, "dur": 2.250, + "args": { + "External id": 148749,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771478703.353, "dur": 3.050, + "args": { + "External id": 148750,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771478765.103, "dur": 27.360, + "args": { + "External id": 148751,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771478838.233, "dur": 18.100, + "args": { + "External id": 148752,"Record function id": 0, "Ev Idx": 4795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6303771478857.263, "dur": 301.389, + "args": { + "External id": 148753,"Record function id": 0, "Ev Idx": 4796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771478879.643, "dur": 269.079, + "args": { + "External id": 148754,"Sequence number": 3058743, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4797 + } + }, + { + "ph": "s", "id": 239, "pid": 5714, "tid": 5714, "ts": 6303771478879.643, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771478927.473, "dur": 24.480, + "args": { + "External id": 148755,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771478965.242, "dur": 25.011, + "args": { + "External id": 148756,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771479000.253, "dur": 15.480, + "args": { + "External id": 148757,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6303771479037.013, "dur": 16.439, + "args": { + "External id": 148758,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771479064.112, "dur": 22.170, + "args": { + "External id": 148759,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6303771479104.462, "dur": 14.050, + "args": { + "External id": 148760,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4803 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.8)", "pid": 5714, "tid": 5714, + "ts": 6303771479198.412, "dur": 48.190, + "args": { + "External id": 148761,"Record function id": 0, "Ev Idx": 4804 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::cast_forward_inputs", "pid": 5714, "tid": 5714, + "ts": 6303771479309.682, "dur": 41.380, + "args": { + "External id": 148762,"Record function id": 0, "Ev Idx": 4805 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::pre_forward (model.layers.9)", "pid": 5714, "tid": 5714, + "ts": 6303771479359.422, "dur": 824.288, + "args": { + "External id": 148763,"Record function id": 0, "Ev Idx": 4806 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather (model.layers.9)", "pid": 5714, "tid": 5714, + "ts": 6303771479366.332, "dur": 443.719, + "args": { + "External id": 148764,"Record function id": 0, "Ev Idx": 4807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771479426.141, "dur": 7.831, + "args": { + "External id": 148765,"Record function id": 0, "Concrete Inputs": ["[1769856]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771479443.621, "dur": 22.660, + "args": { + "External id": 148766,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479446.772, "dur": 1.229, + "args": { + "External id": 148767,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479450.352, "dur": 0.400, + "args": { + "External id": 148768,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479451.952, "dur": 0.460, + "args": { + "External id": 148769,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479453.072, "dur": 1.180, + "args": { + "External id": 148770,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479455.972, "dur": 0.180, + "args": { + "External id": 148771,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479457.061, "dur": 0.171, + "args": { + "External id": 148772,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479457.941, "dur": 1.091, + "args": { + "External id": 148773,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479461.772, "dur": 0.220, + "args": { + "External id": 148774,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479463.081, "dur": 0.140, + "args": { + "External id": 148775,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771479473.012, "dur": 22.239, + "args": { + "External id": 148776,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::all_gather_copy_in", "pid": 5714, "tid": 5714, + "ts": 6303771479524.171, "dur": 88.360, + "args": { + "External id": 148777,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1769856", "4", "0", "15", ""], "Input type": ["TensorList", "ScalarList", "Scalar", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [], [], [], [], [], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [], [], [], [], [], []], "Ev Idx": 4820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771479533.471, "dur": 6.930, + "args": { + "External id": 148778,"Record function id": 0, "Concrete Inputs": ["[7079424]", "15", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::narrow", "pid": 5714, "tid": 5714, + "ts": 6303771479544.151, "dur": 8.200, + "args": { + "External id": 148779,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771479546.401, "dur": 5.550, + "args": { + "External id": 148780,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "1769856", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[7079424], [], [], [], []], "Ev Idx": 4823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479548.651, "dur": 1.640, + "args": { + "External id": 148781,"Record function id": 0, "Concrete Inputs": ["", "[1769856]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[7079424], [], [], []], "Ev Idx": 4824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes", "pid": 5714, "tid": 5714, + "ts": 6303771479559.001, "dur": 18.440, + "args": { + "External id": 148782,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1769856], [], []], "Ev Idx": 4825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479560.921, "dur": 1.290, + "args": { + "External id": 148783,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479563.031, "dur": 0.170, + "args": { + "External id": 148784,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "192"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479563.851, "dur": 0.260, + "args": { + "External id": 148785,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "147648"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479566.291, "dur": 0.160, + "args": { + "External id": 148786,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "295104"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479567.101, "dur": 0.210, + "args": { + "External id": 148787,"Record function id": 0, "Concrete Inputs": ["", "[147456]", "[1]", "442560"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479568.651, "dur": 1.370, + "args": { + "External id": 148788,"Record function id": 0, "Concrete Inputs": ["", "[192]", "[1]", "590016"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479570.671, "dur": 0.150, + "args": { + "External id": 148789,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "590208"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479571.561, "dur": 0.220, + "args": { + "External id": 148790,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "983424"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479573.931, "dur": 0.820, + "args": { + "External id": 148791,"Record function id": 0, "Concrete Inputs": ["", "[393216]", "[1]", "1376640"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[1769856], [], [], []], "Ev Idx": 4834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_copy_", "pid": 5714, "tid": 5714, + "ts": 6303771479588.961, "dur": 16.270, + "args": { + "External id": 148792,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["TensorList", "TensorList", "Scalar"], "Input Strides": [[[1], [1], [1], [1], [1], [1], [1], [1], [1]], [[1], [1], [1], [1], [1], [1], [1], [1], [1]], []], "Input Dims": [[[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], [[192], [147456], [147456], [147456], [147456], [192], [393216], [393216], [393216]], []], "Ev Idx": 4835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::_allgather_base_", "pid": 5714, "tid": 5714, + "ts": 6303771479663.461, "dur": 81.970, + "args": { + "External id": 148793,"Record function id": 0, "Concrete Inputs": ["", "", "", "False", "-1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "", "Scalar", "Scalar"], "Input Strides": [[1], [1], [], [], []], "Input Dims": [[7079424], [1769856], [], [], []], "Ev Idx": 4836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771479680.231, "dur": 62.230, + "args": { + "External id": 148794,"Record function id": 0, "Collective name": "_allgather_base", "Process Group Description": "default_pg", "dtype": "BFloat16", "Rank": 0, "Input Strides": [[1], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 7079424, "Process Group Name": "0", "Input type": ["c10::BFloat16", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[1769856], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 4837, "In msg nelems": 1769856 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:_all_gather_base", "pid": 5714, "tid": 5714, + "ts": 6303771479691.471, "dur": 46.790, + "args": { + "External id": 148795,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1]], "Input Dims": [[1769856]], "Ev Idx": 4838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771479759.781, "dur": 3.250, + "args": { + "External id": 148796,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 4839, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.9)", "pid": 5714, "tid": 5714, + "ts": 6303771479827.501, "dur": 221.789, + "args": { + "External id": 148797,"Record function id": 0, "Ev Idx": 4840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771479903.300, "dur": 3.660, + "args": { + "External id": 148798,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[7079424], []], "Ev Idx": 4841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771479910.860, "dur": 0.671, + "args": { + "External id": 148799,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771479913.071, "dur": 0.480, + "args": { + "External id": 148800,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771479915.100, "dur": 0.431, + "args": { + "External id": 148801,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771479916.631, "dur": 0.440, + "args": { + "External id": 148802,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771479919.351, "dur": 0.480, + "args": { + "External id": 148803,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[589824], []], "Ev Idx": 4846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771479921.091, "dur": 0.609, + "args": { + "External id": 148804,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[768], []], "Ev Idx": 4847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771479922.980, "dur": 1.531, + "args": { + "External id": 148805,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771479925.691, "dur": 0.569, + "args": { + "External id": 148806,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771479928.620, "dur": 0.440, + "args": { + "External id": 148807,"Record function id": 0, "Concrete Inputs": ["", "[4, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[1572864], []], "Ev Idx": 4850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "fsdp::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771479941.180, "dur": 82.130, + "args": { + "External id": 148808,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::split_with_sizes_copy", "pid": 5714, "tid": 5714, + "ts": 6303771479951.360, "dur": 68.600, + "args": { + "External id": 148809,"Record function id": 0, "Concrete Inputs": ["", "[192, 147456, 147456, 147456, 147456, 192, 393216, 393216, 393216]", "1", ""], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "TensorList"], "Input Strides": [[1769856, 1], [], [], [[192, 1], [147456, 1], [147456, 1], [147456, 1], [147456, 1], [192, 1], [393216, 1], [393216, 1], [393216, 1]]], "Input Dims": [[4, 1769856], [], [], [[4, 192], [4, 147456], [4, 147456], [4, 147456], [4, 147456], [4, 192], [4, 393216], [4, 393216], [4, 393216]]], "Ev Idx": 4852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771479962.640, "dur": 5.910, + "args": { + "External id": 148810,"Record function id": 0, "Concrete Inputs": ["[903]", "4", "", "", "True", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771479970.700, "dur": 30.610, + "args": { + "External id": 148811,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", "False", ""], "Input type": ["long int", "", "", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], [], []], "Ev Idx": 4854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771479972.000, "dur": 29.040, + "args": { + "External id": 148812,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "True", ""], "Input type": ["long int", "", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[903], [], [], [], [], [], []], "Ev Idx": 4855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771479975.150, "dur": 6.510, + "args": { + "External id": 148813,"Record function id": 0, "Concrete Inputs": ["[903]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771479982.740, "dur": 17.870, + "args": { + "External id": 148814,"Record function id": 0, "Concrete Inputs": ["", "", "True"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[903], [903], []], "Ev Idx": 4857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RegisterPostBackwardFunction", "pid": 5714, "tid": 5714, + "ts": 6303771480141.010, "dur": 18.920, + "args": { + "External id": 148815,"Sequence number": 3058744, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4858 + } + }, + { + "ph": "s", "id": 238, "pid": 5714, "tid": 5714, "ts": 6303771480141.010, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771480150.580, "dur": 5.920, + "args": { + "External id": 148816,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771480152.700, "dur": 3.310, + "args": { + "External id": 148817,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480210.270, "dur": 9.380, + "args": { + "External id": 148818,"Record function id": 0, "Ev Idx": 4861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 1/0", "pid": 5714, "tid": 5714, + "ts": 6303771480220.670, "dur": 1341.597, + "args": { + "External id": 148819,"Record function id": 0, "Ev Idx": 4862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771480239.750, "dur": 109.280, + "args": { + "External id": 148820,"Sequence number": 3058745, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4863 + } + }, + { + "ph": "s", "id": 237, "pid": 5714, "tid": 5714, "ts": 6303771480239.750, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771480281.340, "dur": 34.339, + "args": { + "External id": 148821,"kernel_hash": "cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/jo/cjo4mnzjfpkb4s5vvrpimijtkpdvmowtvt3nuufvuafades6vkkj.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [768], [8, 2048, 768], [], []], "Ev Idx": 4864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771480328.219, "dur": 4.171, + "args": { + "External id": 148822,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771480329.310, "dur": 2.880, + "args": { + "External id": 148823,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 1]], "Input Dims": [[8, 2048, 768]], "Ev Idx": 4866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480368.310, "dur": 9.440, + "args": { + "External id": 148824,"Record function id": 0, "Ev Idx": 4867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 2/0", "pid": 5714, "tid": 5714, + "ts": 6303771480378.590, "dur": 844.687, + "args": { + "External id": 148825,"Record function id": 0, "Ev Idx": 4868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771480398.899, "dur": 169.680, + "args": { + "External id": 148826,"Sequence number": 3058746, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [768, 1], [768, 1], [768, 1]], "Input Dims": [[8, 2048, 768], [768, 768], [768, 768], [768, 768]], "Ev Idx": 4869 + } + }, + { + "ph": "s", "id": 236, "pid": 5714, "tid": 5714, "ts": 6303771480398.899, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771480423.419, "dur": 30.160, + "args": { + "External id": 148827,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771480464.049, "dur": 16.190, + "args": { + "External id": 148828,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771480490.239, "dur": 14.340, + "args": { + "External id": 148829,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771480534.569, "dur": 3.890, + "args": { + "External id": 148830,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771480545.499, "dur": 0.870, + "args": { + "External id": 148831,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771480551.159, "dur": 1.750, + "args": { + "External id": 148832,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480585.609, "dur": 8.250, + "args": { + "External id": 148833,"Record function id": 0, "Ev Idx": 4876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 3/1", "pid": 5714, "tid": 5714, + "ts": 6303771480594.639, "dur": 386.559, + "args": { + "External id": 148834,"Record function id": 0, "Ev Idx": 4877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480610.229, "dur": 2.650, + "args": { + "External id": 148835,"Record function id": 0, "Ev Idx": 4878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771480613.519, "dur": 184.939, + "args": { + "External id": 148836,"Record function id": 0, "Ev Idx": 4879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771480625.659, "dur": 171.699, + "args": { + "External id": 148837,"Sequence number": 3058747, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4880 + } + }, + { + "ph": "s", "id": 235, "pid": 5714, "tid": 5714, "ts": 6303771480625.659, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480630.229, "dur": 5.530, + "args": { + "External id": 148838,"Record function id": 0, "Ev Idx": 4881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771480636.409, "dur": 151.449, + "args": { + "External id": 148839,"Record function id": 0, "Ev Idx": 4882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480657.109, "dur": 2.940, + "args": { + "External id": 148840,"Record function id": 0, "Ev Idx": 4883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771480660.749, "dur": 106.140, + "args": { + "External id": 148841,"Record function id": 0, "Ev Idx": 4884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480663.629, "dur": 3.540, + "args": { + "External id": 148842,"Record function id": 0, "Ev Idx": 4885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771480667.739, "dur": 96.979, + "args": { + "External id": 148843,"Record function id": 0, "Ev Idx": 4886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480699.659, "dur": 5.140, + "args": { + "External id": 148844,"Record function id": 0, "Ev Idx": 4887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771480705.679, "dur": 58.090, + "args": { + "External id": 148845,"Record function id": 0, "Ev Idx": 4888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771480733.209, "dur": 20.629, + "args": { + "External id": 148846,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480772.009, "dur": 2.860, + "args": { + "External id": 148847,"Record function id": 0, "Ev Idx": 4890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771480775.489, "dur": 11.780, + "args": { + "External id": 148848,"Record function id": 0, "Ev Idx": 4891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480801.938, "dur": 3.671, + "args": { + "External id": 148849,"Record function id": 0, "Ev Idx": 4892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 11/0", "pid": 5714, "tid": 5714, + "ts": 6303771480806.198, "dur": 174.490, + "args": { + "External id": 148850,"Record function id": 0, "Ev Idx": 4893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480809.449, "dur": 2.240, + "args": { + "External id": 148851,"Record function id": 0, "Ev Idx": 4894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 4/0", "pid": 5714, "tid": 5714, + "ts": 6303771480812.329, "dur": 167.559, + "args": { + "External id": 148852,"Record function id": 0, "Ev Idx": 4895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771480823.469, "dur": 155.499, + "args": { + "External id": 148853,"Sequence number": 3058748, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4896 + } + }, + { + "ph": "s", "id": 234, "pid": 5714, "tid": 5714, "ts": 6303771480823.469, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480827.469, "dur": 3.849, + "args": { + "External id": 148854,"Record function id": 0, "Ev Idx": 4897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 5/0", "pid": 5714, "tid": 5714, + "ts": 6303771480831.969, "dur": 140.509, + "args": { + "External id": 148855,"Record function id": 0, "Ev Idx": 4898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480850.808, "dur": 2.770, + "args": { + "External id": 148856,"Record function id": 0, "Ev Idx": 4899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 6/0", "pid": 5714, "tid": 5714, + "ts": 6303771480854.228, "dur": 98.600, + "args": { + "External id": 148857,"Record function id": 0, "Ev Idx": 4900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480856.498, "dur": 3.460, + "args": { + "External id": 148858,"Record function id": 0, "Ev Idx": 4901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 7/0", "pid": 5714, "tid": 5714, + "ts": 6303771480860.528, "dur": 90.210, + "args": { + "External id": 148859,"Record function id": 0, "Ev Idx": 4902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480889.008, "dur": 4.870, + "args": { + "External id": 148860,"Record function id": 0, "Ev Idx": 4903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 8/0", "pid": 5714, "tid": 5714, + "ts": 6303771480894.638, "dur": 55.280, + "args": { + "External id": 148861,"Record function id": 0, "Ev Idx": 4904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "rotary_embedding_kernel_0", "pid": 5714, "tid": 5714, + "ts": 6303771480921.858, "dur": 18.490, + "args": { + "External id": 148862,"kernel_hash": "cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "0", "8", "2048", "12", "64", "32", "4096", "16", "64", "False", "False", "False", "False"], "kernel_file": "/tmp/torchinductor_root/mg/cmgucohstpiwleho5gkrqkqjxo7yymr34qd6efw2lmx2xb4m4vyg.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 4905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480957.698, "dur": 2.720, + "args": { + "External id": 148863,"Record function id": 0, "Ev Idx": 4906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 10/0", "pid": 5714, "tid": 5714, + "ts": 6303771480961.058, "dur": 10.850, + "args": { + "External id": 148864,"Record function id": 0, "Ev Idx": 4907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771480989.888, "dur": 8.210, + "args": { + "External id": 148865,"Record function id": 0, "Ev Idx": 4908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 13/0", "pid": 5714, "tid": 5714, + "ts": 6303771480998.898, "dur": 223.359, + "args": { + "External id": 148866,"Record function id": 0, "Ev Idx": 4909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771481018.388, "dur": 194.580, + "args": { + "External id": 148867,"Sequence number": 3058749, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [768, 1]], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [768, 768]], "Ev Idx": 4910 + } + }, + { + "ph": "s", "id": 233, "pid": 5714, "tid": 5714, "ts": 6303771481018.388, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771481039.508, "dur": 102.150, + "args": { + "External id": 148868,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771481070.208, "dur": 11.170, + "args": { + "External id": 148869,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771481072.838, "dur": 7.680, + "args": { + "External id": 148870,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771481083.308, "dur": 5.010, + "args": { + "External id": 148871,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771481089.548, "dur": 2.150, + "args": { + "External id": 148872,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771481094.028, "dur": 3.470, + "args": { + "External id": 148873,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771481156.638, "dur": 27.600, + "args": { + "External id": 148874,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [768, 1]], "Input Dims": [[16384, 768], [768, 768], [16384, 768]], "Ev Idx": 4917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771481231.417, "dur": 18.080, + "args": { + "External id": 148875,"Record function id": 0, "Ev Idx": 4918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 14/0", "pid": 5714, "tid": 5714, + "ts": 6303771481250.397, "dur": 309.180, + "args": { + "External id": 148876,"Record function id": 0, "Ev Idx": 4919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771481273.477, "dur": 276.470, + "args": { + "External id": 148877,"Sequence number": 3058750, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], [1], [768, 1], [768, 1], [2048, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768], [768], [2048, 768], [2048, 768], [768, 2048]], "Ev Idx": 4920 + } + }, + { + "ph": "s", "id": 232, "pid": 5714, "tid": 5714, "ts": 6303771481273.477, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771481328.077, "dur": 25.350, + "args": { + "External id": 148878,"kernel_hash": "cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts", "grid": "grid(16384,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "", "", "16384", "768"], "kernel_file": "/tmp/torchinductor_root/bp/cbpl4cy2vmclpfs3uasqr7n2eucdas4liychvfmp25ysusizegts.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [1572864, 768, 1], [1572864, 768, 1], [1], [1572864, 768, 1], [1572864, 768, 1], [], []], "Input Dims": [[8, 2048, 1], [8, 2048, 768], [8, 2048, 768], [768], [8, 2048, 768], [8, 2048, 768], [], []], "Ev Idx": 4921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771481366.197, "dur": 24.660, + "args": { + "External id": 148879,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771481400.867, "dur": 15.380, + "args": { + "External id": 148880,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768], [2048, 1]], "Input Dims": [[16384, 768], [768, 2048], [16384, 2048]], "Ev Idx": 4923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 5714, "tid": 5714, + "ts": 6303771481436.437, "dur": 16.940, + "args": { + "External id": 148881,"kernel_hash": "cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6", "grid": "grid(33554432,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "33554432"], "kernel_file": "/tmp/torchinductor_root/md/cmdczmkkidxbrfr4xu7sheesh2e5iejwhytsz7jdalljdrcu3us6.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[2048, 1], [2048, 1], [4194304, 2048, 1], []], "Input Dims": [[16384, 2048], [16384, 2048], [8, 2048, 2048], []], "Ev Idx": 4924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771481464.007, "dur": 21.800, + "args": { + "External id": 148882,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048], [768, 1]], "Input Dims": [[16384, 2048], [2048, 768], [16384, 768]], "Ev Idx": 4925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_poi_fused_add_2", "pid": 5714, "tid": 5714, + "ts": 6303771481504.167, "dur": 14.360, + "args": { + "External id": 148883,"kernel_hash": "c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3", "grid": "grid(12582912,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "12582912"], "kernel_file": "/tmp/torchinductor_root/3w/c3wckpd3jba65ohkmzgmc6xxcduxsz5t4lv6xnsldkfokmzyhfm3.py", "kernel_backend": "triton", "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4926 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward (model.layers.9)", "pid": 5714, "tid": 5714, + "ts": 6303771481598.337, "dur": 20.540, + "args": { + "External id": 148884,"Record function id": 0, "Ev Idx": 4927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6303771481668.827, "dur": 197.269, + "args": { + "External id": 148885,"Sequence number": 3058751, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 4928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771481671.427, "dur": 39.260, + "args": { + "External id": 148886,"Sequence number": 3058751, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 4929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771481672.596, "dur": 37.760, + "args": { + "External id": 148887,"Sequence number": 3058751, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 4930 + } + }, + { + "ph": "s", "id": 231, "pid": 5714, "tid": 5714, "ts": 6303771481672.596, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771481676.936, "dur": 9.460, + "args": { + "External id": 148888,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771481687.647, "dur": 20.689, + "args": { + "External id": 148889,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771481712.347, "dur": 21.569, + "args": { + "External id": 148890,"Sequence number": 3058752, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4933 + } + }, + { + "ph": "s", "id": 230, "pid": 5714, "tid": 5714, "ts": 6303771481712.347, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303771481716.127, "dur": 0.500, + "args": { + "External id": 148891,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771481717.327, "dur": 0.140, + "args": { + "External id": 148892,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 4935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6303771481736.507, "dur": 22.599, + "args": { + "External id": 148893,"Sequence number": 3058753, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 4936 + } + }, + { + "ph": "s", "id": 229, "pid": 5714, "tid": 5714, "ts": 6303771481736.507, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771481761.086, "dur": 20.200, + "args": { + "External id": 148894,"Sequence number": 3058754, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 4937 + } + }, + { + "ph": "s", "id": 228, "pid": 5714, "tid": 5714, "ts": 6303771481761.086, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771481766.766, "dur": 12.190, + "args": { + "External id": 148895,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 4938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6303771481782.986, "dur": 20.210, + "args": { + "External id": 148896,"Sequence number": 3058755, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 4939 + } + }, + { + "ph": "s", "id": 227, "pid": 5714, "tid": 5714, "ts": 6303771481782.986, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771481806.996, "dur": 16.310, + "args": { + "External id": 148897,"Sequence number": 3058756, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 4940 + } + }, + { + "ph": "s", "id": 226, "pid": 5714, "tid": 5714, "ts": 6303771481806.996, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6303771481825.266, "dur": 24.430, + "args": { + "External id": 148898,"Sequence number": 3058757, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 4941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771481826.806, "dur": 22.650, + "args": { + "External id": 148899,"Sequence number": 3058757, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 4942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771481827.626, "dur": 21.580, + "args": { + "External id": 148900,"Sequence number": 3058757, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 4943 + } + }, + { + "ph": "s", "id": 225, "pid": 5714, "tid": 5714, "ts": 6303771481827.626, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771481831.586, "dur": 5.300, + "args": { + "External id": 148901,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771481837.946, "dur": 10.230, + "args": { + "External id": 148902,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 4945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771481851.576, "dur": 14.130, + "args": { + "External id": 148903,"Sequence number": 3058758, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 4946 + } + }, + { + "ph": "s", "id": 224, "pid": 5714, "tid": 5714, "ts": 6303771481851.576, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771481892.116, "dur": 61.680, + "args": { + "External id": 148904,"Sequence number": 3058759, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771481894.136, "dur": 9.420, + "args": { + "External id": 148905,"Sequence number": 3058759, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4948 + } + }, + { + "ph": "s", "id": 223, "pid": 5714, "tid": 5714, "ts": 6303771481894.136, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771481897.206, "dur": 4.900, + "args": { + "External id": 148906,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771481900.306, "dur": 1.440, + "args": { + "External id": 148907,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771481905.006, "dur": 48.390, + "args": { + "External id": 148908,"Sequence number": 3058760, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 4951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771481907.416, "dur": 4.090, + "args": { + "External id": 148909,"Sequence number": 3058760, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771481908.336, "dur": 2.840, + "args": { + "External id": 148910,"Sequence number": 3058760, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4953 + } + }, + { + "ph": "s", "id": 222, "pid": 5714, "tid": 5714, "ts": 6303771481908.336, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771481913.626, "dur": 33.040, + "args": { + "External id": 148911,"Sequence number": 3058761, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 4954 + } + }, + { + "ph": "s", "id": 221, "pid": 5714, "tid": 5714, "ts": 6303771481913.626, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771481949.066, "dur": 3.540, + "args": { + "External id": 148912,"Sequence number": 3058762, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 4955 + } + }, + { + "ph": "s", "id": 220, "pid": 5714, "tid": 5714, "ts": 6303771481949.066, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771481965.846, "dur": 44.730, + "args": { + "External id": 148913,"Sequence number": 3058763, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771481966.496, "dur": 6.080, + "args": { + "External id": 148914,"Sequence number": 3058763, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4957 + } + }, + { + "ph": "s", "id": 219, "pid": 5714, "tid": 5714, "ts": 6303771481966.496, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771481967.886, "dur": 3.310, + "args": { + "External id": 148915,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771481970.146, "dur": 0.760, + "args": { + "External id": 148916,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771481973.336, "dur": 36.980, + "args": { + "External id": 148917,"Sequence number": 3058764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 4960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771481974.996, "dur": 2.360, + "args": { + "External id": 148918,"Sequence number": 3058764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771481975.626, "dur": 1.570, + "args": { + "External id": 148919,"Sequence number": 3058764, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4962 + } + }, + { + "ph": "s", "id": 218, "pid": 5714, "tid": 5714, "ts": 6303771481975.626, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771481977.956, "dur": 27.350, + "args": { + "External id": 148920,"Sequence number": 3058765, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 4963 + } + }, + { + "ph": "s", "id": 217, "pid": 5714, "tid": 5714, "ts": 6303771481977.956, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771482007.066, "dur": 2.480, + "args": { + "External id": 148921,"Sequence number": 3058766, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 4964 + } + }, + { + "ph": "s", "id": 216, "pid": 5714, "tid": 5714, "ts": 6303771482007.066, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771482021.206, "dur": 42.430, + "args": { + "External id": 148922,"Sequence number": 3058767, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771482021.826, "dur": 4.270, + "args": { + "External id": 148923,"Sequence number": 3058767, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4966 + } + }, + { + "ph": "s", "id": 215, "pid": 5714, "tid": 5714, "ts": 6303771482021.826, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771482022.956, "dur": 2.290, + "args": { + "External id": 148924,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771482024.266, "dur": 0.730, + "args": { + "External id": 148925,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771482027.756, "dur": 35.590, + "args": { + "External id": 148926,"Sequence number": 3058768, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 4969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771482028.826, "dur": 3.150, + "args": { + "External id": 148927,"Sequence number": 3058768, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771482029.456, "dur": 2.340, + "args": { + "External id": 148928,"Sequence number": 3058768, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4971 + } + }, + { + "ph": "s", "id": 214, "pid": 5714, "tid": 5714, "ts": 6303771482029.456, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771482032.596, "dur": 25.720, + "args": { + "External id": 148929,"Sequence number": 3058769, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 4972 + } + }, + { + "ph": "s", "id": 213, "pid": 5714, "tid": 5714, "ts": 6303771482032.596, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771482060.206, "dur": 2.540, + "args": { + "External id": 148930,"Sequence number": 3058770, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 4973 + } + }, + { + "ph": "s", "id": 212, "pid": 5714, "tid": 5714, "ts": 6303771482060.206, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771482079.406, "dur": 4.290, + "args": { + "External id": 148931,"Sequence number": 3058771, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771482080.096, "dur": 3.340, + "args": { + "External id": 148932,"Sequence number": 3058771, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4975 + } + }, + { + "ph": "s", "id": 211, "pid": 5714, "tid": 5714, "ts": 6303771482080.096, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771482090.975, "dur": 2.211, + "args": { + "External id": 148933,"Sequence number": 3058772, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771482091.506, "dur": 1.520, + "args": { + "External id": 148934,"Sequence number": 3058772, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4977 + } + }, + { + "ph": "s", "id": 210, "pid": 5714, "tid": 5714, "ts": 6303771482091.506, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771482099.006, "dur": 3.209, + "args": { + "External id": 148935,"Sequence number": 3058773, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771482100.555, "dur": 1.480, + "args": { + "External id": 148936,"Sequence number": 3058773, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 4979 + } + }, + { + "ph": "s", "id": 209, "pid": 5714, "tid": 5714, "ts": 6303771482100.555, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771482129.675, "dur": 139.260, + "args": { + "External id": 148937,"Sequence number": 3058774, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4980 + } + }, + { + "ph": "s", "id": 208, "pid": 5714, "tid": 5714, "ts": 6303771482129.675, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771482146.966, "dur": 9.249, + "args": { + "External id": 148938,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771482148.966, "dur": 6.589, + "args": { + "External id": 148939,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771482284.195, "dur": 127.770, + "args": { + "External id": 148940,"Sequence number": 3058775, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 4983 + } + }, + { + "ph": "s", "id": 207, "pid": 5714, "tid": 5714, "ts": 6303771482284.195, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771482306.935, "dur": 11.230, + "args": { + "External id": 148941,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771482309.155, "dur": 8.260, + "args": { + "External id": 148942,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5714, "tid": 5714, + "ts": 6303771482439.355, "dur": 152.230, + "args": { + "External id": 148943,"Sequence number": 3058776, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 4986 + } + }, + { + "ph": "s", "id": 206, "pid": 5714, "tid": 5714, "ts": 6303771482439.355, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771482459.265, "dur": 106.849, + "args": { + "External id": 148944,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 4987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771482494.165, "dur": 11.970, + "args": { + "External id": 148945,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 4988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771482496.685, "dur": 8.580, + "args": { + "External id": 148946,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771482507.695, "dur": 4.800, + "args": { + "External id": 148947,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771482513.415, "dur": 2.140, + "args": { + "External id": 148948,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771482517.805, "dur": 4.740, + "args": { + "External id": 148949,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 4992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 5714, + "ts": 6303771482577.394, "dur": 3.860, + "args": { + "External id": 148950,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 4993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771482597.474, "dur": 4.500, + "args": { + "External id": 148951,"Sequence number": 3058777, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771482598.285, "dur": 3.460, + "args": { + "External id": 148952,"Sequence number": 3058777, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 4995 + } + }, + { + "ph": "s", "id": 205, "pid": 5714, "tid": 5714, "ts": 6303771482598.285, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771482613.514, "dur": 54.920, + "args": { + "External id": 148953,"Sequence number": 3058778, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 4996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771482615.405, "dur": 5.549, + "args": { + "External id": 148954,"Sequence number": 3058778, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 4997 + } + }, + { + "ph": "s", "id": 204, "pid": 5714, "tid": 5714, "ts": 6303771482615.405, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771482617.205, "dur": 2.869, + "args": { + "External id": 148955,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 4998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771482618.614, "dur": 1.120, + "args": { + "External id": 148956,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 4999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771482621.814, "dur": 46.310, + "args": { + "External id": 148957,"Sequence number": 3058779, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771482623.105, "dur": 3.560, + "args": { + "External id": 148958,"Sequence number": 3058779, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771482625.005, "dur": 1.469, + "args": { + "External id": 148959,"Sequence number": 3058779, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5002 + } + }, + { + "ph": "s", "id": 203, "pid": 5714, "tid": 5714, "ts": 6303771482625.005, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771482627.334, "dur": 35.550, + "args": { + "External id": 148960,"Sequence number": 3058780, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5003 + } + }, + { + "ph": "s", "id": 202, "pid": 5714, "tid": 5714, "ts": 6303771482627.334, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771482665.014, "dur": 2.310, + "args": { + "External id": 148961,"Sequence number": 3058781, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5004 + } + }, + { + "ph": "s", "id": 201, "pid": 5714, "tid": 5714, "ts": 6303771482665.014, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771482679.774, "dur": 20.130, + "args": { + "External id": 148962,"Sequence number": 3058782, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5005 + } + }, + { + "ph": "s", "id": 200, "pid": 5714, "tid": 5714, "ts": 6303771482679.774, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6303771482717.434, "dur": 158.960, + "args": { + "External id": 148963,"Sequence number": 3058783, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771482719.134, "dur": 27.620, + "args": { + "External id": 148964,"Sequence number": 3058783, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771482720.134, "dur": 26.350, + "args": { + "External id": 148965,"Sequence number": 3058783, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5008 + } + }, + { + "ph": "s", "id": 199, "pid": 5714, "tid": 5714, "ts": 6303771482720.134, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771482723.524, "dur": 7.090, + "args": { + "External id": 148966,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771482731.634, "dur": 13.530, + "args": { + "External id": 148967,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771482747.924, "dur": 18.520, + "args": { + "External id": 148968,"Sequence number": 3058784, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5011 + } + }, + { + "ph": "s", "id": 198, "pid": 5714, "tid": 5714, "ts": 6303771482747.924, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303771482750.274, "dur": 0.380, + "args": { + "External id": 148969,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771482751.334, "dur": 0.130, + "args": { + "External id": 148970,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6303771482769.204, "dur": 17.110, + "args": { + "External id": 148971,"Sequence number": 3058785, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5014 + } + }, + { + "ph": "s", "id": 197, "pid": 5714, "tid": 5714, "ts": 6303771482769.204, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771482787.654, "dur": 18.350, + "args": { + "External id": 148972,"Sequence number": 3058786, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5015 + } + }, + { + "ph": "s", "id": 196, "pid": 5714, "tid": 5714, "ts": 6303771482787.654, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771482793.884, "dur": 10.260, + "args": { + "External id": 148973,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6303771482806.974, "dur": 13.540, + "args": { + "External id": 148974,"Sequence number": 3058787, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5017 + } + }, + { + "ph": "s", "id": 195, "pid": 5714, "tid": 5714, "ts": 6303771482806.974, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771482823.424, "dur": 14.190, + "args": { + "External id": 148975,"Sequence number": 3058788, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5018 + } + }, + { + "ph": "s", "id": 194, "pid": 5714, "tid": 5714, "ts": 6303771482823.424, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6303771482838.814, "dur": 20.180, + "args": { + "External id": 148976,"Sequence number": 3058789, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771482839.914, "dur": 18.840, + "args": { + "External id": 148977,"Sequence number": 3058789, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771482840.694, "dur": 17.830, + "args": { + "External id": 148978,"Sequence number": 3058789, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5021 + } + }, + { + "ph": "s", "id": 193, "pid": 5714, "tid": 5714, "ts": 6303771482840.694, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771482843.704, "dur": 3.840, + "args": { + "External id": 148979,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771482848.404, "dur": 9.280, + "args": { + "External id": 148980,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771482861.814, "dur": 14.120, + "args": { + "External id": 148981,"Sequence number": 3058790, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5024 + } + }, + { + "ph": "s", "id": 192, "pid": 5714, "tid": 5714, "ts": 6303771482861.814, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771482896.834, "dur": 52.210, + "args": { + "External id": 148982,"Sequence number": 3058791, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771482897.694, "dur": 6.910, + "args": { + "External id": 148983,"Sequence number": 3058791, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5026 + } + }, + { + "ph": "s", "id": 191, "pid": 5714, "tid": 5714, "ts": 6303771482897.694, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771482899.714, "dur": 3.500, + "args": { + "External id": 148984,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771482901.634, "dur": 1.260, + "args": { + "External id": 148985,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771482905.344, "dur": 43.390, + "args": { + "External id": 148986,"Sequence number": 3058792, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771482907.604, "dur": 4.370, + "args": { + "External id": 148987,"Sequence number": 3058792, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771482908.314, "dur": 3.460, + "args": { + "External id": 148988,"Sequence number": 3058792, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5031 + } + }, + { + "ph": "s", "id": 190, "pid": 5714, "tid": 5714, "ts": 6303771482908.314, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771482912.644, "dur": 30.260, + "args": { + "External id": 148989,"Sequence number": 3058793, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5032 + } + }, + { + "ph": "s", "id": 189, "pid": 5714, "tid": 5714, "ts": 6303771482912.644, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771482945.444, "dur": 2.500, + "args": { + "External id": 148990,"Sequence number": 3058794, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5033 + } + }, + { + "ph": "s", "id": 188, "pid": 5714, "tid": 5714, "ts": 6303771482945.444, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771482959.674, "dur": 56.110, + "args": { + "External id": 148991,"Sequence number": 3058795, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771482961.404, "dur": 15.760, + "args": { + "External id": 148992,"Sequence number": 3058795, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5035 + } + }, + { + "ph": "s", "id": 187, "pid": 5714, "tid": 5714, "ts": 6303771482961.404, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771482972.744, "dur": 3.370, + "args": { + "External id": 148993,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771482974.944, "dur": 0.880, + "args": { + "External id": 148994,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771482977.954, "dur": 37.539, + "args": { + "External id": 148995,"Sequence number": 3058796, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771482979.014, "dur": 3.430, + "args": { + "External id": 148996,"Sequence number": 3058796, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771482980.654, "dur": 1.510, + "args": { + "External id": 148997,"Sequence number": 3058796, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5040 + } + }, + { + "ph": "s", "id": 186, "pid": 5714, "tid": 5714, "ts": 6303771482980.654, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771482982.984, "dur": 26.180, + "args": { + "External id": 148998,"Sequence number": 3058797, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5041 + } + }, + { + "ph": "s", "id": 185, "pid": 5714, "tid": 5714, "ts": 6303771482982.984, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771483011.293, "dur": 3.491, + "args": { + "External id": 148999,"Sequence number": 3058798, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5042 + } + }, + { + "ph": "s", "id": 184, "pid": 5714, "tid": 5714, "ts": 6303771483011.293, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5714, "tid": 5714, + "ts": 6303771483039.344, "dur": 126.249, + "args": { + "External id": 149000,"Sequence number": 3058799, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5043 + } + }, + { + "ph": "s", "id": 183, "pid": 5714, "tid": 5714, "ts": 6303771483039.344, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771483066.613, "dur": 7.280, + "args": { + "External id": 149001,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771483103.673, "dur": 47.450, + "args": { + "External id": 149002,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771483104.573, "dur": 6.260, + "args": { + "External id": 149003,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771483105.993, "dur": 3.720, + "args": { + "External id": 149004,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771483107.963, "dur": 1.220, + "args": { + "External id": 149005,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771483111.633, "dur": 39.080, + "args": { + "External id": 149006,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771483114.263, "dur": 2.770, + "args": { + "External id": 149007,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771483114.953, "dur": 1.900, + "args": { + "External id": 149008,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771483117.583, "dur": 28.800, + "args": { + "External id": 149009,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771483148.603, "dur": 1.140, + "args": { + "External id": 149010,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771483171.633, "dur": 19.470, + "args": { + "External id": 149011,"Sequence number": 3058800, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5054 + } + }, + { + "ph": "s", "id": 182, "pid": 5714, "tid": 5714, "ts": 6303771483171.633, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6303771483216.653, "dur": 166.350, + "args": { + "External id": 149012,"Sequence number": 3058801, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771483218.093, "dur": 29.120, + "args": { + "External id": 149013,"Sequence number": 3058801, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771483220.293, "dur": 26.590, + "args": { + "External id": 149014,"Sequence number": 3058801, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5057 + } + }, + { + "ph": "s", "id": 181, "pid": 5714, "tid": 5714, "ts": 6303771483220.293, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771483223.813, "dur": 6.620, + "args": { + "External id": 149015,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771483231.493, "dur": 13.910, + "args": { + "External id": 149016,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771483248.333, "dur": 16.550, + "args": { + "External id": 149017,"Sequence number": 3058802, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5060 + } + }, + { + "ph": "s", "id": 180, "pid": 5714, "tid": 5714, "ts": 6303771483248.333, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303771483250.543, "dur": 0.380, + "args": { + "External id": 149018,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771483251.563, "dur": 0.160, + "args": { + "External id": 149019,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6303771483266.443, "dur": 16.670, + "args": { + "External id": 149020,"Sequence number": 3058803, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5063 + } + }, + { + "ph": "s", "id": 179, "pid": 5714, "tid": 5714, "ts": 6303771483266.443, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771483284.413, "dur": 25.400, + "args": { + "External id": 149021,"Sequence number": 3058804, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5064 + } + }, + { + "ph": "s", "id": 178, "pid": 5714, "tid": 5714, "ts": 6303771483284.413, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771483290.123, "dur": 17.590, + "args": { + "External id": 149022,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6303771483311.013, "dur": 16.600, + "args": { + "External id": 149023,"Sequence number": 3058805, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5066 + } + }, + { + "ph": "s", "id": 177, "pid": 5714, "tid": 5714, "ts": 6303771483311.013, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771483330.713, "dur": 14.710, + "args": { + "External id": 149024,"Sequence number": 3058806, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5067 + } + }, + { + "ph": "s", "id": 176, "pid": 5714, "tid": 5714, "ts": 6303771483330.713, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6303771483346.463, "dur": 21.050, + "args": { + "External id": 149025,"Sequence number": 3058807, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771483348.403, "dur": 18.880, + "args": { + "External id": 149026,"Sequence number": 3058807, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771483349.203, "dur": 17.860, + "args": { + "External id": 149027,"Sequence number": 3058807, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5070 + } + }, + { + "ph": "s", "id": 175, "pid": 5714, "tid": 5714, "ts": 6303771483349.203, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771483352.083, "dur": 3.820, + "args": { + "External id": 149028,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771483356.763, "dur": 9.450, + "args": { + "External id": 149029,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771483369.393, "dur": 13.170, + "args": { + "External id": 149030,"Sequence number": 3058808, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5073 + } + }, + { + "ph": "s", "id": 174, "pid": 5714, "tid": 5714, "ts": 6303771483369.393, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771483405.503, "dur": 54.469, + "args": { + "External id": 149031,"Sequence number": 3058809, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771483406.383, "dur": 7.800, + "args": { + "External id": 149032,"Sequence number": 3058809, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5075 + } + }, + { + "ph": "s", "id": 173, "pid": 5714, "tid": 5714, "ts": 6303771483406.383, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771483409.513, "dur": 3.250, + "args": { + "External id": 149033,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771483411.203, "dur": 1.240, + "args": { + "External id": 149034,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771483414.933, "dur": 44.699, + "args": { + "External id": 149035,"Sequence number": 3058810, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771483416.263, "dur": 5.270, + "args": { + "External id": 149036,"Sequence number": 3058810, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771483416.953, "dur": 4.360, + "args": { + "External id": 149037,"Sequence number": 3058810, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5080 + } + }, + { + "ph": "s", "id": 172, "pid": 5714, "tid": 5714, "ts": 6303771483416.953, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771483422.173, "dur": 30.619, + "args": { + "External id": 149038,"Sequence number": 3058811, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5081 + } + }, + { + "ph": "s", "id": 171, "pid": 5714, "tid": 5714, "ts": 6303771483422.173, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771483455.372, "dur": 3.440, + "args": { + "External id": 149039,"Sequence number": 3058812, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5082 + } + }, + { + "ph": "s", "id": 170, "pid": 5714, "tid": 5714, "ts": 6303771483455.372, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771483470.403, "dur": 44.369, + "args": { + "External id": 149040,"Sequence number": 3058813, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771483471.012, "dur": 5.720, + "args": { + "External id": 149041,"Sequence number": 3058813, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5084 + } + }, + { + "ph": "s", "id": 169, "pid": 5714, "tid": 5714, "ts": 6303771483471.012, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771483472.503, "dur": 3.280, + "args": { + "External id": 149042,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771483474.743, "dur": 0.760, + "args": { + "External id": 149043,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771483477.472, "dur": 37.040, + "args": { + "External id": 149044,"Sequence number": 3058814, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771483478.443, "dur": 2.400, + "args": { + "External id": 149045,"Sequence number": 3058814, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771483479.072, "dur": 1.600, + "args": { + "External id": 149046,"Sequence number": 3058814, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5089 + } + }, + { + "ph": "s", "id": 168, "pid": 5714, "tid": 5714, "ts": 6303771483479.072, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771483482.723, "dur": 26.629, + "args": { + "External id": 149047,"Sequence number": 3058815, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5090 + } + }, + { + "ph": "s", "id": 167, "pid": 5714, "tid": 5714, "ts": 6303771483482.723, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771483511.223, "dur": 2.560, + "args": { + "External id": 149048,"Sequence number": 3058816, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5091 + } + }, + { + "ph": "s", "id": 166, "pid": 5714, "tid": 5714, "ts": 6303771483511.223, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771483524.472, "dur": 42.270, + "args": { + "External id": 149049,"Sequence number": 3058817, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771483525.063, "dur": 5.389, + "args": { + "External id": 149050,"Sequence number": 3058817, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5093 + } + }, + { + "ph": "s", "id": 165, "pid": 5714, "tid": 5714, "ts": 6303771483525.063, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771483526.303, "dur": 3.229, + "args": { + "External id": 149051,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771483528.403, "dur": 0.849, + "args": { + "External id": 149052,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771483531.183, "dur": 35.289, + "args": { + "External id": 149053,"Sequence number": 3058818, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771483533.143, "dur": 2.660, + "args": { + "External id": 149054,"Sequence number": 3058818, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771483533.763, "dur": 1.889, + "args": { + "External id": 149055,"Sequence number": 3058818, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5098 + } + }, + { + "ph": "s", "id": 164, "pid": 5714, "tid": 5714, "ts": 6303771483533.763, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771483536.403, "dur": 26.059, + "args": { + "External id": 149056,"Sequence number": 3058819, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5099 + } + }, + { + "ph": "s", "id": 163, "pid": 5714, "tid": 5714, "ts": 6303771483536.403, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771483564.182, "dur": 1.650, + "args": { + "External id": 149057,"Sequence number": 3058820, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5100 + } + }, + { + "ph": "s", "id": 162, "pid": 5714, "tid": 5714, "ts": 6303771483564.182, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771483579.902, "dur": 3.560, + "args": { + "External id": 149058,"Sequence number": 3058821, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771483581.082, "dur": 2.170, + "args": { + "External id": 149059,"Sequence number": 3058821, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5102 + } + }, + { + "ph": "s", "id": 161, "pid": 5714, "tid": 5714, "ts": 6303771483581.082, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771483590.342, "dur": 4.140, + "args": { + "External id": 149060,"Sequence number": 3058822, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771483590.912, "dur": 3.340, + "args": { + "External id": 149061,"Sequence number": 3058822, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5104 + } + }, + { + "ph": "s", "id": 160, "pid": 5714, "tid": 5714, "ts": 6303771483590.912, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771483598.972, "dur": 1.710, + "args": { + "External id": 149062,"Sequence number": 3058823, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771483599.442, "dur": 1.080, + "args": { + "External id": 149063,"Sequence number": 3058823, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5106 + } + }, + { + "ph": "s", "id": 159, "pid": 5714, "tid": 5714, "ts": 6303771483599.442, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771483625.602, "dur": 124.200, + "args": { + "External id": 149064,"Sequence number": 3058824, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5107 + } + }, + { + "ph": "s", "id": 158, "pid": 5714, "tid": 5714, "ts": 6303771483625.602, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771483641.042, "dur": 10.330, + "args": { + "External id": 149065,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771483643.322, "dur": 7.420, + "args": { + "External id": 149066,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771483764.752, "dur": 113.880, + "args": { + "External id": 149067,"Sequence number": 3058825, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5110 + } + }, + { + "ph": "s", "id": 157, "pid": 5714, "tid": 5714, "ts": 6303771483764.752, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771483778.472, "dur": 10.130, + "args": { + "External id": 149068,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771483780.722, "dur": 7.300, + "args": { + "External id": 149069,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5714, "tid": 5714, + "ts": 6303771483902.702, "dur": 143.649, + "args": { + "External id": 149070,"Sequence number": 3058826, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 5113 + } + }, + { + "ph": "s", "id": 156, "pid": 5714, "tid": 5714, "ts": 6303771483902.702, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771483920.702, "dur": 102.629, + "args": { + "External id": 149071,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 5114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771483951.742, "dur": 12.009, + "args": { + "External id": 149072,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771483954.651, "dur": 8.200, + "args": { + "External id": 149073,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771483965.422, "dur": 5.360, + "args": { + "External id": 149074,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771483971.671, "dur": 2.280, + "args": { + "External id": 149075,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771483976.111, "dur": 3.931, + "args": { + "External id": 149076,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 5714, + "ts": 6303771484032.921, "dur": 3.440, + "args": { + "External id": 149077,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 5120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771484051.691, "dur": 5.370, + "args": { + "External id": 149078,"Sequence number": 3058827, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771484052.951, "dur": 3.820, + "args": { + "External id": 149079,"Sequence number": 3058827, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5122 + } + }, + { + "ph": "s", "id": 155, "pid": 5714, "tid": 5714, "ts": 6303771484052.951, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771484069.371, "dur": 54.040, + "args": { + "External id": 149080,"Sequence number": 3058828, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771484070.131, "dur": 5.430, + "args": { + "External id": 149081,"Sequence number": 3058828, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5124 + } + }, + { + "ph": "s", "id": 154, "pid": 5714, "tid": 5714, "ts": 6303771484070.131, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771484071.861, "dur": 2.800, + "args": { + "External id": 149082,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484073.171, "dur": 1.130, + "args": { + "External id": 149083,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771484076.421, "dur": 46.650, + "args": { + "External id": 149084,"Sequence number": 3058829, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771484079.091, "dur": 2.050, + "args": { + "External id": 149085,"Sequence number": 3058829, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771484079.601, "dur": 1.360, + "args": { + "External id": 149086,"Sequence number": 3058829, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5129 + } + }, + { + "ph": "s", "id": 153, "pid": 5714, "tid": 5714, "ts": 6303771484079.601, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771484081.841, "dur": 35.950, + "args": { + "External id": 149087,"Sequence number": 3058830, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5130 + } + }, + { + "ph": "s", "id": 152, "pid": 5714, "tid": 5714, "ts": 6303771484081.841, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771484119.871, "dur": 2.360, + "args": { + "External id": 149088,"Sequence number": 3058831, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5131 + } + }, + { + "ph": "s", "id": 151, "pid": 5714, "tid": 5714, "ts": 6303771484119.871, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771484133.731, "dur": 19.010, + "args": { + "External id": 149089,"Sequence number": 3058832, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5132 + } + }, + { + "ph": "s", "id": 150, "pid": 5714, "tid": 5714, "ts": 6303771484133.731, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6303771484170.111, "dur": 170.219, + "args": { + "External id": 149090,"Sequence number": 3058833, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771484173.961, "dur": 26.730, + "args": { + "External id": 149091,"Sequence number": 3058833, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771484175.061, "dur": 25.340, + "args": { + "External id": 149092,"Sequence number": 3058833, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5135 + } + }, + { + "ph": "s", "id": 149, "pid": 5714, "tid": 5714, "ts": 6303771484175.061, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484178.581, "dur": 5.660, + "args": { + "External id": 149093,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771484185.211, "dur": 13.900, + "args": { + "External id": 149094,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771484201.821, "dur": 19.120, + "args": { + "External id": 149095,"Sequence number": 3058834, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5138 + } + }, + { + "ph": "s", "id": 148, "pid": 5714, "tid": 5714, "ts": 6303771484201.821, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303771484204.311, "dur": 0.390, + "args": { + "External id": 149096,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771484206.561, "dur": 0.150, + "args": { + "External id": 149097,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6303771484222.441, "dur": 18.290, + "args": { + "External id": 149098,"Sequence number": 3058835, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5141 + } + }, + { + "ph": "s", "id": 147, "pid": 5714, "tid": 5714, "ts": 6303771484222.441, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771484242.121, "dur": 16.270, + "args": { + "External id": 149099,"Sequence number": 3058836, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5142 + } + }, + { + "ph": "s", "id": 146, "pid": 5714, "tid": 5714, "ts": 6303771484242.121, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771484246.341, "dur": 10.200, + "args": { + "External id": 149100,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6303771484259.461, "dur": 13.240, + "args": { + "External id": 149101,"Sequence number": 3058837, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5144 + } + }, + { + "ph": "s", "id": 145, "pid": 5714, "tid": 5714, "ts": 6303771484259.461, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771484275.741, "dur": 15.290, + "args": { + "External id": 149102,"Sequence number": 3058838, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5145 + } + }, + { + "ph": "s", "id": 144, "pid": 5714, "tid": 5714, "ts": 6303771484275.741, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6303771484292.111, "dur": 30.800, + "args": { + "External id": 149103,"Sequence number": 3058839, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771484293.261, "dur": 29.410, + "args": { + "External id": 149104,"Sequence number": 3058839, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771484294.071, "dur": 28.290, + "args": { + "External id": 149105,"Sequence number": 3058839, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5148 + } + }, + { + "ph": "s", "id": 143, "pid": 5714, "tid": 5714, "ts": 6303771484294.071, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484305.101, "dur": 4.560, + "args": { + "External id": 149106,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771484310.591, "dur": 10.700, + "args": { + "External id": 149107,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771484324.941, "dur": 14.940, + "args": { + "External id": 149108,"Sequence number": 3058840, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5151 + } + }, + { + "ph": "s", "id": 142, "pid": 5714, "tid": 5714, "ts": 6303771484324.941, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771484358.821, "dur": 53.380, + "args": { + "External id": 149109,"Sequence number": 3058841, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771484360.770, "dur": 6.700, + "args": { + "External id": 149110,"Sequence number": 3058841, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5153 + } + }, + { + "ph": "s", "id": 141, "pid": 5714, "tid": 5714, "ts": 6303771484360.770, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771484362.621, "dur": 3.489, + "args": { + "External id": 149111,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484364.381, "dur": 1.320, + "args": { + "External id": 149112,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771484368.221, "dur": 43.669, + "args": { + "External id": 149113,"Sequence number": 3058842, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771484370.161, "dur": 4.520, + "args": { + "External id": 149114,"Sequence number": 3058842, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771484371.950, "dur": 2.560, + "args": { + "External id": 149115,"Sequence number": 3058842, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5158 + } + }, + { + "ph": "s", "id": 140, "pid": 5714, "tid": 5714, "ts": 6303771484371.950, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771484375.301, "dur": 31.560, + "args": { + "External id": 149116,"Sequence number": 3058843, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5159 + } + }, + { + "ph": "s", "id": 139, "pid": 5714, "tid": 5714, "ts": 6303771484375.301, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771484408.861, "dur": 2.240, + "args": { + "External id": 149117,"Sequence number": 3058844, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5160 + } + }, + { + "ph": "s", "id": 138, "pid": 5714, "tid": 5714, "ts": 6303771484408.861, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771484422.481, "dur": 42.829, + "args": { + "External id": 149118,"Sequence number": 3058845, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771484423.061, "dur": 5.660, + "args": { + "External id": 149119,"Sequence number": 3058845, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5162 + } + }, + { + "ph": "s", "id": 137, "pid": 5714, "tid": 5714, "ts": 6303771484423.061, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771484425.261, "dur": 2.600, + "args": { + "External id": 149120,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484426.630, "dur": 0.840, + "args": { + "External id": 149121,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771484429.481, "dur": 35.549, + "args": { + "External id": 149122,"Sequence number": 3058846, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771484430.581, "dur": 3.269, + "args": { + "External id": 149123,"Sequence number": 3058846, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771484432.201, "dur": 1.489, + "args": { + "External id": 149124,"Sequence number": 3058846, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5167 + } + }, + { + "ph": "s", "id": 136, "pid": 5714, "tid": 5714, "ts": 6303771484432.201, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771484434.421, "dur": 25.349, + "args": { + "External id": 149125,"Sequence number": 3058847, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5168 + } + }, + { + "ph": "s", "id": 135, "pid": 5714, "tid": 5714, "ts": 6303771484434.421, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771484461.930, "dur": 2.390, + "args": { + "External id": 149126,"Sequence number": 3058848, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5169 + } + }, + { + "ph": "s", "id": 134, "pid": 5714, "tid": 5714, "ts": 6303771484461.930, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5714, "tid": 5714, + "ts": 6303771484486.880, "dur": 116.780, + "args": { + "External id": 149127,"Sequence number": 3058849, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5170 + } + }, + { + "ph": "s", "id": 133, "pid": 5714, "tid": 5714, "ts": 6303771484486.880, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771484510.690, "dur": 7.620, + "args": { + "External id": 149128,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771484543.040, "dur": 47.910, + "args": { + "External id": 149129,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771484543.890, "dur": 7.510, + "args": { + "External id": 149130,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771484546.770, "dur": 3.480, + "args": { + "External id": 149131,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484548.710, "dur": 1.170, + "args": { + "External id": 149132,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771484552.280, "dur": 38.280, + "args": { + "External id": 149133,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771484553.780, "dur": 3.700, + "args": { + "External id": 149134,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771484555.400, "dur": 1.900, + "args": { + "External id": 149135,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771484558.140, "dur": 28.480, + "args": { + "External id": 149136,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771484588.430, "dur": 1.120, + "args": { + "External id": 149137,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771484609.290, "dur": 19.540, + "args": { + "External id": 149138,"Sequence number": 3058850, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5181 + } + }, + { + "ph": "s", "id": 132, "pid": 5714, "tid": 5714, "ts": 6303771484609.290, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6303771484653.460, "dur": 155.509, + "args": { + "External id": 149139,"Sequence number": 3058851, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771484654.950, "dur": 29.090, + "args": { + "External id": 149140,"Sequence number": 3058851, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771484656.110, "dur": 27.650, + "args": { + "External id": 149141,"Sequence number": 3058851, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5184 + } + }, + { + "ph": "s", "id": 131, "pid": 5714, "tid": 5714, "ts": 6303771484656.110, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484661.220, "dur": 5.810, + "args": { + "External id": 149142,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771484668.040, "dur": 14.280, + "args": { + "External id": 149143,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771484685.210, "dur": 17.550, + "args": { + "External id": 149144,"Sequence number": 3058852, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5187 + } + }, + { + "ph": "s", "id": 130, "pid": 5714, "tid": 5714, "ts": 6303771484685.210, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303771484687.760, "dur": 0.430, + "args": { + "External id": 149145,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771484688.880, "dur": 0.160, + "args": { + "External id": 149146,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6303771484704.230, "dur": 17.090, + "args": { + "External id": 149147,"Sequence number": 3058853, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5190 + } + }, + { + "ph": "s", "id": 129, "pid": 5714, "tid": 5714, "ts": 6303771484704.230, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771484722.640, "dur": 15.870, + "args": { + "External id": 149148,"Sequence number": 3058854, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5191 + } + }, + { + "ph": "s", "id": 128, "pid": 5714, "tid": 5714, "ts": 6303771484722.640, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771484727.200, "dur": 9.540, + "args": { + "External id": 149149,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6303771484739.500, "dur": 14.080, + "args": { + "External id": 149150,"Sequence number": 3058855, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5193 + } + }, + { + "ph": "s", "id": 127, "pid": 5714, "tid": 5714, "ts": 6303771484739.500, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771484756.480, "dur": 13.890, + "args": { + "External id": 149151,"Sequence number": 3058856, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5194 + } + }, + { + "ph": "s", "id": 126, "pid": 5714, "tid": 5714, "ts": 6303771484756.480, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6303771484771.470, "dur": 22.219, + "args": { + "External id": 149152,"Sequence number": 3058857, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771484772.530, "dur": 20.959, + "args": { + "External id": 149153,"Sequence number": 3058857, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771484774.470, "dur": 18.799, + "args": { + "External id": 149154,"Sequence number": 3058857, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5197 + } + }, + { + "ph": "s", "id": 125, "pid": 5714, "tid": 5714, "ts": 6303771484774.470, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484778.330, "dur": 3.750, + "args": { + "External id": 149155,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771484783.010, "dur": 9.370, + "args": { + "External id": 149156,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771484795.509, "dur": 13.040, + "args": { + "External id": 149157,"Sequence number": 3058858, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5200 + } + }, + { + "ph": "s", "id": 124, "pid": 5714, "tid": 5714, "ts": 6303771484795.509, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771484829.960, "dur": 53.809, + "args": { + "External id": 149158,"Sequence number": 3058859, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771484830.769, "dur": 8.291, + "args": { + "External id": 149159,"Sequence number": 3058859, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5202 + } + }, + { + "ph": "s", "id": 123, "pid": 5714, "tid": 5714, "ts": 6303771484830.769, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771484832.749, "dur": 4.920, + "args": { + "External id": 149160,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484836.009, "dur": 1.360, + "args": { + "External id": 149161,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771484839.929, "dur": 43.491, + "args": { + "External id": 149162,"Sequence number": 3058860, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771484841.240, "dur": 3.960, + "args": { + "External id": 149163,"Sequence number": 3058860, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771484841.920, "dur": 3.109, + "args": { + "External id": 149164,"Sequence number": 3058860, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5207 + } + }, + { + "ph": "s", "id": 122, "pid": 5714, "tid": 5714, "ts": 6303771484841.920, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771484846.809, "dur": 31.371, + "args": { + "External id": 149165,"Sequence number": 3058861, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5208 + } + }, + { + "ph": "s", "id": 121, "pid": 5714, "tid": 5714, "ts": 6303771484846.809, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771484880.360, "dur": 2.189, + "args": { + "External id": 149166,"Sequence number": 3058862, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5209 + } + }, + { + "ph": "s", "id": 120, "pid": 5714, "tid": 5714, "ts": 6303771484880.360, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771484894.399, "dur": 44.500, + "args": { + "External id": 149167,"Sequence number": 3058863, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771484894.999, "dur": 5.570, + "args": { + "External id": 149168,"Sequence number": 3058863, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5211 + } + }, + { + "ph": "s", "id": 119, "pid": 5714, "tid": 5714, "ts": 6303771484894.999, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771484896.249, "dur": 3.430, + "args": { + "External id": 149169,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484898.689, "dur": 0.710, + "args": { + "External id": 149170,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771484901.279, "dur": 37.370, + "args": { + "External id": 149171,"Sequence number": 3058864, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771484902.259, "dur": 2.250, + "args": { + "External id": 149172,"Sequence number": 3058864, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771484902.889, "dur": 1.450, + "args": { + "External id": 149173,"Sequence number": 3058864, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5216 + } + }, + { + "ph": "s", "id": 118, "pid": 5714, "tid": 5714, "ts": 6303771484902.889, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771484905.079, "dur": 28.220, + "args": { + "External id": 149174,"Sequence number": 3058865, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5217 + } + }, + { + "ph": "s", "id": 117, "pid": 5714, "tid": 5714, "ts": 6303771484905.079, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771484935.049, "dur": 2.910, + "args": { + "External id": 149175,"Sequence number": 3058866, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5218 + } + }, + { + "ph": "s", "id": 116, "pid": 5714, "tid": 5714, "ts": 6303771484935.049, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771484948.509, "dur": 41.930, + "args": { + "External id": 149176,"Sequence number": 3058867, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771484949.059, "dur": 5.040, + "args": { + "External id": 149177,"Sequence number": 3058867, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5220 + } + }, + { + "ph": "s", "id": 115, "pid": 5714, "tid": 5714, "ts": 6303771484949.059, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771484950.169, "dur": 3.030, + "args": { + "External id": 149178,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771484951.239, "dur": 1.710, + "args": { + "External id": 149179,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771484956.019, "dur": 34.140, + "args": { + "External id": 149180,"Sequence number": 3058868, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771484956.959, "dur": 2.730, + "args": { + "External id": 149181,"Sequence number": 3058868, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771484958.139, "dur": 1.390, + "args": { + "External id": 149182,"Sequence number": 3058868, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5225 + } + }, + { + "ph": "s", "id": 114, "pid": 5714, "tid": 5714, "ts": 6303771484958.139, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771484960.249, "dur": 24.210, + "args": { + "External id": 149183,"Sequence number": 3058869, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5226 + } + }, + { + "ph": "s", "id": 113, "pid": 5714, "tid": 5714, "ts": 6303771484960.249, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771484986.019, "dur": 3.500, + "args": { + "External id": 149184,"Sequence number": 3058870, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5227 + } + }, + { + "ph": "s", "id": 112, "pid": 5714, "tid": 5714, "ts": 6303771484986.019, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771485003.289, "dur": 3.130, + "args": { + "External id": 149185,"Sequence number": 3058871, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771485003.959, "dur": 2.170, + "args": { + "External id": 149186,"Sequence number": 3058871, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5229 + } + }, + { + "ph": "s", "id": 111, "pid": 5714, "tid": 5714, "ts": 6303771485003.959, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771485013.779, "dur": 2.290, + "args": { + "External id": 149187,"Sequence number": 3058872, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771485014.269, "dur": 1.620, + "args": { + "External id": 149188,"Sequence number": 3058872, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5231 + } + }, + { + "ph": "s", "id": 110, "pid": 5714, "tid": 5714, "ts": 6303771485014.269, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771485021.329, "dur": 3.160, + "args": { + "External id": 149189,"Sequence number": 3058873, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771485023.059, "dur": 1.270, + "args": { + "External id": 149190,"Sequence number": 3058873, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5233 + } + }, + { + "ph": "s", "id": 109, "pid": 5714, "tid": 5714, "ts": 6303771485023.059, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771485049.129, "dur": 122.440, + "args": { + "External id": 149191,"Sequence number": 3058874, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5234 + } + }, + { + "ph": "s", "id": 108, "pid": 5714, "tid": 5714, "ts": 6303771485049.129, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771485064.489, "dur": 10.390, + "args": { + "External id": 149192,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771485066.649, "dur": 7.610, + "args": { + "External id": 149193,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771485185.889, "dur": 119.870, + "args": { + "External id": 149194,"Sequence number": 3058875, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5237 + } + }, + { + "ph": "s", "id": 107, "pid": 5714, "tid": 5714, "ts": 6303771485185.889, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771485199.339, "dur": 11.780, + "args": { + "External id": 149195,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771485201.779, "dur": 8.650, + "args": { + "External id": 149196,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5714, "tid": 5714, + "ts": 6303771485331.328, "dur": 144.020, + "args": { + "External id": 149197,"Sequence number": 3058876, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 5240 + } + }, + { + "ph": "s", "id": 106, "pid": 5714, "tid": 5714, "ts": 6303771485331.328, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771485348.338, "dur": 103.370, + "args": { + "External id": 149198,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 5241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771485382.318, "dur": 10.890, + "args": { + "External id": 149199,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771485384.608, "dur": 7.670, + "args": { + "External id": 149200,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771485395.108, "dur": 4.160, + "args": { + "External id": 149201,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771485400.188, "dur": 2.110, + "args": { + "External id": 149202,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771485404.518, "dur": 3.690, + "args": { + "External id": 149203,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 5714, + "ts": 6303771485461.198, "dur": 3.510, + "args": { + "External id": 149204,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 5247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771485480.378, "dur": 4.890, + "args": { + "External id": 149205,"Sequence number": 3058877, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771485481.468, "dur": 3.540, + "args": { + "External id": 149206,"Sequence number": 3058877, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5249 + } + }, + { + "ph": "s", "id": 105, "pid": 5714, "tid": 5714, "ts": 6303771485481.468, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771485496.358, "dur": 55.640, + "args": { + "External id": 149207,"Sequence number": 3058878, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771485497.148, "dur": 6.790, + "args": { + "External id": 149208,"Sequence number": 3058878, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5251 + } + }, + { + "ph": "s", "id": 104, "pid": 5714, "tid": 5714, "ts": 6303771485497.148, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771485500.228, "dur": 2.830, + "args": { + "External id": 149209,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771485501.598, "dur": 1.100, + "args": { + "External id": 149210,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771485504.828, "dur": 46.820, + "args": { + "External id": 149211,"Sequence number": 3058879, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771485506.068, "dur": 2.970, + "args": { + "External id": 149212,"Sequence number": 3058879, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771485507.458, "dur": 1.430, + "args": { + "External id": 149213,"Sequence number": 3058879, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5256 + } + }, + { + "ph": "s", "id": 103, "pid": 5714, "tid": 5714, "ts": 6303771485507.458, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771485509.738, "dur": 35.170, + "args": { + "External id": 149214,"Sequence number": 3058880, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5257 + } + }, + { + "ph": "s", "id": 102, "pid": 5714, "tid": 5714, "ts": 6303771485509.738, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771485547.508, "dur": 3.270, + "args": { + "External id": 149215,"Sequence number": 3058881, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5258 + } + }, + { + "ph": "s", "id": 101, "pid": 5714, "tid": 5714, "ts": 6303771485547.508, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771485562.148, "dur": 19.290, + "args": { + "External id": 149216,"Sequence number": 3058882, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5259 + } + }, + { + "ph": "s", "id": 100, "pid": 5714, "tid": 5714, "ts": 6303771485562.148, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6303771485597.608, "dur": 158.039, + "args": { + "External id": 149217,"Sequence number": 3058883, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771485600.218, "dur": 29.490, + "args": { + "External id": 149218,"Sequence number": 3058883, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771485603.238, "dur": 26.180, + "args": { + "External id": 149219,"Sequence number": 3058883, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5262 + } + }, + { + "ph": "s", "id": 99, "pid": 5714, "tid": 5714, "ts": 6303771485603.238, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771485606.828, "dur": 6.520, + "args": { + "External id": 149220,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771485614.328, "dur": 13.790, + "args": { + "External id": 149221,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771485630.968, "dur": 16.910, + "args": { + "External id": 149222,"Sequence number": 3058884, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5265 + } + }, + { + "ph": "s", "id": 98, "pid": 5714, "tid": 5714, "ts": 6303771485630.968, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303771485633.338, "dur": 0.370, + "args": { + "External id": 149223,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771485634.468, "dur": 0.160, + "args": { + "External id": 149224,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6303771485649.318, "dur": 18.720, + "args": { + "External id": 149225,"Sequence number": 3058885, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5268 + } + }, + { + "ph": "s", "id": 97, "pid": 5714, "tid": 5714, "ts": 6303771485649.318, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771485669.308, "dur": 15.840, + "args": { + "External id": 149226,"Sequence number": 3058886, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5269 + } + }, + { + "ph": "s", "id": 96, "pid": 5714, "tid": 5714, "ts": 6303771485669.308, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771485673.198, "dur": 10.120, + "args": { + "External id": 149227,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6303771485686.108, "dur": 13.410, + "args": { + "External id": 149228,"Sequence number": 3058887, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5271 + } + }, + { + "ph": "s", "id": 95, "pid": 5714, "tid": 5714, "ts": 6303771485686.108, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771485702.538, "dur": 14.009, + "args": { + "External id": 149229,"Sequence number": 3058888, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5272 + } + }, + { + "ph": "s", "id": 94, "pid": 5714, "tid": 5714, "ts": 6303771485702.538, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6303771485718.758, "dur": 21.349, + "args": { + "External id": 149230,"Sequence number": 3058889, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771485719.907, "dur": 19.980, + "args": { + "External id": 149231,"Sequence number": 3058889, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771485720.698, "dur": 18.980, + "args": { + "External id": 149232,"Sequence number": 3058889, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5275 + } + }, + { + "ph": "s", "id": 93, "pid": 5714, "tid": 5714, "ts": 6303771485720.698, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771485724.367, "dur": 3.940, + "args": { + "External id": 149233,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771485729.158, "dur": 9.589, + "args": { + "External id": 149234,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771485741.947, "dur": 13.251, + "args": { + "External id": 149235,"Sequence number": 3058890, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5278 + } + }, + { + "ph": "s", "id": 92, "pid": 5714, "tid": 5714, "ts": 6303771485741.947, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771485773.858, "dur": 54.389, + "args": { + "External id": 149236,"Sequence number": 3058891, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771485774.598, "dur": 7.349, + "args": { + "External id": 149237,"Sequence number": 3058891, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5280 + } + }, + { + "ph": "s", "id": 91, "pid": 5714, "tid": 5714, "ts": 6303771485774.598, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771485777.347, "dur": 3.240, + "args": { + "External id": 149238,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771485778.967, "dur": 1.300, + "args": { + "External id": 149239,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771485782.687, "dur": 45.230, + "args": { + "External id": 149240,"Sequence number": 3058892, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771485783.958, "dur": 4.240, + "args": { + "External id": 149241,"Sequence number": 3058892, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771485785.758, "dur": 2.260, + "args": { + "External id": 149242,"Sequence number": 3058892, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5285 + } + }, + { + "ph": "s", "id": 90, "pid": 5714, "tid": 5714, "ts": 6303771485785.758, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771485789.777, "dur": 32.500, + "args": { + "External id": 149243,"Sequence number": 3058893, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5286 + } + }, + { + "ph": "s", "id": 89, "pid": 5714, "tid": 5714, "ts": 6303771485789.777, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771485824.467, "dur": 2.560, + "args": { + "External id": 149244,"Sequence number": 3058894, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5287 + } + }, + { + "ph": "s", "id": 88, "pid": 5714, "tid": 5714, "ts": 6303771485824.467, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771485838.277, "dur": 42.840, + "args": { + "External id": 149245,"Sequence number": 3058895, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771485839.017, "dur": 5.300, + "args": { + "External id": 149246,"Sequence number": 3058895, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5289 + } + }, + { + "ph": "s", "id": 87, "pid": 5714, "tid": 5714, "ts": 6303771485839.017, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771485841.327, "dur": 2.170, + "args": { + "External id": 149247,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771485842.367, "dur": 0.790, + "args": { + "External id": 149248,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771485845.057, "dur": 35.790, + "args": { + "External id": 149249,"Sequence number": 3058896, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771485846.587, "dur": 3.320, + "args": { + "External id": 149250,"Sequence number": 3058896, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771485847.267, "dur": 2.490, + "args": { + "External id": 149251,"Sequence number": 3058896, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5294 + } + }, + { + "ph": "s", "id": 86, "pid": 5714, "tid": 5714, "ts": 6303771485847.267, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771485850.477, "dur": 25.590, + "args": { + "External id": 149252,"Sequence number": 3058897, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5295 + } + }, + { + "ph": "s", "id": 85, "pid": 5714, "tid": 5714, "ts": 6303771485850.477, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771485877.837, "dur": 2.270, + "args": { + "External id": 149253,"Sequence number": 3058898, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5296 + } + }, + { + "ph": "s", "id": 84, "pid": 5714, "tid": 5714, "ts": 6303771485877.837, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5714, "tid": 5714, + "ts": 6303771485903.477, "dur": 114.540, + "args": { + "External id": 149254,"Sequence number": 3058899, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5297 + } + }, + { + "ph": "s", "id": 83, "pid": 5714, "tid": 5714, "ts": 6303771485903.477, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771485926.327, "dur": 6.600, + "args": { + "External id": 149255,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771485957.287, "dur": 48.010, + "args": { + "External id": 149256,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771485958.177, "dur": 7.030, + "args": { + "External id": 149257,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771485960.877, "dur": 3.070, + "args": { + "External id": 149258,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771485962.397, "dur": 1.170, + "args": { + "External id": 149259,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771485966.057, "dur": 38.870, + "args": { + "External id": 149260,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771485967.527, "dur": 3.880, + "args": { + "External id": 149261,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771485969.497, "dur": 1.750, + "args": { + "External id": 149262,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771485972.017, "dur": 28.910, + "args": { + "External id": 149263,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771486002.837, "dur": 1.070, + "args": { + "External id": 149264,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771486023.697, "dur": 20.230, + "args": { + "External id": 149265,"Sequence number": 3058900, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5308 + } + }, + { + "ph": "s", "id": 82, "pid": 5714, "tid": 5714, "ts": 6303771486023.697, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6303771486067.577, "dur": 155.760, + "args": { + "External id": 149266,"Sequence number": 3058901, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771486068.997, "dur": 28.100, + "args": { + "External id": 149267,"Sequence number": 3058901, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771486070.187, "dur": 26.620, + "args": { + "External id": 149268,"Sequence number": 3058901, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5311 + } + }, + { + "ph": "s", "id": 81, "pid": 5714, "tid": 5714, "ts": 6303771486070.187, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771486073.507, "dur": 5.990, + "args": { + "External id": 149269,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771486081.617, "dur": 13.900, + "args": { + "External id": 149270,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771486098.227, "dur": 17.350, + "args": { + "External id": 149271,"Sequence number": 3058902, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5314 + } + }, + { + "ph": "s", "id": 80, "pid": 5714, "tid": 5714, "ts": 6303771486098.227, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303771486100.627, "dur": 0.390, + "args": { + "External id": 149272,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771486101.707, "dur": 0.160, + "args": { + "External id": 149273,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6303771486117.507, "dur": 16.960, + "args": { + "External id": 149274,"Sequence number": 3058903, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5317 + } + }, + { + "ph": "s", "id": 79, "pid": 5714, "tid": 5714, "ts": 6303771486117.507, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771486135.837, "dur": 15.120, + "args": { + "External id": 149275,"Sequence number": 3058904, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5318 + } + }, + { + "ph": "s", "id": 78, "pid": 5714, "tid": 5714, "ts": 6303771486135.837, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771486139.526, "dur": 9.671, + "args": { + "External id": 149276,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6303771486153.126, "dur": 14.251, + "args": { + "External id": 149277,"Sequence number": 3058905, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5320 + } + }, + { + "ph": "s", "id": 77, "pid": 5714, "tid": 5714, "ts": 6303771486153.126, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771486170.117, "dur": 13.689, + "args": { + "External id": 149278,"Sequence number": 3058906, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5321 + } + }, + { + "ph": "s", "id": 76, "pid": 5714, "tid": 5714, "ts": 6303771486170.117, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6303771486184.866, "dur": 23.920, + "args": { + "External id": 149279,"Sequence number": 3058907, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771486185.986, "dur": 22.520, + "args": { + "External id": 149280,"Sequence number": 3058907, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771486186.766, "dur": 21.560, + "args": { + "External id": 149281,"Sequence number": 3058907, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5324 + } + }, + { + "ph": "s", "id": 75, "pid": 5714, "tid": 5714, "ts": 6303771486186.766, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771486193.266, "dur": 3.831, + "args": { + "External id": 149282,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771486197.997, "dur": 9.229, + "args": { + "External id": 149283,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771486210.466, "dur": 12.420, + "args": { + "External id": 149284,"Sequence number": 3058908, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5327 + } + }, + { + "ph": "s", "id": 74, "pid": 5714, "tid": 5714, "ts": 6303771486210.466, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771486244.546, "dur": 60.170, + "args": { + "External id": 149285,"Sequence number": 3058909, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771486245.416, "dur": 8.440, + "args": { + "External id": 149286,"Sequence number": 3058909, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5329 + } + }, + { + "ph": "s", "id": 73, "pid": 5714, "tid": 5714, "ts": 6303771486245.416, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771486247.476, "dur": 4.960, + "args": { + "External id": 149287,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771486250.796, "dur": 1.340, + "args": { + "External id": 149288,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771486254.606, "dur": 49.740, + "args": { + "External id": 149289,"Sequence number": 3058910, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771486255.906, "dur": 3.060, + "args": { + "External id": 149290,"Sequence number": 3058910, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771486256.546, "dur": 2.210, + "args": { + "External id": 149291,"Sequence number": 3058910, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5334 + } + }, + { + "ph": "s", "id": 72, "pid": 5714, "tid": 5714, "ts": 6303771486256.546, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771486259.566, "dur": 31.520, + "args": { + "External id": 149292,"Sequence number": 3058911, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5335 + } + }, + { + "ph": "s", "id": 71, "pid": 5714, "tid": 5714, "ts": 6303771486259.566, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771486293.006, "dur": 3.160, + "args": { + "External id": 149293,"Sequence number": 3058912, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5336 + } + }, + { + "ph": "s", "id": 70, "pid": 5714, "tid": 5714, "ts": 6303771486293.006, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771486316.146, "dur": 46.370, + "args": { + "External id": 149294,"Sequence number": 3058913, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771486316.746, "dur": 5.380, + "args": { + "External id": 149295,"Sequence number": 3058913, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5338 + } + }, + { + "ph": "s", "id": 69, "pid": 5714, "tid": 5714, "ts": 6303771486316.746, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771486318.226, "dur": 2.910, + "args": { + "External id": 149296,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771486319.926, "dur": 0.890, + "args": { + "External id": 149297,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771486323.876, "dur": 38.330, + "args": { + "External id": 149298,"Sequence number": 3058914, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771486324.956, "dur": 2.470, + "args": { + "External id": 149299,"Sequence number": 3058914, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771486325.616, "dur": 1.660, + "args": { + "External id": 149300,"Sequence number": 3058914, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5343 + } + }, + { + "ph": "s", "id": 68, "pid": 5714, "tid": 5714, "ts": 6303771486325.616, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771486327.966, "dur": 27.970, + "args": { + "External id": 149301,"Sequence number": 3058915, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5344 + } + }, + { + "ph": "s", "id": 67, "pid": 5714, "tid": 5714, "ts": 6303771486327.966, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771486357.966, "dur": 3.520, + "args": { + "External id": 149302,"Sequence number": 3058916, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5345 + } + }, + { + "ph": "s", "id": 66, "pid": 5714, "tid": 5714, "ts": 6303771486357.966, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771486372.476, "dur": 42.260, + "args": { + "External id": 149303,"Sequence number": 3058917, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771486373.156, "dur": 4.010, + "args": { + "External id": 149304,"Sequence number": 3058917, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5347 + } + }, + { + "ph": "s", "id": 65, "pid": 5714, "tid": 5714, "ts": 6303771486373.156, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771486374.406, "dur": 1.930, + "args": { + "External id": 149305,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771486375.386, "dur": 0.710, + "args": { + "External id": 149306,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771486377.886, "dur": 36.600, + "args": { + "External id": 149307,"Sequence number": 3058918, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771486379.726, "dur": 3.130, + "args": { + "External id": 149308,"Sequence number": 3058918, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771486380.336, "dur": 2.340, + "args": { + "External id": 149309,"Sequence number": 3058918, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5352 + } + }, + { + "ph": "s", "id": 64, "pid": 5714, "tid": 5714, "ts": 6303771486380.336, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771486383.416, "dur": 25.840, + "args": { + "External id": 149310,"Sequence number": 3058919, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5353 + } + }, + { + "ph": "s", "id": 63, "pid": 5714, "tid": 5714, "ts": 6303771486383.416, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771486410.986, "dur": 2.860, + "args": { + "External id": 149311,"Sequence number": 3058920, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5354 + } + }, + { + "ph": "s", "id": 62, "pid": 5714, "tid": 5714, "ts": 6303771486410.986, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771486429.006, "dur": 3.070, + "args": { + "External id": 149312,"Sequence number": 3058921, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771486429.726, "dur": 2.170, + "args": { + "External id": 149313,"Sequence number": 3058921, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5356 + } + }, + { + "ph": "s", "id": 61, "pid": 5714, "tid": 5714, "ts": 6303771486429.726, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771486438.556, "dur": 3.220, + "args": { + "External id": 149314,"Sequence number": 3058922, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771486440.196, "dur": 1.420, + "args": { + "External id": 149315,"Sequence number": 3058922, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5358 + } + }, + { + "ph": "s", "id": 60, "pid": 5714, "tid": 5714, "ts": 6303771486440.196, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771486446.456, "dur": 2.600, + "args": { + "External id": 149316,"Sequence number": 3058923, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771486447.856, "dur": 1.050, + "args": { + "External id": 149317,"Sequence number": 3058923, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 12, 64]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5360 + } + }, + { + "ph": "s", "id": 59, "pid": 5714, "tid": 5714, "ts": 6303771486447.856, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771486473.616, "dur": 132.680, + "args": { + "External id": 149318,"Sequence number": 3058924, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5361 + } + }, + { + "ph": "s", "id": 58, "pid": 5714, "tid": 5714, "ts": 6303771486473.616, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771486489.026, "dur": 9.140, + "args": { + "External id": 149319,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771486490.756, "dur": 6.720, + "args": { + "External id": 149320,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "RotaryEmbeddingFunction", "pid": 5714, "tid": 5714, + "ts": 6303771486620.685, "dur": 107.550, + "args": { + "External id": 149321,"Sequence number": 3058925, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "False", "False", "0", "", "4096"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [32, 1], [32, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [4096, 32], [4096, 32], [], [], [], [], []], "Ev Idx": 5364 + } + }, + { + "ph": "s", "id": 57, "pid": 5714, "tid": 5714, "ts": 6303771486620.685, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771486633.885, "dur": 9.940, + "args": { + "External id": 149322,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771486635.996, "dur": 7.220, + "args": { + "External id": 149323,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FlashAttnFunc", "pid": 5714, "tid": 5714, + "ts": 6303771486753.105, "dur": 143.530, + "args": { + "External id": 149324,"Sequence number": 3058926, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "", "True", "", "0.", "", "False", "False", "True"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "", "Scalar", "", "Scalar", "", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], [], []], "Ev Idx": 5367 + } + }, + { + "ph": "s", "id": 56, "pid": 5714, "tid": 5714, "ts": 6303771486753.105, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "flash_attn::_flash_attn_forward", "pid": 5714, "tid": 5714, + "ts": 6303771486769.755, "dur": 104.370, + "args": { + "External id": 149325,"Record function id": 0, "Concrete Inputs": ["", "", "", "0.", "0.125", "True", "-1", "-1", "0.", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[1572864, 768, 64, 1], [1572864, 768, 64, 1], [1572864, 768, 64, 1], [], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [8, 2048, 12, 64], [8, 2048, 12, 64], [], [], [], [], [], [], [], []], "Ev Idx": 5368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771486801.695, "dur": 12.760, + "args": { + "External id": 149326,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", ""], "Input type": ["c10::BFloat16", "", "", "", "", ""], "Input Strides": [[1572864, 768, 64, 1], [], [], [], [], []], "Input Dims": [[8, 2048, 12, 64], [], [], [], [], []], "Ev Idx": 5369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771486805.115, "dur": 8.340, + "args": { + "External id": 149327,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 12, 64]", "[1572864, 768, 64, 1]", "15", "0", "", ""], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771486817.495, "dur": 4.390, + "args": { + "External id": 149328,"Record function id": 0, "Concrete Inputs": ["[8, 12, 2048]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771486823.175, "dur": 2.320, + "args": { + "External id": 149329,"Record function id": 0, "Concrete Inputs": ["[0]", "15", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771486827.775, "dur": 3.790, + "args": { + "External id": 149330,"Record function id": 0, "Concrete Inputs": ["[2]", "4", "", "", "", ""], "Input type": ["ScalarList", "Scalar", "", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::alias", "pid": 5714, "tid": 5714, + "ts": 6303771486883.575, "dur": 3.410, + "args": { + "External id": 149331,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[1572864, 768, 64, 1]], "Input Dims": [[8, 2048, 12, 64]], "Ev Idx": 5374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771486901.485, "dur": 4.940, + "args": { + "External id": 149332,"Sequence number": 3058927, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771486902.325, "dur": 3.860, + "args": { + "External id": 149333,"Sequence number": 3058927, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, -1]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 64, 1], []], "Input Dims": [[8, 2048, 12, 64], []], "Ev Idx": 5376 + } + }, + { + "ph": "s", "id": 55, "pid": 5714, "tid": 5714, "ts": 6303771486902.325, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771486917.445, "dur": 55.470, + "args": { + "External id": 149334,"Sequence number": 3058928, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [768, 768], []], "Ev Idx": 5377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771486919.095, "dur": 5.730, + "args": { + "External id": 149335,"Sequence number": 3058928, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[768, 768]], "Ev Idx": 5378 + } + }, + { + "ph": "s", "id": 54, "pid": 5714, "tid": 5714, "ts": 6303771486919.095, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771486920.825, "dur": 3.120, + "args": { + "External id": 149336,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[768, 768], [], []], "Ev Idx": 5379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771486922.525, "dur": 1.080, + "args": { + "External id": 149337,"Record function id": 0, "Concrete Inputs": ["", "[768, 768]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[768, 768], [], [], []], "Ev Idx": 5380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771486925.695, "dur": 46.880, + "args": { + "External id": 149338,"Sequence number": 3058929, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 768]], "Ev Idx": 5381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771486926.925, "dur": 3.430, + "args": { + "External id": 149339,"Sequence number": 3058929, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771486928.605, "dur": 1.580, + "args": { + "External id": 149340,"Sequence number": 3058929, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5383 + } + }, + { + "ph": "s", "id": 53, "pid": 5714, "tid": 5714, "ts": 6303771486928.605, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771486931.115, "dur": 34.690, + "args": { + "External id": 149341,"Sequence number": 3058930, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 768]], "Ev Idx": 5384 + } + }, + { + "ph": "s", "id": 52, "pid": 5714, "tid": 5714, "ts": 6303771486931.115, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771486968.305, "dur": 3.380, + "args": { + "External id": 149342,"Sequence number": 3058931, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5385 + } + }, + { + "ph": "s", "id": 51, "pid": 5714, "tid": 5714, "ts": 6303771486968.305, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771486983.095, "dur": 19.980, + "args": { + "External id": 149343,"Sequence number": 3058932, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5386 + } + }, + { + "ph": "s", "id": 50, "pid": 5714, "tid": 5714, "ts": 6303771486983.095, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rms_norm", "pid": 5714, "tid": 5714, + "ts": 6303771487020.685, "dur": 155.959, + "args": { + "External id": 149344,"Sequence number": 3058933, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[768]", "", "9.9999999999999995e-07"], "Input type": ["c10::BFloat16", "ScalarList", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [], [1], []], "Input Dims": [[8, 2048, 768], [], [768], []], "Ev Idx": 5387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771487022.155, "dur": 26.049, + "args": { + "External id": 149345,"Sequence number": 3058933, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771487023.375, "dur": 24.560, + "args": { + "External id": 149346,"Sequence number": 3058933, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "6", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5389 + } + }, + { + "ph": "s", "id": 49, "pid": 5714, "tid": 5714, "ts": 6303771487023.375, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487026.525, "dur": 5.540, + "args": { + "External id": 149347,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771487033.015, "dur": 13.640, + "args": { + "External id": 149348,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771487049.335, "dur": 19.060, + "args": { + "External id": 149349,"Sequence number": 3058934, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5392 + } + }, + { + "ph": "s", "id": 48, "pid": 5714, "tid": 5714, "ts": 6303771487049.335, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303771487051.815, "dur": 0.389, + "args": { + "External id": 149350,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["float", "Scalar"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771487052.875, "dur": 0.169, + "args": { + "External id": 149351,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mean", "pid": 5714, "tid": 5714, + "ts": 6303771487071.075, "dur": 17.229, + "args": { + "External id": 149352,"Sequence number": 3058935, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[2]", "True", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5395 + } + }, + { + "ph": "s", "id": 47, "pid": 5714, "tid": 5714, "ts": 6303771487071.075, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771487089.644, "dur": 15.680, + "args": { + "External id": 149353,"Sequence number": 3058936, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "9.9837779998779297e-07", "1"], "Input type": ["float", "Scalar", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5396 + } + }, + { + "ph": "s", "id": 46, "pid": 5714, "tid": 5714, "ts": 6303771487089.644, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771487094.075, "dur": 9.500, + "args": { + "External id": 149354,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[2048, 1, 1], [], []], "Input Dims": [[8, 2048, 1], [], []], "Ev Idx": 5397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::rsqrt", "pid": 5714, "tid": 5714, + "ts": 6303771487106.335, "dur": 14.160, + "args": { + "External id": 149355,"Sequence number": 3058937, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[2048, 1, 1]], "Input Dims": [[8, 2048, 1]], "Ev Idx": 5398 + } + }, + { + "ph": "s", "id": 45, "pid": 5714, "tid": 5714, "ts": 6303771487106.335, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771487123.644, "dur": 15.780, + "args": { + "External id": 149356,"Sequence number": 3058938, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1572864, 768, 1], [2048, 1, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 1]], "Ev Idx": 5399 + } + }, + { + "ph": "s", "id": 44, "pid": 5714, "tid": 5714, "ts": 6303771487123.644, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::type_as", "pid": 5714, "tid": 5714, + "ts": 6303771487140.554, "dur": 19.860, + "args": { + "External id": 149357,"Sequence number": 3058939, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1]], "Input Dims": [[8, 2048, 768], [8, 2048, 768]], "Ev Idx": 5400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771487141.664, "dur": 18.510, + "args": { + "External id": 149358,"Sequence number": 3058939, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], [], []], "Ev Idx": 5401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771487142.464, "dur": 17.490, + "args": { + "External id": 149359,"Sequence number": 3058939, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", ""], "Input Strides": [[1572864, 768, 1], [], [], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], [], [], []], "Ev Idx": 5402 + } + }, + { + "ph": "s", "id": 43, "pid": 5714, "tid": 5714, "ts": 6303771487142.464, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487144.974, "dur": 3.900, + "args": { + "External id": 149360,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 768]", "[1572864, 768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771487149.764, "dur": 9.230, + "args": { + "External id": 149361,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771487162.254, "dur": 13.960, + "args": { + "External id": 149362,"Sequence number": 3058940, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1]], "Input Dims": [[8, 2048, 768], [768]], "Ev Idx": 5405 + } + }, + { + "ph": "s", "id": 42, "pid": 5714, "tid": 5714, "ts": 6303771487162.254, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771487195.174, "dur": 52.400, + "args": { + "External id": 149363,"Sequence number": 3058941, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771487197.094, "dur": 6.480, + "args": { + "External id": 149364,"Sequence number": 3058941, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5407 + } + }, + { + "ph": "s", "id": 41, "pid": 5714, "tid": 5714, "ts": 6303771487197.094, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771487199.004, "dur": 3.200, + "args": { + "External id": 149365,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487200.584, "dur": 1.310, + "args": { + "External id": 149366,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771487204.314, "dur": 42.930, + "args": { + "External id": 149367,"Sequence number": 3058942, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771487205.634, "dur": 4.650, + "args": { + "External id": 149368,"Sequence number": 3058942, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771487207.824, "dur": 2.300, + "args": { + "External id": 149369,"Sequence number": 3058942, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5412 + } + }, + { + "ph": "s", "id": 40, "pid": 5714, "tid": 5714, "ts": 6303771487207.824, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771487210.894, "dur": 31.290, + "args": { + "External id": 149370,"Sequence number": 3058943, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5413 + } + }, + { + "ph": "s", "id": 39, "pid": 5714, "tid": 5714, "ts": 6303771487210.894, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771487244.244, "dur": 2.180, + "args": { + "External id": 149371,"Sequence number": 3058944, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5414 + } + }, + { + "ph": "s", "id": 38, "pid": 5714, "tid": 5714, "ts": 6303771487244.244, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771487257.904, "dur": 52.060, + "args": { + "External id": 149372,"Sequence number": 3058945, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[1572864, 768, 1], [768, 1], []], "Input Dims": [[8, 2048, 768], [2048, 768], []], "Ev Idx": 5415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771487258.434, "dur": 5.650, + "args": { + "External id": 149373,"Sequence number": 3058945, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[2048, 768]], "Ev Idx": 5416 + } + }, + { + "ph": "s", "id": 37, "pid": 5714, "tid": 5714, "ts": 6303771487258.434, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771487261.154, "dur": 2.020, + "args": { + "External id": 149374,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[2048, 768], [], []], "Ev Idx": 5417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487262.164, "dur": 0.730, + "args": { + "External id": 149375,"Record function id": 0, "Concrete Inputs": ["", "[768, 2048]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[2048, 768], [], [], []], "Ev Idx": 5418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771487264.834, "dur": 44.820, + "args": { + "External id": 149376,"Sequence number": 3058946, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1572864, 768, 1], [1, 768]], "Input Dims": [[8, 2048, 768], [768, 2048]], "Ev Idx": 5419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771487265.824, "dur": 4.590, + "args": { + "External id": 149377,"Sequence number": 3058946, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771487267.774, "dur": 2.470, + "args": { + "External id": 149378,"Sequence number": 3058946, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[16384, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5421 + } + }, + { + "ph": "s", "id": 36, "pid": 5714, "tid": 5714, "ts": 6303771487267.774, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771487271.014, "dur": 24.810, + "args": { + "External id": 149379,"Sequence number": 3058947, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[16384, 768], [768, 2048]], "Ev Idx": 5422 + } + }, + { + "ph": "s", "id": 35, "pid": 5714, "tid": 5714, "ts": 6303771487271.014, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771487305.184, "dur": 3.700, + "args": { + "External id": 149380,"Sequence number": 3058948, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[16384, 2048], []], "Ev Idx": 5423 + } + }, + { + "ph": "s", "id": 34, "pid": 5714, "tid": 5714, "ts": 6303771487305.184, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "SwiGLULinearFunction", "pid": 5714, "tid": 5714, + "ts": 6303771487332.394, "dur": 118.280, + "args": { + "External id": 149381,"Sequence number": 3058949, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [8, 2048, 2048], [768, 2048], []], "Ev Idx": 5424 + } + }, + { + "ph": "s", "id": 33, "pid": 5714, "tid": 5714, "ts": 6303771487332.394, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771487356.574, "dur": 8.320, + "args": { + "External id": 149382,"Record function id": 0, "Concrete Inputs": ["[8, 2048, 2048]", "15", "", "", "", "0"], "Input type": ["ScalarList", "Scalar", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771487390.114, "dur": 48.380, + "args": { + "External id": 149383,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[4194304, 2048, 1], [2048, 1], []], "Input Dims": [[8, 2048, 2048], [768, 2048], []], "Ev Idx": 5426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771487391.124, "dur": 6.060, + "args": { + "External id": 149384,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[2048, 1]], "Input Dims": [[768, 2048]], "Ev Idx": 5427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771487392.744, "dur": 3.170, + "args": { + "External id": 149385,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[2048, 1], [], []], "Input Dims": [[768, 2048], [], []], "Ev Idx": 5428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487394.274, "dur": 1.280, + "args": { + "External id": 149386,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[1, 2048]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[2048, 1], [], [], []], "Input Dims": [[768, 2048], [], [], []], "Ev Idx": 5429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771487398.004, "dur": 40.140, + "args": { + "External id": 149387,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[4194304, 2048, 1], [1, 2048]], "Input Dims": [[8, 2048, 2048], [2048, 768]], "Ev Idx": 5430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reshape", "pid": 5714, "tid": 5714, + "ts": 6303771487399.674, "dur": 3.680, + "args": { + "External id": 149388,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771487401.354, "dur": 1.850, + "args": { + "External id": 149389,"Record function id": 0, "Concrete Inputs": ["", "[16384, 2048]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[4194304, 2048, 1], []], "Input Dims": [[8, 2048, 2048], []], "Ev Idx": 5432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771487403.964, "dur": 30.210, + "args": { + "External id": 149390,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[2048, 1], [1, 2048]], "Input Dims": [[16384, 2048], [2048, 768]], "Ev Idx": 5433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_unsafe_view", "pid": 5714, "tid": 5714, + "ts": 6303771487435.984, "dur": 1.150, + "args": { + "External id": 149391,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771487456.264, "dur": 19.390, + "args": { + "External id": 149392,"Sequence number": 3058950, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[1572864, 768, 1], [1572864, 768, 1], []], "Input Dims": [[8, 2048, 768], [8, 2048, 768], []], "Ev Idx": 5435 + } + }, + { + "ph": "s", "id": 32, "pid": 5714, "tid": 5714, "ts": 6303771487456.264, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::stack", "pid": 5714, "tid": 5714, + "ts": 6303771487488.954, "dur": 32.489, + "args": { + "External id": 149393,"Sequence number": 3058951, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "-2"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[[1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1]], []], "Input Dims": [[[8, 2048, 768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 768]], []], "Ev Idx": 5436 + } + }, + { + "ph": "s", "id": 31, "pid": 5714, "tid": 5714, "ts": 6303771487488.954, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::cat", "pid": 5714, "tid": 5714, + "ts": 6303771487494.723, "dur": 21.560, + "args": { + "External id": 149394,"Record function id": 0, "Concrete Inputs": ["", "2"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[[1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1], [1572864, 768, 1]], []], "Input Dims": [[[8, 2048, 768], [8, 2048, 768], [8, 2048, 768], [8, 2048, 768]], []], "Ev Idx": 5437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771487518.643, "dur": 1.091, + "args": { + "External id": 149395,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 3072], []], "Ev Idx": 5438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "TorchDynamo Cache Lookup", "pid": 5714, "tid": 5714, + "ts": 6303771487545.143, "dur": 22.311, + "args": { + "External id": 149396,"Record function id": 0, "Ev Idx": 5439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Torch-Compiled Region: 0/1", "pid": 5714, "tid": 5714, + "ts": 6303771487568.423, "dur": 181.410, + "args": { + "External id": 149397,"Record function id": 0, "Ev Idx": 5440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "CompiledFunction", "pid": 5714, "tid": 5714, + "ts": 6303771487632.563, "dur": 105.170, + "args": { + "External id": 149398,"Sequence number": 3058952, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "8", "2048", "4", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "c10::BFloat16"], "Input Strides": [[1], [], [], [], [6291456, 3072, 768, 1]], "Input Dims": [[768], [], [], [], [8, 2048, 4, 768]], "Ev Idx": 5441 + } + }, + { + "ph": "s", "id": 30, "pid": 5714, "tid": 5714, "ts": 6303771487632.563, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 5714, "tid": 5714, + "ts": 6303771487683.383, "dur": 24.810, + "args": { + "External id": 149399,"kernel_hash": "cwefpfej5pwum5b4hu7een5otcjqe4vo2l2suze5lxgbdcyqp62t", "grid": "grid(65536,)", "Record function id": 0, "stream": 0, "Concrete Inputs": ["", "", "", "", "65536", "768"], "kernel_file": "/tmp/torchinductor_root/we/cwefpfej5pwum5b4hu7een5otcjqe4vo2l2suze5lxgbdcyqp62t.py", "kernel_backend": "triton", "Input type": ["float", "c10::BFloat16", "c10::BFloat16", "c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[8192, 4, 1, 1], [6291456, 3072, 768, 1], [1], [6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 1], [8, 2048, 4, 768], [768], [8, 2048, 4, 768], [], []], "Ev Idx": 5442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6303771487806.083, "dur": 31.770, + "args": { + "External id": 149400,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False"], "Input type": ["ScalarList", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771487808.083, "dur": 7.980, + "args": { + "External id": 149401,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False", ""], "Input type": ["ScalarList", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771487817.943, "dur": 19.570, + "args": { + "External id": 149402,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 5445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771487820.243, "dur": 16.440, + "args": { + "External id": 149403,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 5446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6303771487844.543, "dur": 15.220, + "args": { + "External id": 149404,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False"], "Input type": ["ScalarList", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771487845.523, "dur": 4.140, + "args": { + "External id": 149405,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False", ""], "Input type": ["ScalarList", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771487850.323, "dur": 9.230, + "args": { + "External id": 149406,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 5449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771487851.123, "dur": 7.670, + "args": { + "External id": 149407,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 5450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6303771487863.403, "dur": 14.180, + "args": { + "External id": 149408,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False"], "Input type": ["ScalarList", "", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771487864.153, "dur": 3.230, + "args": { + "External id": 149409,"Record function id": 0, "Concrete Inputs": ["[1]", "", "", "", "False", ""], "Input type": ["ScalarList", "", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771487868.013, "dur": 9.320, + "args": { + "External id": 149410,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[1]], "Ev Idx": 5453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771487869.863, "dur": 6.760, + "args": { + "External id": 149411,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 5454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771487886.703, "dur": 0.320, + "args": { + "External id": 149412,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "False", "False", ""], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[4096, 1], [], [], [], [], [], [], []], "Input Dims": [[8, 4096], [], [], [], [], [], [], []], "Ev Idx": 5455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unfold", "pid": 5714, "tid": 5714, + "ts": 6303771487892.243, "dur": 6.890, + "args": { + "External id": 149413,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "5", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 5456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487895.913, "dur": 1.560, + "args": { + "External id": 149414,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 5]", "[4096, 1, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 4096], [], [], []], "Ev Idx": 5457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487904.123, "dur": 4.710, + "args": { + "External id": 149415,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 5], [], [], [], []], "Ev Idx": 5458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487906.923, "dur": 0.570, + "args": { + "External id": 149416,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 5]", "[4096, 1, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 5], [], [], []], "Ev Idx": 5459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487910.003, "dur": 1.660, + "args": { + "External id": 149417,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 5], [], [], [], []], "Ev Idx": 5460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487910.923, "dur": 0.270, + "args": { + "External id": 149418,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 5]", "[4096, 1, 1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 5], [], [], []], "Ev Idx": 5461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487912.843, "dur": 1.860, + "args": { + "External id": 149419,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "1", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 5], [], [], [], []], "Ev Idx": 5462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487913.983, "dur": 0.250, + "args": { + "External id": 149420,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 5], [], [], []], "Ev Idx": 5463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487918.303, "dur": 2.320, + "args": { + "External id": 149421,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 4], [], [], [], []], "Ev Idx": 5464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487919.213, "dur": 0.890, + "args": { + "External id": 149422,"Record function id": 0, "Concrete Inputs": ["", "[8, 4092, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 4], [], [], []], "Ev Idx": 5465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487921.533, "dur": 1.760, + "args": { + "External id": 149423,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4092, 4], [], [], [], []], "Ev Idx": 5466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487922.593, "dur": 0.240, + "args": { + "External id": 149424,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4092, 4], [], [], []], "Ev Idx": 5467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487924.183, "dur": 1.900, + "args": { + "External id": 149425,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 2048, 4], [], [], [], []], "Ev Idx": 5468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487925.373, "dur": 0.290, + "args": { + "External id": 149426,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 2048, 4], [], [], []], "Ev Idx": 5469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771487929.973, "dur": 5.460, + "args": { + "External id": 149427,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "2"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 2048, 4], [], []], "Ev Idx": 5470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487934.393, "dur": 0.310, + "args": { + "External id": 149428,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 2048, 4], [], [], []], "Ev Idx": 5471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487939.762, "dur": 2.260, + "args": { + "External id": 149429,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 5472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487941.242, "dur": 0.260, + "args": { + "External id": 149430,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6303771487944.353, "dur": 4.409, + "args": { + "External id": 149431,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 5474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487947.442, "dur": 0.331, + "args": { + "External id": 149432,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487949.813, "dur": 1.920, + "args": { + "External id": 149433,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 5476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487951.053, "dur": 0.229, + "args": { + "External id": 149434,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 5477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487956.493, "dur": 4.509, + "args": { + "External id": 149435,"Sequence number": 3058953, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5478 + } + }, + { + "ph": "s", "id": 29, "pid": 5714, "tid": 5714, "ts": 6303771487956.493, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487958.922, "dur": 0.471, + "args": { + "External id": 149436,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487961.973, "dur": 2.969, + "args": { + "External id": 149437,"Sequence number": 3058954, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5480 + } + }, + { + "ph": "s", "id": 28, "pid": 5714, "tid": 5714, "ts": 6303771487961.973, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487963.333, "dur": 0.969, + "args": { + "External id": 149438,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6303771487965.882, "dur": 4.180, + "args": { + "External id": 149439,"Sequence number": 3058955, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 5482 + } + }, + { + "ph": "s", "id": 27, "pid": 5714, "tid": 5714, "ts": 6303771487965.882, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487969.082, "dur": 0.320, + "args": { + "External id": 149440,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771487971.022, "dur": 3.491, + "args": { + "External id": 149441,"Sequence number": 3058956, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5484 + } + }, + { + "ph": "s", "id": 26, "pid": 5714, "tid": 5714, "ts": 6303771487971.022, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771487973.342, "dur": 0.431, + "args": { + "External id": 149442,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6303771487977.982, "dur": 30.900, + "args": { + "External id": 149443,"Sequence number": 3058957, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6303771487979.262, "dur": 29.320, + "args": { + "External id": 149444,"Sequence number": 3058957, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771487980.653, "dur": 8.669, + "args": { + "External id": 149445,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 5488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771487982.062, "dur": 6.780, + "args": { + "External id": 149446,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771487990.253, "dur": 17.789, + "args": { + "External id": 149447,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 5490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771488034.462, "dur": 4.420, + "args": { + "External id": 149448,"Sequence number": 3058957, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5491 + } + }, + { + "ph": "s", "id": 25, "pid": 5714, "tid": 5714, "ts": 6303771488034.462, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771488041.092, "dur": 0.970, + "args": { + "External id": 149449,"Sequence number": 3058958, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5714, "tid": 5714, + "ts": 6303771488062.852, "dur": 317054.036, + "args": { + "External id": 149450,"Sequence number": 3058958, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 5493 + } + }, + { + "ph": "s", "id": 24, "pid": 5714, "tid": 5714, "ts": 6303771488062.852, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6303771488073.812, "dur": 29.860, + "args": { + "External id": 149451,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6303771488074.592, "dur": 28.840, + "args": { + "External id": 149452,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771488075.812, "dur": 8.920, + "args": { + "External id": 149453,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771488078.212, "dur": 5.920, + "args": { + "External id": 149454,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771488085.652, "dur": 17.350, + "args": { + "External id": 149455,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 5498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6303771488118.232, "dur": 24.670, + "args": { + "External id": 149456,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771488119.822, "dur": 8.160, + "args": { + "External id": 149457,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771488122.362, "dur": 5.270, + "args": { + "External id": 149458,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771488128.842, "dur": 13.860, + "args": { + "External id": 149459,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771488130.062, "dur": 11.850, + "args": { + "External id": 149460,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6303771488146.502, "dur": 16.960, + "args": { + "External id": 149461,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771488147.112, "dur": 6.580, + "args": { + "External id": 149462,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771488148.542, "dur": 4.790, + "args": { + "External id": 149463,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771488154.232, "dur": 9.020, + "args": { + "External id": 149464,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771488154.992, "dur": 7.510, + "args": { + "External id": 149465,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 5508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6303771488168.222, "dur": 15.580, + "args": { + "External id": 149466,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771488170.632, "dur": 4.010, + "args": { + "External id": 149467,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771488175.252, "dur": 8.310, + "args": { + "External id": 149468,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 5511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771488176.092, "dur": 6.730, + "args": { + "External id": 149469,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 5714, + "ts": 6303771488187.872, "dur": 21.150, + "args": { + "External id": 149470,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771488212.642, "dur": 44.410, + "args": { + "External id": 149471,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771488215.642, "dur": 40.940, + "args": { + "External id": 149472,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771488222.242, "dur": 0.970, + "args": { + "External id": 149473,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 5516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771488224.152, "dur": 19.400, + "args": { + "External id": 149474,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771488225.182, "dur": 18.130, + "args": { + "External id": 149475,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 5518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771488226.742, "dur": 3.540, + "args": { + "External id": 149476,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771488231.172, "dur": 11.790, + "args": { + "External id": 149477,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 5520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6303771488263.032, "dur": 311470.477, + "args": { + "External id": 149478,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6303771488264.662, "dur": 311464.987, + "args": { + "External id": 149479,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771799759.200, "dur": 11.740, + "args": { + "External id": 149480,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771799765.960, "dur": 2.160, + "args": { + "External id": 149481,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771799777.660, "dur": 76.969, + "args": { + "External id": 149482,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771799779.829, "dur": 7.891, + "args": { + "External id": 149483,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771799782.109, "dur": 4.651, + "args": { + "External id": 149484,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771799785.209, "dur": 1.140, + "args": { + "External id": 149485,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771799789.360, "dur": 64.189, + "args": { + "External id": 149486,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771799791.509, "dur": 60.630, + "args": { + "External id": 149487,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771799861.729, "dur": 6.690, + "args": { + "External id": 149488,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771799865.299, "dur": 1.650, + "args": { + "External id": 149489,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771799879.069, "dur": 3.100, + "args": { + "External id": 149490,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771799894.419, "dur": 16.070, + "args": { + "External id": 149491,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771799897.359, "dur": 12.610, + "args": { + "External id": 149492,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771800041.529, "dur": 250.049, + "args": { + "External id": 149493,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771800045.879, "dur": 5.610, + "args": { + "External id": 149494,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771800054.259, "dur": 236.679, + "args": { + "External id": 149495,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771800057.429, "dur": 0.850, + "args": { + "External id": 149496,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771800060.319, "dur": 34.060, + "args": { + "External id": 149497,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771800097.089, "dur": 4.150, + "args": { + "External id": 149498,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771800099.579, "dur": 1.030, + "args": { + "External id": 149499,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771800102.659, "dur": 33.050, + "args": { + "External id": 149500,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771800105.019, "dur": 2.990, + "args": { + "External id": 149501,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771800110.089, "dur": 25.270, + "args": { + "External id": 149502,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771800117.569, "dur": 4.910, + "args": { + "External id": 149503,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771800137.739, "dur": 23.620, + "args": { + "External id": 149504,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771800176.328, "dur": 18.791, + "args": { + "External id": 149505,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771800199.719, "dur": 16.180, + "args": { + "External id": 149506,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771800218.419, "dur": 13.480, + "args": { + "External id": 149507,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771800235.659, "dur": 28.539, + "args": { + "External id": 149508,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771800238.259, "dur": 2.800, + "args": { + "External id": 149509,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771800248.559, "dur": 0.920, + "args": { + "External id": 149510,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771800267.238, "dur": 12.970, + "args": { + "External id": 149511,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771800281.448, "dur": 8.110, + "args": { + "External id": 149512,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771800311.748, "dur": 3.320, + "args": { + "External id": 149513,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771800324.498, "dur": 4.410, + "args": { + "External id": 149514,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771800327.098, "dur": 0.710, + "args": { + "External id": 149515,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771800420.298, "dur": 63.140, + "args": { + "External id": 149516,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771800493.078, "dur": 8.690, + "args": { + "External id": 149517,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771800498.608, "dur": 1.220, + "args": { + "External id": 149518,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771800503.418, "dur": 30.920, + "args": { + "External id": 149519,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771800543.288, "dur": 5.660, + "args": { + "External id": 149520,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771800544.828, "dur": 3.250, + "args": { + "External id": 149521,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771800546.508, "dur": 1.210, + "args": { + "External id": 149522,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771800552.808, "dur": 42.010, + "args": { + "External id": 149523,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771800555.408, "dur": 38.450, + "args": { + "External id": 149524,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771800601.638, "dur": 18.520, + "args": { + "External id": 149525,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771800629.798, "dur": 6.900, + "args": { + "External id": 149526,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771800634.118, "dur": 1.049, + "args": { + "External id": 149527,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771800641.007, "dur": 44.791, + "args": { + "External id": 149528,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771800641.838, "dur": 5.980, + "args": { + "External id": 149529,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771800643.078, "dur": 4.149, + "args": { + "External id": 149530,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771800645.247, "dur": 1.611, + "args": { + "External id": 149531,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771800648.838, "dur": 36.309, + "args": { + "External id": 149532,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771800649.738, "dur": 34.560, + "args": { + "External id": 149533,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771800693.307, "dur": 4.631, + "args": { + "External id": 149534,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771800695.918, "dur": 0.580, + "args": { + "External id": 149535,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771800705.427, "dur": 1.990, + "args": { + "External id": 149536,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771800715.487, "dur": 8.500, + "args": { + "External id": 149537,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771800716.937, "dur": 6.690, + "args": { + "External id": 149538,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771800813.537, "dur": 171.770, + "args": { + "External id": 149539,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771800815.897, "dur": 5.050, + "args": { + "External id": 149540,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771800822.457, "dur": 162.290, + "args": { + "External id": 149541,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771800823.737, "dur": 0.220, + "args": { + "External id": 149542,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771800827.717, "dur": 23.960, + "args": { + "External id": 149543,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771800853.487, "dur": 4.860, + "args": { + "External id": 149544,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771800855.927, "dur": 1.860, + "args": { + "External id": 149545,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771800859.527, "dur": 22.330, + "args": { + "External id": 149546,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771800860.617, "dur": 3.830, + "args": { + "External id": 149547,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771800865.587, "dur": 15.930, + "args": { + "External id": 149548,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771800868.577, "dur": 3.300, + "args": { + "External id": 149549,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771800883.087, "dur": 19.410, + "args": { + "External id": 149550,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771800904.407, "dur": 10.120, + "args": { + "External id": 149551,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771800917.417, "dur": 12.230, + "args": { + "External id": 149552,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771800930.987, "dur": 8.560, + "args": { + "External id": 149553,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771800942.357, "dur": 21.690, + "args": { + "External id": 149554,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771800944.477, "dur": 2.810, + "args": { + "External id": 149555,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771800949.667, "dur": 0.790, + "args": { + "External id": 149556,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771800965.837, "dur": 8.460, + "args": { + "External id": 149557,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771800975.407, "dur": 7.920, + "args": { + "External id": 149558,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771800994.867, "dur": 3.180, + "args": { + "External id": 149559,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771801009.947, "dur": 4.510, + "args": { + "External id": 149560,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801012.737, "dur": 0.660, + "args": { + "External id": 149561,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771801090.577, "dur": 47.189, + "args": { + "External id": 149562,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771801146.366, "dur": 7.400, + "args": { + "External id": 149563,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801150.737, "dur": 1.129, + "args": { + "External id": 149564,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771801155.156, "dur": 22.990, + "args": { + "External id": 149565,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771801185.966, "dur": 6.790, + "args": { + "External id": 149566,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771801187.466, "dur": 4.490, + "args": { + "External id": 149567,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801190.646, "dur": 0.970, + "args": { + "External id": 149568,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771801195.176, "dur": 35.870, + "args": { + "External id": 149569,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771801196.086, "dur": 33.910, + "args": { + "External id": 149570,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771801236.166, "dur": 16.250, + "args": { + "External id": 149571,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771801261.056, "dur": 5.360, + "args": { + "External id": 149572,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801264.056, "dur": 0.930, + "args": { + "External id": 149573,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771801271.526, "dur": 53.220, + "args": { + "External id": 149574,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771801272.386, "dur": 4.520, + "args": { + "External id": 149575,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771801273.656, "dur": 2.700, + "args": { + "External id": 149576,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801275.406, "dur": 0.570, + "args": { + "External id": 149577,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771801277.766, "dur": 46.370, + "args": { + "External id": 149578,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771801278.676, "dur": 44.250, + "args": { + "External id": 149579,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771801332.976, "dur": 5.530, + "args": { + "External id": 149580,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801336.116, "dur": 0.970, + "args": { + "External id": 149581,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771801347.426, "dur": 2.030, + "args": { + "External id": 149582,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771801357.376, "dur": 9.580, + "args": { + "External id": 149583,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771801359.056, "dur": 7.540, + "args": { + "External id": 149584,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771801454.296, "dur": 166.999, + "args": { + "External id": 149585,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771801457.656, "dur": 4.900, + "args": { + "External id": 149586,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771801464.136, "dur": 156.619, + "args": { + "External id": 149587,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771801466.526, "dur": 0.220, + "args": { + "External id": 149588,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771801467.736, "dur": 23.160, + "args": { + "External id": 149589,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771801492.666, "dur": 4.600, + "args": { + "External id": 149590,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801495.766, "dur": 0.950, + "args": { + "External id": 149591,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771801498.006, "dur": 21.290, + "args": { + "External id": 149592,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771801499.066, "dur": 2.990, + "args": { + "External id": 149593,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771801503.185, "dur": 15.731, + "args": { + "External id": 149594,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771801506.196, "dur": 3.709, + "args": { + "External id": 149595,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771801520.556, "dur": 17.360, + "args": { + "External id": 149596,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771801539.616, "dur": 10.909, + "args": { + "External id": 149597,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771801554.796, "dur": 11.500, + "args": { + "External id": 149598,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771801567.676, "dur": 8.589, + "args": { + "External id": 149599,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771801577.976, "dur": 20.889, + "args": { + "External id": 149600,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771801580.085, "dur": 2.700, + "args": { + "External id": 149601,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801586.056, "dur": 1.040, + "args": { + "External id": 149602,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771801600.736, "dur": 8.429, + "args": { + "External id": 149603,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771801611.275, "dur": 7.940, + "args": { + "External id": 149604,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771801630.205, "dur": 3.720, + "args": { + "External id": 149605,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771801644.295, "dur": 4.550, + "args": { + "External id": 149606,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801647.125, "dur": 0.690, + "args": { + "External id": 149607,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771801722.765, "dur": 47.350, + "args": { + "External id": 149608,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771801777.805, "dur": 7.230, + "args": { + "External id": 149609,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801782.095, "dur": 1.110, + "args": { + "External id": 149610,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771801786.245, "dur": 21.760, + "args": { + "External id": 149611,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771801815.665, "dur": 8.630, + "args": { + "External id": 149612,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771801819.895, "dur": 3.600, + "args": { + "External id": 149613,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801822.115, "dur": 0.920, + "args": { + "External id": 149614,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771801826.755, "dur": 35.900, + "args": { + "External id": 149615,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771801827.725, "dur": 34.110, + "args": { + "External id": 149616,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771801867.735, "dur": 15.970, + "args": { + "External id": 149617,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771801892.625, "dur": 6.840, + "args": { + "External id": 149618,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801897.065, "dur": 0.960, + "args": { + "External id": 149619,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771801903.435, "dur": 41.360, + "args": { + "External id": 149620,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771801904.355, "dur": 4.050, + "args": { + "External id": 149621,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771801905.585, "dur": 2.280, + "args": { + "External id": 149622,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801907.055, "dur": 0.480, + "args": { + "External id": 149623,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771801909.325, "dur": 34.840, + "args": { + "External id": 149624,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771801910.315, "dur": 32.920, + "args": { + "External id": 149625,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771801952.295, "dur": 5.589, + "args": { + "External id": 149626,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771801954.895, "dur": 1.760, + "args": { + "External id": 149627,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771801966.504, "dur": 1.840, + "args": { + "External id": 149628,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771801975.115, "dur": 8.180, + "args": { + "External id": 149629,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771801976.695, "dur": 6.280, + "args": { + "External id": 149630,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771802069.334, "dur": 170.900, + "args": { + "External id": 149631,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771802072.944, "dur": 4.920, + "args": { + "External id": 149632,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771802079.404, "dur": 160.300, + "args": { + "External id": 149633,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771802080.724, "dur": 0.210, + "args": { + "External id": 149634,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771802081.924, "dur": 23.740, + "args": { + "External id": 149635,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771802108.934, "dur": 4.800, + "args": { + "External id": 149636,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802112.224, "dur": 0.950, + "args": { + "External id": 149637,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771802114.574, "dur": 24.920, + "args": { + "External id": 149638,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771802116.924, "dur": 2.930, + "args": { + "External id": 149639,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771802120.934, "dur": 18.170, + "args": { + "External id": 149640,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771802124.004, "dur": 4.690, + "args": { + "External id": 149641,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771802140.834, "dur": 18.900, + "args": { + "External id": 149642,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771802161.524, "dur": 10.140, + "args": { + "External id": 149643,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771802174.404, "dur": 12.110, + "args": { + "External id": 149644,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771802187.864, "dur": 8.430, + "args": { + "External id": 149645,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771802198.094, "dur": 21.270, + "args": { + "External id": 149646,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771802201.694, "dur": 2.520, + "args": { + "External id": 149647,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802207.454, "dur": 0.810, + "args": { + "External id": 149648,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771802221.074, "dur": 8.450, + "args": { + "External id": 149649,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771802230.664, "dur": 7.750, + "args": { + "External id": 149650,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771802249.774, "dur": 3.080, + "args": { + "External id": 149651,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771802263.024, "dur": 4.670, + "args": { + "External id": 149652,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802265.944, "dur": 0.680, + "args": { + "External id": 149653,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771802350.464, "dur": 48.290, + "args": { + "External id": 149654,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771802407.563, "dur": 9.151, + "args": { + "External id": 149655,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802413.583, "dur": 1.211, + "args": { + "External id": 149656,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771802417.963, "dur": 21.120, + "args": { + "External id": 149657,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771802446.543, "dur": 6.740, + "args": { + "External id": 149658,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771802448.034, "dur": 4.380, + "args": { + "External id": 149659,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802449.983, "dur": 2.040, + "args": { + "External id": 149660,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771802456.694, "dur": 37.120, + "args": { + "External id": 149661,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771802459.403, "dur": 33.371, + "args": { + "External id": 149662,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771802499.134, "dur": 16.409, + "args": { + "External id": 149663,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771802523.983, "dur": 5.460, + "args": { + "External id": 149664,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802526.993, "dur": 0.920, + "args": { + "External id": 149665,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771802533.493, "dur": 42.780, + "args": { + "External id": 149666,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771802534.333, "dur": 5.130, + "args": { + "External id": 149667,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771802535.503, "dur": 3.430, + "args": { + "External id": 149668,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802538.203, "dur": 0.440, + "args": { + "External id": 149669,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771802540.293, "dur": 35.370, + "args": { + "External id": 149670,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771802541.183, "dur": 33.670, + "args": { + "External id": 149671,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771802583.263, "dur": 4.620, + "args": { + "External id": 149672,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802585.713, "dur": 0.820, + "args": { + "External id": 149673,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771802595.463, "dur": 1.790, + "args": { + "External id": 149674,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771802604.903, "dur": 10.420, + "args": { + "External id": 149675,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771802607.793, "dur": 7.210, + "args": { + "External id": 149676,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771802698.903, "dur": 169.270, + "args": { + "External id": 149677,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771802701.223, "dur": 4.950, + "args": { + "External id": 149678,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771802708.883, "dur": 158.770, + "args": { + "External id": 149679,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771802710.243, "dur": 0.200, + "args": { + "External id": 149680,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771802711.493, "dur": 23.780, + "args": { + "External id": 149681,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771802737.103, "dur": 5.030, + "args": { + "External id": 149682,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802739.613, "dur": 2.030, + "args": { + "External id": 149683,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771802744.323, "dur": 21.500, + "args": { + "External id": 149684,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771802745.323, "dur": 3.110, + "args": { + "External id": 149685,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771802749.513, "dur": 15.940, + "args": { + "External id": 149686,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771802752.543, "dur": 3.830, + "args": { + "External id": 149687,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771802767.033, "dur": 17.030, + "args": { + "External id": 149688,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771802787.413, "dur": 12.130, + "args": { + "External id": 149689,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771802802.393, "dur": 12.030, + "args": { + "External id": 149690,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771802815.823, "dur": 8.140, + "args": { + "External id": 149691,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771802825.853, "dur": 21.460, + "args": { + "External id": 149692,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771802829.163, "dur": 2.460, + "args": { + "External id": 149693,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802833.873, "dur": 0.870, + "args": { + "External id": 149694,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771802849.013, "dur": 8.320, + "args": { + "External id": 149695,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771802858.442, "dur": 7.860, + "args": { + "External id": 149696,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771802877.153, "dur": 3.100, + "args": { + "External id": 149697,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771802890.342, "dur": 4.660, + "args": { + "External id": 149698,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771802893.313, "dur": 0.660, + "args": { + "External id": 149699,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771802970.042, "dur": 46.660, + "args": { + "External id": 149700,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771803026.282, "dur": 7.590, + "args": { + "External id": 149701,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803030.702, "dur": 1.190, + "args": { + "External id": 149702,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771803035.132, "dur": 20.430, + "args": { + "External id": 149703,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771803063.542, "dur": 7.530, + "args": { + "External id": 149704,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771803066.212, "dur": 3.960, + "args": { + "External id": 149705,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803068.832, "dur": 0.910, + "args": { + "External id": 149706,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771803075.012, "dur": 35.750, + "args": { + "External id": 149707,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771803076.022, "dur": 33.780, + "args": { + "External id": 149708,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771803116.332, "dur": 14.960, + "args": { + "External id": 149709,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771803140.262, "dur": 5.630, + "args": { + "External id": 149710,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803143.482, "dur": 0.940, + "args": { + "External id": 149711,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771803149.942, "dur": 42.270, + "args": { + "External id": 149712,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771803150.722, "dur": 5.880, + "args": { + "External id": 149713,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771803153.502, "dur": 2.490, + "args": { + "External id": 149714,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803155.082, "dur": 0.520, + "args": { + "External id": 149715,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771803157.442, "dur": 34.140, + "args": { + "External id": 149716,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771803158.322, "dur": 32.370, + "args": { + "External id": 149717,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771803198.752, "dur": 4.990, + "args": { + "External id": 149718,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803201.692, "dur": 0.790, + "args": { + "External id": 149719,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771803212.462, "dur": 1.860, + "args": { + "External id": 149720,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771803222.092, "dur": 9.300, + "args": { + "External id": 149721,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771803223.732, "dur": 7.290, + "args": { + "External id": 149722,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771803326.912, "dur": 164.239, + "args": { + "External id": 149723,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771803330.432, "dur": 5.409, + "args": { + "External id": 149724,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771803337.561, "dur": 153.040, + "args": { + "External id": 149725,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771803338.841, "dur": 0.200, + "args": { + "External id": 149726,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771803341.141, "dur": 23.000, + "args": { + "External id": 149727,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771803365.752, "dur": 4.709, + "args": { + "External id": 149728,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803368.881, "dur": 0.971, + "args": { + "External id": 149729,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771803371.221, "dur": 20.780, + "args": { + "External id": 149730,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771803372.161, "dur": 2.851, + "args": { + "External id": 149731,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771803376.032, "dur": 15.660, + "args": { + "External id": 149732,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771803379.201, "dur": 3.431, + "args": { + "External id": 149733,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771803393.192, "dur": 17.019, + "args": { + "External id": 149734,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771803411.911, "dur": 9.840, + "args": { + "External id": 149735,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771803424.531, "dur": 13.720, + "args": { + "External id": 149736,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771803439.671, "dur": 8.640, + "args": { + "External id": 149737,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771803451.011, "dur": 19.590, + "args": { + "External id": 149738,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771803454.141, "dur": 2.500, + "args": { + "External id": 149739,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803458.831, "dur": 0.750, + "args": { + "External id": 149740,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771803472.351, "dur": 8.160, + "args": { + "External id": 149741,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771803481.611, "dur": 7.680, + "args": { + "External id": 149742,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771803500.551, "dur": 3.060, + "args": { + "External id": 149743,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5786 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771803514.761, "dur": 4.470, + "args": { + "External id": 149744,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803517.531, "dur": 0.630, + "args": { + "External id": 149745,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5788 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771803593.121, "dur": 47.010, + "args": { + "External id": 149746,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771803647.851, "dur": 7.080, + "args": { + "External id": 149747,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803651.971, "dur": 1.110, + "args": { + "External id": 149748,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771803656.131, "dur": 21.370, + "args": { + "External id": 149749,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771803685.051, "dur": 8.370, + "args": { + "External id": 149750,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771803686.541, "dur": 5.970, + "args": { + "External id": 149751,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803689.771, "dur": 2.340, + "args": { + "External id": 149752,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771803695.831, "dur": 35.130, + "args": { + "External id": 149753,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771803696.801, "dur": 33.150, + "args": { + "External id": 149754,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771803736.861, "dur": 15.170, + "args": { + "External id": 149755,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771803760.331, "dur": 5.609, + "args": { + "External id": 149756,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803763.500, "dur": 0.931, + "args": { + "External id": 149757,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771803771.140, "dur": 42.220, + "args": { + "External id": 149758,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771803771.980, "dur": 4.340, + "args": { + "External id": 149759,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771803773.280, "dur": 2.480, + "args": { + "External id": 149760,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5803 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803774.820, "dur": 0.660, + "args": { + "External id": 149761,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771803777.140, "dur": 35.631, + "args": { + "External id": 149762,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771803778.071, "dur": 33.789, + "args": { + "External id": 149763,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771803821.080, "dur": 6.751, + "args": { + "External id": 149764,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803824.351, "dur": 0.760, + "args": { + "External id": 149765,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771803845.480, "dur": 1.880, + "args": { + "External id": 149766,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771803854.800, "dur": 8.440, + "args": { + "External id": 149767,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771803856.420, "dur": 6.490, + "args": { + "External id": 149768,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771803947.550, "dur": 172.430, + "args": { + "External id": 149769,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771803949.740, "dur": 4.990, + "args": { + "External id": 149770,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771803957.580, "dur": 161.830, + "args": { + "External id": 149771,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771803960.590, "dur": 0.230, + "args": { + "External id": 149772,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771803961.910, "dur": 22.250, + "args": { + "External id": 149773,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771803985.850, "dur": 5.050, + "args": { + "External id": 149774,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771803987.990, "dur": 1.760, + "args": { + "External id": 149775,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771803993.150, "dur": 28.610, + "args": { + "External id": 149776,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771803996.550, "dur": 6.430, + "args": { + "External id": 149777,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771804004.330, "dur": 17.060, + "args": { + "External id": 149778,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771804007.260, "dur": 3.670, + "args": { + "External id": 149779,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771804023.040, "dur": 18.540, + "args": { + "External id": 149780,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771804043.350, "dur": 10.020, + "args": { + "External id": 149781,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771804057.240, "dur": 11.520, + "args": { + "External id": 149782,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771804070.000, "dur": 8.200, + "args": { + "External id": 149783,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771804079.930, "dur": 18.460, + "args": { + "External id": 149784,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771804082.070, "dur": 2.500, + "args": { + "External id": 149785,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804086.700, "dur": 0.820, + "args": { + "External id": 149786,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771804100.040, "dur": 7.970, + "args": { + "External id": 149787,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771804110.150, "dur": 7.870, + "args": { + "External id": 149788,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771804129.720, "dur": 3.100, + "args": { + "External id": 149789,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771804142.870, "dur": 4.490, + "args": { + "External id": 149790,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804145.660, "dur": 0.630, + "args": { + "External id": 149791,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771804220.659, "dur": 45.920, + "args": { + "External id": 149792,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771804275.110, "dur": 8.249, + "args": { + "External id": 149793,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804280.159, "dur": 1.240, + "args": { + "External id": 149794,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771804284.659, "dur": 32.730, + "args": { + "External id": 149795,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771804326.229, "dur": 7.980, + "args": { + "External id": 149796,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771804329.279, "dur": 3.960, + "args": { + "External id": 149797,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804331.579, "dur": 1.180, + "args": { + "External id": 149798,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771804336.789, "dur": 38.090, + "args": { + "External id": 149799,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771804337.789, "dur": 36.090, + "args": { + "External id": 149800,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771804379.989, "dur": 16.270, + "args": { + "External id": 149801,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771804404.749, "dur": 7.010, + "args": { + "External id": 149802,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804409.369, "dur": 0.950, + "args": { + "External id": 149803,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771804415.759, "dur": 41.350, + "args": { + "External id": 149804,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771804416.629, "dur": 4.030, + "args": { + "External id": 149805,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771804417.879, "dur": 2.260, + "args": { + "External id": 149806,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804419.359, "dur": 0.510, + "args": { + "External id": 149807,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771804421.499, "dur": 34.970, + "args": { + "External id": 149808,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771804422.329, "dur": 33.290, + "args": { + "External id": 149809,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771804465.149, "dur": 6.070, + "args": { + "External id": 149810,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804468.219, "dur": 1.600, + "args": { + "External id": 149811,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771804478.679, "dur": 1.870, + "args": { + "External id": 149812,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771804487.439, "dur": 7.870, + "args": { + "External id": 149813,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771804489.109, "dur": 5.850, + "args": { + "External id": 149814,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771804581.799, "dur": 165.719, + "args": { + "External id": 149815,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771804584.259, "dur": 6.020, + "args": { + "External id": 149816,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771804591.989, "dur": 154.940, + "args": { + "External id": 149817,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771804593.279, "dur": 0.200, + "args": { + "External id": 149818,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771804594.629, "dur": 24.550, + "args": { + "External id": 149819,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771804620.969, "dur": 3.650, + "args": { + "External id": 149820,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804623.149, "dur": 0.950, + "args": { + "External id": 149821,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771804625.379, "dur": 23.260, + "args": { + "External id": 149822,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771804626.529, "dur": 2.850, + "args": { + "External id": 149823,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771804630.419, "dur": 17.890, + "args": { + "External id": 149824,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771804635.919, "dur": 3.600, + "args": { + "External id": 149825,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771804649.879, "dur": 16.859, + "args": { + "External id": 149826,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771804668.358, "dur": 10.780, + "args": { + "External id": 149827,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771804682.258, "dur": 12.400, + "args": { + "External id": 149828,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771804695.978, "dur": 8.480, + "args": { + "External id": 149829,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771804706.218, "dur": 19.000, + "args": { + "External id": 149830,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771804708.289, "dur": 2.529, + "args": { + "External id": 149831,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804713.289, "dur": 0.829, + "args": { + "External id": 149832,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771804728.109, "dur": 8.589, + "args": { + "External id": 149833,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771804737.849, "dur": 7.789, + "args": { + "External id": 149834,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771804756.938, "dur": 3.130, + "args": { + "External id": 149835,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771804769.538, "dur": 4.430, + "args": { + "External id": 149836,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804772.298, "dur": 0.640, + "args": { + "External id": 149837,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771804846.498, "dur": 46.620, + "args": { + "External id": 149838,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771804901.678, "dur": 8.480, + "args": { + "External id": 149839,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804907.148, "dur": 1.160, + "args": { + "External id": 149840,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771804911.378, "dur": 23.890, + "args": { + "External id": 149841,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771804942.848, "dur": 5.720, + "args": { + "External id": 149842,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771804944.318, "dur": 3.390, + "args": { + "External id": 149843,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771804946.368, "dur": 0.960, + "args": { + "External id": 149844,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771804950.908, "dur": 35.080, + "args": { + "External id": 149845,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771804951.868, "dur": 33.120, + "args": { + "External id": 149846,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771804992.348, "dur": 16.090, + "args": { + "External id": 149847,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771805014.178, "dur": 27.450, + "args": { + "External id": 149848,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771805016.768, "dur": 24.330, + "args": { + "External id": 149849,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805023.808, "dur": 0.810, + "args": { + "External id": 149850,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 5893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771805048.798, "dur": 27.990, + "args": { + "External id": 149851,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771805050.518, "dur": 26.010, + "args": { + "External id": 149852,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 5895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805054.368, "dur": 6.220, + "args": { + "External id": 149853,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5896 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771805061.948, "dur": 14.060, + "args": { + "External id": 149854,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771805088.388, "dur": 6.200, + "args": { + "External id": 149855,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771805090.298, "dur": 3.940, + "args": { + "External id": 149856,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771805095.778, "dur": 1.400, + "args": { + "External id": 149857,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771805096.218, "dur": 0.690, + "args": { + "External id": 149858,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771805135.268, "dur": 28.349, + "args": { + "External id": 149859,"Sequence number": 3058959, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 5902 + } + }, + { + "ph": "s", "id": 23, "pid": 5714, "tid": 5714, "ts": 6303771805135.268, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771805172.888, "dur": 7.669, + "args": { + "External id": 149860,"Sequence number": 3058960, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 5903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805177.257, "dur": 1.491, + "args": { + "External id": 149861,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6303771805183.228, "dur": 5.669, + "args": { + "External id": 149862,"Sequence number": 3058960, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "1"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 5905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805187.268, "dur": 0.409, + "args": { + "External id": 149863,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "2"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 5906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771805190.448, "dur": 2.369, + "args": { + "External id": 149864,"Sequence number": 3058960, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 5907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805191.948, "dur": 0.289, + "args": { + "External id": 149865,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "2"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 5908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771805198.788, "dur": 5.669, + "args": { + "External id": 149866,"Sequence number": 3058960, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5909 + } + }, + { + "ph": "s", "id": 22, "pid": 5714, "tid": 5714, "ts": 6303771805198.788, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805202.617, "dur": 0.560, + "args": { + "External id": 149867,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771805205.597, "dur": 3.220, + "args": { + "External id": 149868,"Sequence number": 3058961, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 5911 + } + }, + { + "ph": "s", "id": 21, "pid": 5714, "tid": 5714, "ts": 6303771805205.597, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805207.677, "dur": 0.310, + "args": { + "External id": 149869,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6303771805209.837, "dur": 7.210, + "args": { + "External id": 149870,"Sequence number": 3058962, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 5913 + } + }, + { + "ph": "s", "id": 20, "pid": 5714, "tid": 5714, "ts": 6303771805209.837, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805214.967, "dur": 1.270, + "args": { + "External id": 149871,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 5914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771805218.257, "dur": 3.570, + "args": { + "External id": 149872,"Sequence number": 3058963, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 5915 + } + }, + { + "ph": "s", "id": 19, "pid": 5714, "tid": 5714, "ts": 6303771805218.257, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805220.817, "dur": 0.310, + "args": { + "External id": 149873,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "768"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 5916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6303771805226.327, "dur": 35.430, + "args": { + "External id": 149874,"Sequence number": 3058964, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6303771805227.447, "dur": 33.940, + "args": { + "External id": 149875,"Sequence number": 3058964, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771805229.807, "dur": 10.390, + "args": { + "External id": 149876,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 5919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771805231.577, "dur": 8.030, + "args": { + "External id": 149877,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771805241.267, "dur": 19.510, + "args": { + "External id": 149878,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 5921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771805294.877, "dur": 16.530, + "args": { + "External id": 149879,"Sequence number": 3058964, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 5922 + } + }, + { + "ph": "s", "id": 18, "pid": 5714, "tid": 5714, "ts": 6303771805294.877, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771805314.937, "dur": 1.190, + "args": { + "External id": 149880,"Sequence number": 3058965, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 5923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5714, "tid": 5714, + "ts": 6303771805343.297, "dur": 21341.793, + "args": { + "External id": 149881,"Sequence number": 3058965, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 5924 + } + }, + { + "ph": "s", "id": 17, "pid": 5714, "tid": 5714, "ts": 6303771805343.297, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6303771805358.277, "dur": 38.360, + "args": { + "External id": 149882,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6303771805359.107, "dur": 37.230, + "args": { + "External id": 149883,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771805362.087, "dur": 10.000, + "args": { + "External id": 149884,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771805364.237, "dur": 7.020, + "args": { + "External id": 149885,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771805373.157, "dur": 22.610, + "args": { + "External id": 149886,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 5929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6303771805413.667, "dur": 30.430, + "args": { + "External id": 149887,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771805415.117, "dur": 10.850, + "args": { + "External id": 149888,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 5931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805418.027, "dur": 7.170, + "args": { + "External id": 149889,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771805427.427, "dur": 16.410, + "args": { + "External id": 149890,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 5933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771805429.397, "dur": 13.330, + "args": { + "External id": 149891,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 5934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6303771805448.707, "dur": 19.270, + "args": { + "External id": 149892,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771805449.377, "dur": 6.110, + "args": { + "External id": 149893,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 5936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805450.907, "dur": 4.230, + "args": { + "External id": 149894,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771805456.117, "dur": 11.600, + "args": { + "External id": 149895,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771805457.047, "dur": 9.890, + "args": { + "External id": 149896,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 5939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6303771805474.957, "dur": 19.450, + "args": { + "External id": 149897,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 5940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771805476.867, "dur": 6.350, + "args": { + "External id": 149898,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771805483.977, "dur": 10.140, + "args": { + "External id": 149899,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 5942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771805485.207, "dur": 8.100, + "args": { + "External id": 149900,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 5714, + "ts": 6303771805500.357, "dur": 21.670, + "args": { + "External id": 149901,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771805526.037, "dur": 50.459, + "args": { + "External id": 149902,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 5945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771805527.957, "dur": 47.930, + "args": { + "External id": 149903,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805534.987, "dur": 0.990, + "args": { + "External id": 149904,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 5947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771805537.267, "dur": 23.720, + "args": { + "External id": 149905,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771805538.457, "dur": 22.279, + "args": { + "External id": 149906,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 5949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771805540.757, "dur": 5.040, + "args": { + "External id": 149907,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771805546.857, "dur": 13.459, + "args": { + "External id": 149908,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 5951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6303771805581.947, "dur": 16166.353, + "args": { + "External id": 149909,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6303771805583.676, "dur": 16162.555, + "args": { + "External id": 149910,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 5953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771821764.320, "dur": 10.400, + "args": { + "External id": 149911,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771821770.891, "dur": 1.700, + "args": { + "External id": 149912,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771821779.660, "dur": 58.780, + "args": { + "External id": 149913,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 5956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771821780.671, "dur": 5.920, + "args": { + "External id": 149914,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 5957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771821782.360, "dur": 3.431, + "args": { + "External id": 149915,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 5958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771821784.200, "dur": 1.251, + "args": { + "External id": 149916,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 5959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771821787.591, "dur": 49.969, + "args": { + "External id": 149917,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771821790.011, "dur": 46.480, + "args": { + "External id": 149918,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 5961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771821843.780, "dur": 4.790, + "args": { + "External id": 149919,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771821846.580, "dur": 0.700, + "args": { + "External id": 149920,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771821856.030, "dur": 2.140, + "args": { + "External id": 149921,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 5964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771821865.430, "dur": 9.160, + "args": { + "External id": 149922,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 5965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771821867.050, "dur": 7.190, + "args": { + "External id": 149923,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771821969.620, "dur": 171.990, + "args": { + "External id": 149924,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771821973.210, "dur": 5.920, + "args": { + "External id": 149925,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771821980.890, "dur": 160.180, + "args": { + "External id": 149926,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 5969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771821982.160, "dur": 0.240, + "args": { + "External id": 149927,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771821983.440, "dur": 25.460, + "args": { + "External id": 149928,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 5971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771822010.700, "dur": 5.000, + "args": { + "External id": 149929,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 5972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822014.240, "dur": 0.970, + "args": { + "External id": 149930,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 5973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771822016.440, "dur": 22.470, + "args": { + "External id": 149931,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771822018.580, "dur": 2.960, + "args": { + "External id": 149932,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 5975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771822022.520, "dur": 16.030, + "args": { + "External id": 149933,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 5976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771822025.700, "dur": 3.630, + "args": { + "External id": 149934,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771822040.190, "dur": 18.000, + "args": { + "External id": 149935,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771822059.940, "dur": 10.960, + "args": { + "External id": 149936,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771822074.040, "dur": 12.260, + "args": { + "External id": 149937,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 5980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771822087.690, "dur": 8.790, + "args": { + "External id": 149938,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 5981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771822098.310, "dur": 22.010, + "args": { + "External id": 149939,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 5982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771822101.610, "dur": 2.780, + "args": { + "External id": 149940,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 5983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822107.980, "dur": 0.780, + "args": { + "External id": 149941,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 5984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771822122.170, "dur": 8.420, + "args": { + "External id": 149942,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 5985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771822131.750, "dur": 7.840, + "args": { + "External id": 149943,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 5986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771822150.580, "dur": 3.190, + "args": { + "External id": 149944,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 5987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771822162.210, "dur": 4.540, + "args": { + "External id": 149945,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 5988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822165.060, "dur": 0.630, + "args": { + "External id": 149946,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 5989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771822240.279, "dur": 45.991, + "args": { + "External id": 149947,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 5990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771822294.239, "dur": 17.040, + "args": { + "External id": 149948,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 5991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822307.749, "dur": 1.170, + "args": { + "External id": 149949,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 5992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771822312.769, "dur": 23.920, + "args": { + "External id": 149950,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 5993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771822344.779, "dur": 7.080, + "args": { + "External id": 149951,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 5994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771822346.219, "dur": 4.740, + "args": { + "External id": 149952,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 5995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822349.729, "dur": 0.890, + "args": { + "External id": 149953,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 5996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771822354.609, "dur": 36.880, + "args": { + "External id": 149954,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771822356.799, "dur": 33.720, + "args": { + "External id": 149955,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 5998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771822396.779, "dur": 15.080, + "args": { + "External id": 149956,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 5999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771822420.099, "dur": 5.450, + "args": { + "External id": 149957,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822423.259, "dur": 0.880, + "args": { + "External id": 149958,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771822429.779, "dur": 42.930, + "args": { + "External id": 149959,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771822430.599, "dur": 5.340, + "args": { + "External id": 149960,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771822431.839, "dur": 3.430, + "args": { + "External id": 149961,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822434.449, "dur": 0.540, + "args": { + "External id": 149962,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771822436.789, "dur": 35.300, + "args": { + "External id": 149963,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771822437.829, "dur": 33.480, + "args": { + "External id": 149964,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771822478.849, "dur": 4.650, + "args": { + "External id": 149965,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822481.369, "dur": 0.910, + "args": { + "External id": 149966,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771822491.189, "dur": 2.000, + "args": { + "External id": 149967,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771822500.979, "dur": 7.960, + "args": { + "External id": 149968,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771822502.519, "dur": 6.090, + "args": { + "External id": 149969,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771822592.389, "dur": 164.899, + "args": { + "External id": 149970,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771822594.659, "dur": 6.110, + "args": { + "External id": 149971,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771822602.399, "dur": 154.339, + "args": { + "External id": 149972,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771822603.639, "dur": 0.230, + "args": { + "External id": 149973,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771822605.039, "dur": 22.520, + "args": { + "External id": 149974,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771822630.119, "dur": 4.820, + "args": { + "External id": 149975,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822633.379, "dur": 1.100, + "args": { + "External id": 149976,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771822635.749, "dur": 20.689, + "args": { + "External id": 149977,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771822636.779, "dur": 2.730, + "args": { + "External id": 149978,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771822640.759, "dur": 15.350, + "args": { + "External id": 149979,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771822643.799, "dur": 3.519, + "args": { + "External id": 149980,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771822657.658, "dur": 20.071, + "args": { + "External id": 149981,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771822679.409, "dur": 10.060, + "args": { + "External id": 149982,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771822692.878, "dur": 11.780, + "args": { + "External id": 149983,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771822706.029, "dur": 8.329, + "args": { + "External id": 149984,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771822717.489, "dur": 19.160, + "args": { + "External id": 149985,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771822719.838, "dur": 2.871, + "args": { + "External id": 149986,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822724.809, "dur": 0.860, + "args": { + "External id": 149987,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771822738.498, "dur": 8.130, + "args": { + "External id": 149988,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771822747.718, "dur": 7.540, + "args": { + "External id": 149989,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771822766.058, "dur": 3.110, + "args": { + "External id": 149990,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771822780.078, "dur": 4.330, + "args": { + "External id": 149991,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822782.788, "dur": 0.620, + "args": { + "External id": 149992,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771822856.488, "dur": 46.170, + "args": { + "External id": 149993,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771822910.278, "dur": 7.010, + "args": { + "External id": 149994,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822914.388, "dur": 1.120, + "args": { + "External id": 149995,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771822918.968, "dur": 20.100, + "args": { + "External id": 149996,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771822946.318, "dur": 7.130, + "args": { + "External id": 149997,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771822947.858, "dur": 4.760, + "args": { + "External id": 149998,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771822951.238, "dur": 1.020, + "args": { + "External id": 149999,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771822955.948, "dur": 33.370, + "args": { + "External id": 150000,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771822956.918, "dur": 31.460, + "args": { + "External id": 150001,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771822994.078, "dur": 14.460, + "args": { + "External id": 150002,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771823016.698, "dur": 5.510, + "args": { + "External id": 150003,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823019.908, "dur": 0.950, + "args": { + "External id": 150004,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771823027.538, "dur": 41.010, + "args": { + "External id": 150005,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771823028.338, "dur": 5.450, + "args": { + "External id": 150006,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771823029.508, "dur": 3.690, + "args": { + "External id": 150007,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823031.208, "dur": 1.650, + "args": { + "External id": 150008,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771823034.658, "dur": 33.270, + "args": { + "External id": 150009,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771823036.038, "dur": 30.990, + "args": { + "External id": 150010,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771823074.978, "dur": 4.620, + "args": { + "External id": 150011,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823077.728, "dur": 0.520, + "args": { + "External id": 150012,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771823088.138, "dur": 1.890, + "args": { + "External id": 150013,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771823096.617, "dur": 8.280, + "args": { + "External id": 150014,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771823098.348, "dur": 6.200, + "args": { + "External id": 150015,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771823187.157, "dur": 186.840, + "args": { + "External id": 150016,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771823203.317, "dur": 4.730, + "args": { + "External id": 150017,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771823210.447, "dur": 162.950, + "args": { + "External id": 150018,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771823212.937, "dur": 0.200, + "args": { + "External id": 150019,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771823215.547, "dur": 23.090, + "args": { + "External id": 150020,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771823240.237, "dur": 3.620, + "args": { + "External id": 150021,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823242.457, "dur": 0.950, + "args": { + "External id": 150022,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771823244.627, "dur": 21.840, + "args": { + "External id": 150023,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771823245.647, "dur": 4.080, + "args": { + "External id": 150024,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771823250.877, "dur": 15.180, + "args": { + "External id": 150025,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771823253.817, "dur": 3.400, + "args": { + "External id": 150026,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771823267.677, "dur": 18.550, + "args": { + "External id": 150027,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771823287.937, "dur": 18.630, + "args": { + "External id": 150028,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771823309.717, "dur": 12.580, + "args": { + "External id": 150029,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771823325.167, "dur": 8.480, + "args": { + "External id": 150030,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771823335.437, "dur": 18.540, + "args": { + "External id": 150031,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771823337.507, "dur": 2.340, + "args": { + "External id": 150032,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823341.897, "dur": 0.940, + "args": { + "External id": 150033,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771823355.587, "dur": 7.900, + "args": { + "External id": 150034,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771823364.607, "dur": 7.550, + "args": { + "External id": 150035,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771823384.117, "dur": 3.160, + "args": { + "External id": 150036,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771823397.867, "dur": 7.980, + "args": { + "External id": 150037,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823404.087, "dur": 0.700, + "args": { + "External id": 150038,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771823478.587, "dur": 45.810, + "args": { + "External id": 150039,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771823532.007, "dur": 7.310, + "args": { + "External id": 150040,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823536.367, "dur": 1.130, + "args": { + "External id": 150041,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771823540.557, "dur": 20.219, + "args": { + "External id": 150042,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771823569.276, "dur": 5.611, + "args": { + "External id": 150043,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771823570.816, "dur": 3.211, + "args": { + "External id": 150044,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823572.767, "dur": 0.940, + "args": { + "External id": 150045,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771823577.336, "dur": 33.731, + "args": { + "External id": 150046,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771823578.316, "dur": 31.820, + "args": { + "External id": 150047,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771823615.787, "dur": 14.889, + "args": { + "External id": 150048,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771823638.516, "dur": 6.580, + "args": { + "External id": 150049,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823642.876, "dur": 0.860, + "args": { + "External id": 150050,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771823649.196, "dur": 41.710, + "args": { + "External id": 150051,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771823650.126, "dur": 5.330, + "args": { + "External id": 150052,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771823651.236, "dur": 3.650, + "args": { + "External id": 150053,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823652.946, "dur": 1.600, + "args": { + "External id": 150054,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771823656.386, "dur": 33.950, + "args": { + "External id": 150055,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771823658.366, "dur": 31.200, + "args": { + "External id": 150056,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771823696.896, "dur": 4.610, + "args": { + "External id": 150057,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823699.726, "dur": 0.580, + "args": { + "External id": 150058,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771823708.886, "dur": 1.890, + "args": { + "External id": 150059,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771823717.756, "dur": 9.710, + "args": { + "External id": 150060,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771823720.566, "dur": 6.540, + "args": { + "External id": 150061,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771823809.506, "dur": 161.560, + "args": { + "External id": 150062,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771823811.926, "dur": 5.430, + "args": { + "External id": 150063,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771823818.876, "dur": 151.650, + "args": { + "External id": 150064,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771823820.116, "dur": 0.210, + "args": { + "External id": 150065,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771823822.476, "dur": 21.860, + "args": { + "External id": 150066,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771823845.976, "dur": 3.570, + "args": { + "External id": 150067,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823848.096, "dur": 0.980, + "args": { + "External id": 150068,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771823850.446, "dur": 24.540, + "args": { + "External id": 150069,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771823852.506, "dur": 4.230, + "args": { + "External id": 150070,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771823858.016, "dur": 16.600, + "args": { + "External id": 150071,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771823861.076, "dur": 4.580, + "args": { + "External id": 150072,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771823876.286, "dur": 17.530, + "args": { + "External id": 150073,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771823895.486, "dur": 9.960, + "args": { + "External id": 150074,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771823908.276, "dur": 11.650, + "args": { + "External id": 150075,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771823921.246, "dur": 8.310, + "args": { + "External id": 150076,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771823931.276, "dur": 19.360, + "args": { + "External id": 150077,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771823933.626, "dur": 2.570, + "args": { + "External id": 150078,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823939.206, "dur": 0.840, + "args": { + "External id": 150079,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771823952.366, "dur": 8.060, + "args": { + "External id": 150080,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771823961.506, "dur": 7.660, + "args": { + "External id": 150081,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771823979.606, "dur": 3.070, + "args": { + "External id": 150082,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771823992.166, "dur": 4.569, + "args": { + "External id": 150083,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771823994.986, "dur": 0.680, + "args": { + "External id": 150084,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771824066.506, "dur": 45.059, + "args": { + "External id": 150085,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771824118.965, "dur": 9.290, + "args": { + "External id": 150086,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824124.155, "dur": 2.140, + "args": { + "External id": 150087,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771824129.475, "dur": 19.800, + "args": { + "External id": 150088,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771824156.625, "dur": 5.560, + "args": { + "External id": 150089,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771824158.035, "dur": 3.340, + "args": { + "External id": 150090,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824160.085, "dur": 0.950, + "args": { + "External id": 150091,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771824164.675, "dur": 34.490, + "args": { + "External id": 150092,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771824166.915, "dur": 31.410, + "args": { + "External id": 150093,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771824203.825, "dur": 14.380, + "args": { + "External id": 150094,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771824226.195, "dur": 5.430, + "args": { + "External id": 150095,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824229.295, "dur": 0.930, + "args": { + "External id": 150096,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771824235.555, "dur": 39.590, + "args": { + "External id": 150097,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771824236.385, "dur": 4.830, + "args": { + "External id": 150098,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771824237.555, "dur": 3.100, + "args": { + "External id": 150099,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824239.885, "dur": 0.510, + "args": { + "External id": 150100,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771824242.035, "dur": 32.570, + "args": { + "External id": 150101,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771824243.045, "dur": 30.900, + "args": { + "External id": 150102,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771824281.395, "dur": 4.290, + "args": { + "External id": 150103,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824283.745, "dur": 0.660, + "args": { + "External id": 150104,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771824292.885, "dur": 1.860, + "args": { + "External id": 150105,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771824309.735, "dur": 10.330, + "args": { + "External id": 150106,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771824312.195, "dur": 7.450, + "args": { + "External id": 150107,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771824402.395, "dur": 160.819, + "args": { + "External id": 150108,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771824404.625, "dur": 4.570, + "args": { + "External id": 150109,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771824410.645, "dur": 152.029, + "args": { + "External id": 150110,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771824412.075, "dur": 0.210, + "args": { + "External id": 150111,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771824413.255, "dur": 23.070, + "args": { + "External id": 150112,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771824439.245, "dur": 3.740, + "args": { + "External id": 150113,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824441.405, "dur": 0.990, + "args": { + "External id": 150114,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771824443.715, "dur": 22.530, + "args": { + "External id": 150115,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771824445.874, "dur": 2.771, + "args": { + "External id": 150116,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771824449.754, "dur": 16.120, + "args": { + "External id": 150117,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771824453.874, "dur": 3.420, + "args": { + "External id": 150118,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771824467.494, "dur": 17.411, + "args": { + "External id": 150119,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771824486.614, "dur": 9.831, + "args": { + "External id": 150120,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771824499.505, "dur": 11.569, + "args": { + "External id": 150121,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771824512.365, "dur": 8.549, + "args": { + "External id": 150122,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771824522.654, "dur": 19.771, + "args": { + "External id": 150123,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771824524.725, "dur": 2.600, + "args": { + "External id": 150124,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824530.725, "dur": 0.780, + "args": { + "External id": 150125,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771824544.334, "dur": 8.190, + "args": { + "External id": 150126,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771824553.654, "dur": 7.770, + "args": { + "External id": 150127,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771824571.894, "dur": 3.060, + "args": { + "External id": 150128,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771824584.544, "dur": 4.350, + "args": { + "External id": 150129,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824587.224, "dur": 0.620, + "args": { + "External id": 150130,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771824658.584, "dur": 44.900, + "args": { + "External id": 150131,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771824712.164, "dur": 6.960, + "args": { + "External id": 150132,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824716.234, "dur": 1.070, + "args": { + "External id": 150133,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771824720.324, "dur": 20.170, + "args": { + "External id": 150134,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771824747.614, "dur": 7.190, + "args": { + "External id": 150135,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771824749.114, "dur": 4.770, + "args": { + "External id": 150136,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824752.554, "dur": 0.960, + "args": { + "External id": 150137,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771824758.434, "dur": 35.150, + "args": { + "External id": 150138,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771824760.994, "dur": 31.640, + "args": { + "External id": 150139,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771824798.264, "dur": 14.570, + "args": { + "External id": 150140,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771824820.774, "dur": 5.560, + "args": { + "External id": 150141,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824823.944, "dur": 0.960, + "args": { + "External id": 150142,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771824830.204, "dur": 39.580, + "args": { + "External id": 150143,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771824831.014, "dur": 5.170, + "args": { + "External id": 150144,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771824833.384, "dur": 2.300, + "args": { + "External id": 150145,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824834.894, "dur": 0.500, + "args": { + "External id": 150146,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771824837.024, "dur": 32.130, + "args": { + "External id": 150147,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771824837.874, "dur": 30.490, + "args": { + "External id": 150148,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771824875.664, "dur": 5.360, + "args": { + "External id": 150149,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771824878.134, "dur": 1.600, + "args": { + "External id": 150150,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771824889.244, "dur": 1.940, + "args": { + "External id": 150151,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771824898.153, "dur": 8.131, + "args": { + "External id": 150152,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771824899.804, "dur": 6.129, + "args": { + "External id": 150153,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771824986.864, "dur": 160.369, + "args": { + "External id": 150154,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771824989.153, "dur": 5.510, + "args": { + "External id": 150155,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6198 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771824996.333, "dur": 150.360, + "args": { + "External id": 150156,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771824998.873, "dur": 0.200, + "args": { + "External id": 150157,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6200 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771825000.063, "dur": 22.720, + "args": { + "External id": 150158,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771825024.413, "dur": 3.550, + "args": { + "External id": 150159,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825026.513, "dur": 0.900, + "args": { + "External id": 150160,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771825028.743, "dur": 23.160, + "args": { + "External id": 150161,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771825029.673, "dur": 2.920, + "args": { + "External id": 150162,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771825033.593, "dur": 17.950, + "args": { + "External id": 150163,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771825037.663, "dur": 4.720, + "args": { + "External id": 150164,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771825053.213, "dur": 16.860, + "args": { + "External id": 150165,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771825071.803, "dur": 9.980, + "args": { + "External id": 150166,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771825084.533, "dur": 11.160, + "args": { + "External id": 150167,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771825096.973, "dur": 8.170, + "args": { + "External id": 150168,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771825108.843, "dur": 18.160, + "args": { + "External id": 150169,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771825110.883, "dur": 2.230, + "args": { + "External id": 150170,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825115.233, "dur": 0.820, + "args": { + "External id": 150171,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771825128.693, "dur": 8.030, + "args": { + "External id": 150172,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771825137.833, "dur": 7.570, + "args": { + "External id": 150173,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771825156.273, "dur": 3.010, + "args": { + "External id": 150174,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771825169.313, "dur": 4.360, + "args": { + "External id": 150175,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825172.043, "dur": 0.640, + "args": { + "External id": 150176,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771825245.363, "dur": 45.750, + "args": { + "External id": 150177,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771825307.463, "dur": 7.370, + "args": { + "External id": 150178,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825311.763, "dur": 1.150, + "args": { + "External id": 150179,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771825316.073, "dur": 21.220, + "args": { + "External id": 150180,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771825344.852, "dur": 6.631, + "args": { + "External id": 150181,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771825346.323, "dur": 4.369, + "args": { + "External id": 150182,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825349.432, "dur": 0.911, + "args": { + "External id": 150183,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771825353.912, "dur": 34.260, + "args": { + "External id": 150184,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771825354.883, "dur": 32.340, + "args": { + "External id": 150185,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771825392.932, "dur": 14.631, + "args": { + "External id": 150186,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771825415.943, "dur": 5.389, + "args": { + "External id": 150187,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825419.023, "dur": 0.940, + "args": { + "External id": 150188,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771825425.203, "dur": 41.469, + "args": { + "External id": 150189,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771825427.143, "dur": 5.829, + "args": { + "External id": 150190,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771825428.332, "dur": 4.071, + "args": { + "External id": 150191,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825430.063, "dur": 1.989, + "args": { + "External id": 150192,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771825433.803, "dur": 32.249, + "args": { + "External id": 150193,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771825434.703, "dur": 30.549, + "args": { + "External id": 150194,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771825472.822, "dur": 4.490, + "args": { + "External id": 150195,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825475.472, "dur": 0.500, + "args": { + "External id": 150196,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771825484.572, "dur": 1.930, + "args": { + "External id": 150197,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771825494.462, "dur": 8.490, + "args": { + "External id": 150198,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771825496.512, "dur": 6.090, + "args": { + "External id": 150199,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771825585.152, "dur": 163.780, + "args": { + "External id": 150200,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771825587.672, "dur": 4.620, + "args": { + "External id": 150201,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771825593.882, "dur": 154.440, + "args": { + "External id": 150202,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771825595.192, "dur": 0.210, + "args": { + "External id": 150203,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771825598.652, "dur": 23.300, + "args": { + "External id": 150204,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771825623.572, "dur": 3.460, + "args": { + "External id": 150205,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825625.652, "dur": 0.930, + "args": { + "External id": 150206,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771825627.842, "dur": 21.720, + "args": { + "External id": 150207,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771825628.882, "dur": 3.920, + "args": { + "External id": 150208,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771825633.962, "dur": 15.250, + "args": { + "External id": 150209,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771825636.972, "dur": 3.650, + "args": { + "External id": 150210,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771825650.802, "dur": 17.400, + "args": { + "External id": 150211,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771825669.902, "dur": 10.060, + "args": { + "External id": 150212,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771825682.842, "dur": 14.410, + "args": { + "External id": 150213,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771825699.652, "dur": 8.910, + "args": { + "External id": 150214,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771825710.402, "dur": 18.370, + "args": { + "External id": 150215,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771825712.522, "dur": 2.460, + "args": { + "External id": 150216,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825717.112, "dur": 0.820, + "args": { + "External id": 150217,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771825730.412, "dur": 7.930, + "args": { + "External id": 150218,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771825739.492, "dur": 7.540, + "args": { + "External id": 150219,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771825759.092, "dur": 3.100, + "args": { + "External id": 150220,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771825771.942, "dur": 4.930, + "args": { + "External id": 150221,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825775.162, "dur": 0.680, + "args": { + "External id": 150222,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771825847.471, "dur": 44.131, + "args": { + "External id": 150223,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771825899.101, "dur": 7.140, + "args": { + "External id": 150224,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825903.191, "dur": 1.220, + "args": { + "External id": 150225,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771825907.421, "dur": 21.280, + "args": { + "External id": 150226,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771825935.851, "dur": 6.990, + "args": { + "External id": 150227,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771825938.731, "dur": 3.210, + "args": { + "External id": 150228,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771825940.601, "dur": 0.910, + "args": { + "External id": 150229,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771825945.241, "dur": 32.830, + "args": { + "External id": 150230,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771825946.201, "dur": 30.980, + "args": { + "External id": 150231,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771825982.701, "dur": 14.420, + "args": { + "External id": 150232,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771826004.931, "dur": 6.450, + "args": { + "External id": 150233,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826009.181, "dur": 0.890, + "args": { + "External id": 150234,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771826015.591, "dur": 39.020, + "args": { + "External id": 150235,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771826016.461, "dur": 4.120, + "args": { + "External id": 150236,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771826017.711, "dur": 2.360, + "args": { + "External id": 150237,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826019.261, "dur": 0.510, + "args": { + "External id": 150238,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771826021.521, "dur": 32.590, + "args": { + "External id": 150239,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771826022.411, "dur": 30.870, + "args": { + "External id": 150240,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771826061.521, "dur": 4.500, + "args": { + "External id": 150241,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826064.151, "dur": 0.550, + "args": { + "External id": 150242,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771826073.421, "dur": 1.810, + "args": { + "External id": 150243,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771826081.811, "dur": 7.820, + "args": { + "External id": 150244,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771826083.361, "dur": 5.930, + "args": { + "External id": 150245,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771826170.961, "dur": 174.559, + "args": { + "External id": 150246,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771826174.371, "dur": 5.600, + "args": { + "External id": 150247,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771826181.551, "dur": 163.449, + "args": { + "External id": 150248,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771826182.831, "dur": 0.200, + "args": { + "External id": 150249,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771826184.051, "dur": 21.820, + "args": { + "External id": 150250,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771826207.501, "dur": 4.610, + "args": { + "External id": 150251,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826210.741, "dur": 0.910, + "args": { + "External id": 150252,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771826212.831, "dur": 21.470, + "args": { + "External id": 150253,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771826213.911, "dur": 2.830, + "args": { + "External id": 150254,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771826217.671, "dur": 16.310, + "args": { + "External id": 150255,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771826221.831, "dur": 3.580, + "args": { + "External id": 150256,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771826235.481, "dur": 17.080, + "args": { + "External id": 150257,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771826254.241, "dur": 11.840, + "args": { + "External id": 150258,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771826269.090, "dur": 12.000, + "args": { + "External id": 150259,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771826282.390, "dur": 8.440, + "args": { + "External id": 150260,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771826292.710, "dur": 30.051, + "args": { + "External id": 150261,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771826296.070, "dur": 10.860, + "args": { + "External id": 150262,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826309.281, "dur": 1.020, + "args": { + "External id": 150263,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771826325.981, "dur": 8.500, + "args": { + "External id": 150264,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771826335.581, "dur": 8.020, + "args": { + "External id": 150265,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771826354.650, "dur": 3.160, + "args": { + "External id": 150266,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771826367.230, "dur": 4.380, + "args": { + "External id": 150267,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826370.010, "dur": 0.580, + "args": { + "External id": 150268,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771826442.110, "dur": 43.920, + "args": { + "External id": 150269,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771826494.090, "dur": 8.000, + "args": { + "External id": 150270,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826499.300, "dur": 1.040, + "args": { + "External id": 150271,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771826503.300, "dur": 21.190, + "args": { + "External id": 150272,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771826531.610, "dur": 6.540, + "args": { + "External id": 150273,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771826533.060, "dur": 4.130, + "args": { + "External id": 150274,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826535.990, "dur": 0.860, + "args": { + "External id": 150275,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771826540.550, "dur": 32.830, + "args": { + "External id": 150276,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771826541.480, "dur": 31.020, + "args": { + "External id": 150277,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771826579.300, "dur": 14.420, + "args": { + "External id": 150278,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771826598.890, "dur": 23.460, + "args": { + "External id": 150279,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771826600.740, "dur": 21.140, + "args": { + "External id": 150280,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826607.450, "dur": 0.680, + "args": { + "External id": 150281,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771826627.590, "dur": 23.730, + "args": { + "External id": 150282,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771826628.690, "dur": 22.370, + "args": { + "External id": 150283,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 6326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826632.030, "dur": 5.700, + "args": { + "External id": 150284,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771826638.770, "dur": 11.760, + "args": { + "External id": 150285,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771826661.060, "dur": 5.420, + "args": { + "External id": 150286,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771826663.840, "dur": 2.290, + "args": { + "External id": 150287,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771826667.520, "dur": 1.210, + "args": { + "External id": 150288,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771826667.930, "dur": 0.550, + "args": { + "External id": 150289,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771826698.869, "dur": 21.591, + "args": { + "External id": 150290,"Sequence number": 3058966, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771826722.909, "dur": 11.280, + "args": { + "External id": 150291,"Sequence number": 3058967, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6334 + } + }, + { + "ph": "s", "id": 16, "pid": 5714, "tid": 5714, "ts": 6303771826722.909, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771826740.929, "dur": 7.620, + "args": { + "External id": 150292,"Sequence number": 3058968, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 6335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826745.340, "dur": 1.489, + "args": { + "External id": 150293,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6303771826751.680, "dur": 4.629, + "args": { + "External id": 150294,"Sequence number": 3058968, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "2"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 6337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826754.809, "dur": 0.420, + "args": { + "External id": 150295,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "3"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771826757.769, "dur": 2.271, + "args": { + "External id": 150296,"Sequence number": 3058968, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 6339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826759.249, "dur": 0.251, + "args": { + "External id": 150297,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "3"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 6340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771826765.660, "dur": 5.189, + "args": { + "External id": 150298,"Sequence number": 3058968, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6341 + } + }, + { + "ph": "s", "id": 15, "pid": 5714, "tid": 5714, "ts": 6303771826765.660, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826768.949, "dur": 0.560, + "args": { + "External id": 150299,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771826771.969, "dur": 4.560, + "args": { + "External id": 150300,"Sequence number": 3058969, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6343 + } + }, + { + "ph": "s", "id": 14, "pid": 5714, "tid": 5714, "ts": 6303771826771.969, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826775.460, "dur": 0.349, + "args": { + "External id": 150301,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6303771826777.609, "dur": 5.080, + "args": { + "External id": 150302,"Sequence number": 3058970, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "2"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 6345 + } + }, + { + "ph": "s", "id": 13, "pid": 5714, "tid": 5714, "ts": 6303771826777.609, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826781.400, "dur": 0.460, + "args": { + "External id": 150303,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771826783.860, "dur": 4.360, + "args": { + "External id": 150304,"Sequence number": 3058971, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 6347 + } + }, + { + "ph": "s", "id": 12, "pid": 5714, "tid": 5714, "ts": 6303771826783.860, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826786.189, "dur": 1.320, + "args": { + "External id": 150305,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "1536"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 6348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6303771826792.260, "dur": 31.839, + "args": { + "External id": 150306,"Sequence number": 3058972, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6303771826793.189, "dur": 30.570, + "args": { + "External id": 150307,"Sequence number": 3058972, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771826794.959, "dur": 9.640, + "args": { + "External id": 150308,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 6351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771826796.999, "dur": 7.020, + "args": { + "External id": 150309,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771826805.599, "dur": 17.600, + "args": { + "External id": 150310,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 6353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771826850.459, "dur": 5.020, + "args": { + "External id": 150311,"Sequence number": 3058972, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 6354 + } + }, + { + "ph": "s", "id": 11, "pid": 5714, "tid": 5714, "ts": 6303771826850.459, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771826859.509, "dur": 2.040, + "args": { + "External id": 150312,"Sequence number": 3058973, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5714, "tid": 5714, + "ts": 6303771826883.049, "dur": 21336.843, + "args": { + "External id": 150313,"Sequence number": 3058973, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 6356 + } + }, + { + "ph": "s", "id": 10, "pid": 5714, "tid": 5714, "ts": 6303771826883.049, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6303771826896.169, "dur": 30.600, + "args": { + "External id": 150314,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6303771826896.989, "dur": 29.450, + "args": { + "External id": 150315,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771826898.519, "dur": 8.820, + "args": { + "External id": 150316,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771826900.129, "dur": 6.520, + "args": { + "External id": 150317,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771826908.279, "dur": 17.640, + "args": { + "External id": 150318,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 6361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6303771826941.939, "dur": 26.310, + "args": { + "External id": 150319,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771826942.959, "dur": 8.500, + "args": { + "External id": 150320,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826945.109, "dur": 5.920, + "args": { + "External id": 150321,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771826952.459, "dur": 15.530, + "args": { + "External id": 150322,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771826955.419, "dur": 11.670, + "args": { + "External id": 150323,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6303771826972.389, "dur": 18.060, + "args": { + "External id": 150324,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771826973.119, "dur": 6.000, + "args": { + "External id": 150325,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771826974.469, "dur": 4.280, + "args": { + "External id": 150326,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771826979.739, "dur": 10.490, + "args": { + "External id": 150327,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771826980.669, "dur": 8.740, + "args": { + "External id": 150328,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 6371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6303771826996.179, "dur": 17.320, + "args": { + "External id": 150329,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 6372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771826997.509, "dur": 5.500, + "args": { + "External id": 150330,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771827003.759, "dur": 9.470, + "args": { + "External id": 150331,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 6374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771827004.689, "dur": 7.680, + "args": { + "External id": 150332,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 5714, + "ts": 6303771827017.749, "dur": 19.810, + "args": { + "External id": 150333,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771827041.639, "dur": 45.290, + "args": { + "External id": 150334,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771827043.459, "dur": 42.860, + "args": { + "External id": 150335,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771827049.719, "dur": 1.080, + "args": { + "External id": 150336,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771827051.809, "dur": 22.450, + "args": { + "External id": 150337,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771827052.889, "dur": 21.150, + "args": { + "External id": 150338,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 6381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771827055.199, "dur": 5.380, + "args": { + "External id": 150339,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771827061.559, "dur": 12.070, + "args": { + "External id": 150340,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 6383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6303771827091.779, "dur": 16489.863, + "args": { + "External id": 150341,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6303771827093.159, "dur": 16486.523, + "args": { + "External id": 150342,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771843596.442, "dur": 10.480, + "args": { + "External id": 150343,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771843602.522, "dur": 1.810, + "args": { + "External id": 150344,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771843612.642, "dur": 53.640, + "args": { + "External id": 150345,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771843613.692, "dur": 5.600, + "args": { + "External id": 150346,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771843615.372, "dur": 3.110, + "args": { + "External id": 150347,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771843617.062, "dur": 1.060, + "args": { + "External id": 150348,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771843620.262, "dur": 45.250, + "args": { + "External id": 150349,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771843621.452, "dur": 42.990, + "args": { + "External id": 150350,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771843672.842, "dur": 4.620, + "args": { + "External id": 150351,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771843675.492, "dur": 0.620, + "args": { + "External id": 150352,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771843684.562, "dur": 2.160, + "args": { + "External id": 150353,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771843694.012, "dur": 8.370, + "args": { + "External id": 150354,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771843695.592, "dur": 6.470, + "args": { + "External id": 150355,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771843792.711, "dur": 172.940, + "args": { + "External id": 150356,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771843795.302, "dur": 4.729, + "args": { + "External id": 150357,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771843801.682, "dur": 163.419, + "args": { + "External id": 150358,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771843804.231, "dur": 0.300, + "args": { + "External id": 150359,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771843805.742, "dur": 24.300, + "args": { + "External id": 150360,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771843831.802, "dur": 8.909, + "args": { + "External id": 150361,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771843838.382, "dur": 1.800, + "args": { + "External id": 150362,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771843841.842, "dur": 24.029, + "args": { + "External id": 150363,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771843844.411, "dur": 2.991, + "args": { + "External id": 150364,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771843848.511, "dur": 17.031, + "args": { + "External id": 150365,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771843851.542, "dur": 3.929, + "args": { + "External id": 150366,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771843867.151, "dur": 17.411, + "args": { + "External id": 150367,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771843886.271, "dur": 10.620, + "args": { + "External id": 150368,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771843899.671, "dur": 11.970, + "args": { + "External id": 150369,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771843912.951, "dur": 8.610, + "args": { + "External id": 150370,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771843923.341, "dur": 21.010, + "args": { + "External id": 150371,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771843925.461, "dur": 2.620, + "args": { + "External id": 150372,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771843931.371, "dur": 1.780, + "args": { + "External id": 150373,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771843946.091, "dur": 8.530, + "args": { + "External id": 150374,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771843955.831, "dur": 7.810, + "args": { + "External id": 150375,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771843974.501, "dur": 3.130, + "args": { + "External id": 150376,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771843985.871, "dur": 4.270, + "args": { + "External id": 150377,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771843988.571, "dur": 0.550, + "args": { + "External id": 150378,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771844062.991, "dur": 45.700, + "args": { + "External id": 150379,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771844117.681, "dur": 8.170, + "args": { + "External id": 150380,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844122.021, "dur": 1.110, + "args": { + "External id": 150381,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771844127.141, "dur": 21.330, + "args": { + "External id": 150382,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771844155.921, "dur": 5.860, + "args": { + "External id": 150383,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771844157.351, "dur": 3.450, + "args": { + "External id": 150384,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844159.541, "dur": 0.920, + "args": { + "External id": 150385,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771844165.581, "dur": 34.090, + "args": { + "External id": 150386,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771844166.601, "dur": 32.060, + "args": { + "External id": 150387,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771844204.691, "dur": 14.620, + "args": { + "External id": 150388,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771844227.141, "dur": 6.310, + "args": { + "External id": 150389,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844231.191, "dur": 0.910, + "args": { + "External id": 150390,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771844237.581, "dur": 41.580, + "args": { + "External id": 150391,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771844238.381, "dur": 6.500, + "args": { + "External id": 150392,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771844240.910, "dur": 3.411, + "args": { + "External id": 150393,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844243.410, "dur": 0.631, + "args": { + "External id": 150394,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771844245.781, "dur": 32.780, + "args": { + "External id": 150395,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771844246.741, "dur": 31.020, + "args": { + "External id": 150396,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771844285.361, "dur": 5.649, + "args": { + "External id": 150397,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844288.021, "dur": 1.749, + "args": { + "External id": 150398,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771844309.790, "dur": 2.060, + "args": { + "External id": 150399,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771844319.190, "dur": 8.771, + "args": { + "External id": 150400,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771844320.861, "dur": 6.760, + "args": { + "External id": 150401,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771844412.550, "dur": 167.700, + "args": { + "External id": 150402,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771844415.800, "dur": 4.830, + "args": { + "External id": 150403,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771844422.090, "dur": 157.610, + "args": { + "External id": 150404,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771844424.430, "dur": 0.320, + "args": { + "External id": 150405,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771844427.230, "dur": 22.360, + "args": { + "External id": 150406,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771844451.280, "dur": 3.700, + "args": { + "External id": 150407,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844453.530, "dur": 0.990, + "args": { + "External id": 150408,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771844455.790, "dur": 23.020, + "args": { + "External id": 150409,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771844458.190, "dur": 2.970, + "args": { + "External id": 150410,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771844462.250, "dur": 16.250, + "args": { + "External id": 150411,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771844465.340, "dur": 4.460, + "args": { + "External id": 150412,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771844480.050, "dur": 19.150, + "args": { + "External id": 150413,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771844500.920, "dur": 10.170, + "args": { + "External id": 150414,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771844515.230, "dur": 12.220, + "args": { + "External id": 150415,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771844529.900, "dur": 8.430, + "args": { + "External id": 150416,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771844540.040, "dur": 19.320, + "args": { + "External id": 150417,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771844542.090, "dur": 2.990, + "args": { + "External id": 150418,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844547.360, "dur": 0.860, + "args": { + "External id": 150419,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771844561.250, "dur": 8.110, + "args": { + "External id": 150420,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771844570.580, "dur": 7.670, + "args": { + "External id": 150421,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771844588.900, "dur": 3.040, + "args": { + "External id": 150422,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771844603.200, "dur": 4.450, + "args": { + "External id": 150423,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844605.940, "dur": 0.670, + "args": { + "External id": 150424,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771844679.810, "dur": 45.199, + "args": { + "External id": 150425,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771844732.469, "dur": 8.320, + "args": { + "External id": 150426,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844736.680, "dur": 2.169, + "args": { + "External id": 150427,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771844742.049, "dur": 20.071, + "args": { + "External id": 150428,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771844769.400, "dur": 6.680, + "args": { + "External id": 150429,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771844771.989, "dur": 3.311, + "args": { + "External id": 150430,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844773.949, "dur": 0.911, + "args": { + "External id": 150431,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771844778.520, "dur": 33.059, + "args": { + "External id": 150432,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771844779.500, "dur": 31.219, + "args": { + "External id": 150433,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771844816.299, "dur": 14.330, + "args": { + "External id": 150434,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771844838.819, "dur": 6.540, + "args": { + "External id": 150435,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844843.019, "dur": 0.910, + "args": { + "External id": 150436,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771844849.399, "dur": 37.790, + "args": { + "External id": 150437,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771844850.219, "dur": 3.850, + "args": { + "External id": 150438,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771844851.389, "dur": 2.160, + "args": { + "External id": 150439,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844852.789, "dur": 0.470, + "args": { + "External id": 150440,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771844854.919, "dur": 31.690, + "args": { + "External id": 150441,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771844855.789, "dur": 30.080, + "args": { + "External id": 150442,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771844893.309, "dur": 4.540, + "args": { + "External id": 150443,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771844895.909, "dur": 0.640, + "args": { + "External id": 150444,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771844906.229, "dur": 1.900, + "args": { + "External id": 150445,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771844914.639, "dur": 9.290, + "args": { + "External id": 150446,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771844916.289, "dur": 7.280, + "args": { + "External id": 150447,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771845006.619, "dur": 161.680, + "args": { + "External id": 150448,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771845008.959, "dur": 4.620, + "args": { + "External id": 150449,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771845016.179, "dur": 151.549, + "args": { + "External id": 150450,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771845017.589, "dur": 0.210, + "args": { + "External id": 150451,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771845018.789, "dur": 22.510, + "args": { + "External id": 150452,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771845044.079, "dur": 3.700, + "args": { + "External id": 150453,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845046.339, "dur": 0.960, + "args": { + "External id": 150454,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771845048.499, "dur": 21.980, + "args": { + "External id": 150455,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771845049.539, "dur": 2.720, + "args": { + "External id": 150456,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771845053.259, "dur": 16.850, + "args": { + "External id": 150457,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771845057.559, "dur": 3.620, + "args": { + "External id": 150458,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771845071.779, "dur": 17.540, + "args": { + "External id": 150459,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771845090.869, "dur": 10.240, + "args": { + "External id": 150460,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771845104.959, "dur": 11.800, + "args": { + "External id": 150461,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771845118.069, "dur": 8.730, + "args": { + "External id": 150462,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771845128.539, "dur": 18.180, + "args": { + "External id": 150463,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771845130.579, "dur": 2.270, + "args": { + "External id": 150464,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845135.029, "dur": 0.820, + "args": { + "External id": 150465,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771845148.439, "dur": 8.400, + "args": { + "External id": 150466,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771845157.968, "dur": 7.680, + "args": { + "External id": 150467,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771845176.968, "dur": 3.011, + "args": { + "External id": 150468,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771845190.379, "dur": 4.420, + "args": { + "External id": 150469,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845193.139, "dur": 0.660, + "args": { + "External id": 150470,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771845264.318, "dur": 53.790, + "args": { + "External id": 150471,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771845325.978, "dur": 7.250, + "args": { + "External id": 150472,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845330.218, "dur": 1.190, + "args": { + "External id": 150473,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771845334.478, "dur": 20.840, + "args": { + "External id": 150474,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771845362.748, "dur": 7.950, + "args": { + "External id": 150475,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771845365.458, "dur": 4.320, + "args": { + "External id": 150476,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845368.478, "dur": 0.940, + "args": { + "External id": 150477,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771845373.208, "dur": 33.690, + "args": { + "External id": 150478,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771845374.198, "dur": 31.760, + "args": { + "External id": 150479,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771845411.758, "dur": 14.430, + "args": { + "External id": 150480,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771845434.278, "dur": 6.580, + "args": { + "External id": 150481,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845438.538, "dur": 0.980, + "args": { + "External id": 150482,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771845444.898, "dur": 38.250, + "args": { + "External id": 150483,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771845445.708, "dur": 3.970, + "args": { + "External id": 150484,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771845446.858, "dur": 2.270, + "args": { + "External id": 150485,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845448.328, "dur": 0.520, + "args": { + "External id": 150486,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771845450.538, "dur": 32.060, + "args": { + "External id": 150487,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771845451.388, "dur": 30.460, + "args": { + "External id": 150488,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771845490.168, "dur": 4.330, + "args": { + "External id": 150489,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845492.648, "dur": 0.640, + "args": { + "External id": 150490,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771845502.118, "dur": 1.880, + "args": { + "External id": 150491,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771845510.668, "dur": 8.180, + "args": { + "External id": 150492,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771845512.278, "dur": 6.230, + "args": { + "External id": 150493,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771845610.947, "dur": 183.660, + "args": { + "External id": 150494,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771845613.687, "dur": 7.820, + "args": { + "External id": 150495,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771845623.487, "dur": 170.580, + "args": { + "External id": 150496,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771845624.967, "dur": 0.231, + "args": { + "External id": 150497,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771845626.338, "dur": 25.640, + "args": { + "External id": 150498,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771845653.858, "dur": 5.400, + "args": { + "External id": 150499,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845657.487, "dur": 1.211, + "args": { + "External id": 150500,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771845660.178, "dur": 26.080, + "args": { + "External id": 150501,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771845661.367, "dur": 3.460, + "args": { + "External id": 150502,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771845667.658, "dur": 18.209, + "args": { + "External id": 150503,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771845671.247, "dur": 4.311, + "args": { + "External id": 150504,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771845687.687, "dur": 19.880, + "args": { + "External id": 150505,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771845709.627, "dur": 11.490, + "args": { + "External id": 150506,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771845725.177, "dur": 13.210, + "args": { + "External id": 150507,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771845739.997, "dur": 10.210, + "args": { + "External id": 150508,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771845752.107, "dur": 20.110, + "args": { + "External id": 150509,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771845754.337, "dur": 2.580, + "args": { + "External id": 150510,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845759.127, "dur": 1.900, + "args": { + "External id": 150511,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771845775.157, "dur": 8.560, + "args": { + "External id": 150512,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771845784.897, "dur": 7.730, + "args": { + "External id": 150513,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771845803.637, "dur": 3.190, + "args": { + "External id": 150514,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771845816.627, "dur": 4.020, + "args": { + "External id": 150515,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845819.187, "dur": 0.620, + "args": { + "External id": 150516,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771845881.177, "dur": 38.720, + "args": { + "External id": 150517,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771845926.287, "dur": 7.310, + "args": { + "External id": 150518,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845930.917, "dur": 1.030, + "args": { + "External id": 150519,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771845934.747, "dur": 17.730, + "args": { + "External id": 150520,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771845958.767, "dur": 4.750, + "args": { + "External id": 150521,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771845960.047, "dur": 2.750, + "args": { + "External id": 150522,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771845961.707, "dur": 0.810, + "args": { + "External id": 150523,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771845965.667, "dur": 29.030, + "args": { + "External id": 150524,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771845966.527, "dur": 27.320, + "args": { + "External id": 150525,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771846000.177, "dur": 12.930, + "args": { + "External id": 150526,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771846020.367, "dur": 4.700, + "args": { + "External id": 150527,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846023.027, "dur": 0.760, + "args": { + "External id": 150528,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771846039.926, "dur": 38.640, + "args": { + "External id": 150529,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771846040.746, "dur": 7.791, + "args": { + "External id": 150530,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771846041.786, "dur": 6.200, + "args": { + "External id": 150531,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846047.106, "dur": 0.611, + "args": { + "External id": 150532,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771846049.337, "dur": 28.789, + "args": { + "External id": 150533,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771846050.126, "dur": 27.240, + "args": { + "External id": 150534,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771846084.286, "dur": 4.031, + "args": { + "External id": 150535,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846086.646, "dur": 0.431, + "args": { + "External id": 150536,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771846094.677, "dur": 1.660, + "args": { + "External id": 150537,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771846102.206, "dur": 8.480, + "args": { + "External id": 150538,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771846104.946, "dur": 5.471, + "args": { + "External id": 150539,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771846182.026, "dur": 156.090, + "args": { + "External id": 150540,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771846184.176, "dur": 3.940, + "args": { + "External id": 150541,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771846189.456, "dur": 148.210, + "args": { + "External id": 150542,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771846191.616, "dur": 0.180, + "args": { + "External id": 150543,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771846192.696, "dur": 20.570, + "args": { + "External id": 150544,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771846214.726, "dur": 3.190, + "args": { + "External id": 150545,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846216.636, "dur": 0.860, + "args": { + "External id": 150546,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771846218.566, "dur": 20.610, + "args": { + "External id": 150547,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771846221.766, "dur": 2.620, + "args": { + "External id": 150548,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771846225.286, "dur": 13.510, + "args": { + "External id": 150549,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771846227.856, "dur": 3.020, + "args": { + "External id": 150550,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771846240.226, "dur": 16.830, + "args": { + "External id": 150551,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771846258.516, "dur": 8.820, + "args": { + "External id": 150552,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771846269.826, "dur": 10.460, + "args": { + "External id": 150553,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771846281.406, "dur": 7.550, + "args": { + "External id": 150554,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771846290.616, "dur": 27.450, + "args": { + "External id": 150555,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771846292.416, "dur": 2.200, + "args": { + "External id": 150556,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846305.896, "dur": 0.860, + "args": { + "External id": 150557,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771846319.946, "dur": 8.260, + "args": { + "External id": 150558,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771846329.186, "dur": 7.120, + "args": { + "External id": 150559,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771846346.046, "dur": 2.710, + "args": { + "External id": 150560,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771846357.366, "dur": 4.030, + "args": { + "External id": 150561,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846359.906, "dur": 0.590, + "args": { + "External id": 150562,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771846423.256, "dur": 39.490, + "args": { + "External id": 150563,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771846470.376, "dur": 7.110, + "args": { + "External id": 150564,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846473.946, "dur": 1.910, + "args": { + "External id": 150565,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771846478.586, "dur": 17.519, + "args": { + "External id": 150566,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771846502.305, "dur": 4.700, + "args": { + "External id": 150567,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771846503.556, "dur": 2.749, + "args": { + "External id": 150568,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846505.256, "dur": 0.740, + "args": { + "External id": 150569,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771846510.556, "dur": 28.809, + "args": { + "External id": 150570,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771846511.425, "dur": 27.111, + "args": { + "External id": 150571,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771846543.645, "dur": 12.620, + "args": { + "External id": 150572,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771846563.216, "dur": 4.769, + "args": { + "External id": 150573,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6616 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846565.905, "dur": 0.831, + "args": { + "External id": 150574,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771846571.565, "dur": 35.710, + "args": { + "External id": 150575,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771846572.245, "dur": 4.620, + "args": { + "External id": 150576,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771846574.445, "dur": 1.960, + "args": { + "External id": 150577,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6620 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846575.756, "dur": 0.389, + "args": { + "External id": 150578,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771846577.625, "dur": 29.170, + "args": { + "External id": 150579,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771846578.405, "dur": 27.610, + "args": { + "External id": 150580,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771846612.525, "dur": 3.910, + "args": { + "External id": 150581,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846614.845, "dur": 0.530, + "args": { + "External id": 150582,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771846623.945, "dur": 1.590, + "args": { + "External id": 150583,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771846631.355, "dur": 8.400, + "args": { + "External id": 150584,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771846632.735, "dur": 6.690, + "args": { + "External id": 150585,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771846711.315, "dur": 141.570, + "args": { + "External id": 150586,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771846713.555, "dur": 4.790, + "args": { + "External id": 150587,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771846719.645, "dur": 132.720, + "args": { + "External id": 150588,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771846722.065, "dur": 0.200, + "args": { + "External id": 150589,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771846723.165, "dur": 19.370, + "args": { + "External id": 150590,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771846744.925, "dur": 3.210, + "args": { + "External id": 150591,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846746.815, "dur": 0.810, + "args": { + "External id": 150592,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771846748.785, "dur": 19.190, + "args": { + "External id": 150593,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771846749.535, "dur": 2.460, + "args": { + "External id": 150594,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771846753.015, "dur": 14.650, + "args": { + "External id": 150595,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771846756.565, "dur": 3.240, + "args": { + "External id": 150596,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771846769.075, "dur": 15.000, + "args": { + "External id": 150597,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771846785.585, "dur": 8.840, + "args": { + "External id": 150598,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771846796.975, "dur": 10.370, + "args": { + "External id": 150599,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771846808.475, "dur": 7.540, + "args": { + "External id": 150600,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771846818.635, "dur": 16.180, + "args": { + "External id": 150601,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771846820.425, "dur": 2.040, + "args": { + "External id": 150602,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846824.345, "dur": 0.730, + "args": { + "External id": 150603,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771846836.295, "dur": 7.360, + "args": { + "External id": 150604,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771846844.625, "dur": 6.810, + "args": { + "External id": 150605,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6648 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771846860.565, "dur": 2.790, + "args": { + "External id": 150606,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6649 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771846871.955, "dur": 3.730, + "args": { + "External id": 150607,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846874.195, "dur": 0.610, + "args": { + "External id": 150608,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771846938.175, "dur": 38.769, + "args": { + "External id": 150609,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771846983.464, "dur": 6.191, + "args": { + "External id": 150610,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771846987.004, "dur": 1.011, + "args": { + "External id": 150611,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771846990.764, "dur": 17.691, + "args": { + "External id": 150612,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771847014.804, "dur": 7.231, + "args": { + "External id": 150613,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771847017.475, "dur": 3.820, + "args": { + "External id": 150614,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847020.255, "dur": 0.729, + "args": { + "External id": 150615,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771847024.284, "dur": 28.830, + "args": { + "External id": 150616,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771847025.064, "dur": 27.330, + "args": { + "External id": 150617,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771847057.094, "dur": 13.090, + "args": { + "External id": 150618,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771847077.284, "dur": 4.870, + "args": { + "External id": 150619,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847080.134, "dur": 0.790, + "args": { + "External id": 150620,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771847085.674, "dur": 34.640, + "args": { + "External id": 150621,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771847087.874, "dur": 3.290, + "args": { + "External id": 150622,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771847088.854, "dur": 1.780, + "args": { + "External id": 150623,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847090.004, "dur": 0.400, + "args": { + "External id": 150624,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771847091.914, "dur": 27.920, + "args": { + "External id": 150625,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771847092.624, "dur": 26.490, + "args": { + "External id": 150626,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771847125.864, "dur": 3.880, + "args": { + "External id": 150627,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847128.054, "dur": 0.500, + "args": { + "External id": 150628,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771847136.154, "dur": 1.580, + "args": { + "External id": 150629,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771847144.524, "dur": 7.250, + "args": { + "External id": 150630,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771847145.884, "dur": 5.580, + "args": { + "External id": 150631,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771847222.044, "dur": 155.620, + "args": { + "External id": 150632,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771847225.174, "dur": 4.140, + "args": { + "External id": 150633,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771847230.554, "dur": 146.610, + "args": { + "External id": 150634,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771847231.664, "dur": 0.170, + "args": { + "External id": 150635,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771847233.804, "dur": 20.430, + "args": { + "External id": 150636,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771847255.644, "dur": 5.660, + "args": { + "External id": 150637,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847260.104, "dur": 0.800, + "args": { + "External id": 150638,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771847261.984, "dur": 19.210, + "args": { + "External id": 150639,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771847262.934, "dur": 2.540, + "args": { + "External id": 150640,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771847266.374, "dur": 14.510, + "args": { + "External id": 150641,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771847269.054, "dur": 3.890, + "args": { + "External id": 150642,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771847282.354, "dur": 22.020, + "args": { + "External id": 150643,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771847306.144, "dur": 9.630, + "args": { + "External id": 150644,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771847318.664, "dur": 10.860, + "args": { + "External id": 150645,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771847331.804, "dur": 7.630, + "args": { + "External id": 150646,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771847341.764, "dur": 17.940, + "args": { + "External id": 150647,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771847345.074, "dur": 2.380, + "args": { + "External id": 150648,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847349.244, "dur": 0.800, + "args": { + "External id": 150649,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771847361.244, "dur": 7.110, + "args": { + "External id": 150650,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771847369.274, "dur": 6.580, + "args": { + "External id": 150651,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771847386.584, "dur": 2.730, + "args": { + "External id": 150652,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771847398.514, "dur": 4.500, + "args": { + "External id": 150653,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847401.423, "dur": 0.560, + "args": { + "External id": 150654,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771847464.763, "dur": 39.230, + "args": { + "External id": 150655,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771847510.663, "dur": 6.520, + "args": { + "External id": 150656,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847514.603, "dur": 1.050, + "args": { + "External id": 150657,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771847518.243, "dur": 17.650, + "args": { + "External id": 150658,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771847542.173, "dur": 7.160, + "args": { + "External id": 150659,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771847544.443, "dur": 4.090, + "args": { + "External id": 150660,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847547.343, "dur": 0.870, + "args": { + "External id": 150661,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771847551.573, "dur": 28.600, + "args": { + "External id": 150662,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771847552.463, "dur": 26.880, + "args": { + "External id": 150663,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771847584.303, "dur": 13.030, + "args": { + "External id": 150664,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771847604.313, "dur": 5.870, + "args": { + "External id": 150665,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847608.063, "dur": 0.830, + "args": { + "External id": 150666,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771847613.753, "dur": 33.860, + "args": { + "External id": 150667,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771847614.493, "dur": 3.470, + "args": { + "External id": 150668,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771847615.553, "dur": 1.960, + "args": { + "External id": 150669,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847616.883, "dur": 0.410, + "args": { + "External id": 150670,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771847618.703, "dur": 28.380, + "args": { + "External id": 150671,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771847619.493, "dur": 26.830, + "args": { + "External id": 150672,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771847656.413, "dur": 5.060, + "args": { + "External id": 150673,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847658.823, "dur": 1.500, + "args": { + "External id": 150674,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771847667.853, "dur": 1.640, + "args": { + "External id": 150675,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771847675.343, "dur": 7.290, + "args": { + "External id": 150676,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771847676.773, "dur": 5.550, + "args": { + "External id": 150677,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771847755.413, "dur": 144.860, + "args": { + "External id": 150678,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771847757.473, "dur": 5.070, + "args": { + "External id": 150679,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771847764.033, "dur": 135.760, + "args": { + "External id": 150680,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771847765.103, "dur": 0.200, + "args": { + "External id": 150681,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771847766.253, "dur": 20.440, + "args": { + "External id": 150682,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771847788.163, "dur": 3.270, + "args": { + "External id": 150683,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847790.243, "dur": 0.790, + "args": { + "External id": 150684,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771847792.073, "dur": 21.630, + "args": { + "External id": 150685,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771847793.013, "dur": 2.660, + "args": { + "External id": 150686,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771847796.573, "dur": 16.790, + "args": { + "External id": 150687,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771847801.423, "dur": 4.020, + "args": { + "External id": 150688,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771847814.843, "dur": 15.540, + "args": { + "External id": 150689,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771847831.883, "dur": 8.799, + "args": { + "External id": 150690,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771847843.182, "dur": 10.520, + "args": { + "External id": 150691,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771847854.873, "dur": 7.360, + "args": { + "External id": 150692,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771847863.733, "dur": 16.880, + "args": { + "External id": 150693,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771847865.533, "dur": 2.549, + "args": { + "External id": 150694,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847870.042, "dur": 0.720, + "args": { + "External id": 150695,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771847883.622, "dur": 7.260, + "args": { + "External id": 150696,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771847891.873, "dur": 6.800, + "args": { + "External id": 150697,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771847907.613, "dur": 2.560, + "args": { + "External id": 150698,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771847918.762, "dur": 4.011, + "args": { + "External id": 150699,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771847921.273, "dur": 0.600, + "args": { + "External id": 150700,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771847994.562, "dur": 39.090, + "args": { + "External id": 150701,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771848040.302, "dur": 7.380, + "args": { + "External id": 150702,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848045.082, "dur": 1.040, + "args": { + "External id": 150703,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771848048.732, "dur": 18.700, + "args": { + "External id": 150704,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771848073.762, "dur": 5.210, + "args": { + "External id": 150705,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771848075.052, "dur": 3.130, + "args": { + "External id": 150706,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848077.062, "dur": 0.770, + "args": { + "External id": 150707,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771848081.102, "dur": 29.070, + "args": { + "External id": 150708,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771848081.922, "dur": 27.380, + "args": { + "External id": 150709,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771848115.322, "dur": 13.110, + "args": { + "External id": 150710,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771848132.792, "dur": 21.520, + "args": { + "External id": 150711,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771848134.472, "dur": 19.400, + "args": { + "External id": 150712,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848140.892, "dur": 0.650, + "args": { + "External id": 150713,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771848158.652, "dur": 21.840, + "args": { + "External id": 150714,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771848159.652, "dur": 20.590, + "args": { + "External id": 150715,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 6758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848162.622, "dur": 5.170, + "args": { + "External id": 150716,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771848168.802, "dur": 10.940, + "args": { + "External id": 150717,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771848190.352, "dur": 4.550, + "args": { + "External id": 150718,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771848191.452, "dur": 3.160, + "args": { + "External id": 150719,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771848195.812, "dur": 0.980, + "args": { + "External id": 150720,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771848196.162, "dur": 0.480, + "args": { + "External id": 150721,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771848232.572, "dur": 18.110, + "args": { + "External id": 150722,"Sequence number": 3058974, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771848252.892, "dur": 10.600, + "args": { + "External id": 150723,"Sequence number": 3058975, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 6766 + } + }, + { + "ph": "s", "id": 9, "pid": 5714, "tid": 5714, "ts": 6303771848252.892, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771848269.272, "dur": 7.190, + "args": { + "External id": 150724,"Sequence number": 3058976, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], [], []], "Input Dims": [[8, 4, 2048], [], [], [], []], "Ev Idx": 6767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848273.482, "dur": 1.340, + "args": { + "External id": 150725,"Record function id": 0, "Concrete Inputs": ["", "[8, 4, 2048]", "[4096, 1, 1]", "1"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6303771848278.232, "dur": 4.280, + "args": { + "External id": 150726,"Sequence number": 3058976, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "3"], "Input type": ["long int", "Scalar", "Scalar"], "Input Strides": [[4096, 1, 1], [], []], "Input Dims": [[8, 4, 2048], [], []], "Ev Idx": 6769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848281.272, "dur": 0.320, + "args": { + "External id": 150727,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "4"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1, 1], [], [], []], "Input Dims": [[8, 4, 2048], [], [], []], "Ev Idx": 6770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771848283.792, "dur": 2.100, + "args": { + "External id": 150728,"Sequence number": 3058976, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[4096, 1], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], []], "Ev Idx": 6771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848285.182, "dur": 0.240, + "args": { + "External id": 150729,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048]", "[4096, 1]", "4"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[4096, 1], [], [], []], "Input Dims": [[8, 2048], [], [], []], "Ev Idx": 6772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771848291.961, "dur": 3.960, + "args": { + "External id": 150730,"Sequence number": 3058976, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6773 + } + }, + { + "ph": "s", "id": 8, "pid": 5714, "tid": 5714, "ts": 6303771848291.961, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848294.252, "dur": 0.480, + "args": { + "External id": 150731,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771848304.032, "dur": 2.920, + "args": { + "External id": 150732,"Sequence number": 3058977, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "1", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], [], []], "Ev Idx": 6775 + } + }, + { + "ph": "s", "id": 7, "pid": 5714, "tid": 5714, "ts": 6303771848304.032, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848305.832, "dur": 0.349, + "args": { + "External id": 150733,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 4, 768]", "[6291456, 3072, 768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::select", "pid": 5714, "tid": 5714, + "ts": 6303771848307.941, "dur": 4.811, + "args": { + "External id": 150734,"Sequence number": 3058978, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "3"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], []], "Input Dims": [[8, 2048, 4, 768], [], []], "Ev Idx": 6777 + } + }, + { + "ph": "s", "id": 6, "pid": 5714, "tid": 5714, "ts": 6303771848307.941, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848311.321, "dur": 0.751, + "args": { + "External id": 150735,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 768, 1], [], [], []], "Input Dims": [[8, 2048, 4, 768], [], [], []], "Ev Idx": 6778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771848313.881, "dur": 3.091, + "args": { + "External id": 150736,"Sequence number": 3058979, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "2", "0", "9223372036854775807", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], [], []], "Input Dims": [[8, 2048, 768], [], [], [], []], "Ev Idx": 6779 + } + }, + { + "ph": "s", "id": 5, "pid": 5714, "tid": 5714, "ts": 6303771848313.881, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848315.981, "dur": 0.331, + "args": { + "External id": 150737,"Record function id": 0, "Concrete Inputs": ["", "[8, 2048, 768]", "[6291456, 3072, 1]", "2304"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[6291456, 3072, 1], [], [], []], "Input Dims": [[8, 2048, 768], [], [], []], "Ev Idx": 6780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6303771848321.081, "dur": 32.140, + "args": { + "External id": 150738,"Sequence number": 3058980, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6781 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6303771848321.901, "dur": 31.031, + "args": { + "External id": 150739,"Sequence number": 3058980, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["long int", "Scalar"], "Input Strides": [[4096, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6782 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771848323.552, "dur": 10.200, + "args": { + "External id": 150740,"Record function id": 0, "Concrete Inputs": ["", "4", "0", "", "", "0"], "Input type": ["long int", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[4096, 1], [], [], [], [], []], "Input Dims": [[8, 2048], [], [], [], [], []], "Ev Idx": 6783 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771848325.092, "dur": 8.080, + "args": { + "External id": 150741,"Record function id": 0, "Concrete Inputs": ["[8, 2048]", "4", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6784 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771848334.712, "dur": 17.740, + "args": { + "External id": 150742,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "long int", "Scalar"], "Input Strides": [[2048, 1], [4096, 1], []], "Input Dims": [[8, 2048], [8, 2048], []], "Ev Idx": 6785 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771848376.952, "dur": 4.489, + "args": { + "External id": 150743,"Sequence number": 3058980, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1, 768]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[6291456, 3072, 1], []], "Input Dims": [[8, 2048, 768], []], "Ev Idx": 6786 + } + }, + { + "ph": "s", "id": 4, "pid": 5714, "tid": 5714, "ts": 6303771848376.952, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771848383.701, "dur": 1.040, + "args": { + "External id": 150744,"Sequence number": 3058981, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["long int", "ScalarList"], "Input Strides": [[2048, 1], []], "Input Dims": [[8, 2048], []], "Ev Idx": 6787 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "FusedLinearCrossEntropyFunction", "pid": 5714, "tid": 5714, + "ts": 6303771848403.851, "dur": 22037.891, + "args": { + "External id": 150745,"Sequence number": 3058981, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "", "", "-100", "0.", "1.", "8"], "Input type": ["c10::BFloat16", "long int", "c10::BFloat16", "", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[3072, 1], [1], [768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [16384], [32000, 768], [], [], [], [], []], "Ev Idx": 6788 + } + }, + { + "ph": "s", "id": 3, "pid": 5714, "tid": 5714, "ts": 6303771848403.851, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::contiguous", "pid": 5714, "tid": 5714, + "ts": 6303771848417.021, "dur": 28.130, + "args": { + "External id": 150746,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6789 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6303771848417.751, "dur": 27.130, + "args": { + "External id": 150747,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[3072, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6790 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771848419.271, "dur": 7.940, + "args": { + "External id": 150748,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "", "0"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[3072, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6791 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771848420.731, "dur": 5.810, + "args": { + "External id": 150749,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "15", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6792 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771848428.071, "dur": 16.310, + "args": { + "External id": 150750,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [3072, 1], []], "Input Dims": [[16384, 768], [16384, 768], []], "Ev Idx": 6793 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6303771848458.121, "dur": 22.750, + "args": { + "External id": 150751,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", ""], "Input type": ["c10::BFloat16", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6794 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771848458.981, "dur": 8.410, + "args": { + "External id": 150752,"Record function id": 0, "Concrete Inputs": ["", "15", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], [], []], "Ev Idx": 6795 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848461.861, "dur": 5.140, + "args": { + "External id": 150753,"Record function id": 0, "Concrete Inputs": ["[16384, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6796 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771848468.281, "dur": 12.360, + "args": { + "External id": 150754,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 6797 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771848469.571, "dur": 10.320, + "args": { + "External id": 150755,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[16384, 768], []], "Ev Idx": 6798 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros_like", "pid": 5714, "tid": 5714, + "ts": 6303771848484.531, "dur": 15.840, + "args": { + "External id": 150756,"Record function id": 0, "Concrete Inputs": ["", "6", "", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6799 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771848485.261, "dur": 5.230, + "args": { + "External id": 150757,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "False", ""], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 6800 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848486.501, "dur": 3.670, + "args": { + "External id": 150758,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6801 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771848490.981, "dur": 9.190, + "args": { + "External id": 150759,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6802 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771848491.771, "dur": 7.720, + "args": { + "External id": 150760,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[768, 1], []], "Input Dims": [[32000, 768], []], "Ev Idx": 6803 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6303771848505.431, "dur": 17.540, + "args": { + "External id": 150761,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False"], "Input type": ["ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 6804 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771848508.041, "dur": 3.970, + "args": { + "External id": 150762,"Record function id": 0, "Concrete Inputs": ["[16384]", "6", "", "", "False", ""], "Input type": ["ScalarList", "Scalar", "", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6805 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771848512.631, "dur": 10.070, + "args": { + "External id": 150763,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[16384]], "Ev Idx": 6806 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771848514.881, "dur": 7.120, + "args": { + "External id": 150764,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6807 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 5714, + "ts": 6303771848526.491, "dur": 17.270, + "args": { + "External id": 150765,"Record function id": 0, "Concrete Inputs": ["", "-100"], "Input type": ["long int", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6808 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771848546.361, "dur": 40.720, + "args": { + "External id": 150766,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["bool", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 6809 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771848547.961, "dur": 38.580, + "args": { + "External id": 150767,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["bool", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6810 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848553.101, "dur": 0.940, + "args": { + "External id": 150768,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["long int", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 6811 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771848555.011, "dur": 20.710, + "args": { + "External id": 150769,"Record function id": 0, "Concrete Inputs": ["", "4", "False", "False", ""], "Input type": ["bool", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6812 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771848556.071, "dur": 19.450, + "args": { + "External id": 150770,"Record function id": 0, "Concrete Inputs": ["", "4", "", "", "", "False", ""], "Input type": ["bool", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[1], [], [], [], [], [], []], "Input Dims": [[16384], [], [], [], [], [], []], "Ev Idx": 6813 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771848559.301, "dur": 3.910, + "args": { + "External id": 150771,"Record function id": 0, "Concrete Inputs": ["[16384]", "[1]", "4", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6814 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771848564.081, "dur": 11.060, + "args": { + "External id": 150772,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["long int", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[16384], [16384], []], "Ev Idx": 6815 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6303771848591.541, "dur": 16920.762, + "args": { + "External id": 150773,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6816 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6303771848592.891, "dur": 16918.062, + "args": { + "External id": 150774,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["long int"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 6817 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771865525.333, "dur": 8.550, + "args": { + "External id": 150775,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6818 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771865530.083, "dur": 1.830, + "args": { + "External id": 150776,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6819 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771865538.313, "dur": 54.890, + "args": { + "External id": 150777,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6820 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771865539.253, "dur": 5.770, + "args": { + "External id": 150778,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6821 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771865540.823, "dur": 3.140, + "args": { + "External id": 150779,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6822 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771865542.533, "dur": 0.950, + "args": { + "External id": 150780,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6823 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771865547.213, "dur": 45.120, + "args": { + "External id": 150781,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6824 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771865548.443, "dur": 42.810, + "args": { + "External id": 150782,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6825 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771865598.413, "dur": 4.480, + "args": { + "External id": 150783,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6826 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771865601.043, "dur": 0.580, + "args": { + "External id": 150784,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6827 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771865610.023, "dur": 2.130, + "args": { + "External id": 150785,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6828 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771865623.033, "dur": 16.670, + "args": { + "External id": 150786,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6829 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771865632.863, "dur": 6.480, + "args": { + "External id": 150787,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6830 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771865728.043, "dur": 167.250, + "args": { + "External id": 150788,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6831 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771865730.353, "dur": 5.820, + "args": { + "External id": 150789,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6832 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771865737.843, "dur": 156.890, + "args": { + "External id": 150790,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6833 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771865739.193, "dur": 0.220, + "args": { + "External id": 150791,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6834 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771865740.453, "dur": 24.410, + "args": { + "External id": 150792,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6835 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771865766.673, "dur": 4.750, + "args": { + "External id": 150793,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6836 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771865769.963, "dur": 1.010, + "args": { + "External id": 150794,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6837 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771865772.233, "dur": 21.750, + "args": { + "External id": 150795,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6838 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771865774.313, "dur": 2.770, + "args": { + "External id": 150796,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6839 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771865778.213, "dur": 15.440, + "args": { + "External id": 150797,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6840 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771865781.323, "dur": 3.540, + "args": { + "External id": 150798,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6841 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771865795.293, "dur": 18.230, + "args": { + "External id": 150799,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6842 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771865815.303, "dur": 10.350, + "args": { + "External id": 150800,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6843 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771865828.713, "dur": 12.060, + "args": { + "External id": 150801,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6844 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771865842.122, "dur": 8.831, + "args": { + "External id": 150802,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6845 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771865852.642, "dur": 21.580, + "args": { + "External id": 150803,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6846 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771865854.753, "dur": 2.640, + "args": { + "External id": 150804,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6847 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771865860.893, "dur": 1.969, + "args": { + "External id": 150805,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6848 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771865876.002, "dur": 8.220, + "args": { + "External id": 150806,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6849 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771865885.482, "dur": 7.831, + "args": { + "External id": 150807,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6850 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771865903.913, "dur": 3.189, + "args": { + "External id": 150808,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6851 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771865915.093, "dur": 4.520, + "args": { + "External id": 150809,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6852 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771865917.782, "dur": 0.640, + "args": { + "External id": 150810,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "0"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6853 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771865991.352, "dur": 44.820, + "args": { + "External id": 150811,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6854 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771866045.352, "dur": 8.070, + "args": { + "External id": 150812,"Record function id": 0, "Concrete Inputs": ["", "0", "0", "2048", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6855 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866049.622, "dur": 1.030, + "args": { + "External id": 150813,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "0"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6856 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771866054.682, "dur": 22.830, + "args": { + "External id": 150814,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6857 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771866084.922, "dur": 5.660, + "args": { + "External id": 150815,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6858 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771866086.332, "dur": 3.290, + "args": { + "External id": 150816,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6859 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866088.362, "dur": 0.900, + "args": { + "External id": 150817,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6860 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771866094.252, "dur": 42.770, + "args": { + "External id": 150818,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6861 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771866095.222, "dur": 40.840, + "args": { + "External id": 150819,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6862 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771866141.802, "dur": 14.760, + "args": { + "External id": 150820,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6863 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771866164.702, "dur": 5.440, + "args": { + "External id": 150821,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6864 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866167.772, "dur": 0.890, + "args": { + "External id": 150822,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6865 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771866174.152, "dur": 42.100, + "args": { + "External id": 150823,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6866 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771866174.962, "dur": 6.980, + "args": { + "External id": 150824,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6867 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771866177.632, "dur": 3.690, + "args": { + "External id": 150825,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6868 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866180.392, "dur": 0.640, + "args": { + "External id": 150826,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6869 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771866182.852, "dur": 32.840, + "args": { + "External id": 150827,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6870 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771866183.772, "dur": 31.010, + "args": { + "External id": 150828,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6871 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771866222.232, "dur": 5.040, + "args": { + "External id": 150829,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6872 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866224.602, "dur": 1.500, + "args": { + "External id": 150830,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6873 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771866235.982, "dur": 1.810, + "args": { + "External id": 150831,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6874 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771866244.692, "dur": 8.120, + "args": { + "External id": 150832,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6875 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771866246.362, "dur": 6.110, + "args": { + "External id": 150833,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6876 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771866346.492, "dur": 165.499, + "args": { + "External id": 150834,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6877 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771866349.292, "dur": 5.140, + "args": { + "External id": 150835,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6878 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771866355.921, "dur": 155.580, + "args": { + "External id": 150836,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6879 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771866358.241, "dur": 0.291, + "args": { + "External id": 150837,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6880 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771866360.692, "dur": 22.709, + "args": { + "External id": 150838,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6881 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771866385.041, "dur": 3.750, + "args": { + "External id": 150839,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6882 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866387.101, "dur": 1.050, + "args": { + "External id": 150840,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6883 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771866389.661, "dur": 22.090, + "args": { + "External id": 150841,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6884 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771866391.741, "dur": 2.870, + "args": { + "External id": 150842,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6885 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771866395.641, "dur": 15.760, + "args": { + "External id": 150843,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6886 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771866398.711, "dur": 3.720, + "args": { + "External id": 150844,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6887 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771866412.961, "dur": 18.820, + "args": { + "External id": 150845,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6888 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771866433.521, "dur": 10.200, + "args": { + "External id": 150846,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6889 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771866446.831, "dur": 12.160, + "args": { + "External id": 150847,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6890 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771866461.521, "dur": 8.410, + "args": { + "External id": 150848,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6891 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771866471.711, "dur": 18.810, + "args": { + "External id": 150849,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6892 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771866473.801, "dur": 2.670, + "args": { + "External id": 150850,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6893 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866478.581, "dur": 0.820, + "args": { + "External id": 150851,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6894 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771866492.261, "dur": 8.890, + "args": { + "External id": 150852,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6895 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771866502.251, "dur": 7.880, + "args": { + "External id": 150853,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6896 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771866520.921, "dur": 3.170, + "args": { + "External id": 150854,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6897 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771866535.061, "dur": 4.330, + "args": { + "External id": 150855,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6898 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866537.771, "dur": 0.620, + "args": { + "External id": 150856,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "2048"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6899 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771866611.491, "dur": 44.140, + "args": { + "External id": 150857,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6900 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771866663.321, "dur": 8.240, + "args": { + "External id": 150858,"Record function id": 0, "Concrete Inputs": ["", "0", "2048", "4096", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6901 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866667.611, "dur": 2.080, + "args": { + "External id": 150859,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "1572864"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6902 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771866672.871, "dur": 20.580, + "args": { + "External id": 150860,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6903 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771866700.721, "dur": 6.720, + "args": { + "External id": 150861,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6904 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771866703.431, "dur": 3.180, + "args": { + "External id": 150862,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6905 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866705.321, "dur": 0.950, + "args": { + "External id": 150863,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6906 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771866709.861, "dur": 32.499, + "args": { + "External id": 150864,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6907 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771866710.831, "dur": 30.680, + "args": { + "External id": 150865,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6908 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771866747.131, "dur": 14.689, + "args": { + "External id": 150866,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6909 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771866769.971, "dur": 6.569, + "args": { + "External id": 150867,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6910 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866774.251, "dur": 0.889, + "args": { + "External id": 150868,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6911 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771866780.531, "dur": 38.220, + "args": { + "External id": 150869,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6912 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771866781.340, "dur": 4.020, + "args": { + "External id": 150870,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6913 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771866782.540, "dur": 2.271, + "args": { + "External id": 150871,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6914 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866784.011, "dur": 0.489, + "args": { + "External id": 150872,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6915 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771866786.171, "dur": 32.000, + "args": { + "External id": 150873,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6916 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771866787.100, "dur": 30.311, + "args": { + "External id": 150874,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6917 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771866825.331, "dur": 4.429, + "args": { + "External id": 150875,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6918 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866827.911, "dur": 0.589, + "args": { + "External id": 150876,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6919 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771866838.150, "dur": 1.770, + "args": { + "External id": 150877,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6920 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771866846.700, "dur": 9.380, + "args": { + "External id": 150878,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6921 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771866848.310, "dur": 7.400, + "args": { + "External id": 150879,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6922 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771866938.230, "dur": 165.850, + "args": { + "External id": 150880,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6923 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771866941.860, "dur": 4.560, + "args": { + "External id": 150881,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6924 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771866949.310, "dur": 154.240, + "args": { + "External id": 150882,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6925 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771866951.720, "dur": 0.290, + "args": { + "External id": 150883,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6926 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771866953.140, "dur": 23.270, + "args": { + "External id": 150884,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6927 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771866978.110, "dur": 3.780, + "args": { + "External id": 150885,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6928 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771866980.490, "dur": 0.900, + "args": { + "External id": 150886,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6929 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771866982.600, "dur": 21.930, + "args": { + "External id": 150887,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6930 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771866985.040, "dur": 2.910, + "args": { + "External id": 150888,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6931 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771866989.000, "dur": 15.180, + "args": { + "External id": 150889,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6932 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771866991.920, "dur": 3.110, + "args": { + "External id": 150890,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6933 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771867005.730, "dur": 18.380, + "args": { + "External id": 150891,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6934 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771867025.740, "dur": 9.950, + "args": { + "External id": 150892,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6935 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771867040.470, "dur": 11.630, + "args": { + "External id": 150893,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6936 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771867053.460, "dur": 8.450, + "args": { + "External id": 150894,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6937 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771867063.700, "dur": 18.960, + "args": { + "External id": 150895,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6938 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771867065.720, "dur": 2.910, + "args": { + "External id": 150896,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6939 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867070.880, "dur": 0.840, + "args": { + "External id": 150897,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6940 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771867084.340, "dur": 8.100, + "args": { + "External id": 150898,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6941 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771867094.680, "dur": 7.520, + "args": { + "External id": 150899,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6942 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771867112.530, "dur": 3.170, + "args": { + "External id": 150900,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6943 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771867125.810, "dur": 4.670, + "args": { + "External id": 150901,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6944 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867128.730, "dur": 0.680, + "args": { + "External id": 150902,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "4096"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6945 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771867200.879, "dur": 45.560, + "args": { + "External id": 150903,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6946 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771867254.030, "dur": 7.109, + "args": { + "External id": 150904,"Record function id": 0, "Concrete Inputs": ["", "0", "4096", "6144", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6947 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867258.190, "dur": 1.169, + "args": { + "External id": 150905,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "3145728"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6948 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771867262.350, "dur": 20.089, + "args": { + "External id": 150906,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6949 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771867289.519, "dur": 15.110, + "args": { + "External id": 150907,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6950 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771867292.489, "dur": 3.240, + "args": { + "External id": 150908,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6951 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867294.379, "dur": 0.930, + "args": { + "External id": 150909,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6952 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771867307.289, "dur": 35.290, + "args": { + "External id": 150910,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6953 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771867308.279, "dur": 33.320, + "args": { + "External id": 150911,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6954 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771867347.419, "dur": 15.050, + "args": { + "External id": 150912,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 6955 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771867370.919, "dur": 6.890, + "args": { + "External id": 150913,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6956 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867375.609, "dur": 0.980, + "args": { + "External id": 150914,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6957 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771867381.819, "dur": 40.540, + "args": { + "External id": 150915,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 6958 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771867382.739, "dur": 5.360, + "args": { + "External id": 150916,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 6959 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771867383.939, "dur": 3.610, + "args": { + "External id": 150917,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 6960 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867385.479, "dur": 1.710, + "args": { + "External id": 150918,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 6961 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771867388.959, "dur": 32.870, + "args": { + "External id": 150919,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6962 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771867389.899, "dur": 31.170, + "args": { + "External id": 150920,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 6963 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771867429.229, "dur": 4.160, + "args": { + "External id": 150921,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6964 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867431.669, "dur": 0.510, + "args": { + "External id": 150922,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6965 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771867441.099, "dur": 1.900, + "args": { + "External id": 150923,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 6966 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771867449.689, "dur": 8.400, + "args": { + "External id": 150924,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 6967 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771867451.329, "dur": 6.410, + "args": { + "External id": 150925,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6968 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771867542.339, "dur": 160.799, + "args": { + "External id": 150926,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6969 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771867544.879, "dur": 5.540, + "args": { + "External id": 150927,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6970 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771867551.979, "dur": 150.590, + "args": { + "External id": 150928,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 6971 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771867553.229, "dur": 0.210, + "args": { + "External id": 150929,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6972 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771867555.549, "dur": 22.180, + "args": { + "External id": 150930,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 6973 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771867579.399, "dur": 3.710, + "args": { + "External id": 150931,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 6974 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867581.699, "dur": 0.930, + "args": { + "External id": 150932,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 6975 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771867583.969, "dur": 23.730, + "args": { + "External id": 150933,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6976 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771867585.079, "dur": 3.980, + "args": { + "External id": 150934,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 6977 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771867591.459, "dur": 15.850, + "args": { + "External id": 150935,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 6978 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771867594.479, "dur": 3.630, + "args": { + "External id": 150936,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6979 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771867608.969, "dur": 17.040, + "args": { + "External id": 150937,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6980 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771867627.719, "dur": 9.990, + "args": { + "External id": 150938,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6981 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771867640.449, "dur": 11.629, + "args": { + "External id": 150939,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 6982 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771867653.438, "dur": 8.220, + "args": { + "External id": 150940,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 6983 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771867663.269, "dur": 18.249, + "args": { + "External id": 150941,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 6984 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771867665.358, "dur": 2.460, + "args": { + "External id": 150942,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 6985 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867669.909, "dur": 0.840, + "args": { + "External id": 150943,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 6986 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771867684.318, "dur": 8.111, + "args": { + "External id": 150944,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 6987 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771867693.518, "dur": 7.771, + "args": { + "External id": 150945,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 6988 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771867711.589, "dur": 3.000, + "args": { + "External id": 150946,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 6989 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771867724.218, "dur": 4.511, + "args": { + "External id": 150947,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 6990 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867727.078, "dur": 0.711, + "args": { + "External id": 150948,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "6144"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 6991 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771867797.728, "dur": 44.250, + "args": { + "External id": 150949,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 6992 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771867849.628, "dur": 9.200, + "args": { + "External id": 150950,"Record function id": 0, "Concrete Inputs": ["", "0", "6144", "8192", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 6993 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867854.828, "dur": 2.060, + "args": { + "External id": 150951,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "4718592"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 6994 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771867860.118, "dur": 20.040, + "args": { + "External id": 150952,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 6995 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771867887.248, "dur": 5.800, + "args": { + "External id": 150953,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 6996 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771867888.808, "dur": 3.370, + "args": { + "External id": 150954,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 6997 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867890.908, "dur": 0.900, + "args": { + "External id": 150955,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 6998 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771867895.468, "dur": 33.110, + "args": { + "External id": 150956,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 6999 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771867896.488, "dur": 31.210, + "args": { + "External id": 150957,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7000 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771867934.468, "dur": 14.360, + "args": { + "External id": 150958,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7001 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771867956.808, "dur": 5.460, + "args": { + "External id": 150959,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7002 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867959.908, "dur": 0.950, + "args": { + "External id": 150960,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7003 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771867966.258, "dur": 39.590, + "args": { + "External id": 150961,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7004 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771867967.048, "dur": 5.680, + "args": { + "External id": 150962,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7005 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771867968.288, "dur": 3.910, + "args": { + "External id": 150963,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7006 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771867971.458, "dur": 0.480, + "args": { + "External id": 150964,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7007 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771867973.588, "dur": 31.730, + "args": { + "External id": 150965,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7008 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771867974.498, "dur": 30.000, + "args": { + "External id": 150966,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7009 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771868012.528, "dur": 4.270, + "args": { + "External id": 150967,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7010 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868014.998, "dur": 0.550, + "args": { + "External id": 150968,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7011 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771868024.108, "dur": 1.880, + "args": { + "External id": 150969,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7012 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771868032.288, "dur": 10.740, + "args": { + "External id": 150970,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7013 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771868034.928, "dur": 7.740, + "args": { + "External id": 150971,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7014 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771868123.877, "dur": 159.860, + "args": { + "External id": 150972,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7015 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771868126.068, "dur": 4.640, + "args": { + "External id": 150973,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7016 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771868132.137, "dur": 151.080, + "args": { + "External id": 150974,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7017 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771868133.517, "dur": 0.200, + "args": { + "External id": 150975,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7018 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771868134.757, "dur": 23.291, + "args": { + "External id": 150976,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7019 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771868160.737, "dur": 3.580, + "args": { + "External id": 150977,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7020 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868162.897, "dur": 0.960, + "args": { + "External id": 150978,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7021 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771868165.137, "dur": 22.690, + "args": { + "External id": 150979,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7022 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771868167.557, "dur": 2.791, + "args": { + "External id": 150980,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7023 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771868171.457, "dur": 15.970, + "args": { + "External id": 150981,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7024 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771868175.437, "dur": 3.131, + "args": { + "External id": 150982,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7025 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771868189.177, "dur": 17.520, + "args": { + "External id": 150983,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7026 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771868208.417, "dur": 9.840, + "args": { + "External id": 150984,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7027 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771868221.087, "dur": 10.980, + "args": { + "External id": 150985,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7028 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771868233.427, "dur": 8.260, + "args": { + "External id": 150986,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7029 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771868243.427, "dur": 18.800, + "args": { + "External id": 150987,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7030 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771868245.517, "dur": 2.820, + "args": { + "External id": 150988,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7031 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868250.427, "dur": 0.880, + "args": { + "External id": 150989,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7032 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771868264.997, "dur": 8.140, + "args": { + "External id": 150990,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7033 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771868274.237, "dur": 7.540, + "args": { + "External id": 150991,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7034 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771868292.167, "dur": 3.040, + "args": { + "External id": 150992,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7035 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771868320.557, "dur": 4.940, + "args": { + "External id": 150993,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7036 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868323.507, "dur": 0.800, + "args": { + "External id": 150994,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "8192"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7037 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771868400.757, "dur": 44.280, + "args": { + "External id": 150995,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7038 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771868452.387, "dur": 6.820, + "args": { + "External id": 150996,"Record function id": 0, "Concrete Inputs": ["", "0", "8192", "10240", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7039 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868456.407, "dur": 1.060, + "args": { + "External id": 150997,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "6291456"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7040 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771868461.597, "dur": 20.260, + "args": { + "External id": 150998,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7041 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771868489.107, "dur": 7.540, + "args": { + "External id": 150999,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7042 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771868491.667, "dur": 4.150, + "args": { + "External id": 151000,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7043 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868494.507, "dur": 0.900, + "args": { + "External id": 151001,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7044 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771868499.077, "dur": 32.730, + "args": { + "External id": 151002,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7045 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771868500.067, "dur": 30.880, + "args": { + "External id": 151003,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7046 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771868536.456, "dur": 14.460, + "args": { + "External id": 151004,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7047 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771868560.167, "dur": 5.300, + "args": { + "External id": 151005,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7048 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868563.196, "dur": 0.911, + "args": { + "External id": 151006,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7049 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771868569.376, "dur": 39.580, + "args": { + "External id": 151007,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7050 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771868570.167, "dur": 3.929, + "args": { + "External id": 151008,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7051 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771868571.327, "dur": 2.260, + "args": { + "External id": 151009,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7052 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868572.847, "dur": 0.480, + "args": { + "External id": 151010,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7053 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771868576.067, "dur": 32.329, + "args": { + "External id": 151011,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7054 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771868576.907, "dur": 30.740, + "args": { + "External id": 151012,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7055 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771868614.707, "dur": 4.269, + "args": { + "External id": 151013,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7056 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868617.227, "dur": 0.620, + "args": { + "External id": 151014,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7057 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771868626.216, "dur": 1.891, + "args": { + "External id": 151015,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7058 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771868634.676, "dur": 9.360, + "args": { + "External id": 151016,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7059 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771868637.406, "dur": 6.330, + "args": { + "External id": 151017,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7060 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771868724.266, "dur": 161.650, + "args": { + "External id": 151018,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7061 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771868727.746, "dur": 5.710, + "args": { + "External id": 151019,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7062 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771868734.976, "dur": 150.420, + "args": { + "External id": 151020,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7063 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771868736.246, "dur": 0.210, + "args": { + "External id": 151021,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7064 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771868737.526, "dur": 21.960, + "args": { + "External id": 151022,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7065 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771868761.206, "dur": 4.610, + "args": { + "External id": 151023,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7066 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868764.246, "dur": 1.010, + "args": { + "External id": 151024,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7067 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771868767.596, "dur": 20.650, + "args": { + "External id": 151025,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7068 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771868768.676, "dur": 2.790, + "args": { + "External id": 151026,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7069 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771868772.556, "dur": 15.350, + "args": { + "External id": 151027,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7070 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771868775.486, "dur": 3.420, + "args": { + "External id": 151028,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7071 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771868789.536, "dur": 16.900, + "args": { + "External id": 151029,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7072 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771868808.046, "dur": 10.090, + "args": { + "External id": 151030,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7073 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771868820.806, "dur": 11.450, + "args": { + "External id": 151031,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7074 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771868833.616, "dur": 8.240, + "args": { + "External id": 151032,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7075 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771868843.566, "dur": 21.400, + "args": { + "External id": 151033,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7076 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771868848.066, "dur": 2.560, + "args": { + "External id": 151034,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7077 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868852.976, "dur": 0.860, + "args": { + "External id": 151035,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7078 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771868866.666, "dur": 8.370, + "args": { + "External id": 151036,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7079 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771868876.116, "dur": 7.960, + "args": { + "External id": 151037,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7080 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771868894.436, "dur": 3.090, + "args": { + "External id": 151038,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7081 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771868907.446, "dur": 4.630, + "args": { + "External id": 151039,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7082 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771868910.296, "dur": 0.680, + "args": { + "External id": 151040,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "10240"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7083 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771868996.206, "dur": 44.760, + "args": { + "External id": 151041,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7084 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771869048.746, "dur": 8.289, + "args": { + "External id": 151042,"Record function id": 0, "Concrete Inputs": ["", "0", "10240", "12288", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7085 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869053.855, "dur": 1.200, + "args": { + "External id": 151043,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "7864320"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7086 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771869058.326, "dur": 20.100, + "args": { + "External id": 151044,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7087 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771869085.535, "dur": 6.610, + "args": { + "External id": 151045,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7088 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771869086.975, "dur": 4.370, + "args": { + "External id": 151046,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7089 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869090.055, "dur": 0.960, + "args": { + "External id": 151047,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7090 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771869094.575, "dur": 33.390, + "args": { + "External id": 151048,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7091 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771869095.515, "dur": 31.440, + "args": { + "External id": 151049,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7092 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771869133.015, "dur": 14.540, + "args": { + "External id": 151050,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7093 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771869155.655, "dur": 5.370, + "args": { + "External id": 151051,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7094 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869158.685, "dur": 0.990, + "args": { + "External id": 151052,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7095 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771869164.895, "dur": 38.990, + "args": { + "External id": 151053,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7096 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771869166.825, "dur": 3.910, + "args": { + "External id": 151054,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7097 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771869167.925, "dur": 2.280, + "args": { + "External id": 151055,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7098 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869169.445, "dur": 0.480, + "args": { + "External id": 151056,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7099 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771869171.595, "dur": 31.740, + "args": { + "External id": 151057,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7100 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771869172.475, "dur": 30.070, + "args": { + "External id": 151058,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7101 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771869210.345, "dur": 5.540, + "args": { + "External id": 151059,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7102 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869212.865, "dur": 1.730, + "args": { + "External id": 151060,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7103 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771869223.235, "dur": 1.850, + "args": { + "External id": 151061,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7104 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771869232.565, "dur": 8.250, + "args": { + "External id": 151062,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7105 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771869234.115, "dur": 6.350, + "args": { + "External id": 151063,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7106 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771869332.775, "dur": 161.899, + "args": { + "External id": 151064,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7107 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771869335.075, "dur": 5.240, + "args": { + "External id": 151065,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7108 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771869342.085, "dur": 152.040, + "args": { + "External id": 151066,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7109 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771869343.295, "dur": 0.230, + "args": { + "External id": 151067,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7110 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771869345.445, "dur": 24.530, + "args": { + "External id": 151068,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7111 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771869371.795, "dur": 3.760, + "args": { + "External id": 151069,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7112 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869373.905, "dur": 1.050, + "args": { + "External id": 151070,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7113 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771869376.285, "dur": 21.320, + "args": { + "External id": 151071,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7114 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771869377.345, "dur": 2.810, + "args": { + "External id": 151072,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7115 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771869381.175, "dur": 16.110, + "args": { + "External id": 151073,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7116 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771869385.275, "dur": 3.100, + "args": { + "External id": 151074,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7117 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771869398.955, "dur": 18.480, + "args": { + "External id": 151075,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7118 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771869419.075, "dur": 9.880, + "args": { + "External id": 151076,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7119 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771869431.725, "dur": 11.489, + "args": { + "External id": 151077,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7120 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771869444.505, "dur": 8.300, + "args": { + "External id": 151078,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7121 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771869455.734, "dur": 18.811, + "args": { + "External id": 151079,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7122 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771869457.865, "dur": 2.729, + "args": { + "External id": 151080,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7123 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869462.645, "dur": 0.889, + "args": { + "External id": 151081,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7124 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771869476.285, "dur": 8.020, + "args": { + "External id": 151082,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7125 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771869485.425, "dur": 7.480, + "args": { + "External id": 151083,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7126 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771869503.285, "dur": 3.009, + "args": { + "External id": 151084,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7127 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771869517.305, "dur": 4.520, + "args": { + "External id": 151085,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7128 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869520.174, "dur": 0.660, + "args": { + "External id": 151086,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "12288"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7129 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771869592.414, "dur": 44.320, + "args": { + "External id": 151087,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7130 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771869644.404, "dur": 7.990, + "args": { + "External id": 151088,"Record function id": 0, "Concrete Inputs": ["", "0", "12288", "14336", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7131 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869648.454, "dur": 1.990, + "args": { + "External id": 151089,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "9437184"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7132 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771869653.674, "dur": 20.010, + "args": { + "External id": 151090,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7133 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771869680.824, "dur": 6.840, + "args": { + "External id": 151091,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7134 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771869682.234, "dur": 4.630, + "args": { + "External id": 151092,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7135 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869685.504, "dur": 1.020, + "args": { + "External id": 151093,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7136 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771869690.124, "dur": 33.370, + "args": { + "External id": 151094,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7137 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771869691.084, "dur": 31.460, + "args": { + "External id": 151095,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7138 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771869728.424, "dur": 14.440, + "args": { + "External id": 151096,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7139 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771869750.484, "dur": 5.350, + "args": { + "External id": 151097,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7140 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869753.574, "dur": 0.900, + "args": { + "External id": 151098,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7141 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linear", "pid": 5714, "tid": 5714, + "ts": 6303771869759.724, "dur": 40.000, + "args": { + "External id": 151099,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["c10::BFloat16", "c10::BFloat16", ""], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [32000, 768], []], "Ev Idx": 7142 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771869761.724, "dur": 4.020, + "args": { + "External id": 151100,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7143 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771869762.904, "dur": 2.330, + "args": { + "External id": 151101,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], []], "Input Dims": [[32000, 768], [], []], "Ev Idx": 7144 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869764.434, "dur": 0.520, + "args": { + "External id": 151102,"Record function id": 0, "Concrete Inputs": ["", "[768, 32000]", "[1, 768]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[32000, 768], [], [], []], "Ev Idx": 7145 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771869766.564, "dur": 32.600, + "args": { + "External id": 151103,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7146 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771869767.434, "dur": 30.870, + "args": { + "External id": 151104,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[768, 1], [1, 768]], "Input Dims": [[2048, 768], [768, 32000]], "Ev Idx": 7147 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771869805.634, "dur": 4.480, + "args": { + "External id": 151105,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["long int", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7148 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869808.224, "dur": 0.590, + "args": { + "External id": 151106,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["long int", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7149 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771869818.664, "dur": 1.780, + "args": { + "External id": 151107,"Record function id": 0, "Concrete Inputs": ["", "[-1, 32000]"], "Input type": ["c10::BFloat16", "ScalarList"], "Input Strides": [[32000, 1], []], "Input Dims": [[2048, 32000], []], "Ev Idx": 7150 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::new_empty", "pid": 5714, "tid": 5714, + "ts": 6303771869827.004, "dur": 9.210, + "args": { + "External id": 151108,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "6", "", "", "False"], "Input type": ["c10::BFloat16", "ScalarList", "Scalar", "", "", "Scalar"], "Input Strides": [[32000, 1], [], [], [], [], []], "Input Dims": [[2048, 32000], [], [], [], [], []], "Ev Idx": 7151 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771869828.564, "dur": 7.270, + "args": { + "External id": 151109,"Record function id": 0, "Concrete Inputs": ["[2048, 1]", "6", "0", "", "False", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "Scalar", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7152 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771869916.473, "dur": 170.360, + "args": { + "External id": 151110,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7153 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771869918.764, "dur": 5.420, + "args": { + "External id": 151111,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7154 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::logsumexp", "pid": 5714, "tid": 5714, + "ts": 6303771869926.744, "dur": 159.609, + "args": { + "External id": 151112,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", "float"], "Input Strides": [[1, 1], [], [], [1]], "Input Dims": [[2048, 1], [], [], [0]], "Ev Idx": 7155 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::real", "pid": 5714, "tid": 5714, + "ts": 6303771869928.104, "dur": 0.209, + "args": { + "External id": 151113,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7156 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::amax", "pid": 5714, "tid": 5714, + "ts": 6303771869929.593, "dur": 22.331, + "args": { + "External id": 151114,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "True"], "Input type": ["float", "ScalarList", "Scalar"], "Input Strides": [[1, 1], [], []], "Input Dims": [[2048, 1], [], []], "Ev Idx": 7157 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::squeeze", "pid": 5714, "tid": 5714, + "ts": 6303771869954.433, "dur": 3.611, + "args": { + "External id": 151115,"Record function id": 0, "Concrete Inputs": ["", "[-1]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1, 1], []], "Input Dims": [[2048, 1], []], "Ev Idx": 7158 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771869956.573, "dur": 0.971, + "args": { + "External id": 151116,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1, 1], [], [], []], "Input Dims": [[2048, 1], [], [], []], "Ev Idx": 7159 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771869958.784, "dur": 22.029, + "args": { + "External id": 151117,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7160 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771869959.804, "dur": 2.800, + "args": { + "External id": 151118,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7161 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303771869963.753, "dur": 16.691, + "args": { + "External id": 151119,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[1], [1]], "Input Dims": [[2048], [0]], "Ev Idx": 7162 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771869967.744, "dur": 3.720, + "args": { + "External id": 151120,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7163 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303771869983.553, "dur": 25.110, + "args": { + "External id": 151121,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7164 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::masked_fill_", "pid": 5714, "tid": 5714, + "ts": 6303771870010.333, "dur": 9.860, + "args": { + "External id": 151122,"Record function id": 0, "Concrete Inputs": ["", "", "0"], "Input type": ["float", "bool", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7165 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sub", "pid": 5714, "tid": 5714, + "ts": 6303771870023.023, "dur": 11.120, + "args": { + "External id": 151123,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1, 1], [1, 1], []], "Input Dims": [[2048, 1], [2048, 1], []], "Ev Idx": 7166 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::exp_", "pid": 5714, "tid": 5714, + "ts": 6303771870035.443, "dur": 8.230, + "args": { + "External id": 151124,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1, 1]], "Input Dims": [[2048, 1]], "Ev Idx": 7167 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771870045.373, "dur": 19.820, + "args": { + "External id": 151125,"Record function id": 0, "Concrete Inputs": ["", "[-1]", "False", "", ""], "Input type": ["float", "ScalarList", "Scalar", "", "float"], "Input Strides": [[1, 1], [], [], [], [1]], "Input Dims": [[2048, 1], [], [], [], [0]], "Ev Idx": 7168 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303771870047.413, "dur": 2.630, + "args": { + "External id": 151126,"Record function id": 0, "Concrete Inputs": ["", "[2048]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7169 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771870053.203, "dur": 0.810, + "args": { + "External id": 151127,"Record function id": 0, "Concrete Inputs": ["", "[2048, 1]", "[1, 0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[2048], [], [], []], "Ev Idx": 7170 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::log_", "pid": 5714, "tid": 5714, + "ts": 6303771870067.883, "dur": 8.300, + "args": { + "External id": 151128,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[2048]], "Ev Idx": 7171 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771870077.223, "dur": 7.700, + "args": { + "External id": 151129,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [1], []], "Input Dims": [[2048], [2048], []], "Ev Idx": 7172 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771870095.263, "dur": 3.070, + "args": { + "External id": 151130,"Record function id": 0, "Concrete Inputs": ["", "[2048]"], "Input type": ["float", "ScalarList"], "Input Strides": [[1], []], "Input Dims": [[2048], []], "Ev Idx": 7173 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771870108.033, "dur": 4.400, + "args": { + "External id": 151131,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["float", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], []], "Input Dims": [[16384], [], [], [], []], "Ev Idx": 7174 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771870110.773, "dur": 0.680, + "args": { + "External id": 151132,"Record function id": 0, "Concrete Inputs": ["", "[2048]", "[1]", "14336"], "Input type": ["float", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7175 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771870182.563, "dur": 45.650, + "args": { + "External id": 151133,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[32000, 1], [768, 1]], "Input Dims": [[2048, 32000], [32000, 768]], "Ev Idx": 7176 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::slice", "pid": 5714, "tid": 5714, + "ts": 6303771870236.183, "dur": 8.350, + "args": { + "External id": 151134,"Record function id": 0, "Concrete Inputs": ["", "0", "14336", "16384", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar", "Scalar", "Scalar"], "Input Strides": [[768, 1], [], [], [], []], "Input Dims": [[16384, 768], [], [], [], []], "Ev Idx": 7177 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771870241.423, "dur": 1.210, + "args": { + "External id": 151135,"Record function id": 0, "Concrete Inputs": ["", "[2048, 768]", "[768, 1]", "11010048"], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", "Scalar"], "Input Strides": [[768, 1], [], [], []], "Input Dims": [[16384, 768], [], [], []], "Ev Idx": 7178 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771870245.703, "dur": 21.040, + "args": { + "External id": 151136,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[2048, 768], [2048, 768], []], "Ev Idx": 7179 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::t", "pid": 5714, "tid": 5714, + "ts": 6303771870273.713, "dur": 5.750, + "args": { + "External id": 151137,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[32000, 1]], "Input Dims": [[2048, 32000]], "Ev Idx": 7180 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::transpose", "pid": 5714, "tid": 5714, + "ts": 6303771870275.163, "dur": 3.370, + "args": { + "External id": 151138,"Record function id": 0, "Concrete Inputs": ["", "0", "1"], "Input type": ["c10::BFloat16", "Scalar", "Scalar"], "Input Strides": [[32000, 1], [], []], "Input Dims": [[2048, 32000], [], []], "Ev Idx": 7181 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771870277.283, "dur": 0.910, + "args": { + "External id": 151139,"Record function id": 0, "Concrete Inputs": ["", "[32000, 2048]", "[1, 32000]", ""], "Input type": ["c10::BFloat16", "ScalarList", "ScalarList", ""], "Input Strides": [[32000, 1], [], [], []], "Input Dims": [[2048, 32000], [], [], []], "Ev Idx": 7182 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::matmul", "pid": 5714, "tid": 5714, + "ts": 6303771870281.903, "dur": 44.130, + "args": { + "External id": 151140,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7183 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mm", "pid": 5714, "tid": 5714, + "ts": 6303771870284.093, "dur": 40.780, + "args": { + "External id": 151141,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["c10::BFloat16", "c10::BFloat16"], "Input Strides": [[1, 32000], [768, 1]], "Input Dims": [[32000, 2048], [2048, 768]], "Ev Idx": 7184 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771870332.332, "dur": 15.151, + "args": { + "External id": 151142,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "c10::BFloat16", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7185 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771870352.683, "dur": 25.640, + "args": { + "External id": 151143,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[1], []], "Input Dims": [[16384], []], "Ev Idx": 7186 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::sum", "pid": 5714, "tid": 5714, + "ts": 6303771870354.752, "dur": 23.080, + "args": { + "External id": 151144,"Record function id": 0, "Concrete Inputs": ["", "[]", "False", ""], "Input type": ["float", "ScalarList", "Scalar", ""], "Input Strides": [[1], [], [], []], "Input Dims": [[16384], [], [], []], "Ev Idx": 7187 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771870362.972, "dur": 0.700, + "args": { + "External id": 151145,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7188 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771870383.243, "dur": 24.840, + "args": { + "External id": 151146,"Record function id": 0, "Concrete Inputs": ["", "", "15", "False", "False", ""], "Input type": ["float", "", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], []], "Ev Idx": 7189 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_to_copy", "pid": 5714, "tid": 5714, + "ts": 6303771870384.383, "dur": 23.449, + "args": { + "External id": 151147,"Record function id": 0, "Concrete Inputs": ["", "15", "", "", "", "False", ""], "Input type": ["float", "Scalar", "", "", "", "Scalar", ""], "Input Strides": [[768, 1], [], [], [], [], [], []], "Input Dims": [[32000, 768], [], [], [], [], [], []], "Ev Idx": 7190 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771870387.803, "dur": 6.360, + "args": { + "External id": 151148,"Record function id": 0, "Concrete Inputs": ["[32000, 768]", "[768, 1]", "15", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7191 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771870395.203, "dur": 12.120, + "args": { + "External id": 151149,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["c10::BFloat16", "float", "Scalar"], "Input Strides": [[768, 1], [768, 1], []], "Input Dims": [[32000, 768], [32000, 768], []], "Ev Idx": 7192 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771870418.092, "dur": 5.011, + "args": { + "External id": 151150,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 7193 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771870420.452, "dur": 2.331, + "args": { + "External id": 151151,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[16384, 768]], "Ev Idx": 7194 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::detach", "pid": 5714, "tid": 5714, + "ts": 6303771870424.072, "dur": 1.171, + "args": { + "External id": 151152,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7195 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "detach", "pid": 5714, "tid": 5714, + "ts": 6303771870424.472, "dur": 0.511, + "args": { + "External id": 151153,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["c10::BFloat16"], "Input Strides": [[768, 1]], "Input Dims": [[32000, 768]], "Ev Idx": 7196 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771870456.542, "dur": 20.780, + "args": { + "External id": 151154,"Sequence number": 3058982, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7197 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add_", "pid": 5714, "tid": 5714, + "ts": 6303771870479.712, "dur": 10.720, + "args": { + "External id": 151155,"Sequence number": 3058983, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7198 + } + }, + { + "ph": "s", "id": 2, "pid": 5714, "tid": 5714, "ts": 6303771870479.712, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "user_annotation", "name": "FSDP::post_forward", "pid": 5714, "tid": 5714, + "ts": 6303771870602.842, "dur": 50.520, + "args": { + "External id": 151156,"Record function id": 0, "Ev Idx": 7199 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 5714, + "ts": 6303771870771.142, "dur": 37.640, + "args": { + "External id": 151157,"Sequence number": 3058984, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7200 + } + }, + { + "ph": "s", "id": 1, "pid": 5714, "tid": 5714, "ts": 6303771870771.142, + "cat": "fwdbwd", "name": "fwdbwd" + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ones_like", "pid": 5714, "tid": 5714, + "ts": 6303771870862.891, "dur": 27.170, + "args": { + "External id": 151158,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "1"], "Input type": ["float", "", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[1], [], [], [], [], []], "Ev Idx": 7201 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771870864.291, "dur": 10.391, + "args": { + "External id": 151159,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "False", "1"], "Input type": ["float", "", "", "", "Scalar", "Scalar"], "Input Strides": [[1], [], [], [], [], []], "Input Dims": [[1], [], [], [], [], []], "Ev Idx": 7202 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_strided", "pid": 5714, "tid": 5714, + "ts": 6303771870867.082, "dur": 6.889, + "args": { + "External id": 151160,"Record function id": 0, "Concrete Inputs": ["[1]", "[1]", "6", "0", "", "False"], "Input type": ["ScalarList", "ScalarList", "Scalar", "Scalar", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7203 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771870875.782, "dur": 13.949, + "args": { + "External id": 151161,"Record function id": 0, "Concrete Inputs": ["", "1."], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7204 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 5714, + "ts": 6303771938551.361, "dur": 52.350, + "args": { + "External id": 151162,"Sequence number": 3058985, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7205 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::div", "pid": 5714, "tid": 5714, + "ts": 6303771938618.351, "dur": 49.710, + "args": { + "External id": 151163,"Sequence number": 3058986, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "long int"], "Input Strides": [[1], []], "Input Dims": [[1], []], "Ev Idx": 7206 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771938679.421, "dur": 30.090, + "args": { + "External id": 151164,"Sequence number": 3058987, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "long int", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7207 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771939086.120, "dur": 26.300, + "args": { + "External id": 151165,"Sequence number": 3058988, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "long int", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7208 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771939121.630, "dur": 19.140, + "args": { + "External id": 151166,"Sequence number": 3058989, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "long int", "Scalar"], "Input Strides": [[1], [], []], "Input Dims": [[1], [], []], "Ev Idx": 7209 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_norm", "pid": 5714, "tid": 5714, + "ts": 6303771940081.708, "dur": 3328.472, + "args": { + "External id": 151167,"Record function id": 0, "Concrete Inputs": ["", "2.", ""], "Input type": ["TensorList", "Scalar", ""], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7210 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_norm", "pid": 5714, "tid": 5714, + "ts": 6303771940567.047, "dur": 629.758, + "args": { + "External id": 151168,"Record function id": 0, "Concrete Inputs": ["", "2.", ""], "Input type": ["TensorList", "Scalar", ""], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7211 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zeros", "pid": 5714, "tid": 5714, + "ts": 6303771940583.827, "dur": 56.179, + "args": { + "External id": 151169,"Record function id": 0, "Concrete Inputs": ["[12032]", "6", "0", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7212 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771940586.556, "dur": 13.120, + "args": { + "External id": 151170,"Record function id": 0, "Concrete Inputs": ["[12032]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7213 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::zero_", "pid": 5714, "tid": 5714, + "ts": 6303771940602.507, "dur": 36.759, + "args": { + "External id": 151171,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[1]], "Input Dims": [[12032]], "Ev Idx": 7214 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::fill_", "pid": 5714, "tid": 5714, + "ts": 6303771940606.207, "dur": 31.449, + "args": { + "External id": 151172,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[1], []], "Input Dims": [[12032], []], "Ev Idx": 7215 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943458.500, "dur": 6.430, + "args": { + "External id": 151173,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7216 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943471.430, "dur": 1.290, + "args": { + "External id": 151174,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7217 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943477.020, "dur": 0.790, + "args": { + "External id": 151175,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7218 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943482.000, "dur": 0.870, + "args": { + "External id": 151176,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7219 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943486.370, "dur": 0.790, + "args": { + "External id": 151177,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7220 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943491.400, "dur": 0.810, + "args": { + "External id": 151178,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7221 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943495.690, "dur": 0.850, + "args": { + "External id": 151179,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7222 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943502.110, "dur": 0.770, + "args": { + "External id": 151180,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7223 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943506.480, "dur": 0.770, + "args": { + "External id": 151181,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7224 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943511.140, "dur": 0.750, + "args": { + "External id": 151182,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7225 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943515.530, "dur": 0.760, + "args": { + "External id": 151183,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7226 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943519.900, "dur": 0.830, + "args": { + "External id": 151184,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7227 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943524.300, "dur": 0.870, + "args": { + "External id": 151185,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7228 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943528.800, "dur": 0.770, + "args": { + "External id": 151186,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7229 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943533.000, "dur": 0.780, + "args": { + "External id": 151187,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7230 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943538.880, "dur": 0.800, + "args": { + "External id": 151188,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7231 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943543.130, "dur": 0.740, + "args": { + "External id": 151189,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7232 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943547.670, "dur": 0.740, + "args": { + "External id": 151190,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7233 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943551.920, "dur": 0.760, + "args": { + "External id": 151191,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7234 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943556.180, "dur": 0.760, + "args": { + "External id": 151192,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7235 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943560.540, "dur": 0.780, + "args": { + "External id": 151193,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7236 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943564.830, "dur": 0.750, + "args": { + "External id": 151194,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7237 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943569.060, "dur": 0.780, + "args": { + "External id": 151195,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7238 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943574.990, "dur": 0.770, + "args": { + "External id": 151196,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7239 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943579.260, "dur": 0.760, + "args": { + "External id": 151197,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7240 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943583.860, "dur": 0.760, + "args": { + "External id": 151198,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7241 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943588.340, "dur": 0.750, + "args": { + "External id": 151199,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7242 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943592.730, "dur": 0.750, + "args": { + "External id": 151200,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7243 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943596.980, "dur": 0.760, + "args": { + "External id": 151201,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7244 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943601.280, "dur": 0.740, + "args": { + "External id": 151202,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7245 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943605.450, "dur": 0.830, + "args": { + "External id": 151203,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7246 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943611.590, "dur": 0.810, + "args": { + "External id": 151204,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7247 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943615.980, "dur": 0.780, + "args": { + "External id": 151205,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7248 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943620.570, "dur": 0.800, + "args": { + "External id": 151206,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7249 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943624.990, "dur": 0.800, + "args": { + "External id": 151207,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7250 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943629.490, "dur": 0.800, + "args": { + "External id": 151208,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7251 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943633.880, "dur": 0.760, + "args": { + "External id": 151209,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7252 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943638.120, "dur": 0.780, + "args": { + "External id": 151210,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7253 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943642.500, "dur": 0.750, + "args": { + "External id": 151211,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7254 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943648.540, "dur": 0.810, + "args": { + "External id": 151212,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7255 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943652.840, "dur": 0.790, + "args": { + "External id": 151213,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7256 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943657.540, "dur": 0.780, + "args": { + "External id": 151214,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7257 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943661.840, "dur": 0.769, + "args": { + "External id": 151215,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7258 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943666.089, "dur": 0.791, + "args": { + "External id": 151216,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7259 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943670.389, "dur": 0.791, + "args": { + "External id": 151217,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7260 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943674.749, "dur": 0.771, + "args": { + "External id": 151218,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7261 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943679.080, "dur": 0.829, + "args": { + "External id": 151219,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7262 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943685.289, "dur": 0.800, + "args": { + "External id": 151220,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7263 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943689.809, "dur": 0.831, + "args": { + "External id": 151221,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7264 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943694.309, "dur": 0.920, + "args": { + "External id": 151222,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7265 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943699.240, "dur": 0.940, + "args": { + "External id": 151223,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7266 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943704.360, "dur": 0.889, + "args": { + "External id": 151224,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7267 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943709.420, "dur": 0.929, + "args": { + "External id": 151225,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7268 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943715.229, "dur": 1.051, + "args": { + "External id": 151226,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7269 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943727.300, "dur": 1.029, + "args": { + "External id": 151227,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7270 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943735.020, "dur": 1.020, + "args": { + "External id": 151228,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7271 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943740.769, "dur": 1.000, + "args": { + "External id": 151229,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7272 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943746.209, "dur": 0.980, + "args": { + "External id": 151230,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7273 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943751.649, "dur": 0.991, + "args": { + "External id": 151231,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7274 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943757.309, "dur": 0.951, + "args": { + "External id": 151232,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7275 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943762.859, "dur": 0.950, + "args": { + "External id": 151233,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7276 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943768.319, "dur": 0.960, + "args": { + "External id": 151234,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7277 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943773.829, "dur": 0.980, + "args": { + "External id": 151235,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7278 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943783.149, "dur": 0.960, + "args": { + "External id": 151236,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7279 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943788.799, "dur": 0.960, + "args": { + "External id": 151237,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7280 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943794.679, "dur": 0.990, + "args": { + "External id": 151238,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7281 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943800.289, "dur": 1.020, + "args": { + "External id": 151239,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7282 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943805.899, "dur": 1.010, + "args": { + "External id": 151240,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7283 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943811.499, "dur": 1.070, + "args": { + "External id": 151241,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7284 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943817.149, "dur": 1.060, + "args": { + "External id": 151242,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7285 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943822.869, "dur": 0.970, + "args": { + "External id": 151243,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7286 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943830.429, "dur": 1.010, + "args": { + "External id": 151244,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7287 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943836.039, "dur": 1.010, + "args": { + "External id": 151245,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7288 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943841.589, "dur": 1.010, + "args": { + "External id": 151246,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7289 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943847.119, "dur": 1.050, + "args": { + "External id": 151247,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7290 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943852.749, "dur": 0.980, + "args": { + "External id": 151248,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7291 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943858.399, "dur": 0.990, + "args": { + "External id": 151249,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7292 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943865.149, "dur": 0.970, + "args": { + "External id": 151250,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7293 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943870.669, "dur": 0.990, + "args": { + "External id": 151251,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7294 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943878.029, "dur": 1.020, + "args": { + "External id": 151252,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7295 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943883.559, "dur": 0.980, + "args": { + "External id": 151253,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7296 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943889.079, "dur": 0.990, + "args": { + "External id": 151254,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7297 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943894.709, "dur": 0.960, + "args": { + "External id": 151255,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7298 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943900.169, "dur": 1.000, + "args": { + "External id": 151256,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7299 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943905.789, "dur": 0.980, + "args": { + "External id": 151257,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7300 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943911.209, "dur": 0.970, + "args": { + "External id": 151258,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7301 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943916.709, "dur": 1.000, + "args": { + "External id": 151259,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7302 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943924.239, "dur": 0.990, + "args": { + "External id": 151260,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7303 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943929.779, "dur": 1.000, + "args": { + "External id": 151261,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7304 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943935.289, "dur": 0.980, + "args": { + "External id": 151262,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7305 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943940.769, "dur": 1.010, + "args": { + "External id": 151263,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7306 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943946.319, "dur": 0.920, + "args": { + "External id": 151264,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7307 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943951.779, "dur": 0.980, + "args": { + "External id": 151265,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7308 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943958.259, "dur": 1.000, + "args": { + "External id": 151266,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7309 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943963.799, "dur": 0.970, + "args": { + "External id": 151267,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7310 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943970.959, "dur": 0.980, + "args": { + "External id": 151268,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7311 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943976.389, "dur": 0.980, + "args": { + "External id": 151269,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7312 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943981.809, "dur": 0.950, + "args": { + "External id": 151270,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7313 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943987.239, "dur": 1.010, + "args": { + "External id": 151271,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7314 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943992.739, "dur": 1.000, + "args": { + "External id": 151272,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7315 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771943998.259, "dur": 0.970, + "args": { + "External id": 151273,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7316 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944003.649, "dur": 0.980, + "args": { + "External id": 151274,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7317 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944009.109, "dur": 1.040, + "args": { + "External id": 151275,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7318 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944016.399, "dur": 0.990, + "args": { + "External id": 151276,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7319 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944021.849, "dur": 0.990, + "args": { + "External id": 151277,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7320 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944027.249, "dur": 0.980, + "args": { + "External id": 151278,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7321 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944032.649, "dur": 1.000, + "args": { + "External id": 151279,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7322 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944038.169, "dur": 1.030, + "args": { + "External id": 151280,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7323 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944043.839, "dur": 0.940, + "args": { + "External id": 151281,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7324 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944050.319, "dur": 1.000, + "args": { + "External id": 151282,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7325 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944055.879, "dur": 1.000, + "args": { + "External id": 151283,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7326 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944063.199, "dur": 1.030, + "args": { + "External id": 151284,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7327 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944068.649, "dur": 1.000, + "args": { + "External id": 151285,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7328 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944074.229, "dur": 0.990, + "args": { + "External id": 151286,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7329 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944079.739, "dur": 0.990, + "args": { + "External id": 151287,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7330 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944085.289, "dur": 0.980, + "args": { + "External id": 151288,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7331 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944090.869, "dur": 0.960, + "args": { + "External id": 151289,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7332 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944096.359, "dur": 0.960, + "args": { + "External id": 151290,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7333 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944101.799, "dur": 0.970, + "args": { + "External id": 151291,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7334 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944109.269, "dur": 0.960, + "args": { + "External id": 151292,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7335 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944114.859, "dur": 0.989, + "args": { + "External id": 151293,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7336 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944120.448, "dur": 0.960, + "args": { + "External id": 151294,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7337 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944125.968, "dur": 0.971, + "args": { + "External id": 151295,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7338 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944131.408, "dur": 0.980, + "args": { + "External id": 151296,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7339 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944136.939, "dur": 0.969, + "args": { + "External id": 151297,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7340 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944142.439, "dur": 0.980, + "args": { + "External id": 151298,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7341 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944147.979, "dur": 0.980, + "args": { + "External id": 151299,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7342 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771944155.199, "dur": 1.020, + "args": { + "External id": 151300,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7343 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::stack", "pid": 5714, "tid": 5714, + "ts": 6303771944277.948, "dur": 2846.994, + "args": { + "External id": 151301,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7344 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::stack", "pid": 5714, "tid": 5714, + "ts": 6303771945378.326, "dur": 1469.156, + "args": { + "External id": 151302,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7345 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945392.966, "dur": 19.760, + "args": { + "External id": 151303,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7346 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945403.166, "dur": 6.590, + "args": { + "External id": 151304,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7347 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945415.596, "dur": 7.540, + "args": { + "External id": 151305,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7348 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945421.126, "dur": 1.340, + "args": { + "External id": 151306,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7349 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945425.466, "dur": 7.130, + "args": { + "External id": 151307,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7350 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945429.496, "dur": 2.470, + "args": { + "External id": 151308,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7351 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945434.766, "dur": 4.340, + "args": { + "External id": 151309,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7352 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945437.566, "dur": 0.910, + "args": { + "External id": 151310,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7353 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945440.926, "dur": 7.460, + "args": { + "External id": 151311,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7354 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945446.916, "dur": 0.810, + "args": { + "External id": 151312,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7355 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945450.156, "dur": 3.810, + "args": { + "External id": 151313,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7356 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945452.636, "dur": 0.750, + "args": { + "External id": 151314,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7357 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945456.126, "dur": 5.919, + "args": { + "External id": 151315,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7358 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945458.586, "dur": 2.790, + "args": { + "External id": 151316,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7359 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945466.005, "dur": 4.940, + "args": { + "External id": 151317,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7360 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945469.605, "dur": 0.760, + "args": { + "External id": 151318,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7361 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945472.825, "dur": 4.160, + "args": { + "External id": 151319,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7362 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945475.576, "dur": 0.789, + "args": { + "External id": 151320,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7363 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945478.796, "dur": 5.869, + "args": { + "External id": 151321,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7364 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945482.916, "dur": 1.129, + "args": { + "External id": 151322,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7365 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945486.685, "dur": 6.751, + "args": { + "External id": 151323,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7366 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945490.256, "dur": 2.549, + "args": { + "External id": 151324,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7367 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945495.245, "dur": 3.800, + "args": { + "External id": 151325,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7368 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945497.725, "dur": 0.700, + "args": { + "External id": 151326,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7369 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945500.896, "dur": 6.749, + "args": { + "External id": 151327,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7370 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945506.285, "dur": 0.700, + "args": { + "External id": 151328,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7371 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945509.496, "dur": 3.780, + "args": { + "External id": 151329,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7372 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945511.925, "dur": 0.740, + "args": { + "External id": 151330,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7373 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945515.085, "dur": 5.680, + "args": { + "External id": 151331,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7374 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945517.536, "dur": 2.620, + "args": { + "External id": 151332,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7375 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945524.496, "dur": 5.029, + "args": { + "External id": 151333,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7376 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945528.145, "dur": 0.731, + "args": { + "External id": 151334,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7377 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945531.385, "dur": 3.840, + "args": { + "External id": 151335,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7378 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945533.885, "dur": 0.740, + "args": { + "External id": 151336,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7379 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945537.045, "dur": 5.420, + "args": { + "External id": 151337,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7380 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945540.996, "dur": 0.800, + "args": { + "External id": 151338,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7381 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945544.365, "dur": 6.760, + "args": { + "External id": 151339,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7382 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945547.965, "dur": 2.511, + "args": { + "External id": 151340,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7383 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945552.965, "dur": 3.871, + "args": { + "External id": 151341,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7384 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945555.416, "dur": 0.780, + "args": { + "External id": 151342,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7385 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945558.645, "dur": 6.770, + "args": { + "External id": 151343,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7386 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945564.075, "dur": 0.740, + "args": { + "External id": 151344,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7387 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945567.205, "dur": 3.660, + "args": { + "External id": 151345,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7388 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945569.505, "dur": 0.750, + "args": { + "External id": 151346,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7389 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945572.665, "dur": 8.120, + "args": { + "External id": 151347,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7390 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945579.305, "dur": 0.800, + "args": { + "External id": 151348,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7391 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945584.695, "dur": 4.940, + "args": { + "External id": 151349,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7392 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945588.295, "dur": 0.730, + "args": { + "External id": 151350,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7393 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945591.415, "dur": 3.750, + "args": { + "External id": 151351,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7394 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945593.825, "dur": 0.720, + "args": { + "External id": 151352,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7395 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945596.975, "dur": 5.540, + "args": { + "External id": 151353,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7396 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945601.135, "dur": 0.720, + "args": { + "External id": 151354,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7397 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945604.295, "dur": 6.140, + "args": { + "External id": 151355,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7398 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945607.675, "dur": 2.140, + "args": { + "External id": 151356,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7399 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945612.235, "dur": 5.840, + "args": { + "External id": 151357,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7400 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945614.675, "dur": 2.770, + "args": { + "External id": 151358,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7401 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945619.975, "dur": 4.640, + "args": { + "External id": 151359,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7402 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945623.295, "dur": 0.720, + "args": { + "External id": 151360,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7403 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945626.395, "dur": 3.950, + "args": { + "External id": 151361,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7404 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945629.045, "dur": 0.690, + "args": { + "External id": 151362,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7405 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945632.155, "dur": 5.200, + "args": { + "External id": 151363,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7406 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945635.975, "dur": 0.760, + "args": { + "External id": 151364,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7407 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945641.135, "dur": 4.940, + "args": { + "External id": 151365,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7408 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945644.715, "dur": 0.720, + "args": { + "External id": 151366,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7409 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945647.975, "dur": 3.790, + "args": { + "External id": 151367,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7410 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945650.415, "dur": 0.760, + "args": { + "External id": 151368,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7411 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945653.575, "dur": 5.770, + "args": { + "External id": 151369,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7412 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945657.965, "dur": 0.720, + "args": { + "External id": 151370,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7413 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945661.115, "dur": 6.750, + "args": { + "External id": 151371,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7414 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945664.535, "dur": 2.710, + "args": { + "External id": 151372,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7415 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945669.615, "dur": 6.040, + "args": { + "External id": 151373,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7416 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945672.065, "dur": 2.920, + "args": { + "External id": 151374,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7417 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945677.565, "dur": 4.830, + "args": { + "External id": 151375,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7418 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945680.985, "dur": 0.800, + "args": { + "External id": 151376,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7419 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945684.165, "dur": 3.810, + "args": { + "External id": 151377,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7420 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945686.615, "dur": 0.740, + "args": { + "External id": 151378,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7421 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945689.775, "dur": 5.940, + "args": { + "External id": 151379,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7422 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945694.195, "dur": 0.860, + "args": { + "External id": 151380,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7423 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945699.575, "dur": 5.130, + "args": { + "External id": 151381,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7424 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945703.255, "dur": 0.790, + "args": { + "External id": 151382,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7425 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945706.625, "dur": 4.030, + "args": { + "External id": 151383,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7426 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945709.255, "dur": 0.740, + "args": { + "External id": 151384,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7427 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945712.565, "dur": 5.910, + "args": { + "External id": 151385,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7428 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945717.065, "dur": 0.770, + "args": { + "External id": 151386,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7429 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945720.365, "dur": 6.590, + "args": { + "External id": 151387,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7430 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945723.875, "dur": 2.420, + "args": { + "External id": 151388,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7431 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945728.835, "dur": 6.440, + "args": { + "External id": 151389,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7432 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945731.415, "dur": 3.180, + "args": { + "External id": 151390,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7433 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945737.235, "dur": 5.070, + "args": { + "External id": 151391,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7434 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945740.815, "dur": 0.830, + "args": { + "External id": 151392,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7435 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945744.195, "dur": 3.980, + "args": { + "External id": 151393,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7436 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945746.765, "dur": 0.770, + "args": { + "External id": 151394,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7437 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945750.015, "dur": 5.480, + "args": { + "External id": 151395,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7438 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945754.005, "dur": 0.870, + "args": { + "External id": 151396,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7439 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945759.445, "dur": 4.450, + "args": { + "External id": 151397,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7440 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945762.445, "dur": 0.790, + "args": { + "External id": 151398,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7441 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945765.815, "dur": 4.670, + "args": { + "External id": 151399,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7442 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945769.065, "dur": 0.760, + "args": { + "External id": 151400,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7443 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945772.385, "dur": 6.270, + "args": { + "External id": 151401,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7444 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945777.205, "dur": 0.760, + "args": { + "External id": 151402,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7445 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945780.565, "dur": 7.400, + "args": { + "External id": 151403,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7446 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945784.355, "dur": 2.880, + "args": { + "External id": 151404,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7447 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945789.825, "dur": 7.000, + "args": { + "External id": 151405,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7448 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945793.555, "dur": 2.550, + "args": { + "External id": 151406,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7449 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945798.855, "dur": 5.220, + "args": { + "External id": 151407,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7450 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945802.275, "dur": 1.160, + "args": { + "External id": 151408,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7451 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945805.995, "dur": 5.010, + "args": { + "External id": 151409,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7452 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945809.585, "dur": 0.770, + "args": { + "External id": 151410,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7453 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945812.905, "dur": 6.600, + "args": { + "External id": 151411,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7454 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945818.025, "dur": 0.820, + "args": { + "External id": 151412,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7455 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945823.195, "dur": 4.790, + "args": { + "External id": 151413,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7456 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945826.545, "dur": 0.760, + "args": { + "External id": 151414,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7457 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945829.985, "dur": 4.750, + "args": { + "External id": 151415,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7458 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945833.355, "dur": 0.760, + "args": { + "External id": 151416,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7459 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945836.725, "dur": 7.010, + "args": { + "External id": 151417,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7460 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945842.315, "dur": 0.780, + "args": { + "External id": 151418,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7461 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945845.725, "dur": 6.870, + "args": { + "External id": 151419,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7462 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945849.285, "dur": 2.590, + "args": { + "External id": 151420,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7463 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945854.465, "dur": 6.450, + "args": { + "External id": 151421,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7464 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945857.665, "dur": 2.550, + "args": { + "External id": 151422,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7465 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945862.935, "dur": 4.650, + "args": { + "External id": 151423,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7466 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945866.195, "dur": 0.770, + "args": { + "External id": 151424,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7467 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945869.445, "dur": 4.710, + "args": { + "External id": 151425,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7468 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945872.725, "dur": 0.780, + "args": { + "External id": 151426,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7469 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945876.075, "dur": 6.200, + "args": { + "External id": 151427,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7470 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945880.815, "dur": 0.800, + "args": { + "External id": 151428,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7471 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945886.165, "dur": 4.800, + "args": { + "External id": 151429,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7472 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945889.445, "dur": 0.780, + "args": { + "External id": 151430,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7473 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945892.835, "dur": 4.810, + "args": { + "External id": 151431,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7474 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945896.215, "dur": 0.760, + "args": { + "External id": 151432,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7475 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945899.565, "dur": 6.370, + "args": { + "External id": 151433,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7476 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945904.465, "dur": 0.800, + "args": { + "External id": 151434,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7477 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945907.895, "dur": 7.229, + "args": { + "External id": 151435,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7478 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945911.215, "dur": 3.149, + "args": { + "External id": 151436,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7479 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945917.035, "dur": 6.569, + "args": { + "External id": 151437,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7480 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945920.235, "dur": 2.709, + "args": { + "External id": 151438,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7481 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945925.635, "dur": 4.740, + "args": { + "External id": 151439,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7482 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945928.944, "dur": 0.731, + "args": { + "External id": 151440,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7483 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945932.324, "dur": 4.831, + "args": { + "External id": 151441,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7484 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945935.704, "dur": 0.780, + "args": { + "External id": 151442,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7485 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945939.044, "dur": 6.891, + "args": { + "External id": 151443,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7486 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945944.435, "dur": 0.840, + "args": { + "External id": 151444,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7487 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945949.535, "dur": 5.060, + "args": { + "External id": 151445,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7488 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945953.064, "dur": 0.840, + "args": { + "External id": 151446,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7489 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945956.535, "dur": 5.049, + "args": { + "External id": 151447,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7490 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945960.144, "dur": 0.771, + "args": { + "External id": 151448,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7491 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945963.624, "dur": 6.671, + "args": { + "External id": 151449,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7492 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945968.884, "dur": 0.771, + "args": { + "External id": 151450,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7493 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945972.124, "dur": 6.311, + "args": { + "External id": 151451,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7494 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945975.304, "dur": 2.491, + "args": { + "External id": 151452,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7495 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945980.375, "dur": 6.509, + "args": { + "External id": 151453,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7496 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945983.735, "dur": 2.480, + "args": { + "External id": 151454,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7497 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945988.864, "dur": 4.691, + "args": { + "External id": 151455,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7498 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945992.104, "dur": 0.760, + "args": { + "External id": 151456,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7499 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771945995.424, "dur": 4.751, + "args": { + "External id": 151457,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7500 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771945998.744, "dur": 0.780, + "args": { + "External id": 151458,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7501 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946002.075, "dur": 6.629, + "args": { + "External id": 151459,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7502 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946007.204, "dur": 0.860, + "args": { + "External id": 151460,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7503 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946012.324, "dur": 4.750, + "args": { + "External id": 151461,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7504 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946015.664, "dur": 0.770, + "args": { + "External id": 151462,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7505 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946019.044, "dur": 4.690, + "args": { + "External id": 151463,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7506 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946022.324, "dur": 0.780, + "args": { + "External id": 151464,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7507 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946025.664, "dur": 6.990, + "args": { + "External id": 151465,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7508 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946031.134, "dur": 0.840, + "args": { + "External id": 151466,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7509 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946034.564, "dur": 6.730, + "args": { + "External id": 151467,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7510 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946038.064, "dur": 2.550, + "args": { + "External id": 151468,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7511 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946043.304, "dur": 6.770, + "args": { + "External id": 151469,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7512 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946046.784, "dur": 2.610, + "args": { + "External id": 151470,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7513 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946052.064, "dur": 4.640, + "args": { + "External id": 151471,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7514 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946055.294, "dur": 0.770, + "args": { + "External id": 151472,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7515 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946058.644, "dur": 4.910, + "args": { + "External id": 151473,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7516 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946062.184, "dur": 0.740, + "args": { + "External id": 151474,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7517 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946065.434, "dur": 6.210, + "args": { + "External id": 151475,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7518 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946070.174, "dur": 0.810, + "args": { + "External id": 151476,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7519 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946075.384, "dur": 4.780, + "args": { + "External id": 151477,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7520 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946078.744, "dur": 0.760, + "args": { + "External id": 151478,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7521 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946082.094, "dur": 4.960, + "args": { + "External id": 151479,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7522 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946085.594, "dur": 0.830, + "args": { + "External id": 151480,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7523 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946088.954, "dur": 6.650, + "args": { + "External id": 151481,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7524 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946094.204, "dur": 0.740, + "args": { + "External id": 151482,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7525 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946097.584, "dur": 6.590, + "args": { + "External id": 151483,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7526 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946100.864, "dur": 2.590, + "args": { + "External id": 151484,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7527 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946106.204, "dur": 6.480, + "args": { + "External id": 151485,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7528 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946109.374, "dur": 2.630, + "args": { + "External id": 151486,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7529 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946114.744, "dur": 4.700, + "args": { + "External id": 151487,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7530 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946117.954, "dur": 0.800, + "args": { + "External id": 151488,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7531 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946121.354, "dur": 4.780, + "args": { + "External id": 151489,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7532 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946124.714, "dur": 0.760, + "args": { + "External id": 151490,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7533 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946128.064, "dur": 6.290, + "args": { + "External id": 151491,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7534 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946132.814, "dur": 0.840, + "args": { + "External id": 151492,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7535 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946137.914, "dur": 4.550, + "args": { + "External id": 151493,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7536 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946141.044, "dur": 0.770, + "args": { + "External id": 151494,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7537 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946144.364, "dur": 4.690, + "args": { + "External id": 151495,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7538 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946147.654, "dur": 0.780, + "args": { + "External id": 151496,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7539 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946150.934, "dur": 6.810, + "args": { + "External id": 151497,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7540 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946156.314, "dur": 0.780, + "args": { + "External id": 151498,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7541 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946159.634, "dur": 6.150, + "args": { + "External id": 151499,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7542 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946162.914, "dur": 2.210, + "args": { + "External id": 151500,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7543 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946167.674, "dur": 6.150, + "args": { + "External id": 151501,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7544 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946170.924, "dur": 2.240, + "args": { + "External id": 151502,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7545 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946175.844, "dur": 4.820, + "args": { + "External id": 151503,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7546 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946179.254, "dur": 0.780, + "args": { + "External id": 151504,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7547 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946182.504, "dur": 4.840, + "args": { + "External id": 151505,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7548 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946185.794, "dur": 0.870, + "args": { + "External id": 151506,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7549 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946189.374, "dur": 6.580, + "args": { + "External id": 151507,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7550 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946194.434, "dur": 0.860, + "args": { + "External id": 151508,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7551 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946200.024, "dur": 4.670, + "args": { + "External id": 151509,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7552 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946203.314, "dur": 0.760, + "args": { + "External id": 151510,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7553 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946206.644, "dur": 4.610, + "args": { + "External id": 151511,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7554 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946209.904, "dur": 0.750, + "args": { + "External id": 151512,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7555 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946213.204, "dur": 6.380, + "args": { + "External id": 151513,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7556 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946218.184, "dur": 0.780, + "args": { + "External id": 151514,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7557 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946221.494, "dur": 6.830, + "args": { + "External id": 151515,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7558 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946224.774, "dur": 2.830, + "args": { + "External id": 151516,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7559 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946230.234, "dur": 6.980, + "args": { + "External id": 151517,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7560 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946233.894, "dur": 2.630, + "args": { + "External id": 151518,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7561 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946239.154, "dur": 4.600, + "args": { + "External id": 151519,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7562 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946242.324, "dur": 0.770, + "args": { + "External id": 151520,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7563 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946245.644, "dur": 4.800, + "args": { + "External id": 151521,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7564 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946248.924, "dur": 0.850, + "args": { + "External id": 151522,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7565 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946252.394, "dur": 6.440, + "args": { + "External id": 151523,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7566 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946257.374, "dur": 0.810, + "args": { + "External id": 151524,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7567 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946262.534, "dur": 4.710, + "args": { + "External id": 151525,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7568 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946265.784, "dur": 0.770, + "args": { + "External id": 151526,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7569 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946269.144, "dur": 5.260, + "args": { + "External id": 151527,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7570 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946272.974, "dur": 0.750, + "args": { + "External id": 151528,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7571 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946276.594, "dur": 6.710, + "args": { + "External id": 151529,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7572 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946281.894, "dur": 0.740, + "args": { + "External id": 151530,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7573 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946285.204, "dur": 6.500, + "args": { + "External id": 151531,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7574 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946288.384, "dur": 2.640, + "args": { + "External id": 151532,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7575 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946293.654, "dur": 29.710, + "args": { + "External id": 151533,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7576 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946319.524, "dur": 2.810, + "args": { + "External id": 151534,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7577 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946325.654, "dur": 5.060, + "args": { + "External id": 151535,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7578 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946329.214, "dur": 0.820, + "args": { + "External id": 151536,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7579 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946332.644, "dur": 4.630, + "args": { + "External id": 151537,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7580 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946335.914, "dur": 0.740, + "args": { + "External id": 151538,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7581 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946339.214, "dur": 6.590, + "args": { + "External id": 151539,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7582 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946344.334, "dur": 0.830, + "args": { + "External id": 151540,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7583 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946349.914, "dur": 4.690, + "args": { + "External id": 151541,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7584 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946353.184, "dur": 0.770, + "args": { + "External id": 151542,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7585 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946356.584, "dur": 4.750, + "args": { + "External id": 151543,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7586 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946359.944, "dur": 0.779, + "args": { + "External id": 151544,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7587 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946363.223, "dur": 6.600, + "args": { + "External id": 151545,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7588 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946368.403, "dur": 0.760, + "args": { + "External id": 151546,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7589 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946371.774, "dur": 6.260, + "args": { + "External id": 151547,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7590 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946374.883, "dur": 2.471, + "args": { + "External id": 151548,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7591 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946380.034, "dur": 6.480, + "args": { + "External id": 151549,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7592 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946383.294, "dur": 2.560, + "args": { + "External id": 151550,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7593 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946388.454, "dur": 4.780, + "args": { + "External id": 151551,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7594 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946391.714, "dur": 0.840, + "args": { + "External id": 151552,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7595 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946429.734, "dur": 7.880, + "args": { + "External id": 155137,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7596 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946436.043, "dur": 0.911, + "args": { + "External id": 155138,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7597 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946439.783, "dur": 4.640, + "args": { + "External id": 155139,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7598 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946443.023, "dur": 0.771, + "args": { + "External id": 155140,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7599 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::unsqueeze", "pid": 5714, "tid": 5714, + "ts": 6303771946448.463, "dur": 4.851, + "args": { + "External id": 155141,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7600 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771946451.894, "dur": 0.740, + "args": { + "External id": 155142,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[1]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7601 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::cat", "pid": 5714, "tid": 5714, + "ts": 6303771946491.003, "dur": 327.330, + "args": { + "External id": 155143,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7602 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linalg_vector_norm", "pid": 5714, "tid": 5714, + "ts": 6303771947174.362, "dur": 784.718, + "args": { + "External id": 155144,"Record function id": 0, "Concrete Inputs": ["", "2.", "", "False", ""], "Input type": ["float", "Scalar", "", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[128], [], [], [], []], "Ev Idx": 7603 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::linalg_vector_norm", "pid": 5714, "tid": 5714, + "ts": 6303771947481.931, "dur": 261.360, + "args": { + "External id": 155145,"Record function id": 0, "Concrete Inputs": ["", "2.", "", "False", ""], "Input type": ["float", "Scalar", "", "Scalar", ""], "Input Strides": [[1], [], [], [], []], "Input Dims": [[128], [], [], [], []], "Ev Idx": 7604 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::as_strided", "pid": 5714, "tid": 5714, + "ts": 6303771947572.661, "dur": 7.490, + "args": { + "External id": 155146,"Record function id": 0, "Concrete Inputs": ["", "[1]", "[0]", ""], "Input type": ["float", "ScalarList", "ScalarList", ""], "Input Strides": [[], [], [], []], "Input Dims": [[], [], [], []], "Ev Idx": 7605 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "Redistribute", "pid": 5714, "tid": 5714, + "ts": 6303771949025.028, "dur": 3559.142, + "args": { + "External id": 155147,"Sequence number": 3058990, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", "False"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7606 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771949232.467, "dur": 313.340, + "args": { + "External id": 155148,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7607 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303771949246.827, "dur": 4.660, + "args": { + "External id": 155149,"Record function id": 0, "Concrete Inputs": ["", "2."], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7608 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771949262.817, "dur": 2.210, + "args": { + "External id": 155150,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7609 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "_c10d_functional::all_reduce", "pid": 5714, "tid": 5714, + "ts": 6303771949673.796, "dur": 1167.588, + "args": { + "External id": 155151,"Record function id": 0, "Concrete Inputs": ["", "", ""], "Input type": ["float", "", ""], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7610 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clone", "pid": 5714, "tid": 5714, + "ts": 6303771949690.146, "dur": 359.040, + "args": { + "External id": 155152,"Record function id": 0, "Concrete Inputs": ["", "0"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7611 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty_like", "pid": 5714, "tid": 5714, + "ts": 6303771949704.716, "dur": 183.060, + "args": { + "External id": 155153,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "0"], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7612 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303771949835.896, "dur": 46.350, + "args": { + "External id": 155154,"Record function id": 0, "Concrete Inputs": ["[]", "6", "0", "", "", "0"], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", "Scalar"], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7613 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::copy_", "pid": 5714, "tid": 5714, + "ts": 6303771949896.766, "dur": 147.189, + "args": { + "External id": 155155,"Record function id": 0, "Concrete Inputs": ["", "", "False"], "Input type": ["float", "float", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7614 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "c10d::allreduce_", "pid": 5714, "tid": 5714, + "ts": 6303771950086.695, "dur": 739.089, + "args": { + "External id": 155156,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "-1"], "Input type": ["TensorList", "", "", "", "Scalar"], "Input Strides": [[[]], [], [], [], []], "Input Dims": [[[]], [], [], [], []], "Ev Idx": 7615 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771950224.445, "dur": 566.579, + "args": { + "External id": 155157,"Record function id": 0, "Collective name": "allreduce", "Process Group Description": "default_pg", "dtype": "Float", "Rank": 0, "Input Strides": [[[]], [], [], [], [], [], [], [], [], []], "Concrete Inputs": ["", "", "", "0", "", "[]", "[]", "0", "1", "4"], "Out msg nelems": 1, "Process Group Name": "0", "Input type": ["TensorList", "", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Global rank stride": 1, "Out split size": "[]", "In split size": "[]", "Process Group Ranks": "[0, 1, 2, 3]", "Group size": 4, "Input Dims": [[[]], [], [], [], [], [], [], [], [], []], "Global rank start": 0, "Ev Idx": 7616, "In msg nelems": 1 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "nccl:all_reduce", "pid": 5714, "tid": 5714, + "ts": 6303771950363.555, "dur": 392.239, + "args": { + "External id": 155158,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7617 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771951117.663, "dur": 1176.258, + "args": { + "External id": 155159,"Record function id": 0, "Concrete Inputs": ["", "0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7618 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "_c10d_functional::wait_tensor", "pid": 5714, "tid": 5714, + "ts": 6303771951548.172, "dur": 213.250, + "args": { + "External id": 155160,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7619 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "record_param_comms", "pid": 5714, "tid": 5714, + "ts": 6303771951638.472, "dur": 56.030, + "args": { + "External id": 155161,"Record function id": 0, "Collective name": "wait", "Process Group Description": "default_pg", "dtype": "Byte", "Input Dims": [[], [], [], [], [], [], [], [], []], "Input Strides": [[], [], [], [], [], [], [], [], []], "Input type": ["", "", "Scalar", "", "ScalarList", "ScalarList", "Scalar", "Scalar", "Scalar"], "Concrete Inputs": ["", "", "0", "", "[]", "[]", "-1", "-1", "1"], "Out msg nelems": 0, "Process Group Name": "0", "Process Group Ranks": "[]", "Group size": 1, "Ev Idx": 7620, "In msg nelems": 0, "Rank": 0, "In split size": "[]", "Out split size": "[]" + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::pow", "pid": 5714, "tid": 5714, + "ts": 6303771952006.251, "dur": 240.110, + "args": { + "External id": 155162,"Record function id": 0, "Concrete Inputs": ["", "0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7621 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303771952017.781, "dur": 4.670, + "args": { + "External id": 155163,"Record function id": 0, "Concrete Inputs": ["", "0.5"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7622 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771952031.021, "dur": 2.130, + "args": { + "External id": 155164,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7623 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "_ToTorchTensor", "pid": 5714, "tid": 5714, + "ts": 6303771952682.040, "dur": 114.439, + "args": { + "External id": 155165,"Sequence number": 3058991, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", ""], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7624 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view_as", "pid": 5714, "tid": 5714, + "ts": 6303771952728.420, "dur": 42.839, + "args": { + "External id": 155166,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7625 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::view", "pid": 5714, "tid": 5714, + "ts": 6303771952743.849, "dur": 25.040, + "args": { + "External id": 155167,"Record function id": 0, "Concrete Inputs": ["", "[]"], "Input type": ["float", "ScalarList"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7626 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::add", "pid": 5714, "tid": 5714, + "ts": 6303771953738.607, "dur": 213.600, + "args": { + "External id": 155168,"Record function id": 0, "Concrete Inputs": ["", "", "1"], "Input type": ["float", "double", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7627 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::reciprocal", "pid": 5714, "tid": 5714, + "ts": 6303771953998.217, "dur": 109.920, + "args": { + "External id": 155169,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7628 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::mul", "pid": 5714, "tid": 5714, + "ts": 6303771954150.276, "dur": 100.450, + "args": { + "External id": 155170,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "double"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7629 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::clamp", "pid": 5714, "tid": 5714, + "ts": 6303771954365.756, "dur": 141.409, + "args": { + "External id": 155171,"Record function id": 0, "Concrete Inputs": ["", "", "1."], "Input type": ["float", "", "Scalar"], "Input Strides": [[], [], []], "Input Dims": [[], [], []], "Ev Idx": 7630 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771954379.176, "dur": 2.310, + "args": { + "External id": 155172,"Record function id": 0, "Concrete Inputs": ["", "6", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], []], "Input Dims": [[], [], [], [], []], "Ev Idx": 7631 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::to", "pid": 5714, "tid": 5714, + "ts": 6303771954600.105, "dur": 2.400, + "args": { + "External id": 155173,"Record function id": 0, "Concrete Inputs": ["", "6", "0", "", "", "False", "False", ""], "Input type": ["float", "Scalar", "Scalar", "", "", "Scalar", "Scalar", ""], "Input Strides": [[], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], []], "Ev Idx": 7632 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_mul_", "pid": 5714, "tid": 5714, + "ts": 6303771955053.624, "dur": 2687.654, + "args": { + "External id": 155174,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["TensorList", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7633 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_mul_", "pid": 5714, "tid": 5714, + "ts": 6303771957336.819, "dur": 243.570, + "args": { + "External id": 155175,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["TensorList", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7634 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::isnan", "pid": 5714, "tid": 5714, + "ts": 6303771957922.498, "dur": 154.979, + "args": { + "External id": 155176,"Sequence number": 3058992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7635 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::ne", "pid": 5714, "tid": 5714, + "ts": 6303771957934.158, "dur": 139.030, + "args": { + "External id": 155177,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7636 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 5714, + "ts": 6303771958100.237, "dur": 175935.549, + "args": { + "External id": 155178,"Sequence number": 3058992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7637 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6303771958106.428, "dur": 175925.678, + "args": { + "External id": 155179,"Sequence number": 3058992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7638 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6303771958114.297, "dur": 175900.740, + "args": { + "External id": 155180,"Sequence number": 3058992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7639 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::isinf", "pid": 5714, "tid": 5714, + "ts": 6303772134125.656, "dur": 558.659, + "args": { + "External id": 155181,"Sequence number": 3058992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7640 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303772134141.116, "dur": 369.199, + "args": { + "External id": 155182,"Record function id": 0, "Concrete Inputs": [""], "Input type": ["float"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7641 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::empty", "pid": 5714, "tid": 5714, + "ts": 6303772134176.986, "dur": 49.820, + "args": { + "External id": 155183,"Record function id": 0, "Concrete Inputs": ["[0]", "6", "0", "", "", ""], "Input type": ["ScalarList", "Scalar", "Scalar", "", "", ""], "Input Strides": [[], [], [], [], [], []], "Input Dims": [[], [], [], [], [], []], "Ev Idx": 7642 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::abs", "pid": 5714, "tid": 5714, + "ts": 6303772134237.576, "dur": 269.909, + "args": { + "External id": 155184,"Record function id": 0, "Concrete Inputs": ["", ""], "Input type": ["float", "float"], "Input Strides": [[], [1]], "Input Dims": [[], [0]], "Ev Idx": 7643 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::resize_", "pid": 5714, "tid": 5714, + "ts": 6303772134284.366, "dur": 64.370, + "args": { + "External id": 155185,"Record function id": 0, "Concrete Inputs": ["", "[]", ""], "Input type": ["float", "ScalarList", ""], "Input Strides": [[1], [], []], "Input Dims": [[0], [], []], "Ev Idx": 7644 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::eq", "pid": 5714, "tid": 5714, + "ts": 6303772134522.945, "dur": 151.780, + "args": { + "External id": 155186,"Record function id": 0, "Concrete Inputs": ["", "inf"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7645 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::is_nonzero", "pid": 5714, "tid": 5714, + "ts": 6303772134714.575, "dur": 237.969, + "args": { + "External id": 155187,"Sequence number": 3058992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7646 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::item", "pid": 5714, "tid": 5714, + "ts": 6303772134721.805, "dur": 229.070, + "args": { + "External id": 155188,"Sequence number": 3058992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7647 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_local_scalar_dense", "pid": 5714, "tid": 5714, + "ts": 6303772134730.195, "dur": 218.109, + "args": { + "External id": 155189,"Sequence number": 3058992, "Fwd thread id": 0, "Record function id": 0, "Concrete Inputs": [""], "Input type": ["bool"], "Input Strides": [[]], "Input Dims": [[]], "Ev Idx": 7648 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "Optimizer.step#OptimizersContainer.step", "pid": 5714, "tid": 5714, + "ts": 6303772135129.774, "dur": 6402.576, + "args": { + "External id": 155190,"Record function id": 0, "Ev Idx": 7649 + } + }, + { + "ph": "X", "cat": "user_annotation", "name": "Optimizer.step#AdamW.step", "pid": 5714, "tid": 5714, + "ts": 6303772135288.744, "dur": 6213.486, + "args": { + "External id": 155191,"Record function id": 0, "Ev Idx": 7650 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_foreach_add_", "pid": 5714, "tid": 5714, + "ts": 6303772137409.809, "dur": 343.789, + "args": { + "External id": 155192,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["TensorList", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7651 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137442.079, "dur": 2.510, + "args": { + "External id": 155193,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7652 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137447.889, "dur": 0.280, + "args": { + "External id": 155194,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7653 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137450.329, "dur": 0.200, + "args": { + "External id": 155195,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7654 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137452.349, "dur": 0.220, + "args": { + "External id": 155196,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7655 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137454.299, "dur": 0.210, + "args": { + "External id": 155197,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7656 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137456.219, "dur": 0.260, + "args": { + "External id": 155198,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7657 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137458.259, "dur": 0.260, + "args": { + "External id": 155199,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7658 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137460.169, "dur": 0.210, + "args": { + "External id": 155200,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7659 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137462.219, "dur": 0.210, + "args": { + "External id": 155201,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7660 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137464.269, "dur": 0.200, + "args": { + "External id": 155202,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7661 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137466.219, "dur": 0.220, + "args": { + "External id": 155203,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7662 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137468.199, "dur": 0.270, + "args": { + "External id": 155204,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7663 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137470.069, "dur": 0.220, + "args": { + "External id": 155205,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7664 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137471.919, "dur": 0.220, + "args": { + "External id": 155206,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7665 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137473.919, "dur": 0.220, + "args": { + "External id": 155207,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7666 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137475.829, "dur": 0.220, + "args": { + "External id": 155208,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7667 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137477.489, "dur": 0.190, + "args": { + "External id": 155209,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7668 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137479.429, "dur": 0.220, + "args": { + "External id": 155210,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7669 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137481.309, "dur": 0.210, + "args": { + "External id": 155211,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7670 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137483.219, "dur": 0.190, + "args": { + "External id": 155212,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7671 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137485.179, "dur": 0.220, + "args": { + "External id": 155213,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7672 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137487.009, "dur": 0.190, + "args": { + "External id": 155214,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7673 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137488.849, "dur": 0.210, + "args": { + "External id": 155215,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7674 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137490.629, "dur": 0.200, + "args": { + "External id": 155216,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7675 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137492.539, "dur": 0.200, + "args": { + "External id": 155217,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7676 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137494.299, "dur": 0.200, + "args": { + "External id": 155218,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7677 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137496.089, "dur": 0.210, + "args": { + "External id": 155219,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7678 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137497.909, "dur": 0.210, + "args": { + "External id": 155220,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7679 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137499.939, "dur": 0.220, + "args": { + "External id": 155221,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7680 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137501.809, "dur": 0.200, + "args": { + "External id": 155222,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7681 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137503.639, "dur": 0.220, + "args": { + "External id": 155223,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7682 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137505.459, "dur": 0.190, + "args": { + "External id": 155224,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7683 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137507.299, "dur": 0.190, + "args": { + "External id": 155225,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7684 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137509.089, "dur": 0.190, + "args": { + "External id": 155226,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7685 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137510.839, "dur": 0.230, + "args": { + "External id": 155227,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7686 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137512.759, "dur": 0.200, + "args": { + "External id": 155228,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7687 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137514.729, "dur": 0.200, + "args": { + "External id": 155229,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7688 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137516.539, "dur": 0.190, + "args": { + "External id": 155230,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7689 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137518.329, "dur": 0.200, + "args": { + "External id": 155231,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7690 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137520.109, "dur": 0.210, + "args": { + "External id": 155232,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7691 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137521.959, "dur": 0.200, + "args": { + "External id": 155233,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7692 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137523.919, "dur": 0.190, + "args": { + "External id": 155234,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7693 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137525.859, "dur": 0.200, + "args": { + "External id": 155235,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7694 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137527.799, "dur": 0.190, + "args": { + "External id": 155236,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7695 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137529.639, "dur": 0.210, + "args": { + "External id": 155237,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7696 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137531.489, "dur": 0.190, + "args": { + "External id": 155238,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7697 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137533.299, "dur": 0.210, + "args": { + "External id": 155239,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7698 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137535.019, "dur": 0.200, + "args": { + "External id": 155240,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7699 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137536.809, "dur": 0.210, + "args": { + "External id": 155241,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7700 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137538.769, "dur": 0.200, + "args": { + "External id": 155242,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7701 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137540.539, "dur": 0.190, + "args": { + "External id": 155243,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7702 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137542.629, "dur": 0.190, + "args": { + "External id": 155244,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7703 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137544.429, "dur": 0.190, + "args": { + "External id": 155245,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7704 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137546.339, "dur": 0.200, + "args": { + "External id": 155246,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7705 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137548.119, "dur": 0.200, + "args": { + "External id": 155247,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7706 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137550.089, "dur": 0.190, + "args": { + "External id": 155248,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7707 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137551.849, "dur": 0.210, + "args": { + "External id": 155249,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7708 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137553.549, "dur": 0.210, + "args": { + "External id": 155250,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7709 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137555.589, "dur": 0.200, + "args": { + "External id": 155251,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7710 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137557.558, "dur": 0.211, + "args": { + "External id": 155252,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7711 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137559.318, "dur": 0.191, + "args": { + "External id": 155253,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7712 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137561.269, "dur": 0.209, + "args": { + "External id": 155254,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7713 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137563.218, "dur": 0.200, + "args": { + "External id": 155255,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7714 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137564.969, "dur": 0.209, + "args": { + "External id": 155256,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7715 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137566.909, "dur": 0.200, + "args": { + "External id": 155257,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7716 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137568.898, "dur": 0.220, + "args": { + "External id": 155258,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7717 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137570.789, "dur": 0.189, + "args": { + "External id": 155259,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7718 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137572.569, "dur": 0.220, + "args": { + "External id": 155260,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7719 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137574.438, "dur": 0.211, + "args": { + "External id": 155261,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7720 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137576.249, "dur": 0.189, + "args": { + "External id": 155262,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7721 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137578.549, "dur": 0.200, + "args": { + "External id": 155263,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7722 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137580.349, "dur": 0.189, + "args": { + "External id": 155264,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7723 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137582.149, "dur": 0.209, + "args": { + "External id": 155265,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7724 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137584.038, "dur": 0.211, + "args": { + "External id": 155266,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7725 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137585.858, "dur": 0.200, + "args": { + "External id": 155267,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7726 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137587.658, "dur": 0.211, + "args": { + "External id": 155268,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7727 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137589.429, "dur": 0.200, + "args": { + "External id": 155269,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7728 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137591.418, "dur": 0.220, + "args": { + "External id": 155270,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7729 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137593.278, "dur": 0.211, + "args": { + "External id": 155271,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7730 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137595.089, "dur": 0.209, + "args": { + "External id": 155272,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7731 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137597.089, "dur": 0.189, + "args": { + "External id": 155273,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7732 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137598.849, "dur": 0.209, + "args": { + "External id": 155274,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7733 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137600.889, "dur": 0.209, + "args": { + "External id": 155275,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7734 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137602.678, "dur": 0.240, + "args": { + "External id": 155276,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7735 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137604.718, "dur": 0.191, + "args": { + "External id": 155277,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7736 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137606.669, "dur": 0.209, + "args": { + "External id": 155278,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7737 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137608.418, "dur": 0.220, + "args": { + "External id": 155279,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7738 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137610.238, "dur": 0.200, + "args": { + "External id": 155280,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7739 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137612.138, "dur": 0.211, + "args": { + "External id": 155281,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7740 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137613.778, "dur": 0.171, + "args": { + "External id": 155282,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7741 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137615.358, "dur": 0.200, + "args": { + "External id": 155283,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7742 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137616.989, "dur": 0.180, + "args": { + "External id": 155284,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7743 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137618.538, "dur": 0.180, + "args": { + "External id": 155285,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7744 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137620.218, "dur": 0.180, + "args": { + "External id": 155286,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7745 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137621.869, "dur": 0.180, + "args": { + "External id": 155287,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7746 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137623.529, "dur": 0.180, + "args": { + "External id": 155288,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7747 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137625.478, "dur": 0.171, + "args": { + "External id": 155289,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7748 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137627.258, "dur": 0.160, + "args": { + "External id": 155290,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7749 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137628.678, "dur": 0.160, + "args": { + "External id": 155291,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7750 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137630.109, "dur": 0.149, + "args": { + "External id": 155292,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7751 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137631.529, "dur": 0.169, + "args": { + "External id": 155293,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7752 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137633.058, "dur": 0.151, + "args": { + "External id": 155294,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7753 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137634.669, "dur": 0.160, + "args": { + "External id": 155295,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7754 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137636.049, "dur": 0.160, + "args": { + "External id": 155296,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7755 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137637.458, "dur": 0.160, + "args": { + "External id": 155297,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7756 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137638.869, "dur": 0.149, + "args": { + "External id": 155298,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7757 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137640.449, "dur": 0.180, + "args": { + "External id": 155299,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7758 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137641.909, "dur": 0.160, + "args": { + "External id": 155300,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7759 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137643.389, "dur": 0.169, + "args": { + "External id": 155301,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7760 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137644.978, "dur": 0.160, + "args": { + "External id": 155302,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7761 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137646.589, "dur": 0.149, + "args": { + "External id": 155303,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7762 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137647.969, "dur": 0.160, + "args": { + "External id": 155304,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7763 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137649.549, "dur": 0.169, + "args": { + "External id": 155305,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7764 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137650.938, "dur": 0.171, + "args": { + "External id": 155306,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7765 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137652.329, "dur": 0.160, + "args": { + "External id": 155307,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7766 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137653.658, "dur": 0.160, + "args": { + "External id": 155308,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7767 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137655.078, "dur": 0.151, + "args": { + "External id": 155309,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7768 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137656.469, "dur": 0.180, + "args": { + "External id": 155310,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7769 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137658.018, "dur": 0.160, + "args": { + "External id": 155311,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7770 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137659.408, "dur": 0.150, + "args": { + "External id": 155312,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7771 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137660.758, "dur": 0.150, + "args": { + "External id": 155313,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7772 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137662.138, "dur": 0.160, + "args": { + "External id": 155314,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7773 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137663.718, "dur": 0.160, + "args": { + "External id": 155315,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7774 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137665.258, "dur": 0.150, + "args": { + "External id": 155316,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7775 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137666.668, "dur": 0.160, + "args": { + "External id": 155317,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7776 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137668.078, "dur": 0.170, + "args": { + "External id": 155318,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7777 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137669.418, "dur": 0.170, + "args": { + "External id": 155319,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7778 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::result_type", "pid": 5714, "tid": 5714, + "ts": 6303772137670.868, "dur": 0.150, + "args": { + "External id": 155320,"Record function id": 0, "Concrete Inputs": ["", "1"], "Input type": ["float", "Scalar"], "Input Strides": [[], []], "Input Dims": [[], []], "Ev Idx": 7779 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_fused_adamw_", "pid": 5714, "tid": 5714, + "ts": 6303772138344.237, "dur": 3081.733, + "args": { + "External id": 155321,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "0.00018995263597688472", "0.90000000000000002", "0.94999999999999996", "0.10000000000000001", "1.0000000000000001e-15", "False", "False", "", ""], "Input type": ["TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 7780 + } + }, + { + "ph": "X", "cat": "cpu_op", "name": "aten::_fused_adamw_", "pid": 5714, "tid": 5714, + "ts": 6303772141217.801, "dur": 129.239, + "args": { + "External id": 155322,"Record function id": 0, "Concrete Inputs": ["", "", "", "", "", "", "0.00018995263597688472", "0.90000000000000002", "0.94999999999999996", "0.10000000000000001", "1.0000000000000001e-15", "False", "False", "", ""], "Input type": ["TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "TensorList", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "Scalar", "", ""], "Input Strides": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Input Dims": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "Ev Idx": 7781 + } + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6303771452668.283, "dur": 259.940, + "args": { + "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972147, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289972147, "pid": 0, "tid": 7, "ts": 6303771452668.283, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6303771452928.895, "dur": 260.803, + "args": { + "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972150, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289972150, "pid": 0, "tid": 7, "ts": 6303771452928.895, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6303771453190.338, "dur": 258.786, + "args": { + "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972153, "registers per thread": 64, "shared memory": 0, "blocks per SM": 0.554688, "warps per SM": 8.875000, "grid": [71, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 18 + } + }, + { + "ph": "f", "id": 289972153, "pid": 0, "tid": 7, "ts": 6303771453190.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6303771454965.366, "dur": 6.496, + "args": { + "External id": 147483, "device": 0, "context": 1, "stream": 7, "correlation": 289972167, "bytes": 131072, "memory bandwidth (GB/s)": 20.177339901477833 + } + }, + { + "ph": "f", "id": 289972167, "pid": 0, "tid": 7, "ts": 6303771454965.366, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771454938.616, "dur": 26.860, + "args": { + "External id": 147483, "cbid": 41, "correlation": 289972167 + } + }, + { + "ph": "s", "id": 289972167, "pid": 5714, "tid": 5714, "ts": 6303771454938.616, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6303771454966.016, "dur": 8.740, + "args": { + "External id": 147483, "cbid": 131, "correlation": 289972168 + } + }, + { + "ph": "s", "id": 289972168, "pid": 5714, "tid": 5714, "ts": 6303771454966.016, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6303771455047.031, "dur": 11.456, + "args": { + "External id": 147487, "device": 0, "context": 1, "stream": 7, "correlation": 289972181, "bytes": 262144, "memory bandwidth (GB/s)": 22.88268156424581 + } + }, + { + "ph": "f", "id": 289972181, "pid": 0, "tid": 7, "ts": 6303771455047.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771455000.696, "dur": 45.980, + "args": { + "External id": 147487, "cbid": 41, "correlation": 289972181 + } + }, + { + "ph": "s", "id": 289972181, "pid": 5714, "tid": 5714, "ts": 6303771455000.696, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6303771455046.976, "dur": 13.970, + "args": { + "External id": 147487, "cbid": 131, "correlation": 289972182 + } + }, + { + "ph": "s", "id": 289972182, "pid": 5714, "tid": 5714, "ts": 6303771455046.976, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void (anonymous namespace)::elementwise_kernel_with_index(int, at::native::arange_cuda_out(c10::Scalar const&, c10::Scalar const&, c10::Scalar const&, at::Tensor&)::{lambda()#1}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, function_traits::result_type*)", "pid": 0, "tid": 7, + "ts": 6303771455109.015, "dur": 1.088, + "args": { + "External id": 147490, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972198, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 0.500000, "grid": [32, 1, 1], "block": [64, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289972198, "pid": 0, "tid": 7, "ts": 6303771455109.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771455095.916, "dur": 13.899, + "args": { + "External id": 147490, "cbid": 211, "correlation": 289972198 + } + }, + { + "ph": "s", "id": 289972198, "pid": 5714, "tid": 5714, "ts": 6303771455095.916, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771455172.408, "dur": 1.472, + "args": { + "External id": 147504, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972211, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 289972211, "pid": 0, "tid": 7, "ts": 6303771455172.408, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771455162.575, "dur": 9.480, + "args": { + "External id": 147504, "cbid": 211, "correlation": 289972211 + } + }, + { + "ph": "s", "id": 289972211, "pid": 5714, "tid": 5714, "ts": 6303771455162.575, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#3}::operator()() const::{lambda(int)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771455204.825, "dur": 1.632, + "args": { + "External id": 147508, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972225, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289972225, "pid": 0, "tid": 7, "ts": 6303771455204.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771455197.326, "dur": 7.849, + "args": { + "External id": 147508, "cbid": 211, "correlation": 289972225 + } + }, + { + "ph": "s", "id": 289972225, "pid": 5714, "tid": 5714, "ts": 6303771455197.326, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771455327.685, "dur": 2.300, + "args": { + "cbid": 135, "correlation": 289972237 + } + }, + { + "ph": "f", "id": 289972237, "pid": 5714, "tid": 5714, "ts": 6303771455327.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771455332.435, "dur": 1.490, + "args": { + "cbid": 147, "correlation": 289972241 + } + }, + { + "ph": "s", "id": 289972241, "pid": 5714, "tid": 5714, "ts": 6303771455332.435, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771455341.165, "dur": 0.740, + "args": { + "cbid": 135, "correlation": 289972253 + } + }, + { + "ph": "f", "id": 289972253, "pid": 5714, "tid": 5714, "ts": 6303771455341.165, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771455343.405, "dur": 0.850, + "args": { + "cbid": 147, "correlation": 289972257 + } + }, + { + "ph": "s", "id": 289972257, "pid": 5714, "tid": 5714, "ts": 6303771455343.405, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771455756.863, "dur": 67.105, + "args": { + "External id": 147554, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289972277, "registers per thread": 40, "shared memory": 0, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 289972277, "pid": 0, "tid": 17, "ts": 6303771455756.863, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771455741.694, "dur": 15.450, + "args": { + "External id": 147554, "cbid": 211, "correlation": 289972277 + } + }, + { + "ph": "s", "id": 289972277, "pid": 5714, "tid": 5714, "ts": 6303771455741.694, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771455986.690, "dur": 33.088, + "args": { + "External id": 147599, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289972290, "registers per thread": 36, "shared memory": 0, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 289972290, "pid": 0, "tid": 17, "ts": 6303771455986.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771455976.954, "dur": 9.880, + "args": { + "External id": 147599, "cbid": 211, "correlation": 289972290 + } + }, + { + "ph": "s", "id": 289972290, "pid": 5714, "tid": 5714, "ts": 6303771455976.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771456020.544, "dur": 1.340, + "args": { + "cbid": 135, "correlation": 289972300 + } + }, + { + "ph": "f", "id": 289972300, "pid": 5714, "tid": 5714, "ts": 6303771456020.544, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771456023.804, "dur": 1.289, + "args": { + "cbid": 147, "correlation": 289972304 + } + }, + { + "ph": "s", "id": 289972304, "pid": 5714, "tid": 5714, "ts": 6303771456023.804, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771456089.304, "dur": 0.969, + "args": { + "External id": 147601, "cbid": 317, "correlation": 289972317 + } + }, + { + "ph": "f", "id": 289972317, "pid": 5714, "tid": 5714, "ts": 6303771456089.304, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771456092.573, "dur": 1.231, + "args": { + "External id": 147601, "cbid": 135, "correlation": 289972319 + } + }, + { + "ph": "f", "id": 289972319, "pid": 5714, "tid": 5714, "ts": 6303771456092.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771456095.224, "dur": 1.260, + "args": { + "External id": 147601, "cbid": 147, "correlation": 289972323 + } + }, + { + "ph": "s", "id": 289972323, "pid": 5714, "tid": 5714, "ts": 6303771456095.224, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771456117.913, "dur": 0.820, + "args": { + "External id": 147601, "cbid": 409, "correlation": 289972326 + } + }, + { + "ph": "f", "id": 289972326, "pid": 5714, "tid": 5714, "ts": 6303771456117.913, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771456122.893, "dur": 0.810, + "args": { + "External id": 147601, "cbid": 135, "correlation": 289972329 + } + }, + { + "ph": "f", "id": 289972329, "pid": 5714, "tid": 5714, "ts": 6303771456122.893, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771456123.883, "dur": 0.920, + "args": { + "External id": 147601, "cbid": 147, "correlation": 289972330 + } + }, + { + "ph": "s", "id": 289972330, "pid": 5714, "tid": 5714, "ts": 6303771456123.883, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771456136.739, "dur": 276359.559, + "args": { + "External id": 147601, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289972332, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 13223616, "Out msg nelems": 52894464, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289972332, "pid": 0, "tid": 20, "ts": 6303771456136.739, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771456125.943, "dur": 9.980, + "args": { + "External id": 147601, "cbid": 430, "correlation": 289972332 + } + }, + { + "ph": "s", "id": 289972332, "pid": 5714, "tid": 5714, "ts": 6303771456125.943, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771456136.963, "dur": 0.410, + "args": { + "External id": 147601, "cbid": 135, "correlation": 289972334 + } + }, + { + "ph": "f", "id": 289972334, "pid": 5714, "tid": 5714, "ts": 6303771456136.963, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771456137.493, "dur": 0.470, + "args": { + "External id": 147601, "cbid": 147, "correlation": 289972335 + } + }, + { + "ph": "s", "id": 289972335, "pid": 5714, "tid": 5714, "ts": 6303771456137.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771456139.453, "dur": 0.880, + "args": { + "External id": 147601, "cbid": 135, "correlation": 289972338 + } + }, + { + "ph": "f", "id": 289972338, "pid": 5714, "tid": 5714, "ts": 6303771456139.453, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771456148.823, "dur": 0.440, + "args": { + "External id": 147601, "cbid": 135, "correlation": 289972345 + } + }, + { + "ph": "f", "id": 289972345, "pid": 5714, "tid": 5714, "ts": 6303771456148.823, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771456179.663, "dur": 1.130, + "args": { + "External id": 147603, "cbid": 147, "correlation": 289972350 + } + }, + { + "ph": "s", "id": 289972350, "pid": 5714, "tid": 5714, "ts": 6303771456179.663, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771456198.223, "dur": 0.910, + "args": { + "cbid": 135, "correlation": 289972365 + } + }, + { + "ph": "f", "id": 289972365, "pid": 5714, "tid": 5714, "ts": 6303771456198.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771456281.273, "dur": 2.490, + "args": { + "cbid": 147, "correlation": 289972372 + } + }, + { + "ph": "s", "id": 289972372, "pid": 5714, "tid": 5714, "ts": 6303771456281.273, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771456564.692, "dur": 1.300, + "args": { + "External id": 147645, "cbid": 317, "correlation": 289972529 + } + }, + { + "ph": "f", "id": 289972529, "pid": 5714, "tid": 5714, "ts": 6303771456564.692, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771732501.771, "dur": 2.496, + "args": { + "External id": 147650, "device": 0, "context": 1, "stream": 7, "correlation": 289972541, "bytes": 22000, "memory bandwidth (GB/s)": 8.814102564102564 + } + }, + { + "ph": "f", "id": 289972541, "pid": 0, "tid": 7, "ts": 6303771732501.771, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771456601.092, "dur": 13.650, + "args": { + "External id": 147650, "cbid": 41, "correlation": 289972541 + } + }, + { + "ph": "s", "id": 289972541, "pid": 5714, "tid": 5714, "ts": 6303771456601.092, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771456619.792, "dur": 2.160, + "args": { + "External id": 147645, "cbid": 135, "correlation": 289972545 + } + }, + { + "ph": "f", "id": 289972545, "pid": 5714, "tid": 5714, "ts": 6303771456619.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771732506.283, "dur": 1067.500, + "args": { + "External id": 147645, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972549, "registers per thread": 38, "shared memory": 0, "blocks per SM": 20.289062, "warps per SM": 81.156250, "grid": [2597, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289972549, "pid": 0, "tid": 7, "ts": 6303771732506.283, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771456625.062, "dur": 12.500, + "args": { + "External id": 147645, "cbid": 211, "correlation": 289972549 + } + }, + { + "ph": "s", "id": 289972549, "pid": 5714, "tid": 5714, "ts": 6303771456625.062, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771456795.622, "dur": 1.540, + "args": { + "cbid": 135, "correlation": 289972560 + } + }, + { + "ph": "f", "id": 289972560, "pid": 5714, "tid": 5714, "ts": 6303771456795.622, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_0", "pid": 0, "tid": 7, + "ts": 6303771733574.423, "dur": 306.852, + "args": { + "External id": 147654, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972573, "registers per thread": 32, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289972573, "pid": 0, "tid": 7, "ts": 6303771733574.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771457009.091, "dur": 10.860, + "args": { + "External id": 147654, "cbid": 307, "correlation": 289972573 + } + }, + { + "ph": "s", "id": 289972573, "pid": 5714, "tid": 5714, "ts": 6303771457009.091, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771457413.426, "dur": 1149.550, + "args": { + "External id": 147669, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289972588, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289972588, "pid": 0, "tid": 17, "ts": 6303771457413.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771457280.221, "dur": 11.260, + "args": { + "External id": 147669, "cbid": 211, "correlation": 289972588 + } + }, + { + "ph": "s", "id": 289972588, "pid": 5714, "tid": 5714, "ts": 6303771457280.221, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771458565.440, "dur": 8.256, + "args": { + "External id": 147685, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289972601, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289972601, "pid": 0, "tid": 17, "ts": 6303771458565.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771457401.281, "dur": 9.969, + "args": { + "External id": 147685, "cbid": 211, "correlation": 289972601 + } + }, + { + "ph": "s", "id": 289972601, "pid": 5714, "tid": 5714, "ts": 6303771457401.281, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771457434.321, "dur": 1.249, + "args": { + "cbid": 135, "correlation": 289972611 + } + }, + { + "ph": "f", "id": 289972611, "pid": 5714, "tid": 5714, "ts": 6303771457434.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771457437.350, "dur": 1.251, + "args": { + "cbid": 147, "correlation": 289972615 + } + }, + { + "ph": "s", "id": 289972615, "pid": 5714, "tid": 5714, "ts": 6303771457437.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771457490.150, "dur": 0.840, + "args": { + "External id": 147687, "cbid": 317, "correlation": 289972628 + } + }, + { + "ph": "f", "id": 289972628, "pid": 5714, "tid": 5714, "ts": 6303771457490.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771457492.720, "dur": 1.100, + "args": { + "External id": 147687, "cbid": 135, "correlation": 289972630 + } + }, + { + "ph": "f", "id": 289972630, "pid": 5714, "tid": 5714, "ts": 6303771457492.720, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771457495.230, "dur": 1.110, + "args": { + "External id": 147687, "cbid": 147, "correlation": 289972634 + } + }, + { + "ph": "s", "id": 289972634, "pid": 5714, "tid": 5714, "ts": 6303771457495.230, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771457512.280, "dur": 0.640, + "args": { + "External id": 147687, "cbid": 409, "correlation": 289972637 + } + }, + { + "ph": "f", "id": 289972637, "pid": 5714, "tid": 5714, "ts": 6303771457512.280, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771457516.740, "dur": 0.720, + "args": { + "External id": 147687, "cbid": 135, "correlation": 289972640 + } + }, + { + "ph": "f", "id": 289972640, "pid": 5714, "tid": 5714, "ts": 6303771457516.740, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771457517.630, "dur": 0.860, + "args": { + "External id": 147687, "cbid": 147, "correlation": 289972641 + } + }, + { + "ph": "s", "id": 289972641, "pid": 5714, "tid": 5714, "ts": 6303771457517.630, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771732497.547, "dur": 3227.525, + "args": { + "External id": 147687, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289972643, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289972643, "pid": 0, "tid": 20, "ts": 6303771732497.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771457519.440, "dur": 9.300, + "args": { + "External id": 147687, "cbid": 430, "correlation": 289972643 + } + }, + { + "ph": "s", "id": 289972643, "pid": 5714, "tid": 5714, "ts": 6303771457519.440, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771457529.730, "dur": 0.380, + "args": { + "External id": 147687, "cbid": 135, "correlation": 289972645 + } + }, + { + "ph": "f", "id": 289972645, "pid": 5714, "tid": 5714, "ts": 6303771457529.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771457530.220, "dur": 0.640, + "args": { + "External id": 147687, "cbid": 147, "correlation": 289972646 + } + }, + { + "ph": "s", "id": 289972646, "pid": 5714, "tid": 5714, "ts": 6303771457530.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771457532.380, "dur": 0.780, + "args": { + "External id": 147687, "cbid": 135, "correlation": 289972649 + } + }, + { + "ph": "f", "id": 289972649, "pid": 5714, "tid": 5714, "ts": 6303771457532.380, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771457541.200, "dur": 0.380, + "args": { + "External id": 147687, "cbid": 135, "correlation": 289972656 + } + }, + { + "ph": "f", "id": 289972656, "pid": 5714, "tid": 5714, "ts": 6303771457541.200, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771457568.780, "dur": 0.940, + "args": { + "External id": 147689, "cbid": 147, "correlation": 289972661 + } + }, + { + "ph": "s", "id": 289972661, "pid": 5714, "tid": 5714, "ts": 6303771457568.780, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771457585.290, "dur": 0.940, + "args": { + "cbid": 135, "correlation": 289972676 + } + }, + { + "ph": "f", "id": 289972676, "pid": 5714, "tid": 5714, "ts": 6303771457585.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771457625.290, "dur": 1.010, + "args": { + "cbid": 147, "correlation": 289972681 + } + }, + { + "ph": "s", "id": 289972681, "pid": 5714, "tid": 5714, "ts": 6303771457625.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771457628.140, "dur": 0.590, + "args": { + "cbid": 147, "correlation": 289972685 + } + }, + { + "ph": "s", "id": 289972685, "pid": 5714, "tid": 5714, "ts": 6303771457628.140, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771457664.410, "dur": 2.520, + "args": { + "cbid": 147, "correlation": 289972691 + } + }, + { + "ph": "s", "id": 289972691, "pid": 5714, "tid": 5714, "ts": 6303771457664.410, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771457760.770, "dur": 1.080, + "args": { + "External id": 147702, "cbid": 317, "correlation": 289972732 + } + }, + { + "ph": "f", "id": 289972732, "pid": 5714, "tid": 5714, "ts": 6303771457760.770, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771457773.070, "dur": 2.420, + "args": { + "External id": 147703, "cbid": 138, "correlation": 289972735 + } + }, + { + "ph": "f", "id": 289972735, "pid": 5714, "tid": 5714, "ts": 6303771457773.070, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771735729.584, "dur": 2.272, + "args": { + "External id": 147707, "device": 0, "context": 1, "stream": 7, "correlation": 289972746, "bytes": 7224, "memory bandwidth (GB/s)": 3.1795774647887325 + } + }, + { + "ph": "f", "id": 289972746, "pid": 0, "tid": 7, "ts": 6303771735729.584, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771457902.619, "dur": 11.110, + "args": { + "External id": 147707, "cbid": 41, "correlation": 289972746 + } + }, + { + "ph": "s", "id": 289972746, "pid": 5714, "tid": 5714, "ts": 6303771457902.619, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771457917.629, "dur": 1.580, + "args": { + "External id": 147702, "cbid": 135, "correlation": 289972750 + } + }, + { + "ph": "f", "id": 289972750, "pid": 5714, "tid": 5714, "ts": 6303771457917.629, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771735734.256, "dur": 13.153, + "args": { + "External id": 147702, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972754, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289972754, "pid": 0, "tid": 7, "ts": 6303771735734.256, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771457921.609, "dur": 9.860, + "args": { + "External id": 147702, "cbid": 211, "correlation": 289972754 + } + }, + { + "ph": "s", "id": 289972754, "pid": 5714, "tid": 5714, "ts": 6303771457921.609, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771458007.499, "dur": 1.330, + "args": { + "cbid": 135, "correlation": 289972765 + } + }, + { + "ph": "f", "id": 289972765, "pid": 5714, "tid": 5714, "ts": 6303771458007.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771735748.145, "dur": 40.224, + "args": { + "External id": 147714, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972791, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289972791, "pid": 0, "tid": 7, "ts": 6303771735748.145, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771458229.489, "dur": 10.790, + "args": { + "External id": 147714, "cbid": 307, "correlation": 289972791 + } + }, + { + "ph": "s", "id": 289972791, "pid": 5714, "tid": 5714, "ts": 6303771458229.489, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771735789.009, "dur": 568.103, + "args": { + "External id": 147720, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972814, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289972814, "pid": 0, "tid": 7, "ts": 6303771735789.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771458395.988, "dur": 10.370, + "args": { + "External id": 147720, "cbid": 211, "correlation": 289972814 + } + }, + { + "ph": "s", "id": 289972814, "pid": 5714, "tid": 5714, "ts": 6303771458395.988, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771736357.720, "dur": 142.849, + "args": { + "External id": 147721, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972837, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289972837, "pid": 0, "tid": 7, "ts": 6303771736357.720, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771458431.148, "dur": 5.610, + "args": { + "External id": 147721, "cbid": 211, "correlation": 289972837 + } + }, + { + "ph": "s", "id": 289972837, "pid": 5714, "tid": 5714, "ts": 6303771458431.148, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771736501.177, "dur": 566.343, + "args": { + "External id": 147722, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972860, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289972860, "pid": 0, "tid": 7, "ts": 6303771736501.177, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771458456.638, "dur": 5.230, + "args": { + "External id": 147722, "cbid": 211, "correlation": 289972860 + } + }, + { + "ph": "s", "id": 289972860, "pid": 5714, "tid": 5714, "ts": 6303771458456.638, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771737068.224, "dur": 71.937, + "args": { + "External id": 147739, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972880, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289972880, "pid": 0, "tid": 7, "ts": 6303771737068.224, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771458756.967, "dur": 9.880, + "args": { + "External id": 147739, "cbid": 307, "correlation": 289972880 + } + }, + { + "ph": "s", "id": 289972880, "pid": 5714, "tid": 5714, "ts": 6303771458756.967, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771737140.865, "dur": 182.754, + "args": { + "External id": 147755, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972898, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289972898, "pid": 0, "tid": 7, "ts": 6303771737140.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771458958.887, "dur": 8.350, + "args": { + "External id": 147755, "cbid": 307, "correlation": 289972898 + } + }, + { + "ph": "s", "id": 289972898, "pid": 5714, "tid": 5714, "ts": 6303771458958.887, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771459118.267, "dur": 0.510, + "args": { + "External id": 147761, "cbid": 200, "correlation": 289972905 + } + }, + { + "ph": "f", "id": 289972905, "pid": 5714, "tid": 5714, "ts": 6303771459118.267, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771459118.897, "dur": 0.200, + "args": { + "External id": 147761, "cbid": 200, "correlation": 289972906 + } + }, + { + "ph": "f", "id": 289972906, "pid": 5714, "tid": 5714, "ts": 6303771459118.897, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771459143.587, "dur": 0.380, + "args": { + "External id": 147761, "cbid": 200, "correlation": 289972929 + } + }, + { + "ph": "f", "id": 289972929, "pid": 5714, "tid": 5714, "ts": 6303771459143.587, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771459150.357, "dur": 2.169, + "args": { + "External id": 147761, "cbid": 273, "correlation": 289972938 + } + }, + { + "ph": "f", "id": 289972938, "pid": 5714, "tid": 5714, "ts": 6303771459150.357, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771737324.291, "dur": 589.479, + "args": { + "External id": 147761, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972939, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289972939, "pid": 0, "tid": 7, "ts": 6303771737324.291, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771459153.106, "dur": 10.760, + "args": { + "External id": 147761, "cbid": 211, "correlation": 289972939 + } + }, + { + "ph": "s", "id": 289972939, "pid": 5714, "tid": 5714, "ts": 6303771459153.106, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771737914.474, "dur": 145.441, + "args": { + "External id": 147767, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972962, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289972962, "pid": 0, "tid": 7, "ts": 6303771737914.474, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771459227.937, "dur": 7.689, + "args": { + "External id": 147767, "cbid": 211, "correlation": 289972962 + } + }, + { + "ph": "s", "id": 289972962, "pid": 5714, "tid": 5714, "ts": 6303771459227.937, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771738060.619, "dur": 90.273, + "args": { + "External id": 147771, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289972988, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289972988, "pid": 0, "tid": 7, "ts": 6303771738060.619, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771459420.086, "dur": 9.420, + "args": { + "External id": 147771, "cbid": 307, "correlation": 289972988 + } + }, + { + "ph": "s", "id": 289972988, "pid": 5714, "tid": 5714, "ts": 6303771459420.086, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771738151.564, "dur": 343.332, + "args": { + "External id": 147772, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973008, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973008, "pid": 0, "tid": 7, "ts": 6303771738151.564, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771459461.986, "dur": 6.640, + "args": { + "External id": 147772, "cbid": 211, "correlation": 289973008 + } + }, + { + "ph": "s", "id": 289973008, "pid": 5714, "tid": 5714, "ts": 6303771459461.986, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771738495.600, "dur": 490.182, + "args": { + "External id": 147773, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973031, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973031, "pid": 0, "tid": 7, "ts": 6303771738495.600, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771459490.916, "dur": 5.190, + "args": { + "External id": 147773, "cbid": 211, "correlation": 289973031 + } + }, + { + "ph": "s", "id": 289973031, "pid": 5714, "tid": 5714, "ts": 6303771459490.916, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6303771738986.486, "dur": 215.426, + "args": { + "External id": 147774, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973043, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973043, "pid": 0, "tid": 7, "ts": 6303771738986.486, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771459530.596, "dur": 6.800, + "args": { + "External id": 147774, "cbid": 307, "correlation": 289973043 + } + }, + { + "ph": "s", "id": 289973043, "pid": 5714, "tid": 5714, "ts": 6303771459530.596, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771459562.146, "dur": 1.450, + "args": { + "External id": 147775, "cbid": 210, "correlation": 289973063 + } + }, + { + "ph": "f", "id": 289973063, "pid": 5714, "tid": 5714, "ts": 6303771459562.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771739202.552, "dur": 452.166, + "args": { + "External id": 147775, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973064, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973064, "pid": 0, "tid": 7, "ts": 6303771739202.552, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771459566.506, "dur": 6.250, + "args": { + "External id": 147775, "cbid": 211, "correlation": 289973064 + } + }, + { + "ph": "s", "id": 289973064, "pid": 5714, "tid": 5714, "ts": 6303771459566.506, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6303771739655.422, "dur": 130.881, + "args": { + "External id": 147776, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973071, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973071, "pid": 0, "tid": 7, "ts": 6303771739655.422, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771459604.165, "dur": 5.720, + "args": { + "External id": 147776, "cbid": 307, "correlation": 289973071 + } + }, + { + "ph": "s", "id": 289973071, "pid": 5714, "tid": 5714, "ts": 6303771459604.165, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771733628.632, "dur": 157.634, + "args": { + "External id": 147792, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289973086, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289973086, "pid": 0, "tid": 17, "ts": 6303771733628.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771459972.505, "dur": 11.700, + "args": { + "External id": 147792, "cbid": 211, "correlation": 289973086 + } + }, + { + "ph": "s", "id": 289973086, "pid": 5714, "tid": 5714, "ts": 6303771459972.505, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771733804.666, "dur": 262.979, + "args": { + "External id": 147808, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289973099, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289973099, "pid": 0, "tid": 17, "ts": 6303771733804.666, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771460083.435, "dur": 9.149, + "args": { + "External id": 147808, "cbid": 211, "correlation": 289973099 + } + }, + { + "ph": "s", "id": 289973099, "pid": 5714, "tid": 5714, "ts": 6303771460083.435, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771460116.515, "dur": 1.229, + "args": { + "cbid": 135, "correlation": 289973109 + } + }, + { + "ph": "f", "id": 289973109, "pid": 5714, "tid": 5714, "ts": 6303771460116.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771460119.664, "dur": 1.291, + "args": { + "cbid": 147, "correlation": 289973113 + } + }, + { + "ph": "s", "id": 289973113, "pid": 5714, "tid": 5714, "ts": 6303771460119.664, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771460173.204, "dur": 0.880, + "args": { + "External id": 147810, "cbid": 317, "correlation": 289973126 + } + }, + { + "ph": "f", "id": 289973126, "pid": 5714, "tid": 5714, "ts": 6303771460173.204, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771460175.844, "dur": 1.180, + "args": { + "External id": 147810, "cbid": 135, "correlation": 289973128 + } + }, + { + "ph": "f", "id": 289973128, "pid": 5714, "tid": 5714, "ts": 6303771460175.844, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771460178.404, "dur": 1.100, + "args": { + "External id": 147810, "cbid": 147, "correlation": 289973132 + } + }, + { + "ph": "s", "id": 289973132, "pid": 5714, "tid": 5714, "ts": 6303771460178.404, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771460193.934, "dur": 0.690, + "args": { + "External id": 147810, "cbid": 409, "correlation": 289973135 + } + }, + { + "ph": "f", "id": 289973135, "pid": 5714, "tid": 5714, "ts": 6303771460193.934, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771460198.454, "dur": 0.730, + "args": { + "External id": 147810, "cbid": 135, "correlation": 289973138 + } + }, + { + "ph": "f", "id": 289973138, "pid": 5714, "tid": 5714, "ts": 6303771460198.454, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771460199.354, "dur": 0.900, + "args": { + "External id": 147810, "cbid": 147, "correlation": 289973139 + } + }, + { + "ph": "s", "id": 289973139, "pid": 5714, "tid": 5714, "ts": 6303771460199.354, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771735727.024, "dur": 4184.369, + "args": { + "External id": 147810, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289973141, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289973141, "pid": 0, "tid": 20, "ts": 6303771735727.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771460201.214, "dur": 9.320, + "args": { + "External id": 147810, "cbid": 430, "correlation": 289973141 + } + }, + { + "ph": "s", "id": 289973141, "pid": 5714, "tid": 5714, "ts": 6303771460201.214, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771460211.504, "dur": 0.390, + "args": { + "External id": 147810, "cbid": 135, "correlation": 289973143 + } + }, + { + "ph": "f", "id": 289973143, "pid": 5714, "tid": 5714, "ts": 6303771460211.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771460212.004, "dur": 0.510, + "args": { + "External id": 147810, "cbid": 147, "correlation": 289973144 + } + }, + { + "ph": "s", "id": 289973144, "pid": 5714, "tid": 5714, "ts": 6303771460212.004, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771460213.924, "dur": 0.820, + "args": { + "External id": 147810, "cbid": 135, "correlation": 289973147 + } + }, + { + "ph": "f", "id": 289973147, "pid": 5714, "tid": 5714, "ts": 6303771460213.924, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771460223.324, "dur": 0.430, + "args": { + "External id": 147810, "cbid": 135, "correlation": 289973154 + } + }, + { + "ph": "f", "id": 289973154, "pid": 5714, "tid": 5714, "ts": 6303771460223.324, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771460250.954, "dur": 0.950, + "args": { + "External id": 147812, "cbid": 147, "correlation": 289973159 + } + }, + { + "ph": "s", "id": 289973159, "pid": 5714, "tid": 5714, "ts": 6303771460250.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771460267.464, "dur": 0.910, + "args": { + "cbid": 135, "correlation": 289973174 + } + }, + { + "ph": "f", "id": 289973174, "pid": 5714, "tid": 5714, "ts": 6303771460267.464, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771460314.764, "dur": 1.110, + "args": { + "cbid": 147, "correlation": 289973179 + } + }, + { + "ph": "s", "id": 289973179, "pid": 5714, "tid": 5714, "ts": 6303771460314.764, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771460317.674, "dur": 0.580, + "args": { + "cbid": 147, "correlation": 289973183 + } + }, + { + "ph": "s", "id": 289973183, "pid": 5714, "tid": 5714, "ts": 6303771460317.674, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771460355.804, "dur": 2.080, + "args": { + "cbid": 147, "correlation": 289973189 + } + }, + { + "ph": "s", "id": 289973189, "pid": 5714, "tid": 5714, "ts": 6303771460355.804, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771460462.124, "dur": 1.180, + "args": { + "External id": 147825, "cbid": 317, "correlation": 289973230 + } + }, + { + "ph": "f", "id": 289973230, "pid": 5714, "tid": 5714, "ts": 6303771460462.124, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771460471.764, "dur": 2.390, + "args": { + "External id": 147826, "cbid": 138, "correlation": 289973233 + } + }, + { + "ph": "f", "id": 289973233, "pid": 5714, "tid": 5714, "ts": 6303771460471.764, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771739916.769, "dur": 2.432, + "args": { + "External id": 147830, "device": 0, "context": 1, "stream": 7, "correlation": 289973244, "bytes": 7224, "memory bandwidth (GB/s)": 2.9703947368421053 + } + }, + { + "ph": "f", "id": 289973244, "pid": 0, "tid": 7, "ts": 6303771739916.769, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771460495.594, "dur": 11.249, + "args": { + "External id": 147830, "cbid": 41, "correlation": 289973244 + } + }, + { + "ph": "s", "id": 289973244, "pid": 5714, "tid": 5714, "ts": 6303771460495.594, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771460511.734, "dur": 1.640, + "args": { + "External id": 147825, "cbid": 135, "correlation": 289973248 + } + }, + { + "ph": "f", "id": 289973248, "pid": 5714, "tid": 5714, "ts": 6303771460511.734, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771739921.569, "dur": 20.096, + "args": { + "External id": 147825, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973252, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973252, "pid": 0, "tid": 7, "ts": 6303771739921.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771460515.794, "dur": 9.669, + "args": { + "External id": 147825, "cbid": 211, "correlation": 289973252 + } + }, + { + "ph": "s", "id": 289973252, "pid": 5714, "tid": 5714, "ts": 6303771460515.794, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771460601.763, "dur": 1.180, + "args": { + "cbid": 135, "correlation": 289973263 + } + }, + { + "ph": "f", "id": 289973263, "pid": 5714, "tid": 5714, "ts": 6303771460601.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771739942.337, "dur": 33.280, + "args": { + "External id": 147837, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973289, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973289, "pid": 0, "tid": 7, "ts": 6303771739942.337, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771460801.083, "dur": 10.670, + "args": { + "External id": 147837, "cbid": 307, "correlation": 289973289 + } + }, + { + "ph": "s", "id": 289973289, "pid": 5714, "tid": 5714, "ts": 6303771460801.083, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771739976.321, "dur": 562.471, + "args": { + "External id": 147843, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973312, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973312, "pid": 0, "tid": 7, "ts": 6303771739976.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771460937.553, "dur": 9.320, + "args": { + "External id": 147843, "cbid": 211, "correlation": 289973312 + } + }, + { + "ph": "s", "id": 289973312, "pid": 5714, "tid": 5714, "ts": 6303771460937.553, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771740539.400, "dur": 143.810, + "args": { + "External id": 147844, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973335, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973335, "pid": 0, "tid": 7, "ts": 6303771740539.400, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771460969.173, "dur": 5.229, + "args": { + "External id": 147844, "cbid": 211, "correlation": 289973335 + } + }, + { + "ph": "s", "id": 289973335, "pid": 5714, "tid": 5714, "ts": 6303771460969.173, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771740683.914, "dur": 584.390, + "args": { + "External id": 147845, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973358, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973358, "pid": 0, "tid": 7, "ts": 6303771740683.914, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771460994.722, "dur": 5.200, + "args": { + "External id": 147845, "cbid": 211, "correlation": 289973358 + } + }, + { + "ph": "s", "id": 289973358, "pid": 5714, "tid": 5714, "ts": 6303771460994.722, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771741268.944, "dur": 211.459, + "args": { + "External id": 147862, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973378, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973378, "pid": 0, "tid": 7, "ts": 6303771741268.944, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771461244.372, "dur": 9.260, + "args": { + "External id": 147862, "cbid": 307, "correlation": 289973378 + } + }, + { + "ph": "s", "id": 289973378, "pid": 5714, "tid": 5714, "ts": 6303771461244.372, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771741481.043, "dur": 174.978, + "args": { + "External id": 147878, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973396, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973396, "pid": 0, "tid": 7, "ts": 6303771741481.043, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771461444.312, "dur": 8.809, + "args": { + "External id": 147878, "cbid": 307, "correlation": 289973396 + } + }, + { + "ph": "s", "id": 289973396, "pid": 5714, "tid": 5714, "ts": 6303771461444.312, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771461582.721, "dur": 0.510, + "args": { + "External id": 147884, "cbid": 200, "correlation": 289973403 + } + }, + { + "ph": "f", "id": 289973403, "pid": 5714, "tid": 5714, "ts": 6303771461582.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771461583.361, "dur": 0.190, + "args": { + "External id": 147884, "cbid": 200, "correlation": 289973404 + } + }, + { + "ph": "f", "id": 289973404, "pid": 5714, "tid": 5714, "ts": 6303771461583.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771461608.211, "dur": 0.380, + "args": { + "External id": 147884, "cbid": 200, "correlation": 289973427 + } + }, + { + "ph": "f", "id": 289973427, "pid": 5714, "tid": 5714, "ts": 6303771461608.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771461615.371, "dur": 1.960, + "args": { + "External id": 147884, "cbid": 273, "correlation": 289973436 + } + }, + { + "ph": "f", "id": 289973436, "pid": 5714, "tid": 5714, "ts": 6303771461615.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771741656.757, "dur": 548.519, + "args": { + "External id": 147884, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973437, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973437, "pid": 0, "tid": 7, "ts": 6303771741656.757, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771461617.901, "dur": 10.030, + "args": { + "External id": 147884, "cbid": 211, "correlation": 289973437 + } + }, + { + "ph": "s", "id": 289973437, "pid": 5714, "tid": 5714, "ts": 6303771461617.901, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771742205.916, "dur": 144.193, + "args": { + "External id": 147890, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973460, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973460, "pid": 0, "tid": 7, "ts": 6303771742205.916, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771461690.421, "dur": 7.640, + "args": { + "External id": 147890, "cbid": 211, "correlation": 289973460 + } + }, + { + "ph": "s", "id": 289973460, "pid": 5714, "tid": 5714, "ts": 6303771461690.421, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771742350.717, "dur": 90.689, + "args": { + "External id": 147894, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973486, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973486, "pid": 0, "tid": 7, "ts": 6303771742350.717, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771461850.200, "dur": 8.971, + "args": { + "External id": 147894, "cbid": 307, "correlation": 289973486 + } + }, + { + "ph": "s", "id": 289973486, "pid": 5714, "tid": 5714, "ts": 6303771461850.200, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771742442.014, "dur": 342.884, + "args": { + "External id": 147895, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973506, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973506, "pid": 0, "tid": 7, "ts": 6303771742442.014, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771461889.720, "dur": 5.951, + "args": { + "External id": 147895, "cbid": 211, "correlation": 289973506 + } + }, + { + "ph": "s", "id": 289973506, "pid": 5714, "tid": 5714, "ts": 6303771461889.720, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771742785.506, "dur": 339.012, + "args": { + "External id": 147896, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973529, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973529, "pid": 0, "tid": 7, "ts": 6303771742785.506, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771461915.600, "dur": 4.980, + "args": { + "External id": 147896, "cbid": 211, "correlation": 289973529 + } + }, + { + "ph": "s", "id": 289973529, "pid": 5714, "tid": 5714, "ts": 6303771461915.600, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6303771743125.254, "dur": 339.460, + "args": { + "External id": 147897, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973541, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973541, "pid": 0, "tid": 7, "ts": 6303771743125.254, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771461952.440, "dur": 5.690, + "args": { + "External id": 147897, "cbid": 307, "correlation": 289973541 + } + }, + { + "ph": "s", "id": 289973541, "pid": 5714, "tid": 5714, "ts": 6303771461952.440, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771461981.410, "dur": 1.230, + "args": { + "External id": 147898, "cbid": 210, "correlation": 289973561 + } + }, + { + "ph": "f", "id": 289973561, "pid": 5714, "tid": 5714, "ts": 6303771461981.410, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771743465.418, "dur": 557.286, + "args": { + "External id": 147898, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973562, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973562, "pid": 0, "tid": 7, "ts": 6303771743465.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771461984.360, "dur": 5.360, + "args": { + "External id": 147898, "cbid": 211, "correlation": 289973562 + } + }, + { + "ph": "s", "id": 289973562, "pid": 5714, "tid": 5714, "ts": 6303771461984.360, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6303771744023.440, "dur": 173.699, + "args": { + "External id": 147899, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973569, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973569, "pid": 0, "tid": 7, "ts": 6303771744023.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771462019.620, "dur": 5.070, + "args": { + "External id": 147899, "cbid": 307, "correlation": 289973569 + } + }, + { + "ph": "s", "id": 289973569, "pid": 5714, "tid": 5714, "ts": 6303771462019.620, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771735750.449, "dur": 45.888, + "args": { + "External id": 147915, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289973584, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289973584, "pid": 0, "tid": 17, "ts": 6303771735750.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771462385.010, "dur": 11.919, + "args": { + "External id": 147915, "cbid": 211, "correlation": 289973584 + } + }, + { + "ph": "s", "id": 289973584, "pid": 5714, "tid": 5714, "ts": 6303771462385.010, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771735818.513, "dur": 13.120, + "args": { + "External id": 147931, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289973597, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289973597, "pid": 0, "tid": 17, "ts": 6303771735818.513, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771462494.459, "dur": 9.260, + "args": { + "External id": 147931, "cbid": 211, "correlation": 289973597 + } + }, + { + "ph": "s", "id": 289973597, "pid": 5714, "tid": 5714, "ts": 6303771462494.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771462527.119, "dur": 1.280, + "args": { + "cbid": 135, "correlation": 289973607 + } + }, + { + "ph": "f", "id": 289973607, "pid": 5714, "tid": 5714, "ts": 6303771462527.119, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771462530.219, "dur": 1.270, + "args": { + "cbid": 147, "correlation": 289973611 + } + }, + { + "ph": "s", "id": 289973611, "pid": 5714, "tid": 5714, "ts": 6303771462530.219, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771462583.259, "dur": 0.840, + "args": { + "External id": 147933, "cbid": 317, "correlation": 289973624 + } + }, + { + "ph": "f", "id": 289973624, "pid": 5714, "tid": 5714, "ts": 6303771462583.259, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771462585.879, "dur": 1.110, + "args": { + "External id": 147933, "cbid": 135, "correlation": 289973626 + } + }, + { + "ph": "f", "id": 289973626, "pid": 5714, "tid": 5714, "ts": 6303771462585.879, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771462588.329, "dur": 1.130, + "args": { + "External id": 147933, "cbid": 147, "correlation": 289973630 + } + }, + { + "ph": "s", "id": 289973630, "pid": 5714, "tid": 5714, "ts": 6303771462588.329, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771462603.889, "dur": 0.700, + "args": { + "External id": 147933, "cbid": 409, "correlation": 289973633 + } + }, + { + "ph": "f", "id": 289973633, "pid": 5714, "tid": 5714, "ts": 6303771462603.889, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771462608.409, "dur": 0.820, + "args": { + "External id": 147933, "cbid": 135, "correlation": 289973636 + } + }, + { + "ph": "f", "id": 289973636, "pid": 5714, "tid": 5714, "ts": 6303771462608.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771462609.409, "dur": 0.850, + "args": { + "External id": 147933, "cbid": 147, "correlation": 289973637 + } + }, + { + "ph": "s", "id": 289973637, "pid": 5714, "tid": 5714, "ts": 6303771462609.409, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771739913.889, "dur": 4423.507, + "args": { + "External id": 147933, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289973639, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289973639, "pid": 0, "tid": 20, "ts": 6303771739913.889, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771462611.269, "dur": 9.260, + "args": { + "External id": 147933, "cbid": 430, "correlation": 289973639 + } + }, + { + "ph": "s", "id": 289973639, "pid": 5714, "tid": 5714, "ts": 6303771462611.269, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771462621.499, "dur": 0.370, + "args": { + "External id": 147933, "cbid": 135, "correlation": 289973641 + } + }, + { + "ph": "f", "id": 289973641, "pid": 5714, "tid": 5714, "ts": 6303771462621.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771462622.059, "dur": 0.510, + "args": { + "External id": 147933, "cbid": 147, "correlation": 289973642 + } + }, + { + "ph": "s", "id": 289973642, "pid": 5714, "tid": 5714, "ts": 6303771462622.059, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771462623.959, "dur": 0.670, + "args": { + "External id": 147933, "cbid": 135, "correlation": 289973645 + } + }, + { + "ph": "f", "id": 289973645, "pid": 5714, "tid": 5714, "ts": 6303771462623.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771462634.349, "dur": 0.450, + "args": { + "External id": 147933, "cbid": 135, "correlation": 289973652 + } + }, + { + "ph": "f", "id": 289973652, "pid": 5714, "tid": 5714, "ts": 6303771462634.349, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771462661.589, "dur": 0.980, + "args": { + "External id": 147935, "cbid": 147, "correlation": 289973657 + } + }, + { + "ph": "s", "id": 289973657, "pid": 5714, "tid": 5714, "ts": 6303771462661.589, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771462678.779, "dur": 0.820, + "args": { + "cbid": 135, "correlation": 289973672 + } + }, + { + "ph": "f", "id": 289973672, "pid": 5714, "tid": 5714, "ts": 6303771462678.779, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771462717.139, "dur": 1.020, + "args": { + "cbid": 147, "correlation": 289973677 + } + }, + { + "ph": "s", "id": 289973677, "pid": 5714, "tid": 5714, "ts": 6303771462717.139, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771462719.959, "dur": 0.570, + "args": { + "cbid": 147, "correlation": 289973681 + } + }, + { + "ph": "s", "id": 289973681, "pid": 5714, "tid": 5714, "ts": 6303771462719.959, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771462756.738, "dur": 2.060, + "args": { + "cbid": 147, "correlation": 289973687 + } + }, + { + "ph": "s", "id": 289973687, "pid": 5714, "tid": 5714, "ts": 6303771462756.738, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771462853.628, "dur": 1.160, + "args": { + "External id": 147948, "cbid": 317, "correlation": 289973728 + } + }, + { + "ph": "f", "id": 289973728, "pid": 5714, "tid": 5714, "ts": 6303771462853.628, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771462862.068, "dur": 2.330, + "args": { + "External id": 147949, "cbid": 138, "correlation": 289973731 + } + }, + { + "ph": "f", "id": 289973731, "pid": 5714, "tid": 5714, "ts": 6303771462862.068, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771744339.476, "dur": 2.592, + "args": { + "External id": 147953, "device": 0, "context": 1, "stream": 7, "correlation": 289973742, "bytes": 7224, "memory bandwidth (GB/s)": 2.787037037037037 + } + }, + { + "ph": "f", "id": 289973742, "pid": 0, "tid": 7, "ts": 6303771744339.476, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771462886.308, "dur": 11.590, + "args": { + "External id": 147953, "cbid": 41, "correlation": 289973742 + } + }, + { + "ph": "s", "id": 289973742, "pid": 5714, "tid": 5714, "ts": 6303771462886.308, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771462901.858, "dur": 1.610, + "args": { + "External id": 147948, "cbid": 135, "correlation": 289973746 + } + }, + { + "ph": "f", "id": 289973746, "pid": 5714, "tid": 5714, "ts": 6303771462901.858, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771744344.148, "dur": 18.721, + "args": { + "External id": 147948, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973750, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973750, "pid": 0, "tid": 7, "ts": 6303771744344.148, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771462906.108, "dur": 9.600, + "args": { + "External id": 147948, "cbid": 211, "correlation": 289973750 + } + }, + { + "ph": "s", "id": 289973750, "pid": 5714, "tid": 5714, "ts": 6303771462906.108, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771462991.448, "dur": 1.260, + "args": { + "cbid": 135, "correlation": 289973761 + } + }, + { + "ph": "f", "id": 289973761, "pid": 5714, "tid": 5714, "ts": 6303771462991.448, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771744363.605, "dur": 35.424, + "args": { + "External id": 147960, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973787, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973787, "pid": 0, "tid": 7, "ts": 6303771744363.605, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771463193.408, "dur": 10.420, + "args": { + "External id": 147960, "cbid": 307, "correlation": 289973787 + } + }, + { + "ph": "s", "id": 289973787, "pid": 5714, "tid": 5714, "ts": 6303771463193.408, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771744399.733, "dur": 574.247, + "args": { + "External id": 147966, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973810, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973810, "pid": 0, "tid": 7, "ts": 6303771744399.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771463340.307, "dur": 10.030, + "args": { + "External id": 147966, "cbid": 211, "correlation": 289973810 + } + }, + { + "ph": "s", "id": 289973810, "pid": 5714, "tid": 5714, "ts": 6303771463340.307, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771744974.620, "dur": 587.398, + "args": { + "External id": 147967, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973833, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973833, "pid": 0, "tid": 7, "ts": 6303771744974.620, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771463373.087, "dur": 5.510, + "args": { + "External id": 147967, "cbid": 211, "correlation": 289973833 + } + }, + { + "ph": "s", "id": 289973833, "pid": 5714, "tid": 5714, "ts": 6303771463373.087, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771745562.626, "dur": 267.907, + "args": { + "External id": 147968, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973856, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973856, "pid": 0, "tid": 7, "ts": 6303771745562.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771463398.357, "dur": 5.340, + "args": { + "External id": 147968, "cbid": 211, "correlation": 289973856 + } + }, + { + "ph": "s", "id": 289973856, "pid": 5714, "tid": 5714, "ts": 6303771463398.357, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771745831.237, "dur": 222.883, + "args": { + "External id": 147985, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973876, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973876, "pid": 0, "tid": 7, "ts": 6303771745831.237, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771463666.967, "dur": 9.409, + "args": { + "External id": 147985, "cbid": 307, "correlation": 289973876 + } + }, + { + "ph": "s", "id": 289973876, "pid": 5714, "tid": 5714, "ts": 6303771463666.967, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771746054.824, "dur": 112.929, + "args": { + "External id": 148001, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973894, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973894, "pid": 0, "tid": 7, "ts": 6303771746054.824, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771463855.866, "dur": 8.490, + "args": { + "External id": 148001, "cbid": 307, "correlation": 289973894 + } + }, + { + "ph": "s", "id": 289973894, "pid": 5714, "tid": 5714, "ts": 6303771463855.866, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771463994.216, "dur": 0.480, + "args": { + "External id": 148007, "cbid": 200, "correlation": 289973901 + } + }, + { + "ph": "f", "id": 289973901, "pid": 5714, "tid": 5714, "ts": 6303771463994.216, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771463994.886, "dur": 0.200, + "args": { + "External id": 148007, "cbid": 200, "correlation": 289973902 + } + }, + { + "ph": "f", "id": 289973902, "pid": 5714, "tid": 5714, "ts": 6303771463994.886, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771464020.946, "dur": 0.360, + "args": { + "External id": 148007, "cbid": 200, "correlation": 289973925 + } + }, + { + "ph": "f", "id": 289973925, "pid": 5714, "tid": 5714, "ts": 6303771464020.946, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771464026.896, "dur": 1.920, + "args": { + "External id": 148007, "cbid": 273, "correlation": 289973934 + } + }, + { + "ph": "f", "id": 289973934, "pid": 5714, "tid": 5714, "ts": 6303771464026.896, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771746168.457, "dur": 457.414, + "args": { + "External id": 148007, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973935, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973935, "pid": 0, "tid": 7, "ts": 6303771746168.457, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771464029.396, "dur": 10.020, + "args": { + "External id": 148007, "cbid": 211, "correlation": 289973935 + } + }, + { + "ph": "s", "id": 289973935, "pid": 5714, "tid": 5714, "ts": 6303771464029.396, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771746626.511, "dur": 144.929, + "args": { + "External id": 148013, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973958, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289973958, "pid": 0, "tid": 7, "ts": 6303771746626.511, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771464101.775, "dur": 7.480, + "args": { + "External id": 148013, "cbid": 211, "correlation": 289973958 + } + }, + { + "ph": "s", "id": 289973958, "pid": 5714, "tid": 5714, "ts": 6303771464101.775, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771746772.144, "dur": 92.066, + "args": { + "External id": 148017, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289973984, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289973984, "pid": 0, "tid": 7, "ts": 6303771746772.144, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771464263.235, "dur": 8.810, + "args": { + "External id": 148017, "cbid": 307, "correlation": 289973984 + } + }, + { + "ph": "s", "id": 289973984, "pid": 5714, "tid": 5714, "ts": 6303771464263.235, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771746864.818, "dur": 344.099, + "args": { + "External id": 148018, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974004, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974004, "pid": 0, "tid": 7, "ts": 6303771746864.818, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771464319.055, "dur": 6.640, + "args": { + "External id": 148018, "cbid": 211, "correlation": 289974004 + } + }, + { + "ph": "s", "id": 289974004, "pid": 5714, "tid": 5714, "ts": 6303771464319.055, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771747209.589, "dur": 512.678, + "args": { + "External id": 148019, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974027, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974027, "pid": 0, "tid": 7, "ts": 6303771747209.589, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771464350.785, "dur": 5.750, + "args": { + "External id": 148019, "cbid": 211, "correlation": 289974027 + } + }, + { + "ph": "s", "id": 289974027, "pid": 5714, "tid": 5714, "ts": 6303771464350.785, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6303771747722.907, "dur": 278.628, + "args": { + "External id": 148020, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974039, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974039, "pid": 0, "tid": 7, "ts": 6303771747722.907, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771464391.675, "dur": 5.370, + "args": { + "External id": 148020, "cbid": 307, "correlation": 289974039 + } + }, + { + "ph": "s", "id": 289974039, "pid": 5714, "tid": 5714, "ts": 6303771464391.675, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771464421.115, "dur": 1.270, + "args": { + "External id": 148021, "cbid": 210, "correlation": 289974059 + } + }, + { + "ph": "f", "id": 289974059, "pid": 5714, "tid": 5714, "ts": 6303771464421.115, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771748002.239, "dur": 487.525, + "args": { + "External id": 148021, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974060, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974060, "pid": 0, "tid": 7, "ts": 6303771748002.239, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771464424.085, "dur": 5.440, + "args": { + "External id": 148021, "cbid": 211, "correlation": 289974060 + } + }, + { + "ph": "s", "id": 289974060, "pid": 5714, "tid": 5714, "ts": 6303771464424.085, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6303771748490.468, "dur": 67.809, + "args": { + "External id": 148022, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974067, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974067, "pid": 0, "tid": 7, "ts": 6303771748490.468, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771464458.515, "dur": 4.720, + "args": { + "External id": 148022, "cbid": 307, "correlation": 289974067 + } + }, + { + "ph": "s", "id": 289974067, "pid": 5714, "tid": 5714, "ts": 6303771464458.515, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771739945.345, "dur": 42.497, + "args": { + "External id": 148038, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289974082, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289974082, "pid": 0, "tid": 17, "ts": 6303771739945.345, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771464818.604, "dur": 11.980, + "args": { + "External id": 148038, "cbid": 211, "correlation": 289974082 + } + }, + { + "ph": "s", "id": 289974082, "pid": 5714, "tid": 5714, "ts": 6303771464818.604, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771740007.234, "dur": 114.881, + "args": { + "External id": 148054, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289974095, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289974095, "pid": 0, "tid": 17, "ts": 6303771740007.234, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771464931.024, "dur": 9.520, + "args": { + "External id": 148054, "cbid": 211, "correlation": 289974095 + } + }, + { + "ph": "s", "id": 289974095, "pid": 5714, "tid": 5714, "ts": 6303771464931.024, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771464963.364, "dur": 1.260, + "args": { + "cbid": 135, "correlation": 289974105 + } + }, + { + "ph": "f", "id": 289974105, "pid": 5714, "tid": 5714, "ts": 6303771464963.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771464966.514, "dur": 1.230, + "args": { + "cbid": 147, "correlation": 289974109 + } + }, + { + "ph": "s", "id": 289974109, "pid": 5714, "tid": 5714, "ts": 6303771464966.514, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771465021.133, "dur": 0.891, + "args": { + "External id": 148056, "cbid": 317, "correlation": 289974122 + } + }, + { + "ph": "f", "id": 289974122, "pid": 5714, "tid": 5714, "ts": 6303771465021.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771465023.813, "dur": 1.091, + "args": { + "External id": 148056, "cbid": 135, "correlation": 289974124 + } + }, + { + "ph": "f", "id": 289974124, "pid": 5714, "tid": 5714, "ts": 6303771465023.813, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771465026.244, "dur": 1.160, + "args": { + "External id": 148056, "cbid": 147, "correlation": 289974128 + } + }, + { + "ph": "s", "id": 289974128, "pid": 5714, "tid": 5714, "ts": 6303771465026.244, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771465041.933, "dur": 0.671, + "args": { + "External id": 148056, "cbid": 409, "correlation": 289974131 + } + }, + { + "ph": "f", "id": 289974131, "pid": 5714, "tid": 5714, "ts": 6303771465041.933, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771465046.444, "dur": 0.709, + "args": { + "External id": 148056, "cbid": 135, "correlation": 289974134 + } + }, + { + "ph": "f", "id": 289974134, "pid": 5714, "tid": 5714, "ts": 6303771465046.444, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771465047.324, "dur": 0.869, + "args": { + "External id": 148056, "cbid": 147, "correlation": 289974135 + } + }, + { + "ph": "s", "id": 289974135, "pid": 5714, "tid": 5714, "ts": 6303771465047.324, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771744339.476, "dur": 4310.258, + "args": { + "External id": 148056, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289974137, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289974137, "pid": 0, "tid": 20, "ts": 6303771744339.476, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771465049.444, "dur": 9.320, + "args": { + "External id": 148056, "cbid": 430, "correlation": 289974137 + } + }, + { + "ph": "s", "id": 289974137, "pid": 5714, "tid": 5714, "ts": 6303771465049.444, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771465059.693, "dur": 0.391, + "args": { + "External id": 148056, "cbid": 135, "correlation": 289974139 + } + }, + { + "ph": "f", "id": 289974139, "pid": 5714, "tid": 5714, "ts": 6303771465059.693, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771465060.193, "dur": 0.480, + "args": { + "External id": 148056, "cbid": 147, "correlation": 289974140 + } + }, + { + "ph": "s", "id": 289974140, "pid": 5714, "tid": 5714, "ts": 6303771465060.193, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771465062.213, "dur": 0.771, + "args": { + "External id": 148056, "cbid": 135, "correlation": 289974143 + } + }, + { + "ph": "f", "id": 289974143, "pid": 5714, "tid": 5714, "ts": 6303771465062.213, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771465072.024, "dur": 0.440, + "args": { + "External id": 148056, "cbid": 135, "correlation": 289974150 + } + }, + { + "ph": "f", "id": 289974150, "pid": 5714, "tid": 5714, "ts": 6303771465072.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771465101.443, "dur": 1.000, + "args": { + "External id": 148058, "cbid": 147, "correlation": 289974155 + } + }, + { + "ph": "s", "id": 289974155, "pid": 5714, "tid": 5714, "ts": 6303771465101.443, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771465118.343, "dur": 0.980, + "args": { + "cbid": 135, "correlation": 289974170 + } + }, + { + "ph": "f", "id": 289974170, "pid": 5714, "tid": 5714, "ts": 6303771465118.343, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771465158.433, "dur": 1.010, + "args": { + "cbid": 147, "correlation": 289974175 + } + }, + { + "ph": "s", "id": 289974175, "pid": 5714, "tid": 5714, "ts": 6303771465158.433, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771465161.353, "dur": 0.620, + "args": { + "cbid": 147, "correlation": 289974179 + } + }, + { + "ph": "s", "id": 289974179, "pid": 5714, "tid": 5714, "ts": 6303771465161.353, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771465198.433, "dur": 2.090, + "args": { + "cbid": 147, "correlation": 289974185 + } + }, + { + "ph": "s", "id": 289974185, "pid": 5714, "tid": 5714, "ts": 6303771465198.433, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771465305.473, "dur": 1.200, + "args": { + "External id": 148071, "cbid": 317, "correlation": 289974226 + } + }, + { + "ph": "f", "id": 289974226, "pid": 5714, "tid": 5714, "ts": 6303771465305.473, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771465317.083, "dur": 2.190, + "args": { + "External id": 148072, "cbid": 138, "correlation": 289974229 + } + }, + { + "ph": "f", "id": 289974229, "pid": 5714, "tid": 5714, "ts": 6303771465317.083, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771748651.750, "dur": 2.432, + "args": { + "External id": 148076, "device": 0, "context": 1, "stream": 7, "correlation": 289974240, "bytes": 7224, "memory bandwidth (GB/s)": 2.9703947368421053 + } + }, + { + "ph": "f", "id": 289974240, "pid": 0, "tid": 7, "ts": 6303771748651.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771465339.843, "dur": 11.780, + "args": { + "External id": 148076, "cbid": 41, "correlation": 289974240 + } + }, + { + "ph": "s", "id": 289974240, "pid": 5714, "tid": 5714, "ts": 6303771465339.843, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771465355.633, "dur": 1.660, + "args": { + "External id": 148071, "cbid": 135, "correlation": 289974244 + } + }, + { + "ph": "f", "id": 289974244, "pid": 5714, "tid": 5714, "ts": 6303771465355.633, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771748656.230, "dur": 20.064, + "args": { + "External id": 148071, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974248, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974248, "pid": 0, "tid": 7, "ts": 6303771748656.230, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771465359.633, "dur": 9.590, + "args": { + "External id": 148071, "cbid": 211, "correlation": 289974248 + } + }, + { + "ph": "s", "id": 289974248, "pid": 5714, "tid": 5714, "ts": 6303771465359.633, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771465446.983, "dur": 1.240, + "args": { + "cbid": 135, "correlation": 289974259 + } + }, + { + "ph": "f", "id": 289974259, "pid": 5714, "tid": 5714, "ts": 6303771465446.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771748676.934, "dur": 30.081, + "args": { + "External id": 148083, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974285, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974285, "pid": 0, "tid": 7, "ts": 6303771748676.934, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771465647.202, "dur": 10.100, + "args": { + "External id": 148083, "cbid": 307, "correlation": 289974285 + } + }, + { + "ph": "s", "id": 289974285, "pid": 5714, "tid": 5714, "ts": 6303771465647.202, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771748707.623, "dur": 149.154, + "args": { + "External id": 148089, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974308, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974308, "pid": 0, "tid": 7, "ts": 6303771748707.623, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771465783.212, "dur": 9.510, + "args": { + "External id": 148089, "cbid": 211, "correlation": 289974308 + } + }, + { + "ph": "s", "id": 289974308, "pid": 5714, "tid": 5714, "ts": 6303771465783.212, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771748857.417, "dur": 678.856, + "args": { + "External id": 148090, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974331, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974331, "pid": 0, "tid": 7, "ts": 6303771748857.417, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771465815.272, "dur": 5.710, + "args": { + "External id": 148090, "cbid": 211, "correlation": 289974331 + } + }, + { + "ph": "s", "id": 289974331, "pid": 5714, "tid": 5714, "ts": 6303771465815.272, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771749536.945, "dur": 569.638, + "args": { + "External id": 148091, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974354, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974354, "pid": 0, "tid": 7, "ts": 6303771749536.945, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771465842.412, "dur": 4.830, + "args": { + "External id": 148091, "cbid": 211, "correlation": 289974354 + } + }, + { + "ph": "s", "id": 289974354, "pid": 5714, "tid": 5714, "ts": 6303771465842.412, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771750107.191, "dur": 54.177, + "args": { + "External id": 148108, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974374, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974374, "pid": 0, "tid": 7, "ts": 6303771750107.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771466099.901, "dur": 9.220, + "args": { + "External id": 148108, "cbid": 307, "correlation": 289974374 + } + }, + { + "ph": "s", "id": 289974374, "pid": 5714, "tid": 5714, "ts": 6303771466099.901, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771750161.976, "dur": 61.632, + "args": { + "External id": 148124, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974392, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974392, "pid": 0, "tid": 7, "ts": 6303771750161.976, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771466288.331, "dur": 15.280, + "args": { + "External id": 148124, "cbid": 307, "correlation": 289974392 + } + }, + { + "ph": "s", "id": 289974392, "pid": 5714, "tid": 5714, "ts": 6303771466288.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771466437.541, "dur": 0.529, + "args": { + "External id": 148130, "cbid": 200, "correlation": 289974399 + } + }, + { + "ph": "f", "id": 289974399, "pid": 5714, "tid": 5714, "ts": 6303771466437.541, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771466438.181, "dur": 0.200, + "args": { + "External id": 148130, "cbid": 200, "correlation": 289974400 + } + }, + { + "ph": "f", "id": 289974400, "pid": 5714, "tid": 5714, "ts": 6303771466438.181, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771466463.800, "dur": 0.410, + "args": { + "External id": 148130, "cbid": 200, "correlation": 289974423 + } + }, + { + "ph": "f", "id": 289974423, "pid": 5714, "tid": 5714, "ts": 6303771466463.800, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771466469.670, "dur": 1.940, + "args": { + "External id": 148130, "cbid": 273, "correlation": 289974432 + } + }, + { + "ph": "f", "id": 289974432, "pid": 5714, "tid": 5714, "ts": 6303771466469.670, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771750224.376, "dur": 418.245, + "args": { + "External id": 148130, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974433, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974433, "pid": 0, "tid": 7, "ts": 6303771750224.376, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771466472.220, "dur": 10.190, + "args": { + "External id": 148130, "cbid": 211, "correlation": 289974433 + } + }, + { + "ph": "s", "id": 289974433, "pid": 5714, "tid": 5714, "ts": 6303771466472.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771750643.325, "dur": 144.770, + "args": { + "External id": 148136, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974456, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974456, "pid": 0, "tid": 7, "ts": 6303771750643.325, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771466545.150, "dur": 7.450, + "args": { + "External id": 148136, "cbid": 211, "correlation": 289974456 + } + }, + { + "ph": "s", "id": 289974456, "pid": 5714, "tid": 5714, "ts": 6303771466545.150, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771750788.703, "dur": 90.817, + "args": { + "External id": 148140, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974482, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974482, "pid": 0, "tid": 7, "ts": 6303771750788.703, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771466705.590, "dur": 8.960, + "args": { + "External id": 148140, "cbid": 307, "correlation": 289974482 + } + }, + { + "ph": "s", "id": 289974482, "pid": 5714, "tid": 5714, "ts": 6303771466705.590, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771750880.192, "dur": 343.428, + "args": { + "External id": 148141, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974502, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974502, "pid": 0, "tid": 7, "ts": 6303771750880.192, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771466745.870, "dur": 5.890, + "args": { + "External id": 148141, "cbid": 211, "correlation": 289974502 + } + }, + { + "ph": "s", "id": 289974502, "pid": 5714, "tid": 5714, "ts": 6303771466745.870, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771751224.324, "dur": 337.892, + "args": { + "External id": 148142, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974525, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974525, "pid": 0, "tid": 7, "ts": 6303771751224.324, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771466774.200, "dur": 5.130, + "args": { + "External id": 148142, "cbid": 211, "correlation": 289974525 + } + }, + { + "ph": "s", "id": 289974525, "pid": 5714, "tid": 5714, "ts": 6303771466774.200, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6303771751562.856, "dur": 213.538, + "args": { + "External id": 148143, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974537, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974537, "pid": 0, "tid": 7, "ts": 6303771751562.856, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771466810.480, "dur": 5.280, + "args": { + "External id": 148143, "cbid": 307, "correlation": 289974537 + } + }, + { + "ph": "s", "id": 289974537, "pid": 5714, "tid": 5714, "ts": 6303771466810.480, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771466839.709, "dur": 1.240, + "args": { + "External id": 148144, "cbid": 210, "correlation": 289974557 + } + }, + { + "ph": "f", "id": 289974557, "pid": 5714, "tid": 5714, "ts": 6303771466839.709, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771751777.098, "dur": 346.404, + "args": { + "External id": 148144, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974558, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974558, "pid": 0, "tid": 7, "ts": 6303771751777.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771466842.689, "dur": 5.240, + "args": { + "External id": 148144, "cbid": 211, "correlation": 289974558 + } + }, + { + "ph": "s", "id": 289974558, "pid": 5714, "tid": 5714, "ts": 6303771466842.689, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6303771752124.142, "dur": 41.056, + "args": { + "External id": 148145, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974565, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974565, "pid": 0, "tid": 7, "ts": 6303771752124.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771466876.840, "dur": 4.880, + "args": { + "External id": 148145, "cbid": 307, "correlation": 289974565 + } + }, + { + "ph": "s", "id": 289974565, "pid": 5714, "tid": 5714, "ts": 6303771466876.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771744365.781, "dur": 43.840, + "args": { + "External id": 148161, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289974580, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289974580, "pid": 0, "tid": 17, "ts": 6303771744365.781, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771467232.679, "dur": 11.800, + "args": { + "External id": 148161, "cbid": 211, "correlation": 289974580 + } + }, + { + "ph": "s", "id": 289974580, "pid": 5714, "tid": 5714, "ts": 6303771467232.679, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771744790.585, "dur": 92.161, + "args": { + "External id": 148177, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289974593, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289974593, "pid": 0, "tid": 17, "ts": 6303771744790.585, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771467351.818, "dur": 9.640, + "args": { + "External id": 148177, "cbid": 211, "correlation": 289974593 + } + }, + { + "ph": "s", "id": 289974593, "pid": 5714, "tid": 5714, "ts": 6303771467351.818, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771467385.308, "dur": 1.290, + "args": { + "cbid": 135, "correlation": 289974603 + } + }, + { + "ph": "f", "id": 289974603, "pid": 5714, "tid": 5714, "ts": 6303771467385.308, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771467388.448, "dur": 1.260, + "args": { + "cbid": 147, "correlation": 289974607 + } + }, + { + "ph": "s", "id": 289974607, "pid": 5714, "tid": 5714, "ts": 6303771467388.448, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771467443.288, "dur": 0.860, + "args": { + "External id": 148179, "cbid": 317, "correlation": 289974620 + } + }, + { + "ph": "f", "id": 289974620, "pid": 5714, "tid": 5714, "ts": 6303771467443.288, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771467445.938, "dur": 1.190, + "args": { + "External id": 148179, "cbid": 135, "correlation": 289974622 + } + }, + { + "ph": "f", "id": 289974622, "pid": 5714, "tid": 5714, "ts": 6303771467445.938, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771467448.418, "dur": 1.150, + "args": { + "External id": 148179, "cbid": 147, "correlation": 289974626 + } + }, + { + "ph": "s", "id": 289974626, "pid": 5714, "tid": 5714, "ts": 6303771467448.418, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771467466.138, "dur": 0.670, + "args": { + "External id": 148179, "cbid": 409, "correlation": 289974629 + } + }, + { + "ph": "f", "id": 289974629, "pid": 5714, "tid": 5714, "ts": 6303771467466.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771467470.698, "dur": 0.780, + "args": { + "External id": 148179, "cbid": 135, "correlation": 289974632 + } + }, + { + "ph": "f", "id": 289974632, "pid": 5714, "tid": 5714, "ts": 6303771467470.698, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771467471.648, "dur": 0.860, + "args": { + "External id": 148179, "cbid": 147, "correlation": 289974633 + } + }, + { + "ph": "s", "id": 289974633, "pid": 5714, "tid": 5714, "ts": 6303771467471.648, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771748651.814, "dur": 12559.922, + "args": { + "External id": 148179, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289974635, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289974635, "pid": 0, "tid": 20, "ts": 6303771748651.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771467473.568, "dur": 9.370, + "args": { + "External id": 148179, "cbid": 430, "correlation": 289974635 + } + }, + { + "ph": "s", "id": 289974635, "pid": 5714, "tid": 5714, "ts": 6303771467473.568, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771467483.958, "dur": 0.410, + "args": { + "External id": 148179, "cbid": 135, "correlation": 289974637 + } + }, + { + "ph": "f", "id": 289974637, "pid": 5714, "tid": 5714, "ts": 6303771467483.958, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771467484.488, "dur": 0.480, + "args": { + "External id": 148179, "cbid": 147, "correlation": 289974638 + } + }, + { + "ph": "s", "id": 289974638, "pid": 5714, "tid": 5714, "ts": 6303771467484.488, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771467486.418, "dur": 0.790, + "args": { + "External id": 148179, "cbid": 135, "correlation": 289974641 + } + }, + { + "ph": "f", "id": 289974641, "pid": 5714, "tid": 5714, "ts": 6303771467486.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771467495.608, "dur": 0.420, + "args": { + "External id": 148179, "cbid": 135, "correlation": 289974648 + } + }, + { + "ph": "f", "id": 289974648, "pid": 5714, "tid": 5714, "ts": 6303771467495.608, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771467524.468, "dur": 0.970, + "args": { + "External id": 148181, "cbid": 147, "correlation": 289974653 + } + }, + { + "ph": "s", "id": 289974653, "pid": 5714, "tid": 5714, "ts": 6303771467524.468, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771467540.918, "dur": 0.820, + "args": { + "cbid": 135, "correlation": 289974668 + } + }, + { + "ph": "f", "id": 289974668, "pid": 5714, "tid": 5714, "ts": 6303771467540.918, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771467579.878, "dur": 1.020, + "args": { + "cbid": 147, "correlation": 289974673 + } + }, + { + "ph": "s", "id": 289974673, "pid": 5714, "tid": 5714, "ts": 6303771467579.878, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771467582.748, "dur": 0.580, + "args": { + "cbid": 147, "correlation": 289974677 + } + }, + { + "ph": "s", "id": 289974677, "pid": 5714, "tid": 5714, "ts": 6303771467582.748, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771467619.918, "dur": 2.090, + "args": { + "cbid": 147, "correlation": 289974683 + } + }, + { + "ph": "s", "id": 289974683, "pid": 5714, "tid": 5714, "ts": 6303771467619.918, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771467718.638, "dur": 1.060, + "args": { + "External id": 148194, "cbid": 317, "correlation": 289974724 + } + }, + { + "ph": "f", "id": 289974724, "pid": 5714, "tid": 5714, "ts": 6303771467718.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771467727.207, "dur": 2.331, + "args": { + "External id": 148195, "cbid": 138, "correlation": 289974727 + } + }, + { + "ph": "f", "id": 289974727, "pid": 5714, "tid": 5714, "ts": 6303771467727.207, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771761215.191, "dur": 2.048, + "args": { + "External id": 148199, "device": 0, "context": 1, "stream": 7, "correlation": 289974738, "bytes": 7224, "memory bandwidth (GB/s)": 3.52734375 + } + }, + { + "ph": "f", "id": 289974738, "pid": 0, "tid": 7, "ts": 6303771761215.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771467749.118, "dur": 11.789, + "args": { + "External id": 148199, "cbid": 41, "correlation": 289974738 + } + }, + { + "ph": "s", "id": 289974738, "pid": 5714, "tid": 5714, "ts": 6303771467749.118, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771467764.987, "dur": 1.860, + "args": { + "External id": 148194, "cbid": 135, "correlation": 289974742 + } + }, + { + "ph": "f", "id": 289974742, "pid": 5714, "tid": 5714, "ts": 6303771467764.987, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771761219.255, "dur": 84.770, + "args": { + "External id": 148194, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974746, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974746, "pid": 0, "tid": 7, "ts": 6303771761219.255, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771467769.238, "dur": 9.609, + "args": { + "External id": 148194, "cbid": 211, "correlation": 289974746 + } + }, + { + "ph": "s", "id": 289974746, "pid": 5714, "tid": 5714, "ts": 6303771467769.238, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771467855.207, "dur": 1.210, + "args": { + "cbid": 135, "correlation": 289974757 + } + }, + { + "ph": "f", "id": 289974757, "pid": 5714, "tid": 5714, "ts": 6303771467855.207, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771761370.553, "dur": 612.231, + "args": { + "External id": 148206, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974783, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974783, "pid": 0, "tid": 7, "ts": 6303771761370.553, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771468061.317, "dur": 10.300, + "args": { + "External id": 148206, "cbid": 307, "correlation": 289974783 + } + }, + { + "ph": "s", "id": 289974783, "pid": 5714, "tid": 5714, "ts": 6303771468061.317, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771762082.978, "dur": 191.330, + "args": { + "External id": 148212, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974806, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974806, "pid": 0, "tid": 7, "ts": 6303771762082.978, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771468197.706, "dur": 9.400, + "args": { + "External id": 148212, "cbid": 211, "correlation": 289974806 + } + }, + { + "ph": "s", "id": 289974806, "pid": 5714, "tid": 5714, "ts": 6303771468197.706, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771762274.980, "dur": 142.529, + "args": { + "External id": 148213, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974829, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974829, "pid": 0, "tid": 7, "ts": 6303771762274.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771468229.197, "dur": 5.580, + "args": { + "External id": 148213, "cbid": 211, "correlation": 289974829 + } + }, + { + "ph": "s", "id": 289974829, "pid": 5714, "tid": 5714, "ts": 6303771468229.197, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771762418.214, "dur": 144.161, + "args": { + "External id": 148214, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974852, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974852, "pid": 0, "tid": 7, "ts": 6303771762418.214, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771468254.866, "dur": 4.610, + "args": { + "External id": 148214, "cbid": 211, "correlation": 289974852 + } + }, + { + "ph": "s", "id": 289974852, "pid": 5714, "tid": 5714, "ts": 6303771468254.866, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771762563.015, "dur": 53.921, + "args": { + "External id": 148231, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974872, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974872, "pid": 0, "tid": 7, "ts": 6303771762563.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771468514.016, "dur": 9.450, + "args": { + "External id": 148231, "cbid": 307, "correlation": 289974872 + } + }, + { + "ph": "s", "id": 289974872, "pid": 5714, "tid": 5714, "ts": 6303771468514.016, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771762617.608, "dur": 60.545, + "args": { + "External id": 148247, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974890, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974890, "pid": 0, "tid": 7, "ts": 6303771762617.608, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771468704.005, "dur": 8.420, + "args": { + "External id": 148247, "cbid": 307, "correlation": 289974890 + } + }, + { + "ph": "s", "id": 289974890, "pid": 5714, "tid": 5714, "ts": 6303771468704.005, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771468842.875, "dur": 0.520, + "args": { + "External id": 148253, "cbid": 200, "correlation": 289974897 + } + }, + { + "ph": "f", "id": 289974897, "pid": 5714, "tid": 5714, "ts": 6303771468842.875, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771468843.505, "dur": 0.200, + "args": { + "External id": 148253, "cbid": 200, "correlation": 289974898 + } + }, + { + "ph": "f", "id": 289974898, "pid": 5714, "tid": 5714, "ts": 6303771468843.505, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771468869.385, "dur": 0.380, + "args": { + "External id": 148253, "cbid": 200, "correlation": 289974921 + } + }, + { + "ph": "f", "id": 289974921, "pid": 5714, "tid": 5714, "ts": 6303771468869.385, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771468875.915, "dur": 1.950, + "args": { + "External id": 148253, "cbid": 273, "correlation": 289974930 + } + }, + { + "ph": "f", "id": 289974930, "pid": 5714, "tid": 5714, "ts": 6303771468875.915, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771762678.761, "dur": 417.125, + "args": { + "External id": 148253, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974931, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974931, "pid": 0, "tid": 7, "ts": 6303771762678.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771468878.455, "dur": 9.870, + "args": { + "External id": 148253, "cbid": 211, "correlation": 289974931 + } + }, + { + "ph": "s", "id": 289974931, "pid": 5714, "tid": 5714, "ts": 6303771468878.455, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771763096.526, "dur": 144.801, + "args": { + "External id": 148259, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974954, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289974954, "pid": 0, "tid": 7, "ts": 6303771763096.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771468951.425, "dur": 7.170, + "args": { + "External id": 148259, "cbid": 211, "correlation": 289974954 + } + }, + { + "ph": "s", "id": 289974954, "pid": 5714, "tid": 5714, "ts": 6303771468951.425, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771763242.031, "dur": 89.953, + "args": { + "External id": 148263, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289974980, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289974980, "pid": 0, "tid": 7, "ts": 6303771763242.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771469107.775, "dur": 8.900, + "args": { + "External id": 148263, "cbid": 307, "correlation": 289974980 + } + }, + { + "ph": "s", "id": 289974980, "pid": 5714, "tid": 5714, "ts": 6303771469107.775, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771763332.656, "dur": 344.164, + "args": { + "External id": 148264, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975000, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975000, "pid": 0, "tid": 7, "ts": 6303771763332.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771469148.164, "dur": 5.800, + "args": { + "External id": 148264, "cbid": 211, "correlation": 289975000 + } + }, + { + "ph": "s", "id": 289975000, "pid": 5714, "tid": 5714, "ts": 6303771469148.164, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771763677.524, "dur": 338.724, + "args": { + "External id": 148265, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975023, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975023, "pid": 0, "tid": 7, "ts": 6303771763677.524, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771469174.464, "dur": 4.910, + "args": { + "External id": 148265, "cbid": 211, "correlation": 289975023 + } + }, + { + "ph": "s", "id": 289975023, "pid": 5714, "tid": 5714, "ts": 6303771469174.464, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6303771764016.920, "dur": 215.331, + "args": { + "External id": 148266, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975035, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975035, "pid": 0, "tid": 7, "ts": 6303771764016.920, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771469211.924, "dur": 5.190, + "args": { + "External id": 148266, "cbid": 307, "correlation": 289975035 + } + }, + { + "ph": "s", "id": 289975035, "pid": 5714, "tid": 5714, "ts": 6303771469211.924, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771469241.164, "dur": 1.290, + "args": { + "External id": 148267, "cbid": 210, "correlation": 289975055 + } + }, + { + "ph": "f", "id": 289975055, "pid": 5714, "tid": 5714, "ts": 6303771469241.164, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771764232.891, "dur": 348.900, + "args": { + "External id": 148267, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975056, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975056, "pid": 0, "tid": 7, "ts": 6303771764232.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771469244.154, "dur": 5.420, + "args": { + "External id": 148267, "cbid": 211, "correlation": 289975056 + } + }, + { + "ph": "s", "id": 289975056, "pid": 5714, "tid": 5714, "ts": 6303771469244.154, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6303771764582.463, "dur": 42.656, + "args": { + "External id": 148268, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975063, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975063, "pid": 0, "tid": 7, "ts": 6303771764582.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771469280.004, "dur": 5.260, + "args": { + "External id": 148268, "cbid": 307, "correlation": 289975063 + } + }, + { + "ph": "s", "id": 289975063, "pid": 5714, "tid": 5714, "ts": 6303771469280.004, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771748679.718, "dur": 41.569, + "args": { + "External id": 148284, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289975078, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289975078, "pid": 0, "tid": 17, "ts": 6303771748679.718, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771469641.173, "dur": 11.870, + "args": { + "External id": 148284, "cbid": 211, "correlation": 289975078 + } + }, + { + "ph": "s", "id": 289975078, "pid": 5714, "tid": 5714, "ts": 6303771469641.173, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771748738.311, "dur": 14.880, + "args": { + "External id": 148300, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289975091, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289975091, "pid": 0, "tid": 17, "ts": 6303771748738.311, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771469755.373, "dur": 9.420, + "args": { + "External id": 148300, "cbid": 211, "correlation": 289975091 + } + }, + { + "ph": "s", "id": 289975091, "pid": 5714, "tid": 5714, "ts": 6303771469755.373, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771469788.063, "dur": 1.230, + "args": { + "cbid": 135, "correlation": 289975101 + } + }, + { + "ph": "f", "id": 289975101, "pid": 5714, "tid": 5714, "ts": 6303771469788.063, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771469791.163, "dur": 1.290, + "args": { + "cbid": 147, "correlation": 289975105 + } + }, + { + "ph": "s", "id": 289975105, "pid": 5714, "tid": 5714, "ts": 6303771469791.163, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771469845.143, "dur": 0.920, + "args": { + "External id": 148302, "cbid": 317, "correlation": 289975118 + } + }, + { + "ph": "f", "id": 289975118, "pid": 5714, "tid": 5714, "ts": 6303771469845.143, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771469847.783, "dur": 1.030, + "args": { + "External id": 148302, "cbid": 135, "correlation": 289975120 + } + }, + { + "ph": "f", "id": 289975120, "pid": 5714, "tid": 5714, "ts": 6303771469847.783, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771469850.153, "dur": 1.100, + "args": { + "External id": 148302, "cbid": 147, "correlation": 289975124 + } + }, + { + "ph": "s", "id": 289975124, "pid": 5714, "tid": 5714, "ts": 6303771469850.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771469865.963, "dur": 0.680, + "args": { + "External id": 148302, "cbid": 409, "correlation": 289975127 + } + }, + { + "ph": "f", "id": 289975127, "pid": 5714, "tid": 5714, "ts": 6303771469865.963, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771469870.403, "dur": 0.720, + "args": { + "External id": 148302, "cbid": 135, "correlation": 289975130 + } + }, + { + "ph": "f", "id": 289975130, "pid": 5714, "tid": 5714, "ts": 6303771469870.403, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771469871.293, "dur": 0.830, + "args": { + "External id": 148302, "cbid": 147, "correlation": 289975131 + } + }, + { + "ph": "s", "id": 289975131, "pid": 5714, "tid": 5714, "ts": 6303771469871.293, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771761213.399, "dur": 5350.430, + "args": { + "External id": 148302, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289975133, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289975133, "pid": 0, "tid": 20, "ts": 6303771761213.399, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771469873.233, "dur": 9.320, + "args": { + "External id": 148302, "cbid": 430, "correlation": 289975133 + } + }, + { + "ph": "s", "id": 289975133, "pid": 5714, "tid": 5714, "ts": 6303771469873.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771469883.523, "dur": 0.390, + "args": { + "External id": 148302, "cbid": 135, "correlation": 289975135 + } + }, + { + "ph": "f", "id": 289975135, "pid": 5714, "tid": 5714, "ts": 6303771469883.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771469884.023, "dur": 0.490, + "args": { + "External id": 148302, "cbid": 147, "correlation": 289975136 + } + }, + { + "ph": "s", "id": 289975136, "pid": 5714, "tid": 5714, "ts": 6303771469884.023, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771469886.023, "dur": 0.810, + "args": { + "External id": 148302, "cbid": 135, "correlation": 289975139 + } + }, + { + "ph": "f", "id": 289975139, "pid": 5714, "tid": 5714, "ts": 6303771469886.023, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771469897.673, "dur": 0.450, + "args": { + "External id": 148302, "cbid": 135, "correlation": 289975146 + } + }, + { + "ph": "f", "id": 289975146, "pid": 5714, "tid": 5714, "ts": 6303771469897.673, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771469924.813, "dur": 1.000, + "args": { + "External id": 148304, "cbid": 147, "correlation": 289975151 + } + }, + { + "ph": "s", "id": 289975151, "pid": 5714, "tid": 5714, "ts": 6303771469924.813, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771469941.733, "dur": 0.900, + "args": { + "cbid": 135, "correlation": 289975166 + } + }, + { + "ph": "f", "id": 289975166, "pid": 5714, "tid": 5714, "ts": 6303771469941.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771469981.102, "dur": 0.951, + "args": { + "cbid": 147, "correlation": 289975171 + } + }, + { + "ph": "s", "id": 289975171, "pid": 5714, "tid": 5714, "ts": 6303771469981.102, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771469983.833, "dur": 0.609, + "args": { + "cbid": 147, "correlation": 289975175 + } + }, + { + "ph": "s", "id": 289975175, "pid": 5714, "tid": 5714, "ts": 6303771469983.833, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771470021.253, "dur": 2.000, + "args": { + "cbid": 147, "correlation": 289975181 + } + }, + { + "ph": "s", "id": 289975181, "pid": 5714, "tid": 5714, "ts": 6303771470021.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771470119.142, "dur": 1.110, + "args": { + "External id": 148317, "cbid": 317, "correlation": 289975222 + } + }, + { + "ph": "f", "id": 289975222, "pid": 5714, "tid": 5714, "ts": 6303771470119.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771470128.982, "dur": 2.320, + "args": { + "External id": 148318, "cbid": 138, "correlation": 289975225 + } + }, + { + "ph": "f", "id": 289975225, "pid": 5714, "tid": 5714, "ts": 6303771470128.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771766568.726, "dur": 2.016, + "args": { + "External id": 148322, "device": 0, "context": 1, "stream": 7, "correlation": 289975236, "bytes": 7224, "memory bandwidth (GB/s)": 3.5833333333333335 + } + }, + { + "ph": "f", "id": 289975236, "pid": 0, "tid": 7, "ts": 6303771766568.726, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771470151.962, "dur": 11.860, + "args": { + "External id": 148322, "cbid": 41, "correlation": 289975236 + } + }, + { + "ph": "s", "id": 289975236, "pid": 5714, "tid": 5714, "ts": 6303771470151.962, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771470168.852, "dur": 1.610, + "args": { + "External id": 148317, "cbid": 135, "correlation": 289975240 + } + }, + { + "ph": "f", "id": 289975240, "pid": 5714, "tid": 5714, "ts": 6303771470168.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771766572.950, "dur": 310.499, + "args": { + "External id": 148317, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975244, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975244, "pid": 0, "tid": 7, "ts": 6303771766572.950, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771470172.832, "dur": 9.550, + "args": { + "External id": 148317, "cbid": 211, "correlation": 289975244 + } + }, + { + "ph": "s", "id": 289975244, "pid": 5714, "tid": 5714, "ts": 6303771470172.832, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771470257.692, "dur": 1.110, + "args": { + "cbid": 135, "correlation": 289975255 + } + }, + { + "ph": "f", "id": 289975255, "pid": 5714, "tid": 5714, "ts": 6303771470257.692, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771766940.218, "dur": 542.406, + "args": { + "External id": 148329, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975281, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975281, "pid": 0, "tid": 7, "ts": 6303771766940.218, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771470468.412, "dur": 10.240, + "args": { + "External id": 148329, "cbid": 307, "correlation": 289975281 + } + }, + { + "ph": "s", "id": 289975281, "pid": 5714, "tid": 5714, "ts": 6303771470468.412, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771767483.264, "dur": 146.402, + "args": { + "External id": 148335, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975304, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975304, "pid": 0, "tid": 7, "ts": 6303771767483.264, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771470604.141, "dur": 9.740, + "args": { + "External id": 148335, "cbid": 211, "correlation": 289975304 + } + }, + { + "ph": "s", "id": 289975304, "pid": 5714, "tid": 5714, "ts": 6303771470604.141, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771767630.274, "dur": 143.906, + "args": { + "External id": 148336, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975327, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975327, "pid": 0, "tid": 7, "ts": 6303771767630.274, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771470636.391, "dur": 5.710, + "args": { + "External id": 148336, "cbid": 211, "correlation": 289975327 + } + }, + { + "ph": "s", "id": 289975327, "pid": 5714, "tid": 5714, "ts": 6303771470636.391, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771767774.788, "dur": 142.849, + "args": { + "External id": 148337, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975350, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975350, "pid": 0, "tid": 7, "ts": 6303771767774.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771470662.611, "dur": 4.850, + "args": { + "External id": 148337, "cbid": 211, "correlation": 289975350 + } + }, + { + "ph": "s", "id": 289975350, "pid": 5714, "tid": 5714, "ts": 6303771470662.611, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771767918.373, "dur": 54.721, + "args": { + "External id": 148354, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975370, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975370, "pid": 0, "tid": 7, "ts": 6303771767918.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771470913.780, "dur": 9.420, + "args": { + "External id": 148354, "cbid": 307, "correlation": 289975370 + } + }, + { + "ph": "s", "id": 289975370, "pid": 5714, "tid": 5714, "ts": 6303771470913.780, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771767973.702, "dur": 61.121, + "args": { + "External id": 148370, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975388, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975388, "pid": 0, "tid": 7, "ts": 6303771767973.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771471106.310, "dur": 8.780, + "args": { + "External id": 148370, "cbid": 307, "correlation": 289975388 + } + }, + { + "ph": "s", "id": 289975388, "pid": 5714, "tid": 5714, "ts": 6303771471106.310, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771471246.870, "dur": 0.500, + "args": { + "External id": 148376, "cbid": 200, "correlation": 289975395 + } + }, + { + "ph": "f", "id": 289975395, "pid": 5714, "tid": 5714, "ts": 6303771471246.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771471247.500, "dur": 0.190, + "args": { + "External id": 148376, "cbid": 200, "correlation": 289975396 + } + }, + { + "ph": "f", "id": 289975396, "pid": 5714, "tid": 5714, "ts": 6303771471247.500, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771471273.470, "dur": 0.390, + "args": { + "External id": 148376, "cbid": 200, "correlation": 289975419 + } + }, + { + "ph": "f", "id": 289975419, "pid": 5714, "tid": 5714, "ts": 6303771471273.470, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771471279.150, "dur": 1.960, + "args": { + "External id": 148376, "cbid": 273, "correlation": 289975428 + } + }, + { + "ph": "f", "id": 289975428, "pid": 5714, "tid": 5714, "ts": 6303771471279.150, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771768035.495, "dur": 435.525, + "args": { + "External id": 148376, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975429, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975429, "pid": 0, "tid": 7, "ts": 6303771768035.495, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771471281.690, "dur": 10.320, + "args": { + "External id": 148376, "cbid": 211, "correlation": 289975429 + } + }, + { + "ph": "s", "id": 289975429, "pid": 5714, "tid": 5714, "ts": 6303771471281.690, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771768471.756, "dur": 145.025, + "args": { + "External id": 148382, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975452, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975452, "pid": 0, "tid": 7, "ts": 6303771768471.756, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771471376.519, "dur": 8.211, + "args": { + "External id": 148382, "cbid": 211, "correlation": 289975452 + } + }, + { + "ph": "s", "id": 289975452, "pid": 5714, "tid": 5714, "ts": 6303771471376.519, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771768617.517, "dur": 89.441, + "args": { + "External id": 148386, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975478, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975478, "pid": 0, "tid": 7, "ts": 6303771768617.517, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771471537.689, "dur": 8.640, + "args": { + "External id": 148386, "cbid": 307, "correlation": 289975478 + } + }, + { + "ph": "s", "id": 289975478, "pid": 5714, "tid": 5714, "ts": 6303771471537.689, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771768707.534, "dur": 342.021, + "args": { + "External id": 148387, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975498, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975498, "pid": 0, "tid": 7, "ts": 6303771768707.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771471577.819, "dur": 5.890, + "args": { + "External id": 148387, "cbid": 211, "correlation": 289975498 + } + }, + { + "ph": "s", "id": 289975498, "pid": 5714, "tid": 5714, "ts": 6303771471577.819, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771769050.227, "dur": 339.331, + "args": { + "External id": 148388, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975521, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975521, "pid": 0, "tid": 7, "ts": 6303771769050.227, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771471605.199, "dur": 5.170, + "args": { + "External id": 148388, "cbid": 211, "correlation": 289975521 + } + }, + { + "ph": "s", "id": 289975521, "pid": 5714, "tid": 5714, "ts": 6303771471605.199, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6303771769390.262, "dur": 214.403, + "args": { + "External id": 148389, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975533, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975533, "pid": 0, "tid": 7, "ts": 6303771769390.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771471644.699, "dur": 5.290, + "args": { + "External id": 148389, "cbid": 307, "correlation": 289975533 + } + }, + { + "ph": "s", "id": 289975533, "pid": 5714, "tid": 5714, "ts": 6303771471644.699, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771471673.799, "dur": 1.260, + "args": { + "External id": 148390, "cbid": 210, "correlation": 289975553 + } + }, + { + "ph": "f", "id": 289975553, "pid": 5714, "tid": 5714, "ts": 6303771471673.799, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771769605.337, "dur": 415.269, + "args": { + "External id": 148390, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975554, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975554, "pid": 0, "tid": 7, "ts": 6303771769605.337, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771471676.779, "dur": 5.580, + "args": { + "External id": 148390, "cbid": 211, "correlation": 289975554 + } + }, + { + "ph": "s", "id": 289975554, "pid": 5714, "tid": 5714, "ts": 6303771471676.779, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6303771770021.246, "dur": 186.498, + "args": { + "External id": 148391, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975561, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975561, "pid": 0, "tid": 7, "ts": 6303771770021.246, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771471711.849, "dur": 4.990, + "args": { + "External id": 148391, "cbid": 307, "correlation": 289975561 + } + }, + { + "ph": "s", "id": 289975561, "pid": 5714, "tid": 5714, "ts": 6303771471711.849, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771761530.043, "dur": 620.775, + "args": { + "External id": 148407, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289975576, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289975576, "pid": 0, "tid": 17, "ts": 6303771761530.043, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771472178.868, "dur": 11.570, + "args": { + "External id": 148407, "cbid": 211, "correlation": 289975576 + } + }, + { + "ph": "s", "id": 289975576, "pid": 5714, "tid": 5714, "ts": 6303771472178.868, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771762154.722, "dur": 14.977, + "args": { + "External id": 148423, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289975589, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289975589, "pid": 0, "tid": 17, "ts": 6303771762154.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771472288.017, "dur": 17.590, + "args": { + "External id": 148423, "cbid": 211, "correlation": 289975589 + } + }, + { + "ph": "s", "id": 289975589, "pid": 5714, "tid": 5714, "ts": 6303771472288.017, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771472329.957, "dur": 1.330, + "args": { + "cbid": 135, "correlation": 289975599 + } + }, + { + "ph": "f", "id": 289975599, "pid": 5714, "tid": 5714, "ts": 6303771472329.957, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771472333.007, "dur": 1.310, + "args": { + "cbid": 147, "correlation": 289975603 + } + }, + { + "ph": "s", "id": 289975603, "pid": 5714, "tid": 5714, "ts": 6303771472333.007, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771472393.007, "dur": 0.880, + "args": { + "External id": 148425, "cbid": 317, "correlation": 289975616 + } + }, + { + "ph": "f", "id": 289975616, "pid": 5714, "tid": 5714, "ts": 6303771472393.007, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771472395.697, "dur": 1.090, + "args": { + "External id": 148425, "cbid": 135, "correlation": 289975618 + } + }, + { + "ph": "f", "id": 289975618, "pid": 5714, "tid": 5714, "ts": 6303771472395.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771472398.117, "dur": 1.130, + "args": { + "External id": 148425, "cbid": 147, "correlation": 289975622 + } + }, + { + "ph": "s", "id": 289975622, "pid": 5714, "tid": 5714, "ts": 6303771472398.117, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771472414.227, "dur": 0.620, + "args": { + "External id": 148425, "cbid": 409, "correlation": 289975625 + } + }, + { + "ph": "f", "id": 289975625, "pid": 5714, "tid": 5714, "ts": 6303771472414.227, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771472418.657, "dur": 0.730, + "args": { + "External id": 148425, "cbid": 135, "correlation": 289975628 + } + }, + { + "ph": "f", "id": 289975628, "pid": 5714, "tid": 5714, "ts": 6303771472418.657, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771472419.557, "dur": 0.880, + "args": { + "External id": 148425, "cbid": 147, "correlation": 289975629 + } + }, + { + "ph": "s", "id": 289975629, "pid": 5714, "tid": 5714, "ts": 6303771472419.557, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771766565.334, "dur": 4938.105, + "args": { + "External id": 148425, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289975631, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289975631, "pid": 0, "tid": 20, "ts": 6303771766565.334, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771472421.527, "dur": 9.700, + "args": { + "External id": 148425, "cbid": 430, "correlation": 289975631 + } + }, + { + "ph": "s", "id": 289975631, "pid": 5714, "tid": 5714, "ts": 6303771472421.527, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771472432.197, "dur": 0.370, + "args": { + "External id": 148425, "cbid": 135, "correlation": 289975633 + } + }, + { + "ph": "f", "id": 289975633, "pid": 5714, "tid": 5714, "ts": 6303771472432.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771472432.687, "dur": 0.470, + "args": { + "External id": 148425, "cbid": 147, "correlation": 289975634 + } + }, + { + "ph": "s", "id": 289975634, "pid": 5714, "tid": 5714, "ts": 6303771472432.687, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771472434.597, "dur": 0.830, + "args": { + "External id": 148425, "cbid": 135, "correlation": 289975637 + } + }, + { + "ph": "f", "id": 289975637, "pid": 5714, "tid": 5714, "ts": 6303771472434.597, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771472443.497, "dur": 0.440, + "args": { + "External id": 148425, "cbid": 135, "correlation": 289975644 + } + }, + { + "ph": "f", "id": 289975644, "pid": 5714, "tid": 5714, "ts": 6303771472443.497, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771472472.297, "dur": 1.000, + "args": { + "External id": 148427, "cbid": 147, "correlation": 289975649 + } + }, + { + "ph": "s", "id": 289975649, "pid": 5714, "tid": 5714, "ts": 6303771472472.297, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771472488.737, "dur": 0.830, + "args": { + "cbid": 135, "correlation": 289975664 + } + }, + { + "ph": "f", "id": 289975664, "pid": 5714, "tid": 5714, "ts": 6303771472488.737, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771472527.907, "dur": 0.990, + "args": { + "cbid": 147, "correlation": 289975669 + } + }, + { + "ph": "s", "id": 289975669, "pid": 5714, "tid": 5714, "ts": 6303771472527.907, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771472531.347, "dur": 0.640, + "args": { + "cbid": 147, "correlation": 289975673 + } + }, + { + "ph": "s", "id": 289975673, "pid": 5714, "tid": 5714, "ts": 6303771472531.347, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771472568.847, "dur": 1.980, + "args": { + "cbid": 147, "correlation": 289975679 + } + }, + { + "ph": "s", "id": 289975679, "pid": 5714, "tid": 5714, "ts": 6303771472568.847, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771472664.867, "dur": 1.049, + "args": { + "External id": 148440, "cbid": 317, "correlation": 289975720 + } + }, + { + "ph": "f", "id": 289975720, "pid": 5714, "tid": 5714, "ts": 6303771472664.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771472674.767, "dur": 2.360, + "args": { + "External id": 148441, "cbid": 138, "correlation": 289975723 + } + }, + { + "ph": "f", "id": 289975723, "pid": 5714, "tid": 5714, "ts": 6303771472674.767, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771771507.535, "dur": 1.632, + "args": { + "External id": 148445, "device": 0, "context": 1, "stream": 7, "correlation": 289975734, "bytes": 7224, "memory bandwidth (GB/s)": 4.426470588235294 + } + }, + { + "ph": "f", "id": 289975734, "pid": 0, "tid": 7, "ts": 6303771771507.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771472697.927, "dur": 11.700, + "args": { + "External id": 148445, "cbid": 41, "correlation": 289975734 + } + }, + { + "ph": "s", "id": 289975734, "pid": 5714, "tid": 5714, "ts": 6303771472697.927, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771472713.616, "dur": 1.660, + "args": { + "External id": 148440, "cbid": 135, "correlation": 289975738 + } + }, + { + "ph": "f", "id": 289975738, "pid": 5714, "tid": 5714, "ts": 6303771472713.616, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771771511.151, "dur": 210.594, + "args": { + "External id": 148440, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975742, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975742, "pid": 0, "tid": 7, "ts": 6303771771511.151, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771472717.656, "dur": 9.571, + "args": { + "External id": 148440, "cbid": 211, "correlation": 289975742 + } + }, + { + "ph": "s", "id": 289975742, "pid": 5714, "tid": 5714, "ts": 6303771472717.656, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771472903.356, "dur": 1.220, + "args": { + "cbid": 135, "correlation": 289975753 + } + }, + { + "ph": "f", "id": 289975753, "pid": 5714, "tid": 5714, "ts": 6303771472903.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771771814.867, "dur": 591.654, + "args": { + "External id": 148452, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975779, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975779, "pid": 0, "tid": 7, "ts": 6303771771814.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771473101.146, "dur": 10.020, + "args": { + "External id": 148452, "cbid": 307, "correlation": 289975779 + } + }, + { + "ph": "s", "id": 289975779, "pid": 5714, "tid": 5714, "ts": 6303771473101.146, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771772407.193, "dur": 145.154, + "args": { + "External id": 148458, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975802, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975802, "pid": 0, "tid": 7, "ts": 6303771772407.193, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771473235.595, "dur": 9.720, + "args": { + "External id": 148458, "cbid": 211, "correlation": 289975802 + } + }, + { + "ph": "s", "id": 289975802, "pid": 5714, "tid": 5714, "ts": 6303771473235.595, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771772553.051, "dur": 143.202, + "args": { + "External id": 148459, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975825, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975825, "pid": 0, "tid": 7, "ts": 6303771772553.051, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771473268.055, "dur": 5.570, + "args": { + "External id": 148459, "cbid": 211, "correlation": 289975825 + } + }, + { + "ph": "s", "id": 289975825, "pid": 5714, "tid": 5714, "ts": 6303771473268.055, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771772696.957, "dur": 143.426, + "args": { + "External id": 148460, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975848, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975848, "pid": 0, "tid": 7, "ts": 6303771772696.957, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771473295.485, "dur": 11.420, + "args": { + "External id": 148460, "cbid": 211, "correlation": 289975848 + } + }, + { + "ph": "s", "id": 289975848, "pid": 5714, "tid": 5714, "ts": 6303771473295.485, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771772841.023, "dur": 52.096, + "args": { + "External id": 148477, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975868, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975868, "pid": 0, "tid": 7, "ts": 6303771772841.023, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771473554.334, "dur": 9.291, + "args": { + "External id": 148477, "cbid": 307, "correlation": 289975868 + } + }, + { + "ph": "s", "id": 289975868, "pid": 5714, "tid": 5714, "ts": 6303771473554.334, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771772893.823, "dur": 62.913, + "args": { + "External id": 148493, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975886, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975886, "pid": 0, "tid": 7, "ts": 6303771772893.823, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771473756.084, "dur": 8.850, + "args": { + "External id": 148493, "cbid": 307, "correlation": 289975886 + } + }, + { + "ph": "s", "id": 289975886, "pid": 5714, "tid": 5714, "ts": 6303771473756.084, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771473893.424, "dur": 0.510, + "args": { + "External id": 148499, "cbid": 200, "correlation": 289975893 + } + }, + { + "ph": "f", "id": 289975893, "pid": 5714, "tid": 5714, "ts": 6303771473893.424, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771473894.134, "dur": 0.190, + "args": { + "External id": 148499, "cbid": 200, "correlation": 289975894 + } + }, + { + "ph": "f", "id": 289975894, "pid": 5714, "tid": 5714, "ts": 6303771473894.134, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771473921.074, "dur": 0.370, + "args": { + "External id": 148499, "cbid": 200, "correlation": 289975917 + } + }, + { + "ph": "f", "id": 289975917, "pid": 5714, "tid": 5714, "ts": 6303771473921.074, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771473926.734, "dur": 1.920, + "args": { + "External id": 148499, "cbid": 273, "correlation": 289975926 + } + }, + { + "ph": "f", "id": 289975926, "pid": 5714, "tid": 5714, "ts": 6303771473926.734, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771772957.408, "dur": 422.597, + "args": { + "External id": 148499, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975927, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975927, "pid": 0, "tid": 7, "ts": 6303771772957.408, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771473929.244, "dur": 10.070, + "args": { + "External id": 148499, "cbid": 211, "correlation": 289975927 + } + }, + { + "ph": "s", "id": 289975927, "pid": 5714, "tid": 5714, "ts": 6303771473929.244, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771773380.677, "dur": 145.313, + "args": { + "External id": 148505, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975950, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975950, "pid": 0, "tid": 7, "ts": 6303771773380.677, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771474002.773, "dur": 7.431, + "args": { + "External id": 148505, "cbid": 211, "correlation": 289975950 + } + }, + { + "ph": "s", "id": 289975950, "pid": 5714, "tid": 5714, "ts": 6303771474002.773, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771773526.630, "dur": 89.953, + "args": { + "External id": 148509, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975976, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289975976, "pid": 0, "tid": 7, "ts": 6303771773526.630, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771474161.423, "dur": 8.840, + "args": { + "External id": 148509, "cbid": 307, "correlation": 289975976 + } + }, + { + "ph": "s", "id": 289975976, "pid": 5714, "tid": 5714, "ts": 6303771474161.423, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771773617.191, "dur": 346.629, + "args": { + "External id": 148510, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289975996, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289975996, "pid": 0, "tid": 7, "ts": 6303771773617.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771474201.493, "dur": 6.010, + "args": { + "External id": 148510, "cbid": 211, "correlation": 289975996 + } + }, + { + "ph": "s", "id": 289975996, "pid": 5714, "tid": 5714, "ts": 6303771474201.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771773964.428, "dur": 357.252, + "args": { + "External id": 148511, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976019, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976019, "pid": 0, "tid": 7, "ts": 6303771773964.428, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771474229.753, "dur": 5.390, + "args": { + "External id": 148511, "cbid": 211, "correlation": 289976019 + } + }, + { + "ph": "s", "id": 289976019, "pid": 5714, "tid": 5714, "ts": 6303771474229.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6303771774322.320, "dur": 341.092, + "args": { + "External id": 148512, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976031, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976031, "pid": 0, "tid": 7, "ts": 6303771774322.320, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771474267.363, "dur": 5.590, + "args": { + "External id": 148512, "cbid": 307, "correlation": 289976031 + } + }, + { + "ph": "s", "id": 289976031, "pid": 5714, "tid": 5714, "ts": 6303771474267.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771474296.253, "dur": 8.710, + "args": { + "External id": 148513, "cbid": 210, "correlation": 289976051 + } + }, + { + "ph": "f", "id": 289976051, "pid": 5714, "tid": 5714, "ts": 6303771474296.253, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771774664.052, "dur": 552.966, + "args": { + "External id": 148513, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976052, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976052, "pid": 0, "tid": 7, "ts": 6303771774664.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771474306.753, "dur": 6.070, + "args": { + "External id": 148513, "cbid": 211, "correlation": 289976052 + } + }, + { + "ph": "s", "id": 289976052, "pid": 5714, "tid": 5714, "ts": 6303771474306.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6303771775217.626, "dur": 213.826, + "args": { + "External id": 148514, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976059, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976059, "pid": 0, "tid": 7, "ts": 6303771775217.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771474344.873, "dur": 5.160, + "args": { + "External id": 148514, "cbid": 307, "correlation": 289976059 + } + }, + { + "ph": "s", "id": 289976059, "pid": 5714, "tid": 5714, "ts": 6303771474344.873, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771767089.916, "dur": 403.268, + "args": { + "External id": 148530, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289976074, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289976074, "pid": 0, "tid": 17, "ts": 6303771767089.916, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771474699.682, "dur": 11.770, + "args": { + "External id": 148530, "cbid": 211, "correlation": 289976074 + } + }, + { + "ph": "s", "id": 289976074, "pid": 5714, "tid": 5714, "ts": 6303771474699.682, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771767509.761, "dur": 17.088, + "args": { + "External id": 148546, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289976087, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289976087, "pid": 0, "tid": 17, "ts": 6303771767509.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771474812.402, "dur": 8.970, + "args": { + "External id": 148546, "cbid": 211, "correlation": 289976087 + } + }, + { + "ph": "s", "id": 289976087, "pid": 5714, "tid": 5714, "ts": 6303771474812.402, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771474844.892, "dur": 1.340, + "args": { + "cbid": 135, "correlation": 289976097 + } + }, + { + "ph": "f", "id": 289976097, "pid": 5714, "tid": 5714, "ts": 6303771474844.892, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771474848.112, "dur": 1.280, + "args": { + "cbid": 147, "correlation": 289976101 + } + }, + { + "ph": "s", "id": 289976101, "pid": 5714, "tid": 5714, "ts": 6303771474848.112, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771474900.562, "dur": 0.900, + "args": { + "External id": 148548, "cbid": 317, "correlation": 289976114 + } + }, + { + "ph": "f", "id": 289976114, "pid": 5714, "tid": 5714, "ts": 6303771474900.562, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771474903.262, "dur": 1.040, + "args": { + "External id": 148548, "cbid": 135, "correlation": 289976116 + } + }, + { + "ph": "f", "id": 289976116, "pid": 5714, "tid": 5714, "ts": 6303771474903.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771474905.751, "dur": 1.080, + "args": { + "External id": 148548, "cbid": 147, "correlation": 289976120 + } + }, + { + "ph": "s", "id": 289976120, "pid": 5714, "tid": 5714, "ts": 6303771474905.751, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771474922.071, "dur": 0.691, + "args": { + "External id": 148548, "cbid": 409, "correlation": 289976123 + } + }, + { + "ph": "f", "id": 289976123, "pid": 5714, "tid": 5714, "ts": 6303771474922.071, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771474926.631, "dur": 0.751, + "args": { + "External id": 148548, "cbid": 135, "correlation": 289976126 + } + }, + { + "ph": "f", "id": 289976126, "pid": 5714, "tid": 5714, "ts": 6303771474926.631, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771474927.562, "dur": 0.820, + "args": { + "External id": 148548, "cbid": 147, "correlation": 289976127 + } + }, + { + "ph": "s", "id": 289976127, "pid": 5714, "tid": 5714, "ts": 6303771474927.562, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771771504.623, "dur": 4691.446, + "args": { + "External id": 148548, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289976129, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289976129, "pid": 0, "tid": 20, "ts": 6303771771504.623, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771474929.482, "dur": 9.169, + "args": { + "External id": 148548, "cbid": 430, "correlation": 289976129 + } + }, + { + "ph": "s", "id": 289976129, "pid": 5714, "tid": 5714, "ts": 6303771474929.482, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771474939.611, "dur": 0.380, + "args": { + "External id": 148548, "cbid": 135, "correlation": 289976131 + } + }, + { + "ph": "f", "id": 289976131, "pid": 5714, "tid": 5714, "ts": 6303771474939.611, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771474940.102, "dur": 0.440, + "args": { + "External id": 148548, "cbid": 147, "correlation": 289976132 + } + }, + { + "ph": "s", "id": 289976132, "pid": 5714, "tid": 5714, "ts": 6303771474940.102, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771474941.911, "dur": 0.820, + "args": { + "External id": 148548, "cbid": 135, "correlation": 289976135 + } + }, + { + "ph": "f", "id": 289976135, "pid": 5714, "tid": 5714, "ts": 6303771474941.911, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771474950.951, "dur": 0.411, + "args": { + "External id": 148548, "cbid": 135, "correlation": 289976142 + } + }, + { + "ph": "f", "id": 289976142, "pid": 5714, "tid": 5714, "ts": 6303771474950.951, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771474978.111, "dur": 0.931, + "args": { + "External id": 148550, "cbid": 147, "correlation": 289976147 + } + }, + { + "ph": "s", "id": 289976147, "pid": 5714, "tid": 5714, "ts": 6303771474978.111, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771474994.691, "dur": 0.840, + "args": { + "cbid": 135, "correlation": 289976162 + } + }, + { + "ph": "f", "id": 289976162, "pid": 5714, "tid": 5714, "ts": 6303771474994.691, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771475033.781, "dur": 0.940, + "args": { + "cbid": 147, "correlation": 289976167 + } + }, + { + "ph": "s", "id": 289976167, "pid": 5714, "tid": 5714, "ts": 6303771475033.781, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771475037.081, "dur": 0.650, + "args": { + "cbid": 147, "correlation": 289976171 + } + }, + { + "ph": "s", "id": 289976171, "pid": 5714, "tid": 5714, "ts": 6303771475037.081, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771475074.201, "dur": 2.070, + "args": { + "cbid": 147, "correlation": 289976177 + } + }, + { + "ph": "s", "id": 289976177, "pid": 5714, "tid": 5714, "ts": 6303771475074.201, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771475171.931, "dur": 1.120, + "args": { + "External id": 148563, "cbid": 317, "correlation": 289976218 + } + }, + { + "ph": "f", "id": 289976218, "pid": 5714, "tid": 5714, "ts": 6303771475171.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771475181.671, "dur": 2.310, + "args": { + "External id": 148564, "cbid": 138, "correlation": 289976221 + } + }, + { + "ph": "f", "id": 289976221, "pid": 5714, "tid": 5714, "ts": 6303771475181.671, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771776197.381, "dur": 1.568, + "args": { + "External id": 148568, "device": 0, "context": 1, "stream": 7, "correlation": 289976232, "bytes": 7224, "memory bandwidth (GB/s)": 4.607142857142857 + } + }, + { + "ph": "f", "id": 289976232, "pid": 0, "tid": 7, "ts": 6303771776197.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771475204.151, "dur": 11.500, + "args": { + "External id": 148568, "cbid": 41, "correlation": 289976232 + } + }, + { + "ph": "s", "id": 289976232, "pid": 5714, "tid": 5714, "ts": 6303771475204.151, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771475220.831, "dur": 1.680, + "args": { + "External id": 148563, "cbid": 135, "correlation": 289976236 + } + }, + { + "ph": "f", "id": 289976236, "pid": 5714, "tid": 5714, "ts": 6303771475220.831, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771776201.029, "dur": 16.833, + "args": { + "External id": 148563, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976240, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976240, "pid": 0, "tid": 7, "ts": 6303771776201.029, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771475224.931, "dur": 9.630, + "args": { + "External id": 148563, "cbid": 211, "correlation": 289976240 + } + }, + { + "ph": "s", "id": 289976240, "pid": 5714, "tid": 5714, "ts": 6303771475224.931, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771475319.811, "dur": 1.230, + "args": { + "cbid": 135, "correlation": 289976251 + } + }, + { + "ph": "f", "id": 289976251, "pid": 5714, "tid": 5714, "ts": 6303771475319.811, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771776218.470, "dur": 551.046, + "args": { + "External id": 148575, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976277, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976277, "pid": 0, "tid": 7, "ts": 6303771776218.470, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771475517.620, "dur": 10.140, + "args": { + "External id": 148575, "cbid": 307, "correlation": 289976277 + } + }, + { + "ph": "s", "id": 289976277, "pid": 5714, "tid": 5714, "ts": 6303771475517.620, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771776871.757, "dur": 380.901, + "args": { + "External id": 148581, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976300, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976300, "pid": 0, "tid": 7, "ts": 6303771776871.757, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771475656.170, "dur": 9.660, + "args": { + "External id": 148581, "cbid": 211, "correlation": 289976300 + } + }, + { + "ph": "s", "id": 289976300, "pid": 5714, "tid": 5714, "ts": 6303771475656.170, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771777253.330, "dur": 143.362, + "args": { + "External id": 148582, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976323, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976323, "pid": 0, "tid": 7, "ts": 6303771777253.330, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771475688.640, "dur": 5.620, + "args": { + "External id": 148582, "cbid": 211, "correlation": 289976323 + } + }, + { + "ph": "s", "id": 289976323, "pid": 5714, "tid": 5714, "ts": 6303771475688.640, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771777397.428, "dur": 143.841, + "args": { + "External id": 148583, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976346, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976346, "pid": 0, "tid": 7, "ts": 6303771777397.428, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771475713.950, "dur": 4.750, + "args": { + "External id": 148583, "cbid": 211, "correlation": 289976346 + } + }, + { + "ph": "s", "id": 289976346, "pid": 5714, "tid": 5714, "ts": 6303771475713.950, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771777542.005, "dur": 53.153, + "args": { + "External id": 148600, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976366, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976366, "pid": 0, "tid": 7, "ts": 6303771777542.005, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771475958.999, "dur": 9.210, + "args": { + "External id": 148600, "cbid": 307, "correlation": 289976366 + } + }, + { + "ph": "s", "id": 289976366, "pid": 5714, "tid": 5714, "ts": 6303771475958.999, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771777595.766, "dur": 62.496, + "args": { + "External id": 148616, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976384, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976384, "pid": 0, "tid": 7, "ts": 6303771777595.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771476141.619, "dur": 8.590, + "args": { + "External id": 148616, "cbid": 307, "correlation": 289976384 + } + }, + { + "ph": "s", "id": 289976384, "pid": 5714, "tid": 5714, "ts": 6303771476141.619, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771476277.188, "dur": 0.460, + "args": { + "External id": 148622, "cbid": 200, "correlation": 289976391 + } + }, + { + "ph": "f", "id": 289976391, "pid": 5714, "tid": 5714, "ts": 6303771476277.188, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771476277.779, "dur": 0.189, + "args": { + "External id": 148622, "cbid": 200, "correlation": 289976392 + } + }, + { + "ph": "f", "id": 289976392, "pid": 5714, "tid": 5714, "ts": 6303771476277.779, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771476310.768, "dur": 0.391, + "args": { + "External id": 148622, "cbid": 200, "correlation": 289976415 + } + }, + { + "ph": "f", "id": 289976415, "pid": 5714, "tid": 5714, "ts": 6303771476310.768, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771476317.759, "dur": 1.909, + "args": { + "External id": 148622, "cbid": 273, "correlation": 289976424 + } + }, + { + "ph": "f", "id": 289976424, "pid": 5714, "tid": 5714, "ts": 6303771476317.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771777658.966, "dur": 427.397, + "args": { + "External id": 148622, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976425, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976425, "pid": 0, "tid": 7, "ts": 6303771777658.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771476320.248, "dur": 10.351, + "args": { + "External id": 148622, "cbid": 211, "correlation": 289976425 + } + }, + { + "ph": "s", "id": 289976425, "pid": 5714, "tid": 5714, "ts": 6303771476320.248, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771778087.035, "dur": 145.122, + "args": { + "External id": 148628, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976448, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976448, "pid": 0, "tid": 7, "ts": 6303771778087.035, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771476393.888, "dur": 7.380, + "args": { + "External id": 148628, "cbid": 211, "correlation": 289976448 + } + }, + { + "ph": "s", "id": 289976448, "pid": 5714, "tid": 5714, "ts": 6303771476393.888, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771778232.893, "dur": 89.057, + "args": { + "External id": 148632, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976474, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976474, "pid": 0, "tid": 7, "ts": 6303771778232.893, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771476553.728, "dur": 8.740, + "args": { + "External id": 148632, "cbid": 307, "correlation": 289976474 + } + }, + { + "ph": "s", "id": 289976474, "pid": 5714, "tid": 5714, "ts": 6303771476553.728, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771778322.590, "dur": 342.020, + "args": { + "External id": 148633, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976494, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976494, "pid": 0, "tid": 7, "ts": 6303771778322.590, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771476593.158, "dur": 5.840, + "args": { + "External id": 148633, "cbid": 211, "correlation": 289976494 + } + }, + { + "ph": "s", "id": 289976494, "pid": 5714, "tid": 5714, "ts": 6303771476593.158, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771778665.314, "dur": 371.653, + "args": { + "External id": 148634, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976517, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976517, "pid": 0, "tid": 7, "ts": 6303771778665.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771476619.948, "dur": 5.170, + "args": { + "External id": 148634, "cbid": 211, "correlation": 289976517 + } + }, + { + "ph": "s", "id": 289976517, "pid": 5714, "tid": 5714, "ts": 6303771476619.948, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6303771779037.639, "dur": 339.747, + "args": { + "External id": 148635, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976529, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976529, "pid": 0, "tid": 7, "ts": 6303771779037.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771476658.338, "dur": 5.440, + "args": { + "External id": 148635, "cbid": 307, "correlation": 289976529 + } + }, + { + "ph": "s", "id": 289976529, "pid": 5714, "tid": 5714, "ts": 6303771476658.338, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771476687.098, "dur": 1.280, + "args": { + "External id": 148636, "cbid": 210, "correlation": 289976549 + } + }, + { + "ph": "f", "id": 289976549, "pid": 5714, "tid": 5714, "ts": 6303771476687.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771779378.122, "dur": 608.135, + "args": { + "External id": 148636, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976550, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976550, "pid": 0, "tid": 7, "ts": 6303771779378.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771476690.158, "dur": 5.400, + "args": { + "External id": 148636, "cbid": 211, "correlation": 289976550 + } + }, + { + "ph": "s", "id": 289976550, "pid": 5714, "tid": 5714, "ts": 6303771476690.158, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6303771779986.929, "dur": 187.523, + "args": { + "External id": 148637, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976557, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976557, "pid": 0, "tid": 7, "ts": 6303771779986.929, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771476723.578, "dur": 5.060, + "args": { + "External id": 148637, "cbid": 307, "correlation": 289976557 + } + }, + { + "ph": "s", "id": 289976557, "pid": 5714, "tid": 5714, "ts": 6303771476723.578, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771771905.428, "dur": 521.350, + "args": { + "External id": 148653, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289976572, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289976572, "pid": 0, "tid": 17, "ts": 6303771771905.428, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771477076.187, "dur": 11.570, + "args": { + "External id": 148653, "cbid": 211, "correlation": 289976572 + } + }, + { + "ph": "s", "id": 289976572, "pid": 5714, "tid": 5714, "ts": 6303771477076.187, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771772433.722, "dur": 16.032, + "args": { + "External id": 148669, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289976585, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289976585, "pid": 0, "tid": 17, "ts": 6303771772433.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771477184.306, "dur": 8.871, + "args": { + "External id": 148669, "cbid": 211, "correlation": 289976585 + } + }, + { + "ph": "s", "id": 289976585, "pid": 5714, "tid": 5714, "ts": 6303771477184.306, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771477216.286, "dur": 1.291, + "args": { + "cbid": 135, "correlation": 289976595 + } + }, + { + "ph": "f", "id": 289976595, "pid": 5714, "tid": 5714, "ts": 6303771477216.286, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771477219.366, "dur": 1.300, + "args": { + "cbid": 147, "correlation": 289976599 + } + }, + { + "ph": "s", "id": 289976599, "pid": 5714, "tid": 5714, "ts": 6303771477219.366, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771477272.526, "dur": 0.910, + "args": { + "External id": 148671, "cbid": 317, "correlation": 289976612 + } + }, + { + "ph": "f", "id": 289976612, "pid": 5714, "tid": 5714, "ts": 6303771477272.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771477275.966, "dur": 1.060, + "args": { + "External id": 148671, "cbid": 135, "correlation": 289976614 + } + }, + { + "ph": "f", "id": 289976614, "pid": 5714, "tid": 5714, "ts": 6303771477275.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771477278.386, "dur": 1.080, + "args": { + "External id": 148671, "cbid": 147, "correlation": 289976618 + } + }, + { + "ph": "s", "id": 289976618, "pid": 5714, "tid": 5714, "ts": 6303771477278.386, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771477294.656, "dur": 0.670, + "args": { + "External id": 148671, "cbid": 409, "correlation": 289976621 + } + }, + { + "ph": "f", "id": 289976621, "pid": 5714, "tid": 5714, "ts": 6303771477294.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771477306.616, "dur": 0.780, + "args": { + "External id": 148671, "cbid": 135, "correlation": 289976624 + } + }, + { + "ph": "f", "id": 289976624, "pid": 5714, "tid": 5714, "ts": 6303771477306.616, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771477307.566, "dur": 0.950, + "args": { + "External id": 148671, "cbid": 147, "correlation": 289976625 + } + }, + { + "ph": "s", "id": 289976625, "pid": 5714, "tid": 5714, "ts": 6303771477307.566, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771776197.189, "dur": 4644.630, + "args": { + "External id": 148671, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289976627, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289976627, "pid": 0, "tid": 20, "ts": 6303771776197.189, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771477309.576, "dur": 9.840, + "args": { + "External id": 148671, "cbid": 430, "correlation": 289976627 + } + }, + { + "ph": "s", "id": 289976627, "pid": 5714, "tid": 5714, "ts": 6303771477309.576, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771477320.346, "dur": 0.410, + "args": { + "External id": 148671, "cbid": 135, "correlation": 289976629 + } + }, + { + "ph": "f", "id": 289976629, "pid": 5714, "tid": 5714, "ts": 6303771477320.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771477320.876, "dur": 0.550, + "args": { + "External id": 148671, "cbid": 147, "correlation": 289976630 + } + }, + { + "ph": "s", "id": 289976630, "pid": 5714, "tid": 5714, "ts": 6303771477320.876, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771477323.176, "dur": 0.910, + "args": { + "External id": 148671, "cbid": 135, "correlation": 289976633 + } + }, + { + "ph": "f", "id": 289976633, "pid": 5714, "tid": 5714, "ts": 6303771477323.176, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771477332.146, "dur": 0.420, + "args": { + "External id": 148671, "cbid": 135, "correlation": 289976640 + } + }, + { + "ph": "f", "id": 289976640, "pid": 5714, "tid": 5714, "ts": 6303771477332.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771477360.676, "dur": 1.070, + "args": { + "External id": 148673, "cbid": 147, "correlation": 289976645 + } + }, + { + "ph": "s", "id": 289976645, "pid": 5714, "tid": 5714, "ts": 6303771477360.676, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771477377.706, "dur": 0.850, + "args": { + "cbid": 135, "correlation": 289976660 + } + }, + { + "ph": "f", "id": 289976660, "pid": 5714, "tid": 5714, "ts": 6303771477377.706, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771477416.166, "dur": 0.980, + "args": { + "cbid": 147, "correlation": 289976665 + } + }, + { + "ph": "s", "id": 289976665, "pid": 5714, "tid": 5714, "ts": 6303771477416.166, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771477419.076, "dur": 0.610, + "args": { + "cbid": 147, "correlation": 289976669 + } + }, + { + "ph": "s", "id": 289976669, "pid": 5714, "tid": 5714, "ts": 6303771477419.076, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771477455.676, "dur": 2.100, + "args": { + "cbid": 147, "correlation": 289976675 + } + }, + { + "ph": "s", "id": 289976675, "pid": 5714, "tid": 5714, "ts": 6303771477455.676, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771477553.356, "dur": 1.100, + "args": { + "External id": 148686, "cbid": 317, "correlation": 289976716 + } + }, + { + "ph": "f", "id": 289976716, "pid": 5714, "tid": 5714, "ts": 6303771477553.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771477564.436, "dur": 2.280, + "args": { + "External id": 148687, "cbid": 138, "correlation": 289976719 + } + }, + { + "ph": "f", "id": 289976719, "pid": 5714, "tid": 5714, "ts": 6303771477564.436, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771780846.363, "dur": 1.600, + "args": { + "External id": 148691, "device": 0, "context": 1, "stream": 7, "correlation": 289976730, "bytes": 7224, "memory bandwidth (GB/s)": 4.515 + } + }, + { + "ph": "f", "id": 289976730, "pid": 0, "tid": 7, "ts": 6303771780846.363, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771477588.126, "dur": 11.779, + "args": { + "External id": 148691, "cbid": 41, "correlation": 289976730 + } + }, + { + "ph": "s", "id": 289976730, "pid": 5714, "tid": 5714, "ts": 6303771477588.126, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771477603.905, "dur": 1.600, + "args": { + "External id": 148686, "cbid": 135, "correlation": 289976734 + } + }, + { + "ph": "f", "id": 289976734, "pid": 5714, "tid": 5714, "ts": 6303771477603.905, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771780852.571, "dur": 438.886, + "args": { + "External id": 148686, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976738, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976738, "pid": 0, "tid": 7, "ts": 6303771780852.571, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771477607.916, "dur": 9.940, + "args": { + "External id": 148686, "cbid": 211, "correlation": 289976738 + } + }, + { + "ph": "s", "id": 289976738, "pid": 5714, "tid": 5714, "ts": 6303771477607.916, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771477695.005, "dur": 1.330, + "args": { + "cbid": 135, "correlation": 289976749 + } + }, + { + "ph": "f", "id": 289976749, "pid": 5714, "tid": 5714, "ts": 6303771477695.005, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771781294.721, "dur": 469.189, + "args": { + "External id": 148698, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976775, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976775, "pid": 0, "tid": 7, "ts": 6303771781294.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771477894.365, "dur": 10.100, + "args": { + "External id": 148698, "cbid": 307, "correlation": 289976775 + } + }, + { + "ph": "s", "id": 289976775, "pid": 5714, "tid": 5714, "ts": 6303771477894.365, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771781764.582, "dur": 144.354, + "args": { + "External id": 148704, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976798, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976798, "pid": 0, "tid": 7, "ts": 6303771781764.582, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771478037.285, "dur": 9.770, + "args": { + "External id": 148704, "cbid": 211, "correlation": 289976798 + } + }, + { + "ph": "s", "id": 289976798, "pid": 5714, "tid": 5714, "ts": 6303771478037.285, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771781909.544, "dur": 143.201, + "args": { + "External id": 148705, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976821, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976821, "pid": 0, "tid": 7, "ts": 6303771781909.544, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771478069.944, "dur": 5.380, + "args": { + "External id": 148705, "cbid": 211, "correlation": 289976821 + } + }, + { + "ph": "s", "id": 289976821, "pid": 5714, "tid": 5714, "ts": 6303771478069.944, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771782053.449, "dur": 144.034, + "args": { + "External id": 148706, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976844, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976844, "pid": 0, "tid": 7, "ts": 6303771782053.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771478094.355, "dur": 4.809, + "args": { + "External id": 148706, "cbid": 211, "correlation": 289976844 + } + }, + { + "ph": "s", "id": 289976844, "pid": 5714, "tid": 5714, "ts": 6303771478094.355, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771782198.219, "dur": 52.577, + "args": { + "External id": 148723, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976864, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976864, "pid": 0, "tid": 7, "ts": 6303771782198.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771478353.854, "dur": 9.470, + "args": { + "External id": 148723, "cbid": 307, "correlation": 289976864 + } + }, + { + "ph": "s", "id": 289976864, "pid": 5714, "tid": 5714, "ts": 6303771478353.854, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771782251.404, "dur": 62.209, + "args": { + "External id": 148739, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976882, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976882, "pid": 0, "tid": 7, "ts": 6303771782251.404, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771478537.394, "dur": 8.120, + "args": { + "External id": 148739, "cbid": 307, "correlation": 289976882 + } + }, + { + "ph": "s", "id": 289976882, "pid": 5714, "tid": 5714, "ts": 6303771478537.394, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771478674.723, "dur": 0.500, + "args": { + "External id": 148745, "cbid": 200, "correlation": 289976889 + } + }, + { + "ph": "f", "id": 289976889, "pid": 5714, "tid": 5714, "ts": 6303771478674.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771478675.343, "dur": 0.180, + "args": { + "External id": 148745, "cbid": 200, "correlation": 289976890 + } + }, + { + "ph": "f", "id": 289976890, "pid": 5714, "tid": 5714, "ts": 6303771478675.343, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771478702.223, "dur": 0.390, + "args": { + "External id": 148745, "cbid": 200, "correlation": 289976913 + } + }, + { + "ph": "f", "id": 289976913, "pid": 5714, "tid": 5714, "ts": 6303771478702.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771478707.553, "dur": 1.940, + "args": { + "External id": 148745, "cbid": 273, "correlation": 289976922 + } + }, + { + "ph": "f", "id": 289976922, "pid": 5714, "tid": 5714, "ts": 6303771478707.553, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771782314.317, "dur": 425.572, + "args": { + "External id": 148745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976923, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976923, "pid": 0, "tid": 7, "ts": 6303771782314.317, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771478710.073, "dur": 9.800, + "args": { + "External id": 148745, "cbid": 211, "correlation": 289976923 + } + }, + { + "ph": "s", "id": 289976923, "pid": 5714, "tid": 5714, "ts": 6303771478710.073, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771782740.529, "dur": 145.410, + "args": { + "External id": 148751, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976946, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976946, "pid": 0, "tid": 7, "ts": 6303771782740.529, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771478782.603, "dur": 7.450, + "args": { + "External id": 148751, "cbid": 211, "correlation": 289976946 + } + }, + { + "ph": "s", "id": 289976946, "pid": 5714, "tid": 5714, "ts": 6303771478782.603, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771782886.675, "dur": 91.457, + "args": { + "External id": 148755, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976972, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289976972, "pid": 0, "tid": 7, "ts": 6303771782886.675, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771478941.913, "dur": 8.700, + "args": { + "External id": 148755, "cbid": 307, "correlation": 289976972 + } + }, + { + "ph": "s", "id": 289976972, "pid": 5714, "tid": 5714, "ts": 6303771478941.913, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771782978.836, "dur": 427.301, + "args": { + "External id": 148756, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289976992, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289976992, "pid": 0, "tid": 7, "ts": 6303771782978.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771478981.833, "dur": 6.369, + "args": { + "External id": 148756, "cbid": 211, "correlation": 289976992 + } + }, + { + "ph": "s", "id": 289976992, "pid": 5714, "tid": 5714, "ts": 6303771478981.833, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771783406.713, "dur": 526.918, + "args": { + "External id": 148757, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977015, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977015, "pid": 0, "tid": 7, "ts": 6303771783406.713, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771479009.193, "dur": 5.160, + "args": { + "External id": 148757, "cbid": 211, "correlation": 289977015 + } + }, + { + "ph": "s", "id": 289977015, "pid": 5714, "tid": 5714, "ts": 6303771479009.193, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6303771783934.271, "dur": 327.748, + "args": { + "External id": 148758, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977027, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977027, "pid": 0, "tid": 7, "ts": 6303771783934.271, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771479046.922, "dur": 5.370, + "args": { + "External id": 148758, "cbid": 307, "correlation": 289977027 + } + }, + { + "ph": "s", "id": 289977027, "pid": 5714, "tid": 5714, "ts": 6303771479046.922, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771479075.872, "dur": 1.170, + "args": { + "External id": 148759, "cbid": 210, "correlation": 289977047 + } + }, + { + "ph": "f", "id": 289977047, "pid": 5714, "tid": 5714, "ts": 6303771479075.872, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771784262.755, "dur": 507.014, + "args": { + "External id": 148759, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977048, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977048, "pid": 0, "tid": 7, "ts": 6303771784262.755, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771479078.802, "dur": 5.640, + "args": { + "External id": 148759, "cbid": 211, "correlation": 289977048 + } + }, + { + "ph": "s", "id": 289977048, "pid": 5714, "tid": 5714, "ts": 6303771479078.802, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6303771784770.441, "dur": 69.185, + "args": { + "External id": 148760, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977055, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977055, "pid": 0, "tid": 7, "ts": 6303771784770.441, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771479112.522, "dur": 5.000, + "args": { + "External id": 148760, "cbid": 307, "correlation": 289977055 + } + }, + { + "ph": "s", "id": 289977055, "pid": 5714, "tid": 5714, "ts": 6303771479112.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771776265.158, "dur": 790.857, + "args": { + "External id": 148776, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289977070, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289977070, "pid": 0, "tid": 17, "ts": 6303771776265.158, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771479479.201, "dur": 11.860, + "args": { + "External id": 148776, "cbid": 211, "correlation": 289977070 + } + }, + { + "ph": "s", "id": 289977070, "pid": 5714, "tid": 5714, "ts": 6303771479479.201, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771777134.160, "dur": 14.401, + "args": { + "External id": 148792, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289977083, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289977083, "pid": 0, "tid": 17, "ts": 6303771777134.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771479593.321, "dur": 9.260, + "args": { + "External id": 148792, "cbid": 211, "correlation": 289977083 + } + }, + { + "ph": "s", "id": 289977083, "pid": 5714, "tid": 5714, "ts": 6303771479593.321, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771479625.781, "dur": 1.330, + "args": { + "cbid": 135, "correlation": 289977093 + } + }, + { + "ph": "f", "id": 289977093, "pid": 5714, "tid": 5714, "ts": 6303771479625.781, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771479628.831, "dur": 1.310, + "args": { + "cbid": 147, "correlation": 289977097 + } + }, + { + "ph": "s", "id": 289977097, "pid": 5714, "tid": 5714, "ts": 6303771479628.831, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771479682.101, "dur": 0.870, + "args": { + "External id": 148794, "cbid": 317, "correlation": 289977110 + } + }, + { + "ph": "f", "id": 289977110, "pid": 5714, "tid": 5714, "ts": 6303771479682.101, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771479684.721, "dur": 1.090, + "args": { + "External id": 148794, "cbid": 135, "correlation": 289977112 + } + }, + { + "ph": "f", "id": 289977112, "pid": 5714, "tid": 5714, "ts": 6303771479684.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771479687.181, "dur": 1.100, + "args": { + "External id": 148794, "cbid": 147, "correlation": 289977116 + } + }, + { + "ph": "s", "id": 289977116, "pid": 5714, "tid": 5714, "ts": 6303771479687.181, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771479703.521, "dur": 0.680, + "args": { + "External id": 148794, "cbid": 409, "correlation": 289977119 + } + }, + { + "ph": "f", "id": 289977119, "pid": 5714, "tid": 5714, "ts": 6303771479703.521, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771479708.051, "dur": 0.760, + "args": { + "External id": 148794, "cbid": 135, "correlation": 289977122 + } + }, + { + "ph": "f", "id": 289977122, "pid": 5714, "tid": 5714, "ts": 6303771479708.051, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771479708.981, "dur": 0.950, + "args": { + "External id": 148794, "cbid": 147, "correlation": 289977123 + } + }, + { + "ph": "s", "id": 289977123, "pid": 5714, "tid": 5714, "ts": 6303771479708.981, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771780844.059, "dur": 4039.439, + "args": { + "External id": 148794, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289977125, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289977125, "pid": 0, "tid": 20, "ts": 6303771780844.059, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771479710.931, "dur": 9.260, + "args": { + "External id": 148794, "cbid": 430, "correlation": 289977125 + } + }, + { + "ph": "s", "id": 289977125, "pid": 5714, "tid": 5714, "ts": 6303771479710.931, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771479721.131, "dur": 0.390, + "args": { + "External id": 148794, "cbid": 135, "correlation": 289977127 + } + }, + { + "ph": "f", "id": 289977127, "pid": 5714, "tid": 5714, "ts": 6303771479721.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771479721.631, "dur": 0.500, + "args": { + "External id": 148794, "cbid": 147, "correlation": 289977128 + } + }, + { + "ph": "s", "id": 289977128, "pid": 5714, "tid": 5714, "ts": 6303771479721.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771479723.581, "dur": 0.830, + "args": { + "External id": 148794, "cbid": 135, "correlation": 289977131 + } + }, + { + "ph": "f", "id": 289977131, "pid": 5714, "tid": 5714, "ts": 6303771479723.581, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771479732.991, "dur": 0.410, + "args": { + "External id": 148794, "cbid": 135, "correlation": 289977138 + } + }, + { + "ph": "f", "id": 289977138, "pid": 5714, "tid": 5714, "ts": 6303771479732.991, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771479760.991, "dur": 1.030, + "args": { + "External id": 148796, "cbid": 147, "correlation": 289977143 + } + }, + { + "ph": "s", "id": 289977143, "pid": 5714, "tid": 5714, "ts": 6303771479760.991, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771479777.601, "dur": 0.860, + "args": { + "cbid": 135, "correlation": 289977158 + } + }, + { + "ph": "f", "id": 289977158, "pid": 5714, "tid": 5714, "ts": 6303771479777.601, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771479816.481, "dur": 0.930, + "args": { + "cbid": 147, "correlation": 289977163 + } + }, + { + "ph": "s", "id": 289977163, "pid": 5714, "tid": 5714, "ts": 6303771479816.481, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771479819.151, "dur": 0.590, + "args": { + "cbid": 147, "correlation": 289977167 + } + }, + { + "ph": "s", "id": 289977167, "pid": 5714, "tid": 5714, "ts": 6303771479819.151, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771479855.131, "dur": 2.040, + "args": { + "cbid": 147, "correlation": 289977173 + } + }, + { + "ph": "s", "id": 289977173, "pid": 5714, "tid": 5714, "ts": 6303771479855.131, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771479954.820, "dur": 1.140, + "args": { + "External id": 148809, "cbid": 317, "correlation": 289977214 + } + }, + { + "ph": "f", "id": 289977214, "pid": 5714, "tid": 5714, "ts": 6303771479954.820, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771479964.590, "dur": 2.200, + "args": { + "External id": 148810, "cbid": 138, "correlation": 289977217 + } + }, + { + "ph": "f", "id": 289977217, "pid": 5714, "tid": 5714, "ts": 6303771479964.590, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771784888.554, "dur": 1.568, + "args": { + "External id": 148814, "device": 0, "context": 1, "stream": 7, "correlation": 289977228, "bytes": 7224, "memory bandwidth (GB/s)": 4.607142857142857 + } + }, + { + "ph": "f", "id": 289977228, "pid": 0, "tid": 7, "ts": 6303771784888.554, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771479987.500, "dur": 11.300, + "args": { + "External id": 148814, "cbid": 41, "correlation": 289977228 + } + }, + { + "ph": "s", "id": 289977228, "pid": 5714, "tid": 5714, "ts": 6303771479987.500, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771480002.750, "dur": 1.590, + "args": { + "External id": 148809, "cbid": 135, "correlation": 289977232 + } + }, + { + "ph": "f", "id": 289977232, "pid": 5714, "tid": 5714, "ts": 6303771480002.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771784897.418, "dur": 17.793, + "args": { + "External id": 148809, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977236, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977236, "pid": 0, "tid": 7, "ts": 6303771784897.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771480006.560, "dur": 9.480, + "args": { + "External id": 148809, "cbid": 211, "correlation": 289977236 + } + }, + { + "ph": "s", "id": 289977236, "pid": 5714, "tid": 5714, "ts": 6303771480006.560, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771480094.240, "dur": 1.220, + "args": { + "cbid": 135, "correlation": 289977247 + } + }, + { + "ph": "f", "id": 289977247, "pid": 5714, "tid": 5714, "ts": 6303771480094.240, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771784915.851, "dur": 20.416, + "args": { + "External id": 148821, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977273, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977273, "pid": 0, "tid": 7, "ts": 6303771784915.851, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771480293.170, "dur": 20.560, + "args": { + "External id": 148821, "cbid": 307, "correlation": 289977273 + } + }, + { + "ph": "s", "id": 289977273, "pid": 5714, "tid": 5714, "ts": 6303771480293.170, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771784937.003, "dur": 125.153, + "args": { + "External id": 148827, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977296, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977296, "pid": 0, "tid": 7, "ts": 6303771784937.003, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771480441.479, "dur": 9.550, + "args": { + "External id": 148827, "cbid": 211, "correlation": 289977296 + } + }, + { + "ph": "s", "id": 289977296, "pid": 5714, "tid": 5714, "ts": 6303771480441.479, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771785062.764, "dur": 122.914, + "args": { + "External id": 148828, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977319, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977319, "pid": 0, "tid": 7, "ts": 6303771785062.764, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771480473.369, "dur": 5.370, + "args": { + "External id": 148828, "cbid": 211, "correlation": 289977319 + } + }, + { + "ph": "s", "id": 289977319, "pid": 5714, "tid": 5714, "ts": 6303771480473.369, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771785186.318, "dur": 122.433, + "args": { + "External id": 148829, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977342, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977342, "pid": 0, "tid": 7, "ts": 6303771785186.318, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771480498.439, "dur": 4.840, + "args": { + "External id": 148829, "cbid": 211, "correlation": 289977342 + } + }, + { + "ph": "s", "id": 289977342, "pid": 5714, "tid": 5714, "ts": 6303771480498.439, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771785309.391, "dur": 52.353, + "args": { + "External id": 148846, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977362, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977362, "pid": 0, "tid": 7, "ts": 6303771785309.391, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771480743.398, "dur": 9.260, + "args": { + "External id": 148846, "cbid": 307, "correlation": 289977362 + } + }, + { + "ph": "s", "id": 289977362, "pid": 5714, "tid": 5714, "ts": 6303771480743.398, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771785362.416, "dur": 61.792, + "args": { + "External id": 148862, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977380, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977380, "pid": 0, "tid": 7, "ts": 6303771785362.416, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771480931.078, "dur": 8.200, + "args": { + "External id": 148862, "cbid": 307, "correlation": 289977380 + } + }, + { + "ph": "s", "id": 289977380, "pid": 5714, "tid": 5714, "ts": 6303771480931.078, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771481067.148, "dur": 0.490, + "args": { + "External id": 148868, "cbid": 200, "correlation": 289977387 + } + }, + { + "ph": "f", "id": 289977387, "pid": 5714, "tid": 5714, "ts": 6303771481067.148, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771481067.758, "dur": 0.180, + "args": { + "External id": 148868, "cbid": 200, "correlation": 289977388 + } + }, + { + "ph": "f", "id": 289977388, "pid": 5714, "tid": 5714, "ts": 6303771481067.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771481092.808, "dur": 0.390, + "args": { + "External id": 148868, "cbid": 200, "correlation": 289977411 + } + }, + { + "ph": "f", "id": 289977411, "pid": 5714, "tid": 5714, "ts": 6303771481092.808, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771481098.648, "dur": 1.920, + "args": { + "External id": 148868, "cbid": 273, "correlation": 289977420 + } + }, + { + "ph": "f", "id": 289977420, "pid": 5714, "tid": 5714, "ts": 6303771481098.648, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771785424.848, "dur": 417.701, + "args": { + "External id": 148868, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977421, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977421, "pid": 0, "tid": 7, "ts": 6303771785424.848, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481101.138, "dur": 10.020, + "args": { + "External id": 148868, "cbid": 211, "correlation": 289977421 + } + }, + { + "ph": "s", "id": 289977421, "pid": 5714, "tid": 5714, "ts": 6303771481101.138, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771785843.317, "dur": 124.738, + "args": { + "External id": 148874, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977444, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977444, "pid": 0, "tid": 7, "ts": 6303771785843.317, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481174.418, "dur": 7.540, + "args": { + "External id": 148874, "cbid": 211, "correlation": 289977444 + } + }, + { + "ph": "s", "id": 289977444, "pid": 5714, "tid": 5714, "ts": 6303771481174.418, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771785968.695, "dur": 88.737, + "args": { + "External id": 148878, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977470, "registers per thread": 22, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977470, "pid": 0, "tid": 7, "ts": 6303771785968.695, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481342.407, "dur": 9.470, + "args": { + "External id": 148878, "cbid": 307, "correlation": 289977470 + } + }, + { + "ph": "s", "id": 289977470, "pid": 5714, "tid": 5714, "ts": 6303771481342.407, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771786058.136, "dur": 331.332, + "args": { + "External id": 148879, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977490, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977490, "pid": 0, "tid": 7, "ts": 6303771786058.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481382.657, "dur": 6.110, + "args": { + "External id": 148879, "cbid": 211, "correlation": 289977490 + } + }, + { + "ph": "s", "id": 289977490, "pid": 5714, "tid": 5714, "ts": 6303771481382.657, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771786390.172, "dur": 324.227, + "args": { + "External id": 148880, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977513, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977513, "pid": 0, "tid": 7, "ts": 6303771786390.172, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481410.087, "dur": 4.850, + "args": { + "External id": 148880, "cbid": 211, "correlation": 289977513 + } + }, + { + "ph": "s", "id": 289977513, "pid": 5714, "tid": 5714, "ts": 6303771481410.087, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_mul_silu_1", "pid": 0, "tid": 7, + "ts": 6303771786714.976, "dur": 214.850, + "args": { + "External id": 148881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977525, "registers per thread": 33, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977525, "pid": 0, "tid": 7, "ts": 6303771786714.976, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481446.667, "dur": 5.580, + "args": { + "External id": 148881, "cbid": 307, "correlation": 289977525 + } + }, + { + "ph": "s", "id": 289977525, "pid": 5714, "tid": 5714, "ts": 6303771481446.667, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771481475.627, "dur": 1.210, + "args": { + "External id": 148882, "cbid": 210, "correlation": 289977545 + } + }, + { + "ph": "f", "id": 289977545, "pid": 5714, "tid": 5714, "ts": 6303771481475.627, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771786930.498, "dur": 324.292, + "args": { + "External id": 148882, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977546, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977546, "pid": 0, "tid": 7, "ts": 6303771786930.498, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481478.587, "dur": 5.470, + "args": { + "External id": 148882, "cbid": 211, "correlation": 289977546 + } + }, + { + "ph": "s", "id": 289977546, "pid": 5714, "tid": 5714, "ts": 6303771481478.587, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_2", "pid": 0, "tid": 7, + "ts": 6303771787255.494, "dur": 41.888, + "args": { + "External id": 148883, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977553, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977553, "pid": 0, "tid": 7, "ts": 6303771787255.494, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481512.627, "dur": 4.930, + "args": { + "External id": 148883, "cbid": 307, "correlation": 289977553 + } + }, + { + "ph": "s", "id": 289977553, "pid": 5714, "tid": 5714, "ts": 6303771481512.627, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771787298.086, "dur": 32.961, + "args": { + "External id": 148889, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977564, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977564, "pid": 0, "tid": 7, "ts": 6303771787298.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481694.787, "dur": 11.500, + "args": { + "External id": 148889, "cbid": 211, "correlation": 289977564 + } + }, + { + "ph": "s", "id": 289977564, "pid": 5714, "tid": 5714, "ts": 6303771481694.787, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771787331.751, "dur": 75.040, + "args": { + "External id": 148890, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977575, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977575, "pid": 0, "tid": 7, "ts": 6303771787331.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481725.827, "dur": 5.820, + "args": { + "External id": 148890, "cbid": 211, "correlation": 289977575 + } + }, + { + "ph": "s", "id": 289977575, "pid": 5714, "tid": 5714, "ts": 6303771481725.827, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771787407.463, "dur": 15.553, + "args": { + "External id": 148893, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977589, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977589, "pid": 0, "tid": 7, "ts": 6303771787407.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481751.046, "dur": 6.160, + "args": { + "External id": 148893, "cbid": 211, "correlation": 289977589 + } + }, + { + "ph": "s", "id": 289977589, "pid": 5714, "tid": 5714, "ts": 6303771481751.046, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771787423.752, "dur": 1.536, + "args": { + "External id": 148895, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977595, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289977595, "pid": 0, "tid": 7, "ts": 6303771787423.752, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481772.396, "dur": 5.400, + "args": { + "External id": 148895, "cbid": 211, "correlation": 289977595 + } + }, + { + "ph": "s", "id": 289977595, "pid": 5714, "tid": 5714, "ts": 6303771481772.396, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771787425.992, "dur": 1.024, + "args": { + "External id": 148896, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977605, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289977605, "pid": 0, "tid": 7, "ts": 6303771787425.992, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481793.966, "dur": 5.140, + "args": { + "External id": 148896, "cbid": 211, "correlation": 289977605 + } + }, + { + "ph": "s", "id": 289977605, "pid": 5714, "tid": 5714, "ts": 6303771481793.966, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771787427.624, "dur": 88.993, + "args": { + "External id": 148897, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977615, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977615, "pid": 0, "tid": 7, "ts": 6303771787427.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481816.186, "dur": 5.000, + "args": { + "External id": 148897, "cbid": 211, "correlation": 289977615 + } + }, + { + "ph": "s", "id": 289977615, "pid": 5714, "tid": 5714, "ts": 6303771481816.186, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771787517.449, "dur": 48.000, + "args": { + "External id": 148902, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977628, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977628, "pid": 0, "tid": 7, "ts": 6303771787517.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481841.356, "dur": 5.340, + "args": { + "External id": 148902, "cbid": 211, "correlation": 289977628 + } + }, + { + "ph": "s", "id": 289977628, "pid": 5714, "tid": 5714, "ts": 6303771481841.356, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771787566.089, "dur": 23.201, + "args": { + "External id": 148903, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977639, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977639, "pid": 0, "tid": 7, "ts": 6303771787566.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481859.246, "dur": 4.500, + "args": { + "External id": 148903, "cbid": 211, "correlation": 289977639 + } + }, + { + "ph": "s", "id": 289977639, "pid": 5714, "tid": 5714, "ts": 6303771481859.246, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771787589.898, "dur": 123.681, + "args": { + "External id": 148911, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977662, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977662, "pid": 0, "tid": 7, "ts": 6303771787589.898, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481936.466, "dur": 7.430, + "args": { + "External id": 148911, "cbid": 211, "correlation": 289977662 + } + }, + { + "ph": "s", "id": 289977662, "pid": 5714, "tid": 5714, "ts": 6303771481936.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771787714.219, "dur": 122.657, + "args": { + "External id": 148920, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977685, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977685, "pid": 0, "tid": 7, "ts": 6303771787714.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771481996.216, "dur": 6.910, + "args": { + "External id": 148920, "cbid": 211, "correlation": 289977685 + } + }, + { + "ph": "s", "id": 289977685, "pid": 5714, "tid": 5714, "ts": 6303771481996.216, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771787837.580, "dur": 121.986, + "args": { + "External id": 148929, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977708, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977708, "pid": 0, "tid": 7, "ts": 6303771787837.580, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482049.416, "dur": 6.720, + "args": { + "External id": 148929, "cbid": 211, "correlation": 289977708 + } + }, + { + "ph": "s", "id": 289977708, "pid": 5714, "tid": 5714, "ts": 6303771482049.416, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771787960.238, "dur": 51.393, + "args": { + "External id": 148937, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977727, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977727, "pid": 0, "tid": 7, "ts": 6303771787960.238, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482243.495, "dur": 8.960, + "args": { + "External id": 148937, "cbid": 307, "correlation": 289977727 + } + }, + { + "ph": "s", "id": 289977727, "pid": 5714, "tid": 5714, "ts": 6303771482243.495, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771788012.303, "dur": 60.960, + "args": { + "External id": 148940, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977744, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977744, "pid": 0, "tid": 7, "ts": 6303771788012.303, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482389.105, "dur": 8.350, + "args": { + "External id": 148940, "cbid": 307, "correlation": 289977744 + } + }, + { + "ph": "s", "id": 289977744, "pid": 5714, "tid": 5714, "ts": 6303771482389.105, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771482491.215, "dur": 0.450, + "args": { + "External id": 148944, "cbid": 200, "correlation": 289977748 + } + }, + { + "ph": "f", "id": 289977748, "pid": 5714, "tid": 5714, "ts": 6303771482491.215, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771482491.805, "dur": 0.190, + "args": { + "External id": 148944, "cbid": 200, "correlation": 289977749 + } + }, + { + "ph": "f", "id": 289977749, "pid": 5714, "tid": 5714, "ts": 6303771482491.805, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771482516.605, "dur": 0.340, + "args": { + "External id": 148944, "cbid": 200, "correlation": 289977772 + } + }, + { + "ph": "f", "id": 289977772, "pid": 5714, "tid": 5714, "ts": 6303771482516.605, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771482523.765, "dur": 1.950, + "args": { + "External id": 148944, "cbid": 273, "correlation": 289977781 + } + }, + { + "ph": "f", "id": 289977781, "pid": 5714, "tid": 5714, "ts": 6303771482523.765, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771788073.871, "dur": 423.045, + "args": { + "External id": 148944, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977782, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977782, "pid": 0, "tid": 7, "ts": 6303771788073.871, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482526.285, "dur": 9.500, + "args": { + "External id": 148944, "cbid": 211, "correlation": 289977782 + } + }, + { + "ph": "s", "id": 289977782, "pid": 5714, "tid": 5714, "ts": 6303771482526.285, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771788497.588, "dur": 124.450, + "args": { + "External id": 148960, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977808, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977808, "pid": 0, "tid": 7, "ts": 6303771788497.588, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482650.534, "dur": 9.200, + "args": { + "External id": 148960, "cbid": 211, "correlation": 289977808 + } + }, + { + "ph": "s", "id": 289977808, "pid": 5714, "tid": 5714, "ts": 6303771482650.534, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771788622.742, "dur": 62.528, + "args": { + "External id": 148962, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977818, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977818, "pid": 0, "tid": 7, "ts": 6303771788622.742, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482690.274, "dur": 6.930, + "args": { + "External id": 148962, "cbid": 211, "correlation": 289977818 + } + }, + { + "ph": "s", "id": 289977818, "pid": 5714, "tid": 5714, "ts": 6303771482690.274, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771788685.942, "dur": 48.993, + "args": { + "External id": 148967, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977831, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977831, "pid": 0, "tid": 7, "ts": 6303771788685.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482736.624, "dur": 6.900, + "args": { + "External id": 148967, "cbid": 211, "correlation": 289977831 + } + }, + { + "ph": "s", "id": 289977831, "pid": 5714, "tid": 5714, "ts": 6303771482736.624, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771788735.639, "dur": 68.417, + "args": { + "External id": 148968, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977842, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977842, "pid": 0, "tid": 7, "ts": 6303771788735.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482759.004, "dur": 5.370, + "args": { + "External id": 148968, "cbid": 211, "correlation": 289977842 + } + }, + { + "ph": "s", "id": 289977842, "pid": 5714, "tid": 5714, "ts": 6303771482759.004, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771788804.664, "dur": 15.168, + "args": { + "External id": 148971, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977856, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977856, "pid": 0, "tid": 7, "ts": 6303771788804.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482779.414, "dur": 4.940, + "args": { + "External id": 148971, "cbid": 211, "correlation": 289977856 + } + }, + { + "ph": "s", "id": 289977856, "pid": 5714, "tid": 5714, "ts": 6303771482779.414, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771788820.440, "dur": 1.440, + "args": { + "External id": 148973, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977862, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289977862, "pid": 0, "tid": 7, "ts": 6303771788820.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482798.604, "dur": 4.410, + "args": { + "External id": 148973, "cbid": 211, "correlation": 289977862 + } + }, + { + "ph": "s", "id": 289977862, "pid": 5714, "tid": 5714, "ts": 6303771482798.604, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771788822.552, "dur": 1.024, + "args": { + "External id": 148974, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977872, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289977872, "pid": 0, "tid": 7, "ts": 6303771788822.552, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482813.504, "dur": 4.160, + "args": { + "External id": 148974, "cbid": 211, "correlation": 289977872 + } + }, + { + "ph": "s", "id": 289977872, "pid": 5714, "tid": 5714, "ts": 6303771482813.504, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771788824.280, "dur": 91.201, + "args": { + "External id": 148975, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977882, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977882, "pid": 0, "tid": 7, "ts": 6303771788824.280, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482831.644, "dur": 4.130, + "args": { + "External id": 148975, "cbid": 211, "correlation": 289977882 + } + }, + { + "ph": "s", "id": 289977882, "pid": 5714, "tid": 5714, "ts": 6303771482831.644, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771788916.153, "dur": 48.096, + "args": { + "External id": 148980, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977895, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977895, "pid": 0, "tid": 7, "ts": 6303771788916.153, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482851.744, "dur": 4.530, + "args": { + "External id": 148980, "cbid": 211, "correlation": 289977895 + } + }, + { + "ph": "s", "id": 289977895, "pid": 5714, "tid": 5714, "ts": 6303771482851.744, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771788964.954, "dur": 23.040, + "args": { + "External id": 148981, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977906, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977906, "pid": 0, "tid": 7, "ts": 6303771788964.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482869.124, "dur": 4.990, + "args": { + "External id": 148981, "cbid": 211, "correlation": 289977906 + } + }, + { + "ph": "s", "id": 289977906, "pid": 5714, "tid": 5714, "ts": 6303771482869.124, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771788988.730, "dur": 323.587, + "args": { + "External id": 148989, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977929, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977929, "pid": 0, "tid": 7, "ts": 6303771788988.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771482933.194, "dur": 7.120, + "args": { + "External id": 148989, "cbid": 211, "correlation": 289977929 + } + }, + { + "ph": "s", "id": 289977929, "pid": 5714, "tid": 5714, "ts": 6303771482933.194, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771789312.989, "dur": 323.396, + "args": { + "External id": 148998, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977952, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977952, "pid": 0, "tid": 7, "ts": 6303771789312.989, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483000.244, "dur": 6.829, + "args": { + "External id": 148998, "cbid": 211, "correlation": 289977952 + } + }, + { + "ph": "s", "id": 289977952, "pid": 5714, "tid": 5714, "ts": 6303771483000.244, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6303771789637.089, "dur": 213.667, + "args": { + "External id": 149000, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977966, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977966, "pid": 0, "tid": 7, "ts": 6303771789637.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483084.484, "dur": 8.349, + "args": { + "External id": 149000, "cbid": 307, "correlation": 289977966 + } + }, + { + "ph": "s", "id": 289977966, "pid": 5714, "tid": 5714, "ts": 6303771483084.484, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771483134.773, "dur": 1.370, + "args": { + "External id": 149009, "cbid": 210, "correlation": 289977988 + } + }, + { + "ph": "f", "id": 289977988, "pid": 5714, "tid": 5714, "ts": 6303771483134.773, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771789851.364, "dur": 328.548, + "args": { + "External id": 149009, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977989, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289977989, "pid": 0, "tid": 7, "ts": 6303771789851.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483137.933, "dur": 6.570, + "args": { + "External id": 149009, "cbid": 211, "correlation": 289977989 + } + }, + { + "ph": "s", "id": 289977989, "pid": 5714, "tid": 5714, "ts": 6303771483137.933, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771790180.584, "dur": 52.256, + "args": { + "External id": 149011, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289977999, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289977999, "pid": 0, "tid": 7, "ts": 6303771790180.584, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483182.013, "dur": 6.700, + "args": { + "External id": 149011, "cbid": 211, "correlation": 289977999 + } + }, + { + "ph": "s", "id": 289977999, "pid": 5714, "tid": 5714, "ts": 6303771483182.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771790234.248, "dur": 64.289, + "args": { + "External id": 149016, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978012, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978012, "pid": 0, "tid": 7, "ts": 6303771790234.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483236.413, "dur": 7.300, + "args": { + "External id": 149016, "cbid": 211, "correlation": 289978012 + } + }, + { + "ph": "s", "id": 289978012, "pid": 5714, "tid": 5714, "ts": 6303771483236.413, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771790299.241, "dur": 64.225, + "args": { + "External id": 149017, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978023, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978023, "pid": 0, "tid": 7, "ts": 6303771790299.241, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483257.983, "dur": 4.790, + "args": { + "External id": 149017, "cbid": 211, "correlation": 289978023 + } + }, + { + "ph": "s", "id": 289978023, "pid": 5714, "tid": 5714, "ts": 6303771483257.983, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771790364.074, "dur": 15.744, + "args": { + "External id": 149020, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978037, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978037, "pid": 0, "tid": 7, "ts": 6303771790364.074, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483276.513, "dur": 4.700, + "args": { + "External id": 149020, "cbid": 211, "correlation": 289978037 + } + }, + { + "ph": "s", "id": 289978037, "pid": 5714, "tid": 5714, "ts": 6303771483276.513, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771790380.426, "dur": 1.824, + "args": { + "External id": 149022, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978043, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289978043, "pid": 0, "tid": 7, "ts": 6303771790380.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483294.863, "dur": 11.460, + "args": { + "External id": 149022, "cbid": 211, "correlation": 289978043 + } + }, + { + "ph": "s", "id": 289978043, "pid": 5714, "tid": 5714, "ts": 6303771483294.863, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771790382.986, "dur": 1.024, + "args": { + "External id": 149023, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978053, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289978053, "pid": 0, "tid": 7, "ts": 6303771790382.986, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483319.873, "dur": 4.630, + "args": { + "External id": 149023, "cbid": 211, "correlation": 289978053 + } + }, + { + "ph": "s", "id": 289978053, "pid": 5714, "tid": 5714, "ts": 6303771483319.873, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771790384.714, "dur": 89.441, + "args": { + "External id": 149024, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978063, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978063, "pid": 0, "tid": 7, "ts": 6303771790384.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483338.493, "dur": 4.870, + "args": { + "External id": 149024, "cbid": 211, "correlation": 289978063 + } + }, + { + "ph": "s", "id": 289978063, "pid": 5714, "tid": 5714, "ts": 6303771483338.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771790474.859, "dur": 49.089, + "args": { + "External id": 149029, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978076, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978076, "pid": 0, "tid": 7, "ts": 6303771790474.859, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483360.233, "dur": 4.550, + "args": { + "External id": 149029, "cbid": 211, "correlation": 289978076 + } + }, + { + "ph": "s", "id": 289978076, "pid": 5714, "tid": 5714, "ts": 6303771483360.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771790524.652, "dur": 22.176, + "args": { + "External id": 149030, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978087, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978087, "pid": 0, "tid": 7, "ts": 6303771790524.652, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483376.093, "dur": 4.580, + "args": { + "External id": 149030, "cbid": 211, "correlation": 289978087 + } + }, + { + "ph": "s", "id": 289978087, "pid": 5714, "tid": 5714, "ts": 6303771483376.093, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771790547.532, "dur": 123.458, + "args": { + "External id": 149038, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978110, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978110, "pid": 0, "tid": 7, "ts": 6303771790547.532, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483443.072, "dur": 7.231, + "args": { + "External id": 149038, "cbid": 211, "correlation": 289978110 + } + }, + { + "ph": "s", "id": 289978110, "pid": 5714, "tid": 5714, "ts": 6303771483443.072, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771790671.726, "dur": 122.369, + "args": { + "External id": 149047, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978133, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978133, "pid": 0, "tid": 7, "ts": 6303771790671.726, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483499.603, "dur": 6.780, + "args": { + "External id": 149047, "cbid": 211, "correlation": 289978133 + } + }, + { + "ph": "s", "id": 289978133, "pid": 5714, "tid": 5714, "ts": 6303771483499.603, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771790794.703, "dur": 122.689, + "args": { + "External id": 149056, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978156, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978156, "pid": 0, "tid": 7, "ts": 6303771790794.703, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483553.902, "dur": 6.580, + "args": { + "External id": 149056, "cbid": 211, "correlation": 289978156 + } + }, + { + "ph": "s", "id": 289978156, "pid": 5714, "tid": 5714, "ts": 6303771483553.902, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771790918.032, "dur": 52.289, + "args": { + "External id": 149064, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978175, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978175, "pid": 0, "tid": 7, "ts": 6303771790918.032, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483725.662, "dur": 8.650, + "args": { + "External id": 149064, "cbid": 307, "correlation": 289978175 + } + }, + { + "ph": "s", "id": 289978175, "pid": 5714, "tid": 5714, "ts": 6303771483725.662, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771790971.025, "dur": 60.449, + "args": { + "External id": 149067, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978192, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978192, "pid": 0, "tid": 7, "ts": 6303771790971.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483855.322, "dur": 7.910, + "args": { + "External id": 149067, "cbid": 307, "correlation": 289978192 + } + }, + { + "ph": "s", "id": 289978192, "pid": 5714, "tid": 5714, "ts": 6303771483855.322, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771483948.971, "dur": 0.471, + "args": { + "External id": 149071, "cbid": 200, "correlation": 289978196 + } + }, + { + "ph": "f", "id": 289978196, "pid": 5714, "tid": 5714, "ts": 6303771483948.971, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771483949.551, "dur": 0.191, + "args": { + "External id": 149071, "cbid": 200, "correlation": 289978197 + } + }, + { + "ph": "f", "id": 289978197, "pid": 5714, "tid": 5714, "ts": 6303771483949.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771483974.991, "dur": 0.360, + "args": { + "External id": 149071, "cbid": 200, "correlation": 289978220 + } + }, + { + "ph": "f", "id": 289978220, "pid": 5714, "tid": 5714, "ts": 6303771483974.991, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771483981.271, "dur": 1.940, + "args": { + "External id": 149071, "cbid": 273, "correlation": 289978229 + } + }, + { + "ph": "f", "id": 289978229, "pid": 5714, "tid": 5714, "ts": 6303771483981.271, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771791032.146, "dur": 415.109, + "args": { + "External id": 149071, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978230, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978230, "pid": 0, "tid": 7, "ts": 6303771791032.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771483983.802, "dur": 9.299, + "args": { + "External id": 149071, "cbid": 211, "correlation": 289978230 + } + }, + { + "ph": "s", "id": 289978230, "pid": 5714, "tid": 5714, "ts": 6303771483983.802, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771791447.895, "dur": 124.321, + "args": { + "External id": 149087, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978256, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978256, "pid": 0, "tid": 7, "ts": 6303771791447.895, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484104.921, "dur": 9.760, + "args": { + "External id": 149087, "cbid": 211, "correlation": 289978256 + } + }, + { + "ph": "s", "id": 289978256, "pid": 5714, "tid": 5714, "ts": 6303771484104.921, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771791572.824, "dur": 61.793, + "args": { + "External id": 149089, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978266, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978266, "pid": 0, "tid": 7, "ts": 6303771791572.824, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484143.881, "dur": 6.370, + "args": { + "External id": 149089, "cbid": 211, "correlation": 289978266 + } + }, + { + "ph": "s", "id": 289978266, "pid": 5714, "tid": 5714, "ts": 6303771484143.881, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771791635.225, "dur": 51.488, + "args": { + "External id": 149094, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978279, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978279, "pid": 0, "tid": 7, "ts": 6303771791635.225, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484190.111, "dur": 7.210, + "args": { + "External id": 149094, "cbid": 211, "correlation": 289978279 + } + }, + { + "ph": "s", "id": 289978279, "pid": 5714, "tid": 5714, "ts": 6303771484190.111, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771791687.449, "dur": 69.793, + "args": { + "External id": 149095, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978290, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978290, "pid": 0, "tid": 7, "ts": 6303771791687.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484214.061, "dur": 4.870, + "args": { + "External id": 149095, "cbid": 211, "correlation": 289978290 + } + }, + { + "ph": "s", "id": 289978290, "pid": 5714, "tid": 5714, "ts": 6303771484214.061, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771791757.850, "dur": 14.784, + "args": { + "External id": 149098, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978304, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978304, "pid": 0, "tid": 7, "ts": 6303771791757.850, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484233.431, "dur": 5.430, + "args": { + "External id": 149098, "cbid": 211, "correlation": 289978304 + } + }, + { + "ph": "s", "id": 289978304, "pid": 5714, "tid": 5714, "ts": 6303771484233.431, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771791773.306, "dur": 1.632, + "args": { + "External id": 149100, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978310, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289978310, "pid": 0, "tid": 7, "ts": 6303771791773.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484250.971, "dur": 4.350, + "args": { + "External id": 149100, "cbid": 211, "correlation": 289978310 + } + }, + { + "ph": "s", "id": 289978310, "pid": 5714, "tid": 5714, "ts": 6303771484250.971, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771791775.578, "dur": 1.024, + "args": { + "External id": 149101, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978320, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289978320, "pid": 0, "tid": 7, "ts": 6303771791775.578, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484265.491, "dur": 4.220, + "args": { + "External id": 149101, "cbid": 211, "correlation": 289978320 + } + }, + { + "ph": "s", "id": 289978320, "pid": 5714, "tid": 5714, "ts": 6303771484265.491, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771791777.306, "dur": 89.793, + "args": { + "External id": 149102, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978330, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978330, "pid": 0, "tid": 7, "ts": 6303771791777.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484284.571, "dur": 4.550, + "args": { + "External id": 149102, "cbid": 211, "correlation": 289978330 + } + }, + { + "ph": "s", "id": 289978330, "pid": 5714, "tid": 5714, "ts": 6303771484284.571, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771791867.803, "dur": 46.337, + "args": { + "External id": 149107, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978343, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978343, "pid": 0, "tid": 7, "ts": 6303771791867.803, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484314.291, "dur": 5.410, + "args": { + "External id": 149107, "cbid": 211, "correlation": 289978343 + } + }, + { + "ph": "s", "id": 289978343, "pid": 5714, "tid": 5714, "ts": 6303771484314.291, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771791914.844, "dur": 22.304, + "args": { + "External id": 149108, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978354, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978354, "pid": 0, "tid": 7, "ts": 6303771791914.844, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484333.581, "dur": 4.380, + "args": { + "External id": 149108, "cbid": 211, "correlation": 289978354 + } + }, + { + "ph": "s", "id": 289978354, "pid": 5714, "tid": 5714, "ts": 6303771484333.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771791937.852, "dur": 324.516, + "args": { + "External id": 149116, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978377, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978377, "pid": 0, "tid": 7, "ts": 6303771791937.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484396.450, "dur": 6.840, + "args": { + "External id": 149116, "cbid": 211, "correlation": 289978377 + } + }, + { + "ph": "s", "id": 289978377, "pid": 5714, "tid": 5714, "ts": 6303771484396.450, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771792263.104, "dur": 323.972, + "args": { + "External id": 149125, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978400, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978400, "pid": 0, "tid": 7, "ts": 6303771792263.104, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484451.140, "dur": 6.450, + "args": { + "External id": 149125, "cbid": 211, "correlation": 289978400 + } + }, + { + "ph": "s", "id": 289978400, "pid": 5714, "tid": 5714, "ts": 6303771484451.140, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6303771792587.812, "dur": 213.442, + "args": { + "External id": 149127, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978414, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978414, "pid": 0, "tid": 7, "ts": 6303771792587.812, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484525.080, "dur": 7.250, + "args": { + "External id": 149127, "cbid": 307, "correlation": 289978414 + } + }, + { + "ph": "s", "id": 289978414, "pid": 5714, "tid": 5714, "ts": 6303771484525.080, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771484575.220, "dur": 1.330, + "args": { + "External id": 149136, "cbid": 210, "correlation": 289978436 + } + }, + { + "ph": "f", "id": 289978436, "pid": 5714, "tid": 5714, "ts": 6303771484575.220, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771792801.894, "dur": 327.236, + "args": { + "External id": 149136, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978437, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978437, "pid": 0, "tid": 7, "ts": 6303771792801.894, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484578.380, "dur": 6.310, + "args": { + "External id": 149136, "cbid": 211, "correlation": 289978437 + } + }, + { + "ph": "s", "id": 289978437, "pid": 5714, "tid": 5714, "ts": 6303771484578.380, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771793129.834, "dur": 50.945, + "args": { + "External id": 149138, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978447, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978447, "pid": 0, "tid": 7, "ts": 6303771793129.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484619.390, "dur": 7.080, + "args": { + "External id": 149138, "cbid": 211, "correlation": 289978447 + } + }, + { + "ph": "s", "id": 289978447, "pid": 5714, "tid": 5714, "ts": 6303771484619.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771793181.515, "dur": 63.712, + "args": { + "External id": 149143, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978460, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978460, "pid": 0, "tid": 7, "ts": 6303771793181.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484673.020, "dur": 7.500, + "args": { + "External id": 149143, "cbid": 211, "correlation": 289978460 + } + }, + { + "ph": "s", "id": 289978460, "pid": 5714, "tid": 5714, "ts": 6303771484673.020, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771793245.931, "dur": 69.217, + "args": { + "External id": 149144, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978471, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978471, "pid": 0, "tid": 7, "ts": 6303771793245.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484695.590, "dur": 4.840, + "args": { + "External id": 149144, "cbid": 211, "correlation": 289978471 + } + }, + { + "ph": "s", "id": 289978471, "pid": 5714, "tid": 5714, "ts": 6303771484695.590, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771793315.756, "dur": 15.744, + "args": { + "External id": 149147, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978485, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978485, "pid": 0, "tid": 7, "ts": 6303771793315.756, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484715.110, "dur": 4.540, + "args": { + "External id": 149147, "cbid": 211, "correlation": 289978485 + } + }, + { + "ph": "s", "id": 289978485, "pid": 5714, "tid": 5714, "ts": 6303771484715.110, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771793332.076, "dur": 1.856, + "args": { + "External id": 149149, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978491, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289978491, "pid": 0, "tid": 7, "ts": 6303771793332.076, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484731.460, "dur": 4.160, + "args": { + "External id": 149149, "cbid": 211, "correlation": 289978491 + } + }, + { + "ph": "s", "id": 289978491, "pid": 5714, "tid": 5714, "ts": 6303771484731.460, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771793334.636, "dur": 1.025, + "args": { + "External id": 149150, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978501, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289978501, "pid": 0, "tid": 7, "ts": 6303771793334.636, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484745.840, "dur": 4.100, + "args": { + "External id": 149150, "cbid": 211, "correlation": 289978501 + } + }, + { + "ph": "s", "id": 289978501, "pid": 5714, "tid": 5714, "ts": 6303771484745.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771793336.397, "dur": 88.992, + "args": { + "External id": 149151, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978511, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978511, "pid": 0, "tid": 7, "ts": 6303771793336.397, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484764.100, "dur": 4.370, + "args": { + "External id": 149151, "cbid": 211, "correlation": 289978511 + } + }, + { + "ph": "s", "id": 289978511, "pid": 5714, "tid": 5714, "ts": 6303771484764.100, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771793426.029, "dur": 48.257, + "args": { + "External id": 149156, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978524, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978524, "pid": 0, "tid": 7, "ts": 6303771793426.029, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484786.260, "dur": 4.689, + "args": { + "External id": 149156, "cbid": 211, "correlation": 289978524 + } + }, + { + "ph": "s", "id": 289978524, "pid": 5714, "tid": 5714, "ts": 6303771484786.260, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771793474.926, "dur": 23.328, + "args": { + "External id": 149157, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978535, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978535, "pid": 0, "tid": 7, "ts": 6303771793474.926, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484802.580, "dur": 4.100, + "args": { + "External id": 149157, "cbid": 211, "correlation": 289978535 + } + }, + { + "ph": "s", "id": 289978535, "pid": 5714, "tid": 5714, "ts": 6303771484802.580, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771793498.894, "dur": 123.041, + "args": { + "External id": 149165, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978558, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978558, "pid": 0, "tid": 7, "ts": 6303771793498.894, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484867.740, "dur": 7.029, + "args": { + "External id": 149165, "cbid": 211, "correlation": 289978558 + } + }, + { + "ph": "s", "id": 289978558, "pid": 5714, "tid": 5714, "ts": 6303771484867.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771793622.511, "dur": 122.562, + "args": { + "External id": 149174, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978581, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978581, "pid": 0, "tid": 7, "ts": 6303771793622.511, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484924.129, "dur": 7.100, + "args": { + "External id": 149174, "cbid": 211, "correlation": 289978581 + } + }, + { + "ph": "s", "id": 289978581, "pid": 5714, "tid": 5714, "ts": 6303771484924.129, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771793745.745, "dur": 122.401, + "args": { + "External id": 149183, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978604, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978604, "pid": 0, "tid": 7, "ts": 6303771793745.745, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771484976.149, "dur": 6.370, + "args": { + "External id": 149183, "cbid": 211, "correlation": 289978604 + } + }, + { + "ph": "s", "id": 289978604, "pid": 5714, "tid": 5714, "ts": 6303771484976.149, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771793868.754, "dur": 51.905, + "args": { + "External id": 149191, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978623, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978623, "pid": 0, "tid": 7, "ts": 6303771793868.754, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485148.249, "dur": 8.820, + "args": { + "External id": 149191, "cbid": 307, "correlation": 289978623 + } + }, + { + "ph": "s", "id": 289978623, "pid": 5714, "tid": 5714, "ts": 6303771485148.249, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771793921.267, "dur": 59.681, + "args": { + "External id": 149194, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978640, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978640, "pid": 0, "tid": 7, "ts": 6303771793921.267, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485277.119, "dur": 7.600, + "args": { + "External id": 149194, "cbid": 307, "correlation": 289978640 + } + }, + { + "ph": "s", "id": 289978640, "pid": 5714, "tid": 5714, "ts": 6303771485277.119, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771485379.358, "dur": 0.480, + "args": { + "External id": 149198, "cbid": 200, "correlation": 289978644 + } + }, + { + "ph": "f", "id": 289978644, "pid": 5714, "tid": 5714, "ts": 6303771485379.358, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771485379.978, "dur": 0.190, + "args": { + "External id": 149198, "cbid": 200, "correlation": 289978645 + } + }, + { + "ph": "f", "id": 289978645, "pid": 5714, "tid": 5714, "ts": 6303771485379.978, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771485403.338, "dur": 0.360, + "args": { + "External id": 149198, "cbid": 200, "correlation": 289978668 + } + }, + { + "ph": "f", "id": 289978668, "pid": 5714, "tid": 5714, "ts": 6303771485403.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771485409.388, "dur": 1.910, + "args": { + "External id": 149198, "cbid": 273, "correlation": 289978677 + } + }, + { + "ph": "f", "id": 289978677, "pid": 5714, "tid": 5714, "ts": 6303771485409.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771793981.588, "dur": 415.237, + "args": { + "External id": 149198, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978678, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978678, "pid": 0, "tid": 7, "ts": 6303771793981.588, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485411.858, "dur": 9.780, + "args": { + "External id": 149198, "cbid": 211, "correlation": 289978678 + } + }, + { + "ph": "s", "id": 289978678, "pid": 5714, "tid": 5714, "ts": 6303771485411.858, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771794397.529, "dur": 124.001, + "args": { + "External id": 149214, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978704, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978704, "pid": 0, "tid": 7, "ts": 6303771794397.529, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485533.058, "dur": 9.140, + "args": { + "External id": 149214, "cbid": 211, "correlation": 289978704 + } + }, + { + "ph": "s", "id": 289978704, "pid": 5714, "tid": 5714, "ts": 6303771485533.058, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771794522.170, "dur": 62.241, + "args": { + "External id": 149216, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978714, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978714, "pid": 0, "tid": 7, "ts": 6303771794522.170, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485572.908, "dur": 6.200, + "args": { + "External id": 149216, "cbid": 211, "correlation": 289978714 + } + }, + { + "ph": "s", "id": 289978714, "pid": 5714, "tid": 5714, "ts": 6303771485572.908, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771794585.051, "dur": 49.536, + "args": { + "External id": 149221, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978727, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978727, "pid": 0, "tid": 7, "ts": 6303771794585.051, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485619.358, "dur": 7.040, + "args": { + "External id": 149221, "cbid": 211, "correlation": 289978727 + } + }, + { + "ph": "s", "id": 289978727, "pid": 5714, "tid": 5714, "ts": 6303771485619.358, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771794635.195, "dur": 69.089, + "args": { + "External id": 149222, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978738, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978738, "pid": 0, "tid": 7, "ts": 6303771794635.195, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485640.978, "dur": 4.720, + "args": { + "External id": 149222, "cbid": 211, "correlation": 289978738 + } + }, + { + "ph": "s", "id": 289978738, "pid": 5714, "tid": 5714, "ts": 6303771485640.978, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771794704.924, "dur": 14.881, + "args": { + "External id": 149225, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978752, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978752, "pid": 0, "tid": 7, "ts": 6303771794704.924, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485661.398, "dur": 4.780, + "args": { + "External id": 149225, "cbid": 211, "correlation": 289978752 + } + }, + { + "ph": "s", "id": 289978752, "pid": 5714, "tid": 5714, "ts": 6303771485661.398, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771794720.477, "dur": 1.600, + "args": { + "External id": 149227, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978758, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289978758, "pid": 0, "tid": 7, "ts": 6303771794720.477, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485677.508, "dur": 4.720, + "args": { + "External id": 149227, "cbid": 211, "correlation": 289978758 + } + }, + { + "ph": "s", "id": 289978758, "pid": 5714, "tid": 5714, "ts": 6303771485677.508, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771794722.749, "dur": 1.024, + "args": { + "External id": 149228, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978768, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289978768, "pid": 0, "tid": 7, "ts": 6303771794722.749, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485692.458, "dur": 4.109, + "args": { + "External id": 149228, "cbid": 211, "correlation": 289978768 + } + }, + { + "ph": "s", "id": 289978768, "pid": 5714, "tid": 5714, "ts": 6303771485692.458, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771794724.477, "dur": 89.665, + "args": { + "External id": 149229, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978778, "pid": 0, "tid": 7, "ts": 6303771794724.477, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485710.167, "dur": 4.371, + "args": { + "External id": 149229, "cbid": 211, "correlation": 289978778 + } + }, + { + "ph": "s", "id": 289978778, "pid": 5714, "tid": 5714, "ts": 6303771485710.167, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771794814.814, "dur": 47.648, + "args": { + "External id": 149234, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978791, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978791, "pid": 0, "tid": 7, "ts": 6303771794814.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485732.347, "dur": 5.011, + "args": { + "External id": 149234, "cbid": 211, "correlation": 289978791 + } + }, + { + "ph": "s", "id": 289978791, "pid": 5714, "tid": 5714, "ts": 6303771485732.347, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771794863.070, "dur": 23.456, + "args": { + "External id": 149235, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978802, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978802, "pid": 0, "tid": 7, "ts": 6303771794863.070, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485748.987, "dur": 4.360, + "args": { + "External id": 149235, "cbid": 211, "correlation": 289978802 + } + }, + { + "ph": "s", "id": 289978802, "pid": 5714, "tid": 5714, "ts": 6303771485748.987, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771794887.230, "dur": 325.476, + "args": { + "External id": 149243, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978825, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978825, "pid": 0, "tid": 7, "ts": 6303771794887.230, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485811.887, "dur": 6.820, + "args": { + "External id": 149243, "cbid": 211, "correlation": 289978825 + } + }, + { + "ph": "s", "id": 289978825, "pid": 5714, "tid": 5714, "ts": 6303771485811.887, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771795213.314, "dur": 324.580, + "args": { + "External id": 149252, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978848, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978848, "pid": 0, "tid": 7, "ts": 6303771795213.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485867.077, "dur": 6.880, + "args": { + "External id": 149252, "cbid": 211, "correlation": 289978848 + } + }, + { + "ph": "s", "id": 289978848, "pid": 5714, "tid": 5714, "ts": 6303771485867.077, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6303771795538.598, "dur": 212.515, + "args": { + "External id": 149254, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978862, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978862, "pid": 0, "tid": 7, "ts": 6303771795538.598, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485939.707, "dur": 7.070, + "args": { + "External id": 149254, "cbid": 307, "correlation": 289978862 + } + }, + { + "ph": "s", "id": 289978862, "pid": 5714, "tid": 5714, "ts": 6303771485939.707, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771485989.297, "dur": 1.450, + "args": { + "External id": 149263, "cbid": 210, "correlation": 289978884 + } + }, + { + "ph": "f", "id": 289978884, "pid": 5714, "tid": 5714, "ts": 6303771485989.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771795751.817, "dur": 326.851, + "args": { + "External id": 149263, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978885, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289978885, "pid": 0, "tid": 7, "ts": 6303771795751.817, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771485992.517, "dur": 6.590, + "args": { + "External id": 149263, "cbid": 211, "correlation": 289978885 + } + }, + { + "ph": "s", "id": 289978885, "pid": 5714, "tid": 5714, "ts": 6303771485992.517, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771796079.340, "dur": 51.649, + "args": { + "External id": 149265, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978895, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978895, "pid": 0, "tid": 7, "ts": 6303771796079.340, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486035.047, "dur": 6.570, + "args": { + "External id": 149265, "cbid": 211, "correlation": 289978895 + } + }, + { + "ph": "s", "id": 289978895, "pid": 5714, "tid": 5714, "ts": 6303771486035.047, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771796131.693, "dur": 63.937, + "args": { + "External id": 149270, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978908, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978908, "pid": 0, "tid": 7, "ts": 6303771796131.693, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486086.647, "dur": 7.210, + "args": { + "External id": 149270, "cbid": 211, "correlation": 289978908 + } + }, + { + "ph": "s", "id": 289978908, "pid": 5714, "tid": 5714, "ts": 6303771486086.647, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771796196.238, "dur": 63.328, + "args": { + "External id": 149271, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978919, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978919, "pid": 0, "tid": 7, "ts": 6303771796196.238, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486108.197, "dur": 4.960, + "args": { + "External id": 149271, "cbid": 211, "correlation": 289978919 + } + }, + { + "ph": "s", "id": 289978919, "pid": 5714, "tid": 5714, "ts": 6303771486108.197, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771796260.238, "dur": 15.136, + "args": { + "External id": 149274, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978933, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978933, "pid": 0, "tid": 7, "ts": 6303771796260.238, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486128.227, "dur": 4.520, + "args": { + "External id": 149274, "cbid": 211, "correlation": 289978933 + } + }, + { + "ph": "s", "id": 289978933, "pid": 5714, "tid": 5714, "ts": 6303771486128.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771796276.014, "dur": 1.376, + "args": { + "External id": 149276, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978939, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289978939, "pid": 0, "tid": 7, "ts": 6303771796276.014, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486143.917, "dur": 4.220, + "args": { + "External id": 149276, "cbid": 211, "correlation": 289978939 + } + }, + { + "ph": "s", "id": 289978939, "pid": 5714, "tid": 5714, "ts": 6303771486143.917, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771796277.998, "dur": 1.024, + "args": { + "External id": 149277, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978949, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289978949, "pid": 0, "tid": 7, "ts": 6303771796277.998, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486159.446, "dur": 4.120, + "args": { + "External id": 149277, "cbid": 211, "correlation": 289978949 + } + }, + { + "ph": "s", "id": 289978949, "pid": 5714, "tid": 5714, "ts": 6303771486159.446, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771796279.726, "dur": 88.610, + "args": { + "External id": 149278, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978959, "pid": 0, "tid": 7, "ts": 6303771796279.726, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486177.646, "dur": 4.300, + "args": { + "External id": 149278, "cbid": 211, "correlation": 289978959 + } + }, + { + "ph": "s", "id": 289978959, "pid": 5714, "tid": 5714, "ts": 6303771486177.646, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771796368.976, "dur": 48.224, + "args": { + "External id": 149283, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978972, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978972, "pid": 0, "tid": 7, "ts": 6303771796368.976, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486201.117, "dur": 4.660, + "args": { + "External id": 149283, "cbid": 211, "correlation": 289978972 + } + }, + { + "ph": "s", "id": 289978972, "pid": 5714, "tid": 5714, "ts": 6303771486201.117, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771796417.936, "dur": 23.360, + "args": { + "External id": 149284, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289978983, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289978983, "pid": 0, "tid": 7, "ts": 6303771796417.936, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486217.206, "dur": 4.060, + "args": { + "External id": 149284, "cbid": 211, "correlation": 289978983 + } + }, + { + "ph": "s", "id": 289978983, "pid": 5714, "tid": 5714, "ts": 6303771486217.206, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771796441.968, "dur": 123.682, + "args": { + "External id": 149292, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979006, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979006, "pid": 0, "tid": 7, "ts": 6303771796441.968, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486281.416, "dur": 7.160, + "args": { + "External id": 149292, "cbid": 211, "correlation": 289979006 + } + }, + { + "ph": "s", "id": 289979006, "pid": 5714, "tid": 5714, "ts": 6303771486281.416, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771796566.354, "dur": 122.625, + "args": { + "External id": 149301, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979029, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979029, "pid": 0, "tid": 7, "ts": 6303771796566.354, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486346.426, "dur": 7.410, + "args": { + "External id": 149301, "cbid": 211, "correlation": 289979029 + } + }, + { + "ph": "s", "id": 289979029, "pid": 5714, "tid": 5714, "ts": 6303771486346.426, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771796689.683, "dur": 122.370, + "args": { + "External id": 149310, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979052, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979052, "pid": 0, "tid": 7, "ts": 6303771796689.683, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486400.436, "dur": 6.780, + "args": { + "External id": 149310, "cbid": 211, "correlation": 289979052 + } + }, + { + "ph": "s", "id": 289979052, "pid": 5714, "tid": 5714, "ts": 6303771486400.436, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771796812.661, "dur": 52.160, + "args": { + "External id": 149318, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979071, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979071, "pid": 0, "tid": 7, "ts": 6303771796812.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486571.936, "dur": 18.769, + "args": { + "External id": 149318, "cbid": 307, "correlation": 289979071 + } + }, + { + "ph": "s", "id": 289979071, "pid": 5714, "tid": 5714, "ts": 6303771486571.936, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771796865.429, "dur": 61.057, + "args": { + "External id": 149321, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979088, "registers per thread": 22, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979088, "pid": 0, "tid": 7, "ts": 6303771796865.429, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486706.205, "dur": 7.560, + "args": { + "External id": 149321, "cbid": 307, "correlation": 289979088 + } + }, + { + "ph": "s", "id": 289979088, "pid": 5714, "tid": 5714, "ts": 6303771486706.205, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771486797.665, "dur": 0.470, + "args": { + "External id": 149325, "cbid": 200, "correlation": 289979092 + } + }, + { + "ph": "f", "id": 289979092, "pid": 5714, "tid": 5714, "ts": 6303771486797.665, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771486798.255, "dur": 0.180, + "args": { + "External id": 149325, "cbid": 200, "correlation": 289979093 + } + }, + { + "ph": "f", "id": 289979093, "pid": 5714, "tid": 5714, "ts": 6303771486798.255, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771486826.635, "dur": 0.360, + "args": { + "External id": 149325, "cbid": 200, "correlation": 289979116 + } + }, + { + "ph": "f", "id": 289979116, "pid": 5714, "tid": 5714, "ts": 6303771486826.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771486832.735, "dur": 1.860, + "args": { + "External id": 149325, "cbid": 273, "correlation": 289979125 + } + }, + { + "ph": "f", "id": 289979125, "pid": 5714, "tid": 5714, "ts": 6303771486832.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_fwd_kernel >, false, true, false, false, true, true, false, false>(flash::Flash_fwd_params)", "pid": 0, "tid": 7, + "ts": 6303771796927.222, "dur": 423.013, + "args": { + "External id": 149325, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979126, "registers per thread": 255, "shared memory": 49152, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [16, 8, 12], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979126, "pid": 0, "tid": 7, "ts": 6303771796927.222, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486835.145, "dur": 9.280, + "args": { + "External id": 149325, "cbid": 211, "correlation": 289979126 + } + }, + { + "ph": "s", "id": 289979126, "pid": 5714, "tid": 5714, "ts": 6303771486835.145, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_tn", "pid": 0, "tid": 7, + "ts": 6303771797350.907, "dur": 124.289, + "args": { + "External id": 149341, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979152, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979152, "pid": 0, "tid": 7, "ts": 6303771797350.907, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486954.035, "dur": 9.030, + "args": { + "External id": 149341, "cbid": 211, "correlation": 289979152 + } + }, + { + "ph": "s", "id": 289979152, "pid": 5714, "tid": 5714, "ts": 6303771486954.035, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771797475.804, "dur": 62.081, + "args": { + "External id": 149343, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979162, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979162, "pid": 0, "tid": 7, "ts": 6303771797475.804, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771486994.395, "dur": 6.450, + "args": { + "External id": 149343, "cbid": 211, "correlation": 289979162 + } + }, + { + "ph": "s", "id": 289979162, "pid": 5714, "tid": 5714, "ts": 6303771486994.395, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771797538.557, "dur": 49.089, + "args": { + "External id": 149348, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979175, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979175, "pid": 0, "tid": 7, "ts": 6303771797538.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487037.855, "dur": 7.149, + "args": { + "External id": 149348, "cbid": 211, "correlation": 289979175 + } + }, + { + "ph": "s", "id": 289979175, "pid": 5714, "tid": 5714, "ts": 6303771487037.855, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771797588.318, "dur": 68.672, + "args": { + "External id": 149349, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979186, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979186, "pid": 0, "tid": 7, "ts": 6303771797588.318, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487061.295, "dur": 4.929, + "args": { + "External id": 149349, "cbid": 211, "correlation": 289979186 + } + }, + { + "ph": "s", "id": 289979186, "pid": 5714, "tid": 5714, "ts": 6303771487061.295, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771797657.662, "dur": 15.456, + "args": { + "External id": 149352, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979200, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979200, "pid": 0, "tid": 7, "ts": 6303771797657.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487081.884, "dur": 4.691, + "args": { + "External id": 149352, "cbid": 211, "correlation": 289979200 + } + }, + { + "ph": "s", "id": 289979200, "pid": 5714, "tid": 5714, "ts": 6303771487081.884, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771797673.759, "dur": 1.728, + "args": { + "External id": 149354, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979206, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289979206, "pid": 0, "tid": 7, "ts": 6303771797673.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487098.435, "dur": 4.080, + "args": { + "External id": 149354, "cbid": 211, "correlation": 289979206 + } + }, + { + "ph": "s", "id": 289979206, "pid": 5714, "tid": 5714, "ts": 6303771487098.435, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::rsqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771797676.159, "dur": 1.024, + "args": { + "External id": 149355, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979216, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289979216, "pid": 0, "tid": 7, "ts": 6303771797676.159, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487113.435, "dur": 4.049, + "args": { + "External id": 149355, "cbid": 211, "correlation": 289979216 + } + }, + { + "ph": "s", "id": 289979216, "pid": 5714, "tid": 5714, "ts": 6303771487113.435, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771797677.887, "dur": 91.105, + "args": { + "External id": 149356, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979226, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979226, "pid": 0, "tid": 7, "ts": 6303771797677.887, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487132.244, "dur": 4.351, + "args": { + "External id": 149356, "cbid": 211, "correlation": 289979226 + } + }, + { + "ph": "s", "id": 289979226, "pid": 5714, "tid": 5714, "ts": 6303771487132.244, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771797769.664, "dur": 50.016, + "args": { + "External id": 149361, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979239, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979239, "pid": 0, "tid": 7, "ts": 6303771797769.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487152.954, "dur": 4.590, + "args": { + "External id": 149361, "cbid": 211, "correlation": 289979239 + } + }, + { + "ph": "s", "id": 289979239, "pid": 5714, "tid": 5714, "ts": 6303771487152.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771797820.320, "dur": 22.657, + "args": { + "External id": 149362, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979250, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979250, "pid": 0, "tid": 7, "ts": 6303771797820.320, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487170.314, "dur": 4.130, + "args": { + "External id": 149362, "cbid": 211, "correlation": 289979250 + } + }, + { + "ph": "s", "id": 289979250, "pid": 5714, "tid": 5714, "ts": 6303771487170.314, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771797843.681, "dur": 323.875, + "args": { + "External id": 149370, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979273, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979273, "pid": 0, "tid": 7, "ts": 6303771797843.681, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487232.254, "dur": 7.360, + "args": { + "External id": 149370, "cbid": 211, "correlation": 289979273 + } + }, + { + "ph": "s", "id": 289979273, "pid": 5714, "tid": 5714, "ts": 6303771487232.254, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771798168.292, "dur": 324.388, + "args": { + "External id": 149379, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979296, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979296, "pid": 0, "tid": 7, "ts": 6303771798168.292, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487287.124, "dur": 6.700, + "args": { + "External id": 149379, "cbid": 211, "correlation": 289979296 + } + }, + { + "ph": "s", "id": 289979296, "pid": 5714, "tid": 5714, "ts": 6303771487287.124, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6303771798493.384, "dur": 211.811, + "args": { + "External id": 149381, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979310, "registers per thread": 24, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979310, "pid": 0, "tid": 7, "ts": 6303771798493.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487371.614, "dur": 7.950, + "args": { + "External id": 149381, "cbid": 307, "correlation": 289979310 + } + }, + { + "ph": "s", "id": 289979310, "pid": 5714, "tid": 5714, "ts": 6303771487371.614, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771487421.894, "dur": 1.380, + "args": { + "External id": 149390, "cbid": 210, "correlation": 289979332 + } + }, + { + "ph": "f", "id": 289979332, "pid": 5714, "tid": 5714, "ts": 6303771487421.894, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_64x128_sliced1x2_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771798705.835, "dur": 327.396, + "args": { + "External id": 149390, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979333, "registers per thread": 230, "shared memory": 32768, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [12, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979333, "pid": 0, "tid": 7, "ts": 6303771798705.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487425.074, "dur": 7.190, + "args": { + "External id": 149390, "cbid": 211, "correlation": 289979333 + } + }, + { + "ph": "s", "id": 289979333, "pid": 5714, "tid": 5714, "ts": 6303771487425.074, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771799033.871, "dur": 52.800, + "args": { + "External id": 149392, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979343, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979343, "pid": 0, "tid": 7, "ts": 6303771799033.871, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487466.394, "dur": 6.920, + "args": { + "External id": 149392, "cbid": 211, "correlation": 289979343 + } + }, + { + "ph": "s", "id": 289979343, "pid": 5714, "tid": 5714, "ts": 6303771487466.394, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy_contig, unsigned int, 3, 128, 1>(at::native::(anonymous namespace)::OpaqueType<2u>*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, unsigned int, 128, 1>, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 0, "tid": 7, + "ts": 6303771799087.247, "dur": 225.474, + "args": { + "External id": 149394, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979354, "registers per thread": 20, "shared memory": 0, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [256, 4, 1], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979354, "pid": 0, "tid": 7, "ts": 6303771799087.247, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487507.634, "dur": 7.400, + "args": { + "External id": 149394, "cbid": 211, "correlation": 289979354 + } + }, + { + "ph": "s", "id": 289979354, "pid": 5714, "tid": 5714, "ts": 6303771487507.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_0", "pid": 0, "tid": 7, + "ts": 6303771799313.330, "dur": 195.490, + "args": { + "External id": 149399, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979373, "registers per thread": 22, "shared memory": 32, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979373, "pid": 0, "tid": 7, "ts": 6303771799313.330, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487697.483, "dur": 9.230, + "args": { + "External id": 149399, "cbid": 307, "correlation": 289979373 + } + }, + { + "ph": "s", "id": 289979373, "pid": 5714, "tid": 5714, "ts": 6303771487697.483, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771799509.524, "dur": 1.536, + "args": { + "External id": 149403, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979385, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979385, "pid": 0, "tid": 7, "ts": 6303771799509.524, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487825.663, "dur": 9.510, + "args": { + "External id": 149403, "cbid": 211, "correlation": 289979385 + } + }, + { + "ph": "s", "id": 289979385, "pid": 5714, "tid": 5714, "ts": 6303771487825.663, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771799511.796, "dur": 1.088, + "args": { + "External id": 149407, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979401, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979401, "pid": 0, "tid": 7, "ts": 6303771799511.796, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487853.123, "dur": 4.740, + "args": { + "External id": 149407, "cbid": 211, "correlation": 289979401 + } + }, + { + "ph": "s", "id": 289979401, "pid": 5714, "tid": 5714, "ts": 6303771487853.123, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771799513.524, "dur": 0.832, + "args": { + "External id": 149411, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979417, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979417, "pid": 0, "tid": 7, "ts": 6303771799513.524, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487871.643, "dur": 4.040, + "args": { + "External id": 149411, "cbid": 211, "correlation": 289979417 + } + }, + { + "ph": "s", "id": 289979417, "pid": 5714, "tid": 5714, "ts": 6303771487871.643, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771799514.964, "dur": 2.208, + "args": { + "External id": 149447, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979445, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 289979445, "pid": 0, "tid": 7, "ts": 6303771799514.964, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771487997.122, "dur": 8.920, + "args": { + "External id": 149447, "cbid": 211, "correlation": 289979445 + } + }, + { + "ph": "s", "id": 289979445, "pid": 5714, "tid": 5714, "ts": 6303771487997.122, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771799517.812, "dur": 49.984, + "args": { + "External id": 149455, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979463, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979463, "pid": 0, "tid": 7, "ts": 6303771799517.812, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771488091.642, "dur": 9.450, + "args": { + "External id": 149455, "cbid": 211, "correlation": 289979463 + } + }, + { + "ph": "s", "id": 289979463, "pid": 5714, "tid": 5714, "ts": 6303771488091.642, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771799568.404, "dur": 17.569, + "args": { + "External id": 149460, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979480, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979480, "pid": 0, "tid": 7, "ts": 6303771799568.404, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771488134.042, "dur": 6.690, + "args": { + "External id": 149460, "cbid": 211, "correlation": 289979480 + } + }, + { + "ph": "s", "id": 289979480, "pid": 5714, "tid": 5714, "ts": 6303771488134.042, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771799586.581, "dur": 101.121, + "args": { + "External id": 149465, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979496, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979496, "pid": 0, "tid": 7, "ts": 6303771799586.581, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771488157.212, "dur": 4.370, + "args": { + "External id": 149465, "cbid": 211, "correlation": 289979496 + } + }, + { + "ph": "s", "id": 289979496, "pid": 5714, "tid": 5714, "ts": 6303771488157.212, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771799688.406, "dur": 2.176, + "args": { + "External id": 149469, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979512, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289979512, "pid": 0, "tid": 7, "ts": 6303771799688.406, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771488177.712, "dur": 4.230, + "args": { + "External id": 149469, "cbid": 211, "correlation": 289979512 + } + }, + { + "ph": "s", "id": 289979512, "pid": 5714, "tid": 5714, "ts": 6303771488177.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771799691.734, "dur": 1.664, + "args": { + "External id": 149470, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979524, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289979524, "pid": 0, "tid": 7, "ts": 6303771799691.734, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771488201.732, "dur": 5.950, + "args": { + "External id": 149470, "cbid": 211, "correlation": 289979524 + } + }, + { + "ph": "s", "id": 289979524, "pid": 5714, "tid": 5714, "ts": 6303771488201.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771799694.134, "dur": 2.080, + "args": { + "External id": 149477, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979542, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289979542, "pid": 0, "tid": 7, "ts": 6303771799694.134, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771488235.202, "dur": 6.400, + "args": { + "External id": 149477, "cbid": 211, "correlation": 289979542 + } + }, + { + "ph": "s", "id": 289979542, "pid": 5714, "tid": 5714, "ts": 6303771488235.202, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 0, "tid": 7, + "ts": 6303771799696.886, "dur": 3.808, + "args": { + "External id": 149472, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979551, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979551, "pid": 0, "tid": 7, "ts": 6303771799696.886, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771488248.782, "dur": 5.090, + "args": { + "External id": 149472, "cbid": 211, "correlation": 289979551 + } + }, + { + "ph": "s", "id": 289979551, "pid": 5714, "tid": 5714, "ts": 6303771488248.782, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771488267.152, "dur": 2.300, + "args": { + "External id": 149479, "cbid": 138, "correlation": 289979556 + } + }, + { + "ph": "f", "id": 289979556, "pid": 5714, "tid": 5714, "ts": 6303771488267.152, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6303771799707.670, "dur": 1.184, + "args": { + "External id": 149479, "device": 0, "context": 1, "stream": 7, "correlation": 289979559, "bytes": 8, "memory bandwidth (GB/s)": 0.006756756756756757 + } + }, + { + "ph": "f", "id": 289979559, "pid": 0, "tid": 7, "ts": 6303771799707.670, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771488271.612, "dur": 9.380, + "args": { + "External id": 149479, "cbid": 41, "correlation": 289979559 + } + }, + { + "ph": "s", "id": 289979559, "pid": 5714, "tid": 5714, "ts": 6303771488271.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494344.268, "dur": 4.850, + "args": { + "cbid": 138, "correlation": 289979561 + } + }, + { + "ph": "f", "id": 289979561, "pid": 5714, "tid": 1822426688, "ts": 6303771494344.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494349.568, "dur": 0.780, + "args": { + "cbid": 138, "correlation": 289979562 + } + }, + { + "ph": "f", "id": 289979562, "pid": 5714, "tid": 1822426688, "ts": 6303771494349.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494353.278, "dur": 0.540, + "args": { + "cbid": 138, "correlation": 289979563 + } + }, + { + "ph": "f", "id": 289979563, "pid": 5714, "tid": 1822426688, "ts": 6303771494353.278, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494357.308, "dur": 0.840, + "args": { + "cbid": 138, "correlation": 289979564 + } + }, + { + "ph": "f", "id": 289979564, "pid": 5714, "tid": 1822426688, "ts": 6303771494357.308, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494358.378, "dur": 0.390, + "args": { + "cbid": 138, "correlation": 289979565 + } + }, + { + "ph": "f", "id": 289979565, "pid": 5714, "tid": 1822426688, "ts": 6303771494358.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494359.798, "dur": 0.410, + "args": { + "cbid": 138, "correlation": 289979566 + } + }, + { + "ph": "f", "id": 289979566, "pid": 5714, "tid": 1822426688, "ts": 6303771494359.798, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494362.478, "dur": 1.490, + "args": { + "cbid": 138, "correlation": 289979567 + } + }, + { + "ph": "f", "id": 289979567, "pid": 5714, "tid": 1822426688, "ts": 6303771494362.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494365.818, "dur": 1.280, + "args": { + "cbid": 138, "correlation": 289979569 + } + }, + { + "ph": "f", "id": 289979569, "pid": 5714, "tid": 1822426688, "ts": 6303771494365.818, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494368.628, "dur": 1.070, + "args": { + "cbid": 138, "correlation": 289979571 + } + }, + { + "ph": "f", "id": 289979571, "pid": 5714, "tid": 1822426688, "ts": 6303771494368.628, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494371.118, "dur": 1.170, + "args": { + "cbid": 138, "correlation": 289979573 + } + }, + { + "ph": "f", "id": 289979573, "pid": 5714, "tid": 1822426688, "ts": 6303771494371.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494373.438, "dur": 0.920, + "args": { + "cbid": 138, "correlation": 289979575 + } + }, + { + "ph": "f", "id": 289979575, "pid": 5714, "tid": 1822426688, "ts": 6303771494373.438, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494375.608, "dur": 0.970, + "args": { + "cbid": 138, "correlation": 289979577 + } + }, + { + "ph": "f", "id": 289979577, "pid": 5714, "tid": 1822426688, "ts": 6303771494375.608, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494377.998, "dur": 0.910, + "args": { + "cbid": 138, "correlation": 289979579 + } + }, + { + "ph": "f", "id": 289979579, "pid": 5714, "tid": 1822426688, "ts": 6303771494377.998, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494380.298, "dur": 0.900, + "args": { + "cbid": 138, "correlation": 289979581 + } + }, + { + "ph": "f", "id": 289979581, "pid": 5714, "tid": 1822426688, "ts": 6303771494380.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494382.578, "dur": 0.790, + "args": { + "cbid": 138, "correlation": 289979583 + } + }, + { + "ph": "f", "id": 289979583, "pid": 5714, "tid": 1822426688, "ts": 6303771494382.578, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494385.078, "dur": 0.630, + "args": { + "cbid": 138, "correlation": 289979585 + } + }, + { + "ph": "f", "id": 289979585, "pid": 5714, "tid": 1822426688, "ts": 6303771494385.078, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771494387.248, "dur": 0.680, + "args": { + "cbid": 138, "correlation": 289979587 + } + }, + { + "ph": "f", "id": 289979587, "pid": 5714, "tid": 1822426688, "ts": 6303771494387.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594469.136, "dur": 4.330, + "args": { + "cbid": 138, "correlation": 289979589 + } + }, + { + "ph": "f", "id": 289979589, "pid": 5714, "tid": 1822426688, "ts": 6303771594469.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594475.256, "dur": 0.890, + "args": { + "cbid": 138, "correlation": 289979591 + } + }, + { + "ph": "f", "id": 289979591, "pid": 5714, "tid": 1822426688, "ts": 6303771594475.256, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594477.116, "dur": 0.670, + "args": { + "cbid": 138, "correlation": 289979593 + } + }, + { + "ph": "f", "id": 289979593, "pid": 5714, "tid": 1822426688, "ts": 6303771594477.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594478.826, "dur": 0.470, + "args": { + "cbid": 138, "correlation": 289979595 + } + }, + { + "ph": "f", "id": 289979595, "pid": 5714, "tid": 1822426688, "ts": 6303771594478.826, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594480.086, "dur": 0.440, + "args": { + "cbid": 138, "correlation": 289979597 + } + }, + { + "ph": "f", "id": 289979597, "pid": 5714, "tid": 1822426688, "ts": 6303771594480.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594481.316, "dur": 0.470, + "args": { + "cbid": 138, "correlation": 289979599 + } + }, + { + "ph": "f", "id": 289979599, "pid": 5714, "tid": 1822426688, "ts": 6303771594481.316, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594482.716, "dur": 0.700, + "args": { + "cbid": 138, "correlation": 289979601 + } + }, + { + "ph": "f", "id": 289979601, "pid": 5714, "tid": 1822426688, "ts": 6303771594482.716, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594484.216, "dur": 0.740, + "args": { + "cbid": 138, "correlation": 289979603 + } + }, + { + "ph": "f", "id": 289979603, "pid": 5714, "tid": 1822426688, "ts": 6303771594484.216, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594485.736, "dur": 0.550, + "args": { + "cbid": 138, "correlation": 289979605 + } + }, + { + "ph": "f", "id": 289979605, "pid": 5714, "tid": 1822426688, "ts": 6303771594485.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594487.086, "dur": 0.530, + "args": { + "cbid": 138, "correlation": 289979607 + } + }, + { + "ph": "f", "id": 289979607, "pid": 5714, "tid": 1822426688, "ts": 6303771594487.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771594488.696, "dur": 0.430, + "args": { + "cbid": 138, "correlation": 289979609 + } + }, + { + "ph": "f", "id": 289979609, "pid": 5714, "tid": 1822426688, "ts": 6303771594488.696, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694570.653, "dur": 4.830, + "args": { + "cbid": 138, "correlation": 289979611 + } + }, + { + "ph": "f", "id": 289979611, "pid": 5714, "tid": 1822426688, "ts": 6303771694570.653, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694577.333, "dur": 0.960, + "args": { + "cbid": 138, "correlation": 289979613 + } + }, + { + "ph": "f", "id": 289979613, "pid": 5714, "tid": 1822426688, "ts": 6303771694577.333, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694579.233, "dur": 0.980, + "args": { + "cbid": 138, "correlation": 289979615 + } + }, + { + "ph": "f", "id": 289979615, "pid": 5714, "tid": 1822426688, "ts": 6303771694579.233, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694581.193, "dur": 0.700, + "args": { + "cbid": 138, "correlation": 289979617 + } + }, + { + "ph": "f", "id": 289979617, "pid": 5714, "tid": 1822426688, "ts": 6303771694581.193, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694582.713, "dur": 0.530, + "args": { + "cbid": 138, "correlation": 289979619 + } + }, + { + "ph": "f", "id": 289979619, "pid": 5714, "tid": 1822426688, "ts": 6303771694582.713, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694584.073, "dur": 0.470, + "args": { + "cbid": 138, "correlation": 289979621 + } + }, + { + "ph": "f", "id": 289979621, "pid": 5714, "tid": 1822426688, "ts": 6303771694584.073, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694585.483, "dur": 0.740, + "args": { + "cbid": 138, "correlation": 289979623 + } + }, + { + "ph": "f", "id": 289979623, "pid": 5714, "tid": 1822426688, "ts": 6303771694585.483, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694587.073, "dur": 0.860, + "args": { + "cbid": 138, "correlation": 289979625 + } + }, + { + "ph": "f", "id": 289979625, "pid": 5714, "tid": 1822426688, "ts": 6303771694587.073, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694588.773, "dur": 0.440, + "args": { + "cbid": 138, "correlation": 289979627 + } + }, + { + "ph": "f", "id": 289979627, "pid": 5714, "tid": 1822426688, "ts": 6303771694588.773, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694590.173, "dur": 0.540, + "args": { + "cbid": 138, "correlation": 289979629 + } + }, + { + "ph": "f", "id": 289979629, "pid": 5714, "tid": 1822426688, "ts": 6303771694590.173, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771694591.843, "dur": 0.530, + "args": { + "cbid": 138, "correlation": 289979631 + } + }, + { + "ph": "f", "id": 289979631, "pid": 5714, "tid": 1822426688, "ts": 6303771694591.843, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794657.891, "dur": 4.830, + "args": { + "cbid": 138, "correlation": 289979633 + } + }, + { + "ph": "f", "id": 289979633, "pid": 5714, "tid": 1822426688, "ts": 6303771794657.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794663.121, "dur": 0.850, + "args": { + "cbid": 138, "correlation": 289979634 + } + }, + { + "ph": "f", "id": 289979634, "pid": 5714, "tid": 1822426688, "ts": 6303771794663.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794671.561, "dur": 0.600, + "args": { + "cbid": 138, "correlation": 289979635 + } + }, + { + "ph": "f", "id": 289979635, "pid": 5714, "tid": 1822426688, "ts": 6303771794671.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794677.931, "dur": 0.890, + "args": { + "cbid": 138, "correlation": 289979636 + } + }, + { + "ph": "f", "id": 289979636, "pid": 5714, "tid": 1822426688, "ts": 6303771794677.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794678.971, "dur": 0.460, + "args": { + "cbid": 138, "correlation": 289979637 + } + }, + { + "ph": "f", "id": 289979637, "pid": 5714, "tid": 1822426688, "ts": 6303771794678.971, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794680.581, "dur": 0.480, + "args": { + "cbid": 138, "correlation": 289979638 + } + }, + { + "ph": "f", "id": 289979638, "pid": 5714, "tid": 1822426688, "ts": 6303771794680.581, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794682.321, "dur": 0.870, + "args": { + "cbid": 138, "correlation": 289979639 + } + }, + { + "ph": "f", "id": 289979639, "pid": 5714, "tid": 1822426688, "ts": 6303771794682.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794683.331, "dur": 0.470, + "args": { + "cbid": 138, "correlation": 289979640 + } + }, + { + "ph": "f", "id": 289979640, "pid": 5714, "tid": 1822426688, "ts": 6303771794683.331, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794684.611, "dur": 0.480, + "args": { + "cbid": 138, "correlation": 289979641 + } + }, + { + "ph": "f", "id": 289979641, "pid": 5714, "tid": 1822426688, "ts": 6303771794684.611, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794686.611, "dur": 0.700, + "args": { + "cbid": 138, "correlation": 289979642 + } + }, + { + "ph": "f", "id": 289979642, "pid": 5714, "tid": 1822426688, "ts": 6303771794686.611, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794687.541, "dur": 0.420, + "args": { + "cbid": 138, "correlation": 289979643 + } + }, + { + "ph": "f", "id": 289979643, "pid": 5714, "tid": 1822426688, "ts": 6303771794687.541, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794688.861, "dur": 0.470, + "args": { + "cbid": 138, "correlation": 289979644 + } + }, + { + "ph": "f", "id": 289979644, "pid": 5714, "tid": 1822426688, "ts": 6303771794688.861, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794690.811, "dur": 0.710, + "args": { + "cbid": 138, "correlation": 289979645 + } + }, + { + "ph": "f", "id": 289979645, "pid": 5714, "tid": 1822426688, "ts": 6303771794690.811, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794691.741, "dur": 0.450, + "args": { + "cbid": 138, "correlation": 289979646 + } + }, + { + "ph": "f", "id": 289979646, "pid": 5714, "tid": 1822426688, "ts": 6303771794691.741, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794692.921, "dur": 0.460, + "args": { + "cbid": 138, "correlation": 289979647 + } + }, + { + "ph": "f", "id": 289979647, "pid": 5714, "tid": 1822426688, "ts": 6303771794692.921, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794694.871, "dur": 0.710, + "args": { + "cbid": 138, "correlation": 289979648 + } + }, + { + "ph": "f", "id": 289979648, "pid": 5714, "tid": 1822426688, "ts": 6303771794694.871, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794696.001, "dur": 0.420, + "args": { + "cbid": 138, "correlation": 289979649 + } + }, + { + "ph": "f", "id": 289979649, "pid": 5714, "tid": 1822426688, "ts": 6303771794696.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794697.161, "dur": 0.450, + "args": { + "cbid": 138, "correlation": 289979650 + } + }, + { + "ph": "f", "id": 289979650, "pid": 5714, "tid": 1822426688, "ts": 6303771794697.161, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794698.931, "dur": 0.670, + "args": { + "cbid": 138, "correlation": 289979651 + } + }, + { + "ph": "f", "id": 289979651, "pid": 5714, "tid": 1822426688, "ts": 6303771794698.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794699.751, "dur": 0.410, + "args": { + "cbid": 138, "correlation": 289979652 + } + }, + { + "ph": "f", "id": 289979652, "pid": 5714, "tid": 1822426688, "ts": 6303771794699.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794701.211, "dur": 0.440, + "args": { + "cbid": 138, "correlation": 289979653 + } + }, + { + "ph": "f", "id": 289979653, "pid": 5714, "tid": 1822426688, "ts": 6303771794701.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794703.001, "dur": 0.700, + "args": { + "cbid": 138, "correlation": 289979654 + } + }, + { + "ph": "f", "id": 289979654, "pid": 5714, "tid": 1822426688, "ts": 6303771794703.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794703.841, "dur": 0.420, + "args": { + "cbid": 138, "correlation": 289979655 + } + }, + { + "ph": "f", "id": 289979655, "pid": 5714, "tid": 1822426688, "ts": 6303771794703.841, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794705.261, "dur": 0.450, + "args": { + "cbid": 138, "correlation": 289979656 + } + }, + { + "ph": "f", "id": 289979656, "pid": 5714, "tid": 1822426688, "ts": 6303771794705.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794706.941, "dur": 0.570, + "args": { + "cbid": 138, "correlation": 289979657 + } + }, + { + "ph": "f", "id": 289979657, "pid": 5714, "tid": 1822426688, "ts": 6303771794706.941, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794707.651, "dur": 0.410, + "args": { + "cbid": 138, "correlation": 289979658 + } + }, + { + "ph": "f", "id": 289979658, "pid": 5714, "tid": 1822426688, "ts": 6303771794707.651, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794708.921, "dur": 0.450, + "args": { + "cbid": 138, "correlation": 289979659 + } + }, + { + "ph": "f", "id": 289979659, "pid": 5714, "tid": 1822426688, "ts": 6303771794708.921, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794710.561, "dur": 0.620, + "args": { + "cbid": 138, "correlation": 289979660 + } + }, + { + "ph": "f", "id": 289979660, "pid": 5714, "tid": 1822426688, "ts": 6303771794710.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794711.321, "dur": 0.410, + "args": { + "cbid": 138, "correlation": 289979661 + } + }, + { + "ph": "f", "id": 289979661, "pid": 5714, "tid": 1822426688, "ts": 6303771794711.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794712.581, "dur": 0.440, + "args": { + "cbid": 138, "correlation": 289979662 + } + }, + { + "ph": "f", "id": 289979662, "pid": 5714, "tid": 1822426688, "ts": 6303771794712.581, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794714.401, "dur": 0.700, + "args": { + "cbid": 138, "correlation": 289979663 + } + }, + { + "ph": "f", "id": 289979663, "pid": 5714, "tid": 1822426688, "ts": 6303771794714.401, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794715.291, "dur": 0.440, + "args": { + "cbid": 138, "correlation": 289979664 + } + }, + { + "ph": "f", "id": 289979664, "pid": 5714, "tid": 1822426688, "ts": 6303771794715.291, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771794716.371, "dur": 0.480, + "args": { + "cbid": 138, "correlation": 289979665 + } + }, + { + "ph": "f", "id": 289979665, "pid": 5714, "tid": 1822426688, "ts": 6303771794716.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6303771488281.302, "dur": 311435.678, + "args": { + "External id": 149479, "cbid": 131, "correlation": 289979560 + } + }, + { + "ph": "s", "id": 289979560, "pid": 5714, "tid": 5714, "ts": 6303771488281.302, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771799824.709, "dur": 2.450, + "args": { + "External id": 149487, "cbid": 210, "correlation": 289979690 + } + }, + { + "ph": "f", "id": 289979690, "pid": 5714, "tid": 5714, "ts": 6303771799824.709, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771799848.536, "dur": 636.519, + "args": { + "External id": 149487, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979691, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979691, "pid": 0, "tid": 7, "ts": 6303771799848.536, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771799831.669, "dur": 17.680, + "args": { + "External id": 149487, "cbid": 211, "correlation": 289979691 + } + }, + { + "ph": "s", "id": 289979691, "pid": 5714, "tid": 5714, "ts": 6303771799831.669, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771800485.727, "dur": 171.362, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979710, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289979710, "pid": 0, "tid": 7, "ts": 6303771800485.727, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800019.069, "dur": 11.720, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289979710 + } + }, + { + "ph": "s", "id": 289979710, "pid": 5714, "tid": 5714, "ts": 6303771800019.069, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771800657.697, "dur": 4.224, + "args": { + "External id": 149497, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979727, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979727, "pid": 0, "tid": 7, "ts": 6303771800657.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800082.289, "dur": 10.090, + "args": { + "External id": 149497, "cbid": 211, "correlation": 289979727 + } + }, + { + "ph": "s", "id": 289979727, "pid": 5714, "tid": 5714, "ts": 6303771800082.289, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771800662.626, "dur": 1.184, + "args": { + "External id": 149502, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979744, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979744, "pid": 0, "tid": 7, "ts": 6303771800662.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800126.529, "dur": 7.570, + "args": { + "External id": 149502, "cbid": 211, "correlation": 289979744 + } + }, + { + "ph": "s", "id": 289979744, "pid": 5714, "tid": 5714, "ts": 6303771800126.529, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771800664.450, "dur": 1.024, + "args": { + "External id": 149504, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979754, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979754, "pid": 0, "tid": 7, "ts": 6303771800664.450, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800153.488, "dur": 6.431, + "args": { + "External id": 149504, "cbid": 211, "correlation": 289979754 + } + }, + { + "ph": "s", "id": 289979754, "pid": 5714, "tid": 5714, "ts": 6303771800153.488, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771800666.178, "dur": 1.024, + "args": { + "External id": 149505, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979760, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979760, "pid": 0, "tid": 7, "ts": 6303771800666.178, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800184.979, "dur": 7.120, + "args": { + "External id": 149505, "cbid": 211, "correlation": 289979760 + } + }, + { + "ph": "s", "id": 289979760, "pid": 5714, "tid": 5714, "ts": 6303771800184.979, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771800667.874, "dur": 1.056, + "args": { + "External id": 149506, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979770, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979770, "pid": 0, "tid": 7, "ts": 6303771800667.874, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800208.979, "dur": 5.589, + "args": { + "External id": 149506, "cbid": 211, "correlation": 289979770 + } + }, + { + "ph": "s", "id": 289979770, "pid": 5714, "tid": 5714, "ts": 6303771800208.979, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771800669.602, "dur": 1.024, + "args": { + "External id": 149507, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979776, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979776, "pid": 0, "tid": 7, "ts": 6303771800669.602, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800225.148, "dur": 5.591, + "args": { + "External id": 149507, "cbid": 211, "correlation": 289979776 + } + }, + { + "ph": "s", "id": 289979776, "pid": 5714, "tid": 5714, "ts": 6303771800225.148, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771800671.362, "dur": 3.296, + "args": { + "External id": 149508, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979789, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979789, "pid": 0, "tid": 7, "ts": 6303771800671.362, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800255.448, "dur": 7.010, + "args": { + "External id": 149508, "cbid": 211, "correlation": 289979789 + } + }, + { + "ph": "s", "id": 289979789, "pid": 5714, "tid": 5714, "ts": 6303771800255.448, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771800675.298, "dur": 1.088, + "args": { + "External id": 149511, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979795, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979795, "pid": 0, "tid": 7, "ts": 6303771800675.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800272.648, "dur": 6.270, + "args": { + "External id": 149511, "cbid": 211, "correlation": 289979795 + } + }, + { + "ph": "s", "id": 289979795, "pid": 5714, "tid": 5714, "ts": 6303771800272.648, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771800676.994, "dur": 0.992, + "args": { + "External id": 149512, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979801, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979801, "pid": 0, "tid": 7, "ts": 6303771800676.994, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800284.718, "dur": 3.800, + "args": { + "External id": 149512, "cbid": 211, "correlation": 289979801 + } + }, + { + "ph": "s", "id": 289979801, "pid": 5714, "tid": 5714, "ts": 6303771800284.718, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771800678.722, "dur": 232.994, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979815, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289979815, "pid": 0, "tid": 7, "ts": 6303771800678.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800401.188, "dur": 11.310, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289979815 + } + }, + { + "ph": "s", "id": 289979815, "pid": 5714, "tid": 5714, "ts": 6303771800401.188, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771800458.118, "dur": 1.180, + "args": { + "External id": 149516, "cbid": 200, "correlation": 289979838 + } + }, + { + "ph": "f", "id": 289979838, "pid": 5714, "tid": 5714, "ts": 6303771800458.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771800912.644, "dur": 1.024, + "args": { + "External id": 149516, "device": 0, "context": 1, "stream": 7, "correlation": 289979841, "bytes": 1536, "memory bandwidth (GB/s)": 1.5 + } + }, + { + "ph": "f", "id": 289979841, "pid": 0, "tid": 7, "ts": 6303771800912.644, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771800462.308, "dur": 12.150, + "args": { + "External id": 149516, "cbid": 51, "correlation": 289979841 + } + }, + { + "ph": "s", "id": 289979841, "pid": 5714, "tid": 5714, "ts": 6303771800462.308, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771800914.852, "dur": 691.368, + "args": { + "External id": 149516, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979842, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979842, "pid": 0, "tid": 7, "ts": 6303771800914.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800474.718, "dur": 6.560, + "args": { + "External id": 149516, "cbid": 307, "correlation": 289979842 + } + }, + { + "ph": "s", "id": 289979842, "pid": 5714, "tid": 5714, "ts": 6303771800474.718, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771801606.828, "dur": 2.976, + "args": { + "External id": 149519, "device": 0, "context": 1, "stream": 7, "correlation": 289979847, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289979847, "pid": 0, "tid": 7, "ts": 6303771801606.828, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771800510.668, "dur": 20.750, + "args": { + "External id": 149519, "cbid": 41, "correlation": 289979847 + } + }, + { + "ph": "s", "id": 289979847, "pid": 5714, "tid": 5714, "ts": 6303771800510.668, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771800581.568, "dur": 0.550, + "args": { + "External id": 149524, "cbid": 200, "correlation": 289979875 + } + }, + { + "ph": "f", "id": 289979875, "pid": 5714, "tid": 5714, "ts": 6303771800581.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771801610.444, "dur": 694.953, + "args": { + "External id": 149524, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979878, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979878, "pid": 0, "tid": 7, "ts": 6303771801610.444, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800583.858, "dur": 8.330, + "args": { + "External id": 149524, "cbid": 307, "correlation": 289979878 + } + }, + { + "ph": "s", "id": 289979878, "pid": 5714, "tid": 5714, "ts": 6303771800583.858, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771802306.133, "dur": 221.186, + "args": { + "External id": 149525, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979883, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289979883, "pid": 0, "tid": 7, "ts": 6303771802306.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800610.407, "dur": 8.020, + "args": { + "External id": 149525, "cbid": 211, "correlation": 289979883 + } + }, + { + "ph": "s", "id": 289979883, "pid": 5714, "tid": 5714, "ts": 6303771800610.407, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771800668.787, "dur": 1.471, + "args": { + "External id": 149533, "cbid": 210, "correlation": 289979909 + } + }, + { + "ph": "f", "id": 289979909, "pid": 5714, "tid": 5714, "ts": 6303771800668.787, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771802527.927, "dur": 638.023, + "args": { + "External id": 149533, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979910, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289979910, "pid": 0, "tid": 7, "ts": 6303771802527.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800673.987, "dur": 8.251, + "args": { + "External id": 149533, "cbid": 211, "correlation": 289979910 + } + }, + { + "ph": "s", "id": 289979910, "pid": 5714, "tid": 5714, "ts": 6303771800673.987, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771803166.558, "dur": 170.659, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979929, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289979929, "pid": 0, "tid": 7, "ts": 6303771803166.558, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800794.797, "dur": 9.940, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289979929 + } + }, + { + "ph": "s", "id": 289979929, "pid": 5714, "tid": 5714, "ts": 6303771800794.797, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771803337.825, "dur": 4.032, + "args": { + "External id": 149543, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979946, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979946, "pid": 0, "tid": 7, "ts": 6303771803337.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800841.017, "dur": 7.680, + "args": { + "External id": 149543, "cbid": 211, "correlation": 289979946 + } + }, + { + "ph": "s", "id": 289979946, "pid": 5714, "tid": 5714, "ts": 6303771800841.017, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771803342.465, "dur": 1.216, + "args": { + "External id": 149548, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979963, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979963, "pid": 0, "tid": 7, "ts": 6303771803342.465, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800874.207, "dur": 5.950, + "args": { + "External id": 149548, "cbid": 211, "correlation": 289979963 + } + }, + { + "ph": "s", "id": 289979963, "pid": 5714, "tid": 5714, "ts": 6303771800874.207, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771803344.289, "dur": 1.024, + "args": { + "External id": 149550, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979973, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979973, "pid": 0, "tid": 7, "ts": 6303771803344.289, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800895.717, "dur": 5.360, + "args": { + "External id": 149550, "cbid": 211, "correlation": 289979973 + } + }, + { + "ph": "s", "id": 289979973, "pid": 5714, "tid": 5714, "ts": 6303771800895.717, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771803346.017, "dur": 1.056, + "args": { + "External id": 149551, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979979, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979979, "pid": 0, "tid": 7, "ts": 6303771803346.017, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800908.507, "dur": 4.420, + "args": { + "External id": 149551, "cbid": 211, "correlation": 289979979 + } + }, + { + "ph": "s", "id": 289979979, "pid": 5714, "tid": 5714, "ts": 6303771800908.507, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771803347.745, "dur": 1.024, + "args": { + "External id": 149552, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979989, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979989, "pid": 0, "tid": 7, "ts": 6303771803347.745, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800923.547, "dur": 4.490, + "args": { + "External id": 149552, "cbid": 211, "correlation": 289979989 + } + }, + { + "ph": "s", "id": 289979989, "pid": 5714, "tid": 5714, "ts": 6303771800923.547, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771803349.441, "dur": 1.023, + "args": { + "External id": 149553, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289979995, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289979995, "pid": 0, "tid": 7, "ts": 6303771803349.441, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800934.067, "dur": 4.250, + "args": { + "External id": 149553, "cbid": 211, "correlation": 289979995 + } + }, + { + "ph": "s", "id": 289979995, "pid": 5714, "tid": 5714, "ts": 6303771800934.067, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771803351.200, "dur": 3.296, + "args": { + "External id": 149554, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980008, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980008, "pid": 0, "tid": 7, "ts": 6303771803351.200, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800956.597, "dur": 5.720, + "args": { + "External id": 149554, "cbid": 211, "correlation": 289980008 + } + }, + { + "ph": "s", "id": 289980008, "pid": 5714, "tid": 5714, "ts": 6303771800956.597, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771803355.168, "dur": 1.056, + "args": { + "External id": 149557, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980014, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980014, "pid": 0, "tid": 7, "ts": 6303771803355.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800968.757, "dur": 4.370, + "args": { + "External id": 149557, "cbid": 211, "correlation": 289980014 + } + }, + { + "ph": "s", "id": 289980014, "pid": 5714, "tid": 5714, "ts": 6303771800968.757, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771803356.864, "dur": 0.992, + "args": { + "External id": 149558, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980020, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980020, "pid": 0, "tid": 7, "ts": 6303771803356.864, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771800978.217, "dur": 3.910, + "args": { + "External id": 149558, "cbid": 211, "correlation": 289980020 + } + }, + { + "ph": "s", "id": 289980020, "pid": 5714, "tid": 5714, "ts": 6303771800978.217, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771803358.560, "dur": 233.924, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980034, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289980034, "pid": 0, "tid": 7, "ts": 6303771803358.560, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801075.077, "dur": 8.860, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289980034 + } + }, + { + "ph": "s", "id": 289980034, "pid": 5714, "tid": 5714, "ts": 6303771801075.077, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771801119.666, "dur": 0.591, + "args": { + "External id": 149562, "cbid": 200, "correlation": 289980057 + } + }, + { + "ph": "f", "id": 289980057, "pid": 5714, "tid": 5714, "ts": 6303771801119.666, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771803593.284, "dur": 0.832, + "args": { + "External id": 149562, "device": 0, "context": 1, "stream": 7, "correlation": 289980060, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289980060, "pid": 0, "tid": 7, "ts": 6303771803593.284, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771801122.097, "dur": 6.989, + "args": { + "External id": 149562, "cbid": 51, "correlation": 289980060 + } + }, + { + "ph": "s", "id": 289980060, "pid": 5714, "tid": 5714, "ts": 6303771801122.097, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771803594.852, "dur": 695.975, + "args": { + "External id": 149562, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980061, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980061, "pid": 0, "tid": 7, "ts": 6303771803594.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801129.326, "dur": 6.071, + "args": { + "External id": 149562, "cbid": 307, "correlation": 289980061 + } + }, + { + "ph": "s", "id": 289980061, "pid": 5714, "tid": 5714, "ts": 6303771801129.326, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771804291.467, "dur": 2.945, + "args": { + "External id": 149565, "device": 0, "context": 1, "stream": 7, "correlation": 289980066, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.1589134125636 + } + }, + { + "ph": "f", "id": 289980066, "pid": 0, "tid": 7, "ts": 6303771804291.467, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771801160.846, "dur": 13.760, + "args": { + "External id": 149565, "cbid": 41, "correlation": 289980066 + } + }, + { + "ph": "s", "id": 289980066, "pid": 5714, "tid": 5714, "ts": 6303771801160.846, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771801218.856, "dur": 0.480, + "args": { + "External id": 149570, "cbid": 200, "correlation": 289980094 + } + }, + { + "ph": "f", "id": 289980094, "pid": 5714, "tid": 5714, "ts": 6303771801218.856, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771804295.020, "dur": 690.280, + "args": { + "External id": 149570, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980097, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980097, "pid": 0, "tid": 7, "ts": 6303771804295.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801220.826, "dur": 7.510, + "args": { + "External id": 149570, "cbid": 307, "correlation": 289980097 + } + }, + { + "ph": "s", "id": 289980097, "pid": 5714, "tid": 5714, "ts": 6303771801220.826, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771804985.940, "dur": 221.666, + "args": { + "External id": 149571, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980102, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289980102, "pid": 0, "tid": 7, "ts": 6303771804985.940, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801243.236, "dur": 7.500, + "args": { + "External id": 149571, "cbid": 211, "correlation": 289980102 + } + }, + { + "ph": "s", "id": 289980102, "pid": 5714, "tid": 5714, "ts": 6303771801243.236, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771801305.046, "dur": 1.870, + "args": { + "External id": 149579, "cbid": 210, "correlation": 289980128 + } + }, + { + "ph": "f", "id": 289980128, "pid": 5714, "tid": 5714, "ts": 6303771801305.046, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771805208.278, "dur": 640.040, + "args": { + "External id": 149579, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980129, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980129, "pid": 0, "tid": 7, "ts": 6303771805208.278, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801311.486, "dur": 9.010, + "args": { + "External id": 149579, "cbid": 211, "correlation": 289980129 + } + }, + { + "ph": "s", "id": 289980129, "pid": 5714, "tid": 5714, "ts": 6303771801311.486, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771805849.054, "dur": 171.105, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980148, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289980148, "pid": 0, "tid": 7, "ts": 6303771805849.054, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801435.786, "dur": 9.290, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289980148 + } + }, + { + "ph": "s", "id": 289980148, "pid": 5714, "tid": 5714, "ts": 6303771801435.786, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771806020.895, "dur": 4.032, + "args": { + "External id": 149589, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980165, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980165, "pid": 0, "tid": 7, "ts": 6303771806020.895, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801481.476, "dur": 7.510, + "args": { + "External id": 149589, "cbid": 211, "correlation": 289980165 + } + }, + { + "ph": "s", "id": 289980165, "pid": 5714, "tid": 5714, "ts": 6303771801481.476, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771806025.535, "dur": 1.184, + "args": { + "External id": 149594, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980182, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980182, "pid": 0, "tid": 7, "ts": 6303771806025.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801512.225, "dur": 5.471, + "args": { + "External id": 149594, "cbid": 211, "correlation": 289980182 + } + }, + { + "ph": "s", "id": 289980182, "pid": 5714, "tid": 5714, "ts": 6303771801512.225, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771806027.327, "dur": 1.024, + "args": { + "External id": 149596, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980192, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980192, "pid": 0, "tid": 7, "ts": 6303771806027.327, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801531.685, "dur": 4.880, + "args": { + "External id": 149596, "cbid": 211, "correlation": 289980192 + } + }, + { + "ph": "s", "id": 289980192, "pid": 5714, "tid": 5714, "ts": 6303771801531.685, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771806029.055, "dur": 1.057, + "args": { + "External id": 149597, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980198, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980198, "pid": 0, "tid": 7, "ts": 6303771806029.055, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801544.756, "dur": 4.240, + "args": { + "External id": 149597, "cbid": 211, "correlation": 289980198 + } + }, + { + "ph": "s", "id": 289980198, "pid": 5714, "tid": 5714, "ts": 6303771801544.756, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771806030.784, "dur": 1.056, + "args": { + "External id": 149598, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980208, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980208, "pid": 0, "tid": 7, "ts": 6303771806030.784, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801560.696, "dur": 4.369, + "args": { + "External id": 149598, "cbid": 211, "correlation": 289980208 + } + }, + { + "ph": "s", "id": 289980208, "pid": 5714, "tid": 5714, "ts": 6303771801560.696, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771806032.480, "dur": 1.024, + "args": { + "External id": 149599, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980214, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980214, "pid": 0, "tid": 7, "ts": 6303771806032.480, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801570.796, "dur": 4.320, + "args": { + "External id": 149599, "cbid": 211, "correlation": 289980214 + } + }, + { + "ph": "s", "id": 289980214, "pid": 5714, "tid": 5714, "ts": 6303771801570.796, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771806034.240, "dur": 3.296, + "args": { + "External id": 149600, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980227, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980227, "pid": 0, "tid": 7, "ts": 6303771806034.240, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801592.105, "dur": 5.151, + "args": { + "External id": 149600, "cbid": 211, "correlation": 289980227 + } + }, + { + "ph": "s", "id": 289980227, "pid": 5714, "tid": 5714, "ts": 6303771801592.105, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771806038.176, "dur": 1.088, + "args": { + "External id": 149603, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980233, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980233, "pid": 0, "tid": 7, "ts": 6303771806038.176, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801603.535, "dur": 4.470, + "args": { + "External id": 149603, "cbid": 211, "correlation": 289980233 + } + }, + { + "ph": "s", "id": 289980233, "pid": 5714, "tid": 5714, "ts": 6303771801603.535, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771806039.904, "dur": 0.992, + "args": { + "External id": 149604, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980239, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980239, "pid": 0, "tid": 7, "ts": 6303771806039.904, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801614.075, "dur": 4.100, + "args": { + "External id": 149604, "cbid": 211, "correlation": 289980239 + } + }, + { + "ph": "s", "id": 289980239, "pid": 5714, "tid": 5714, "ts": 6303771801614.075, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771806041.600, "dur": 233.186, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980253, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289980253, "pid": 0, "tid": 7, "ts": 6303771806041.600, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801707.715, "dur": 8.530, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289980253 + } + }, + { + "ph": "s", "id": 289980253, "pid": 5714, "tid": 5714, "ts": 6303771801707.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771801751.195, "dur": 0.560, + "args": { + "External id": 149608, "cbid": 200, "correlation": 289980276 + } + }, + { + "ph": "f", "id": 289980276, "pid": 5714, "tid": 5714, "ts": 6303771801751.195, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771806275.682, "dur": 0.800, + "args": { + "External id": 149608, "device": 0, "context": 1, "stream": 7, "correlation": 289980279, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289980279, "pid": 0, "tid": 7, "ts": 6303771806275.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771801754.165, "dur": 7.310, + "args": { + "External id": 149608, "cbid": 51, "correlation": 289980279 + } + }, + { + "ph": "s", "id": 289980279, "pid": 5714, "tid": 5714, "ts": 6303771801754.165, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771806277.666, "dur": 692.553, + "args": { + "External id": 149608, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980280, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980280, "pid": 0, "tid": 7, "ts": 6303771806277.666, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801761.725, "dur": 6.010, + "args": { + "External id": 149608, "cbid": 307, "correlation": 289980280 + } + }, + { + "ph": "s", "id": 289980280, "pid": 5714, "tid": 5714, "ts": 6303771801761.725, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771806970.891, "dur": 2.944, + "args": { + "External id": 149611, "device": 0, "context": 1, "stream": 7, "correlation": 289980285, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 289980285, "pid": 0, "tid": 7, "ts": 6303771806970.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771801792.865, "dur": 12.650, + "args": { + "External id": 149611, "cbid": 41, "correlation": 289980285 + } + }, + { + "ph": "s", "id": 289980285, "pid": 5714, "tid": 5714, "ts": 6303771801792.865, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771801849.615, "dur": 0.470, + "args": { + "External id": 149616, "cbid": 200, "correlation": 289980313 + } + }, + { + "ph": "f", "id": 289980313, "pid": 5714, "tid": 5714, "ts": 6303771801849.615, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771806974.667, "dur": 693.351, + "args": { + "External id": 149616, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980316, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980316, "pid": 0, "tid": 7, "ts": 6303771806974.667, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801851.595, "dur": 8.610, + "args": { + "External id": 149616, "cbid": 307, "correlation": 289980316 + } + }, + { + "ph": "s", "id": 289980316, "pid": 5714, "tid": 5714, "ts": 6303771801851.595, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771807668.691, "dur": 221.154, + "args": { + "External id": 149617, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980321, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289980321, "pid": 0, "tid": 7, "ts": 6303771807668.691, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801875.775, "dur": 6.270, + "args": { + "External id": 149617, "cbid": 211, "correlation": 289980321 + } + }, + { + "ph": "s", "id": 289980321, "pid": 5714, "tid": 5714, "ts": 6303771801875.775, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771801927.825, "dur": 1.170, + "args": { + "External id": 149625, "cbid": 210, "correlation": 289980347 + } + }, + { + "ph": "f", "id": 289980347, "pid": 5714, "tid": 5714, "ts": 6303771801927.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771807890.517, "dur": 641.128, + "args": { + "External id": 149625, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980348, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980348, "pid": 0, "tid": 7, "ts": 6303771807890.517, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771801932.625, "dur": 8.630, + "args": { + "External id": 149625, "cbid": 211, "correlation": 289980348 + } + }, + { + "ph": "s", "id": 289980348, "pid": 5714, "tid": 5714, "ts": 6303771801932.625, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771808532.285, "dur": 171.074, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980367, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289980367, "pid": 0, "tid": 7, "ts": 6303771808532.285, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802051.314, "dur": 9.260, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289980367 + } + }, + { + "ph": "s", "id": 289980367, "pid": 5714, "tid": 5714, "ts": 6303771802051.314, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771808703.967, "dur": 4.096, + "args": { + "External id": 149635, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980384, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980384, "pid": 0, "tid": 7, "ts": 6303771808703.967, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802095.874, "dur": 7.800, + "args": { + "External id": 149635, "cbid": 211, "correlation": 289980384 + } + }, + { + "ph": "s", "id": 289980384, "pid": 5714, "tid": 5714, "ts": 6303771802095.874, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771808708.735, "dur": 1.248, + "args": { + "External id": 149640, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980401, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980401, "pid": 0, "tid": 7, "ts": 6303771808708.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802130.904, "dur": 6.940, + "args": { + "External id": 149640, "cbid": 211, "correlation": 289980401 + } + }, + { + "ph": "s", "id": 289980401, "pid": 5714, "tid": 5714, "ts": 6303771802130.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771808710.559, "dur": 1.024, + "args": { + "External id": 149642, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980411, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980411, "pid": 0, "tid": 7, "ts": 6303771808710.559, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802153.194, "dur": 5.180, + "args": { + "External id": 149642, "cbid": 211, "correlation": 289980411 + } + }, + { + "ph": "s", "id": 289980411, "pid": 5714, "tid": 5714, "ts": 6303771802153.194, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771808712.287, "dur": 1.056, + "args": { + "External id": 149643, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980417, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980417, "pid": 0, "tid": 7, "ts": 6303771808712.287, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802165.614, "dur": 4.490, + "args": { + "External id": 149643, "cbid": 211, "correlation": 289980417 + } + }, + { + "ph": "s", "id": 289980417, "pid": 5714, "tid": 5714, "ts": 6303771802165.614, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771808714.015, "dur": 1.024, + "args": { + "External id": 149644, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980427, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980427, "pid": 0, "tid": 7, "ts": 6303771808714.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802180.794, "dur": 4.380, + "args": { + "External id": 149644, "cbid": 211, "correlation": 289980427 + } + }, + { + "ph": "s", "id": 289980427, "pid": 5714, "tid": 5714, "ts": 6303771802180.794, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771808715.711, "dur": 1.056, + "args": { + "External id": 149645, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980433, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980433, "pid": 0, "tid": 7, "ts": 6303771808715.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802190.934, "dur": 4.270, + "args": { + "External id": 149645, "cbid": 211, "correlation": 289980433 + } + }, + { + "ph": "s", "id": 289980433, "pid": 5714, "tid": 5714, "ts": 6303771802190.934, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771808717.471, "dur": 3.296, + "args": { + "External id": 149646, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980446, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980446, "pid": 0, "tid": 7, "ts": 6303771808717.471, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802212.644, "dur": 5.160, + "args": { + "External id": 149646, "cbid": 211, "correlation": 289980446 + } + }, + { + "ph": "s", "id": 289980446, "pid": 5714, "tid": 5714, "ts": 6303771802212.644, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771808721.439, "dur": 1.088, + "args": { + "External id": 149649, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980452, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980452, "pid": 0, "tid": 7, "ts": 6303771808721.439, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802223.904, "dur": 4.500, + "args": { + "External id": 149649, "cbid": 211, "correlation": 289980452 + } + }, + { + "ph": "s", "id": 289980452, "pid": 5714, "tid": 5714, "ts": 6303771802223.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771808723.135, "dur": 0.992, + "args": { + "External id": 149650, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980458, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980458, "pid": 0, "tid": 7, "ts": 6303771808723.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802233.374, "dur": 3.820, + "args": { + "External id": 149650, "cbid": 211, "correlation": 289980458 + } + }, + { + "ph": "s", "id": 289980458, "pid": 5714, "tid": 5714, "ts": 6303771802233.374, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771808724.863, "dur": 233.731, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980472, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289980472, "pid": 0, "tid": 7, "ts": 6303771808724.863, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802333.504, "dur": 9.860, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289980472 + } + }, + { + "ph": "s", "id": 289980472, "pid": 5714, "tid": 5714, "ts": 6303771802333.504, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771802380.684, "dur": 0.610, + "args": { + "External id": 149654, "cbid": 200, "correlation": 289980495 + } + }, + { + "ph": "f", "id": 289980495, "pid": 5714, "tid": 5714, "ts": 6303771802380.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771808959.490, "dur": 0.832, + "args": { + "External id": 149654, "device": 0, "context": 1, "stream": 7, "correlation": 289980498, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289980498, "pid": 0, "tid": 7, "ts": 6303771808959.490, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771802383.184, "dur": 6.970, + "args": { + "External id": 149654, "cbid": 51, "correlation": 289980498 + } + }, + { + "ph": "s", "id": 289980498, "pid": 5714, "tid": 5714, "ts": 6303771802383.184, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771808961.090, "dur": 691.272, + "args": { + "External id": 149654, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980499, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980499, "pid": 0, "tid": 7, "ts": 6303771808961.090, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802390.404, "dur": 6.050, + "args": { + "External id": 149654, "cbid": 307, "correlation": 289980499 + } + }, + { + "ph": "s", "id": 289980499, "pid": 5714, "tid": 5714, "ts": 6303771802390.404, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771809653.066, "dur": 2.944, + "args": { + "External id": 149657, "device": 0, "context": 1, "stream": 7, "correlation": 289980504, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 289980504, "pid": 0, "tid": 7, "ts": 6303771809653.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771802423.694, "dur": 12.860, + "args": { + "External id": 149657, "cbid": 41, "correlation": 289980504 + } + }, + { + "ph": "s", "id": 289980504, "pid": 5714, "tid": 5714, "ts": 6303771802423.694, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771802480.883, "dur": 0.480, + "args": { + "External id": 149662, "cbid": 200, "correlation": 289980532 + } + }, + { + "ph": "f", "id": 289980532, "pid": 5714, "tid": 5714, "ts": 6303771802480.883, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771809656.746, "dur": 686.568, + "args": { + "External id": 149662, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980535, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980535, "pid": 0, "tid": 7, "ts": 6303771809656.746, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802483.463, "dur": 7.640, + "args": { + "External id": 149662, "cbid": 307, "correlation": 289980535 + } + }, + { + "ph": "s", "id": 289980535, "pid": 5714, "tid": 5714, "ts": 6303771802483.463, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771810344.018, "dur": 220.514, + "args": { + "External id": 149663, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980540, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289980540, "pid": 0, "tid": 7, "ts": 6303771810344.018, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802507.293, "dur": 6.520, + "args": { + "External id": 149663, "cbid": 211, "correlation": 289980540 + } + }, + { + "ph": "s", "id": 289980540, "pid": 5714, "tid": 5714, "ts": 6303771802507.293, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771802559.443, "dur": 1.190, + "args": { + "External id": 149671, "cbid": 210, "correlation": 289980566 + } + }, + { + "ph": "f", "id": 289980566, "pid": 5714, "tid": 5714, "ts": 6303771802559.443, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771810565.172, "dur": 637.191, + "args": { + "External id": 149671, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980567, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980567, "pid": 0, "tid": 7, "ts": 6303771810565.172, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802564.213, "dur": 8.630, + "args": { + "External id": 149671, "cbid": 211, "correlation": 289980567 + } + }, + { + "ph": "s", "id": 289980567, "pid": 5714, "tid": 5714, "ts": 6303771802564.213, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771811203.035, "dur": 171.523, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980586, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289980586, "pid": 0, "tid": 7, "ts": 6303771811203.035, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802681.213, "dur": 9.190, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289980586 + } + }, + { + "ph": "s", "id": 289980586, "pid": 5714, "tid": 5714, "ts": 6303771802681.213, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771811375.134, "dur": 4.064, + "args": { + "External id": 149681, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980603, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980603, "pid": 0, "tid": 7, "ts": 6303771811375.134, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802725.753, "dur": 7.700, + "args": { + "External id": 149681, "cbid": 211, "correlation": 289980603 + } + }, + { + "ph": "s", "id": 289980603, "pid": 5714, "tid": 5714, "ts": 6303771802725.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771811379.806, "dur": 1.184, + "args": { + "External id": 149686, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980620, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980620, "pid": 0, "tid": 7, "ts": 6303771811379.806, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802758.523, "dur": 5.690, + "args": { + "External id": 149686, "cbid": 211, "correlation": 289980620 + } + }, + { + "ph": "s", "id": 289980620, "pid": 5714, "tid": 5714, "ts": 6303771802758.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771811381.630, "dur": 1.024, + "args": { + "External id": 149688, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980630, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980630, "pid": 0, "tid": 7, "ts": 6303771811381.630, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802777.843, "dur": 4.860, + "args": { + "External id": 149688, "cbid": 211, "correlation": 289980630 + } + }, + { + "ph": "s", "id": 289980630, "pid": 5714, "tid": 5714, "ts": 6303771802777.843, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771811383.358, "dur": 1.056, + "args": { + "External id": 149689, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980636, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980636, "pid": 0, "tid": 7, "ts": 6303771811383.358, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802793.233, "dur": 4.620, + "args": { + "External id": 149689, "cbid": 211, "correlation": 289980636 + } + }, + { + "ph": "s", "id": 289980636, "pid": 5714, "tid": 5714, "ts": 6303771802793.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771811385.086, "dur": 1.024, + "args": { + "External id": 149690, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980646, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980646, "pid": 0, "tid": 7, "ts": 6303771811385.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802808.513, "dur": 4.550, + "args": { + "External id": 149690, "cbid": 211, "correlation": 289980646 + } + }, + { + "ph": "s", "id": 289980646, "pid": 5714, "tid": 5714, "ts": 6303771802808.513, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771811386.782, "dur": 1.024, + "args": { + "External id": 149691, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980652, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980652, "pid": 0, "tid": 7, "ts": 6303771811386.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802818.893, "dur": 3.990, + "args": { + "External id": 149691, "cbid": 211, "correlation": 289980652 + } + }, + { + "ph": "s", "id": 289980652, "pid": 5714, "tid": 5714, "ts": 6303771802818.893, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771811388.542, "dur": 3.296, + "args": { + "External id": 149692, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980665, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980665, "pid": 0, "tid": 7, "ts": 6303771811388.542, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802839.233, "dur": 6.320, + "args": { + "External id": 149692, "cbid": 211, "correlation": 289980665 + } + }, + { + "ph": "s", "id": 289980665, "pid": 5714, "tid": 5714, "ts": 6303771802839.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771811392.478, "dur": 1.088, + "args": { + "External id": 149695, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980671, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980671, "pid": 0, "tid": 7, "ts": 6303771811392.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802851.753, "dur": 4.409, + "args": { + "External id": 149695, "cbid": 211, "correlation": 289980671 + } + }, + { + "ph": "s", "id": 289980671, "pid": 5714, "tid": 5714, "ts": 6303771802851.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771811394.174, "dur": 1.024, + "args": { + "External id": 149696, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980677, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980677, "pid": 0, "tid": 7, "ts": 6303771811394.174, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802861.213, "dur": 3.960, + "args": { + "External id": 149696, "cbid": 211, "correlation": 289980677 + } + }, + { + "ph": "s", "id": 289980677, "pid": 5714, "tid": 5714, "ts": 6303771802861.213, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771811395.902, "dur": 233.539, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980691, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289980691, "pid": 0, "tid": 7, "ts": 6303771811395.902, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771802954.352, "dur": 9.180, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289980691 + } + }, + { + "ph": "s", "id": 289980691, "pid": 5714, "tid": 5714, "ts": 6303771802954.352, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771802998.522, "dur": 0.580, + "args": { + "External id": 149700, "cbid": 200, "correlation": 289980714 + } + }, + { + "ph": "f", "id": 289980714, "pid": 5714, "tid": 5714, "ts": 6303771802998.522, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771811736.834, "dur": 0.928, + "args": { + "External id": 149700, "device": 0, "context": 1, "stream": 7, "correlation": 289980717, "bytes": 1536, "memory bandwidth (GB/s)": 1.6551724137931034 + } + }, + { + "ph": "f", "id": 289980717, "pid": 0, "tid": 7, "ts": 6303771811736.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771803001.002, "dur": 7.030, + "args": { + "External id": 149700, "cbid": 51, "correlation": 289980717 + } + }, + { + "ph": "s", "id": 289980717, "pid": 5714, "tid": 5714, "ts": 6303771803001.002, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771811739.202, "dur": 688.072, + "args": { + "External id": 149700, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980718, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980718, "pid": 0, "tid": 7, "ts": 6303771811739.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803008.272, "dur": 6.180, + "args": { + "External id": 149700, "cbid": 307, "correlation": 289980718 + } + }, + { + "ph": "s", "id": 289980718, "pid": 5714, "tid": 5714, "ts": 6303771803008.272, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771812427.914, "dur": 3.008, + "args": { + "External id": 149703, "device": 0, "context": 1, "stream": 7, "correlation": 289980723, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 289980723, "pid": 0, "tid": 7, "ts": 6303771812427.914, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771803040.832, "dur": 12.250, + "args": { + "External id": 149703, "cbid": 41, "correlation": 289980723 + } + }, + { + "ph": "s", "id": 289980723, "pid": 5714, "tid": 5714, "ts": 6303771803040.832, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771803098.722, "dur": 0.510, + "args": { + "External id": 149708, "cbid": 200, "correlation": 289980751 + } + }, + { + "ph": "f", "id": 289980751, "pid": 5714, "tid": 5714, "ts": 6303771803098.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771812431.626, "dur": 692.680, + "args": { + "External id": 149708, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980754, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980754, "pid": 0, "tid": 7, "ts": 6303771812431.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803100.742, "dur": 7.440, + "args": { + "External id": 149708, "cbid": 307, "correlation": 289980754 + } + }, + { + "ph": "s", "id": 289980754, "pid": 5714, "tid": 5714, "ts": 6303771803100.742, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771813124.978, "dur": 221.667, + "args": { + "External id": 149709, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980759, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289980759, "pid": 0, "tid": 7, "ts": 6303771813124.978, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803123.392, "dur": 6.250, + "args": { + "External id": 149709, "cbid": 211, "correlation": 289980759 + } + }, + { + "ph": "s", "id": 289980759, "pid": 5714, "tid": 5714, "ts": 6303771803123.392, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771803176.062, "dur": 1.420, + "args": { + "External id": 149717, "cbid": 210, "correlation": 289980785 + } + }, + { + "ph": "f", "id": 289980785, "pid": 5714, "tid": 5714, "ts": 6303771803176.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771813347.253, "dur": 636.071, + "args": { + "External id": 149717, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980786, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980786, "pid": 0, "tid": 7, "ts": 6303771813347.253, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803181.032, "dur": 7.690, + "args": { + "External id": 149717, "cbid": 211, "correlation": 289980786 + } + }, + { + "ph": "s", "id": 289980786, "pid": 5714, "tid": 5714, "ts": 6303771803181.032, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771813983.964, "dur": 170.658, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980805, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289980805, "pid": 0, "tid": 7, "ts": 6303771813983.964, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803307.301, "dur": 9.551, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289980805 + } + }, + { + "ph": "s", "id": 289980805, "pid": 5714, "tid": 5714, "ts": 6303771803307.301, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771814155.294, "dur": 4.192, + "args": { + "External id": 149727, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980822, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980822, "pid": 0, "tid": 7, "ts": 6303771814155.294, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803354.332, "dur": 7.940, + "args": { + "External id": 149727, "cbid": 211, "correlation": 289980822 + } + }, + { + "ph": "s", "id": 289980822, "pid": 5714, "tid": 5714, "ts": 6303771803354.332, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771814160.094, "dur": 1.216, + "args": { + "External id": 149732, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980839, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980839, "pid": 0, "tid": 7, "ts": 6303771814160.094, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803384.852, "dur": 5.600, + "args": { + "External id": 149732, "cbid": 211, "correlation": 289980839 + } + }, + { + "ph": "s", "id": 289980839, "pid": 5714, "tid": 5714, "ts": 6303771803384.852, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771814161.918, "dur": 1.024, + "args": { + "External id": 149734, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980849, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980849, "pid": 0, "tid": 7, "ts": 6303771814161.918, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803403.841, "dur": 5.010, + "args": { + "External id": 149734, "cbid": 211, "correlation": 289980849 + } + }, + { + "ph": "s", "id": 289980849, "pid": 5714, "tid": 5714, "ts": 6303771803403.841, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771814163.646, "dur": 1.056, + "args": { + "External id": 149735, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980855, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980855, "pid": 0, "tid": 7, "ts": 6303771814163.646, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803415.861, "dur": 4.360, + "args": { + "External id": 149735, "cbid": 211, "correlation": 289980855 + } + }, + { + "ph": "s", "id": 289980855, "pid": 5714, "tid": 5714, "ts": 6303771803415.861, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771814165.342, "dur": 1.056, + "args": { + "External id": 149736, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980865, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980865, "pid": 0, "tid": 7, "ts": 6303771814165.342, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803431.511, "dur": 5.370, + "args": { + "External id": 149736, "cbid": 211, "correlation": 289980865 + } + }, + { + "ph": "s", "id": 289980865, "pid": 5714, "tid": 5714, "ts": 6303771803431.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771814167.070, "dur": 1.024, + "args": { + "External id": 149737, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980871, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980871, "pid": 0, "tid": 7, "ts": 6303771814167.070, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803442.831, "dur": 4.260, + "args": { + "External id": 149737, "cbid": 211, "correlation": 289980871 + } + }, + { + "ph": "s", "id": 289980871, "pid": 5714, "tid": 5714, "ts": 6303771803442.831, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771814168.830, "dur": 3.296, + "args": { + "External id": 149738, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980884, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980884, "pid": 0, "tid": 7, "ts": 6303771814168.830, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803464.061, "dur": 4.820, + "args": { + "External id": 149738, "cbid": 211, "correlation": 289980884 + } + }, + { + "ph": "s", "id": 289980884, "pid": 5714, "tid": 5714, "ts": 6303771803464.061, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771814172.766, "dur": 1.088, + "args": { + "External id": 149741, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980890, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980890, "pid": 0, "tid": 7, "ts": 6303771814172.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803475.231, "dur": 4.110, + "args": { + "External id": 149741, "cbid": 211, "correlation": 289980890 + } + }, + { + "ph": "s", "id": 289980890, "pid": 5714, "tid": 5714, "ts": 6303771803475.231, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771814174.462, "dur": 1.024, + "args": { + "External id": 149742, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980896, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289980896, "pid": 0, "tid": 7, "ts": 6303771814174.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803484.331, "dur": 3.900, + "args": { + "External id": 149742, "cbid": 211, "correlation": 289980896 + } + }, + { + "ph": "s", "id": 289980896, "pid": 5714, "tid": 5714, "ts": 6303771803484.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771814176.190, "dur": 234.563, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980910, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289980910, "pid": 0, "tid": 7, "ts": 6303771814176.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803577.931, "dur": 8.870, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289980910 + } + }, + { + "ph": "s", "id": 289980910, "pid": 5714, "tid": 5714, "ts": 6303771803577.931, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771803622.071, "dur": 0.570, + "args": { + "External id": 149746, "cbid": 200, "correlation": 289980933 + } + }, + { + "ph": "f", "id": 289980933, "pid": 5714, "tid": 5714, "ts": 6303771803622.071, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771814411.553, "dur": 0.800, + "args": { + "External id": 149746, "device": 0, "context": 1, "stream": 7, "correlation": 289980936, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289980936, "pid": 0, "tid": 7, "ts": 6303771814411.553, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771803624.461, "dur": 7.110, + "args": { + "External id": 149746, "cbid": 51, "correlation": 289980936 + } + }, + { + "ph": "s", "id": 289980936, "pid": 5714, "tid": 5714, "ts": 6303771803624.461, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771814413.537, "dur": 690.984, + "args": { + "External id": 149746, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980937, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980937, "pid": 0, "tid": 7, "ts": 6303771814413.537, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803631.791, "dur": 6.070, + "args": { + "External id": 149746, "cbid": 307, "correlation": 289980937 + } + }, + { + "ph": "s", "id": 289980937, "pid": 5714, "tid": 5714, "ts": 6303771803631.791, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771815105.193, "dur": 2.944, + "args": { + "External id": 149749, "device": 0, "context": 1, "stream": 7, "correlation": 289980942, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 289980942, "pid": 0, "tid": 7, "ts": 6303771815105.193, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771803662.661, "dur": 12.420, + "args": { + "External id": 149749, "cbid": 41, "correlation": 289980942 + } + }, + { + "ph": "s", "id": 289980942, "pid": 5714, "tid": 5714, "ts": 6303771803662.661, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771803719.061, "dur": 0.480, + "args": { + "External id": 149754, "cbid": 200, "correlation": 289980970 + } + }, + { + "ph": "f", "id": 289980970, "pid": 5714, "tid": 5714, "ts": 6303771803719.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771815109.289, "dur": 691.176, + "args": { + "External id": 149754, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980973, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289980973, "pid": 0, "tid": 7, "ts": 6303771815109.289, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803721.051, "dur": 7.270, + "args": { + "External id": 149754, "cbid": 307, "correlation": 289980973 + } + }, + { + "ph": "s", "id": 289980973, "pid": 5714, "tid": 5714, "ts": 6303771803721.051, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771815801.169, "dur": 221.122, + "args": { + "External id": 149755, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289980978, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289980978, "pid": 0, "tid": 7, "ts": 6303771815801.169, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803744.111, "dur": 6.229, + "args": { + "External id": 149755, "cbid": 211, "correlation": 289980978 + } + }, + { + "ph": "s", "id": 289980978, "pid": 5714, "tid": 5714, "ts": 6303771803744.111, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771803797.660, "dur": 1.240, + "args": { + "External id": 149763, "cbid": 210, "correlation": 289981004 + } + }, + { + "ph": "f", "id": 289981004, "pid": 5714, "tid": 5714, "ts": 6303771803797.660, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771816023.027, "dur": 639.688, + "args": { + "External id": 149763, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981005, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981005, "pid": 0, "tid": 7, "ts": 6303771816023.027, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803802.440, "dur": 7.611, + "args": { + "External id": 149763, "cbid": 211, "correlation": 289981005 + } + }, + { + "ph": "s", "id": 289981005, "pid": 5714, "tid": 5714, "ts": 6303771803802.440, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771816663.387, "dur": 171.202, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981024, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289981024, "pid": 0, "tid": 7, "ts": 6303771816663.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803930.010, "dur": 9.290, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289981024 + } + }, + { + "ph": "s", "id": 289981024, "pid": 5714, "tid": 5714, "ts": 6303771803930.010, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771816835.229, "dur": 4.064, + "args": { + "External id": 149773, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981041, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981041, "pid": 0, "tid": 7, "ts": 6303771816835.229, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771803975.050, "dur": 7.290, + "args": { + "External id": 149773, "cbid": 211, "correlation": 289981041 + } + }, + { + "ph": "s", "id": 289981041, "pid": 5714, "tid": 5714, "ts": 6303771803975.050, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771816840.029, "dur": 1.216, + "args": { + "External id": 149778, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981058, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981058, "pid": 0, "tid": 7, "ts": 6303771816840.029, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804013.110, "dur": 6.770, + "args": { + "External id": 149778, "cbid": 211, "correlation": 289981058 + } + }, + { + "ph": "s", "id": 289981058, "pid": 5714, "tid": 5714, "ts": 6303771804013.110, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771816841.853, "dur": 0.992, + "args": { + "External id": 149780, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981068, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981068, "pid": 0, "tid": 7, "ts": 6303771816841.853, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804034.950, "dur": 5.230, + "args": { + "External id": 149780, "cbid": 211, "correlation": 289981068 + } + }, + { + "ph": "s", "id": 289981068, "pid": 5714, "tid": 5714, "ts": 6303771804034.950, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771816843.549, "dur": 1.056, + "args": { + "External id": 149781, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981074, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981074, "pid": 0, "tid": 7, "ts": 6303771816843.549, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804047.440, "dur": 4.440, + "args": { + "External id": 149781, "cbid": 211, "correlation": 289981074 + } + }, + { + "ph": "s", "id": 289981074, "pid": 5714, "tid": 5714, "ts": 6303771804047.440, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771816845.277, "dur": 1.056, + "args": { + "External id": 149782, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981084, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981084, "pid": 0, "tid": 7, "ts": 6303771816845.277, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804063.250, "dur": 4.300, + "args": { + "External id": 149782, "cbid": 211, "correlation": 289981084 + } + }, + { + "ph": "s", "id": 289981084, "pid": 5714, "tid": 5714, "ts": 6303771804063.250, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771816846.973, "dur": 1.024, + "args": { + "External id": 149783, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981090, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981090, "pid": 0, "tid": 7, "ts": 6303771816846.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804073.010, "dur": 4.080, + "args": { + "External id": 149783, "cbid": 211, "correlation": 289981090 + } + }, + { + "ph": "s", "id": 289981090, "pid": 5714, "tid": 5714, "ts": 6303771804073.010, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771816848.733, "dur": 3.296, + "args": { + "External id": 149784, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981103, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981103, "pid": 0, "tid": 7, "ts": 6303771816848.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804092.010, "dur": 4.920, + "args": { + "External id": 149784, "cbid": 211, "correlation": 289981103 + } + }, + { + "ph": "s", "id": 289981103, "pid": 5714, "tid": 5714, "ts": 6303771804092.010, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771816852.669, "dur": 1.088, + "args": { + "External id": 149787, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981109, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981109, "pid": 0, "tid": 7, "ts": 6303771816852.669, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804102.820, "dur": 4.050, + "args": { + "External id": 149787, "cbid": 211, "correlation": 289981109 + } + }, + { + "ph": "s", "id": 289981109, "pid": 5714, "tid": 5714, "ts": 6303771804102.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771816854.397, "dur": 0.992, + "args": { + "External id": 149788, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981115, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981115, "pid": 0, "tid": 7, "ts": 6303771816854.397, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804112.840, "dur": 4.060, + "args": { + "External id": 149788, "cbid": 211, "correlation": 289981115 + } + }, + { + "ph": "s", "id": 289981115, "pid": 5714, "tid": 5714, "ts": 6303771804112.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771816856.093, "dur": 233.475, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981129, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289981129, "pid": 0, "tid": 7, "ts": 6303771816856.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804205.059, "dur": 9.120, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289981129 + } + }, + { + "ph": "s", "id": 289981129, "pid": 5714, "tid": 5714, "ts": 6303771804205.059, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771804248.759, "dur": 0.560, + "args": { + "External id": 149792, "cbid": 200, "correlation": 289981152 + } + }, + { + "ph": "f", "id": 289981152, "pid": 5714, "tid": 5714, "ts": 6303771804248.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771817090.432, "dur": 1.120, + "args": { + "External id": 149792, "device": 0, "context": 1, "stream": 7, "correlation": 289981155, "bytes": 1536, "memory bandwidth (GB/s)": 1.3714285714285714 + } + }, + { + "ph": "f", "id": 289981155, "pid": 0, "tid": 7, "ts": 6303771817090.432, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771804251.190, "dur": 6.889, + "args": { + "External id": 149792, "cbid": 51, "correlation": 289981155 + } + }, + { + "ph": "s", "id": 289981155, "pid": 5714, "tid": 5714, "ts": 6303771804251.190, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771817092.736, "dur": 688.552, + "args": { + "External id": 149792, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981156, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981156, "pid": 0, "tid": 7, "ts": 6303771817092.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804258.310, "dur": 6.040, + "args": { + "External id": 149792, "cbid": 307, "correlation": 289981156 + } + }, + { + "ph": "s", "id": 289981156, "pid": 5714, "tid": 5714, "ts": 6303771804258.310, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771817781.992, "dur": 2.944, + "args": { + "External id": 149795, "device": 0, "context": 1, "stream": 7, "correlation": 289981161, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 289981161, "pid": 0, "tid": 7, "ts": 6303771817781.992, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771804290.410, "dur": 24.069, + "args": { + "External id": 149795, "cbid": 41, "correlation": 289981161 + } + }, + { + "ph": "s", "id": 289981161, "pid": 5714, "tid": 5714, "ts": 6303771804290.410, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771804361.549, "dur": 0.480, + "args": { + "External id": 149800, "cbid": 200, "correlation": 289981189 + } + }, + { + "ph": "f", "id": 289981189, "pid": 5714, "tid": 5714, "ts": 6303771804361.549, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771817785.576, "dur": 688.232, + "args": { + "External id": 149800, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981192, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981192, "pid": 0, "tid": 7, "ts": 6303771817785.576, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804363.679, "dur": 8.480, + "args": { + "External id": 149800, "cbid": 307, "correlation": 289981192 + } + }, + { + "ph": "s", "id": 289981192, "pid": 5714, "tid": 5714, "ts": 6303771804363.679, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771818474.480, "dur": 220.995, + "args": { + "External id": 149801, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981197, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289981197, "pid": 0, "tid": 7, "ts": 6303771818474.480, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804388.249, "dur": 6.290, + "args": { + "External id": 149801, "cbid": 211, "correlation": 289981197 + } + }, + { + "ph": "s", "id": 289981197, "pid": 5714, "tid": 5714, "ts": 6303771804388.249, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771804441.349, "dur": 1.190, + "args": { + "External id": 149809, "cbid": 210, "correlation": 289981223 + } + }, + { + "ph": "f", "id": 289981223, "pid": 5714, "tid": 5714, "ts": 6303771804441.349, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771818696.147, "dur": 636.487, + "args": { + "External id": 149809, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981224, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981224, "pid": 0, "tid": 7, "ts": 6303771818696.147, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804446.059, "dur": 7.530, + "args": { + "External id": 149809, "cbid": 211, "correlation": 289981224 + } + }, + { + "ph": "s", "id": 289981224, "pid": 5714, "tid": 5714, "ts": 6303771804446.059, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771819333.306, "dur": 170.882, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981243, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289981243, "pid": 0, "tid": 7, "ts": 6303771819333.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804563.629, "dur": 9.720, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289981243 + } + }, + { + "ph": "s", "id": 289981243, "pid": 5714, "tid": 5714, "ts": 6303771804563.629, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771819504.892, "dur": 4.064, + "args": { + "External id": 149819, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981260, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981260, "pid": 0, "tid": 7, "ts": 6303771819504.892, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804608.869, "dur": 8.420, + "args": { + "External id": 149819, "cbid": 211, "correlation": 289981260 + } + }, + { + "ph": "s", "id": 289981260, "pid": 5714, "tid": 5714, "ts": 6303771804608.869, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771819509.532, "dur": 1.248, + "args": { + "External id": 149824, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981277, "pid": 0, "tid": 7, "ts": 6303771819509.532, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804641.729, "dur": 5.380, + "args": { + "External id": 149824, "cbid": 211, "correlation": 289981277 + } + }, + { + "ph": "s", "id": 289981277, "pid": 5714, "tid": 5714, "ts": 6303771804641.729, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771819511.484, "dur": 1.024, + "args": { + "External id": 149826, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981287, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981287, "pid": 0, "tid": 7, "ts": 6303771819511.484, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804660.409, "dur": 4.800, + "args": { + "External id": 149826, "cbid": 211, "correlation": 289981287 + } + }, + { + "ph": "s", "id": 289981287, "pid": 5714, "tid": 5714, "ts": 6303771804660.409, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771819513.212, "dur": 1.024, + "args": { + "External id": 149827, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981293, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981293, "pid": 0, "tid": 7, "ts": 6303771819513.212, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804672.429, "dur": 4.209, + "args": { + "External id": 149827, "cbid": 211, "correlation": 289981293 + } + }, + { + "ph": "s", "id": 289981293, "pid": 5714, "tid": 5714, "ts": 6303771804672.429, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771819514.908, "dur": 1.024, + "args": { + "External id": 149828, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981303, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981303, "pid": 0, "tid": 7, "ts": 6303771819514.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804688.658, "dur": 4.660, + "args": { + "External id": 149828, "cbid": 211, "correlation": 289981303 + } + }, + { + "ph": "s", "id": 289981303, "pid": 5714, "tid": 5714, "ts": 6303771804688.658, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771819516.636, "dur": 1.024, + "args": { + "External id": 149829, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981309, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981309, "pid": 0, "tid": 7, "ts": 6303771819516.636, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804699.098, "dur": 4.291, + "args": { + "External id": 149829, "cbid": 211, "correlation": 289981309 + } + }, + { + "ph": "s", "id": 289981309, "pid": 5714, "tid": 5714, "ts": 6303771804699.098, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771819518.364, "dur": 3.456, + "args": { + "External id": 149830, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981322, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981322, "pid": 0, "tid": 7, "ts": 6303771819518.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804718.578, "dur": 5.100, + "args": { + "External id": 149830, "cbid": 211, "correlation": 289981322 + } + }, + { + "ph": "s", "id": 289981322, "pid": 5714, "tid": 5714, "ts": 6303771804718.578, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771819522.460, "dur": 1.088, + "args": { + "External id": 149833, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981328, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981328, "pid": 0, "tid": 7, "ts": 6303771819522.460, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804730.838, "dur": 4.400, + "args": { + "External id": 149833, "cbid": 211, "correlation": 289981328 + } + }, + { + "ph": "s", "id": 289981328, "pid": 5714, "tid": 5714, "ts": 6303771804730.838, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771819524.156, "dur": 1.024, + "args": { + "External id": 149834, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981334, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981334, "pid": 0, "tid": 7, "ts": 6303771819524.156, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804740.529, "dur": 3.940, + "args": { + "External id": 149834, "cbid": 211, "correlation": 289981334 + } + }, + { + "ph": "s", "id": 289981334, "pid": 5714, "tid": 5714, "ts": 6303771804740.529, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771819525.884, "dur": 233.859, + "args": { + "External id": 149450, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981348, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289981348, "pid": 0, "tid": 7, "ts": 6303771819525.884, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804831.888, "dur": 8.360, + "args": { + "External id": 149450, "cbid": 307, "correlation": 289981348 + } + }, + { + "ph": "s", "id": 289981348, "pid": 5714, "tid": 5714, "ts": 6303771804831.888, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771804875.478, "dur": 0.580, + "args": { + "External id": 149838, "cbid": 200, "correlation": 289981371 + } + }, + { + "ph": "f", "id": 289981371, "pid": 5714, "tid": 5714, "ts": 6303771804875.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771819760.735, "dur": 0.800, + "args": { + "External id": 149838, "device": 0, "context": 1, "stream": 7, "correlation": 289981374, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289981374, "pid": 0, "tid": 7, "ts": 6303771819760.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771804877.838, "dur": 6.930, + "args": { + "External id": 149838, "cbid": 51, "correlation": 289981374 + } + }, + { + "ph": "s", "id": 289981374, "pid": 5714, "tid": 5714, "ts": 6303771804877.838, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771819762.303, "dur": 691.208, + "args": { + "External id": 149838, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981375, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981375, "pid": 0, "tid": 7, "ts": 6303771819762.303, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804885.008, "dur": 5.880, + "args": { + "External id": 149838, "cbid": 307, "correlation": 289981375 + } + }, + { + "ph": "s", "id": 289981375, "pid": 5714, "tid": 5714, "ts": 6303771804885.008, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771820454.183, "dur": 2.944, + "args": { + "External id": 149841, "device": 0, "context": 1, "stream": 7, "correlation": 289981380, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 289981380, "pid": 0, "tid": 7, "ts": 6303771820454.183, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771804916.888, "dur": 14.890, + "args": { + "External id": 149841, "cbid": 41, "correlation": 289981380 + } + }, + { + "ph": "s", "id": 289981380, "pid": 5714, "tid": 5714, "ts": 6303771804916.888, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771804974.098, "dur": 0.450, + "args": { + "External id": 149846, "cbid": 200, "correlation": 289981408 + } + }, + { + "ph": "f", "id": 289981408, "pid": 5714, "tid": 5714, "ts": 6303771804974.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771820457.927, "dur": 690.280, + "args": { + "External id": 149846, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981411, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981411, "pid": 0, "tid": 7, "ts": 6303771820457.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771804976.048, "dur": 7.250, + "args": { + "External id": 149846, "cbid": 307, "correlation": 289981411 + } + }, + { + "ph": "s", "id": 289981411, "pid": 5714, "tid": 5714, "ts": 6303771804976.048, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771821148.815, "dur": 220.995, + "args": { + "External id": 149847, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981416, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289981416, "pid": 0, "tid": 7, "ts": 6303771821148.815, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805000.338, "dur": 6.410, + "args": { + "External id": 149847, "cbid": 211, "correlation": 289981416 + } + }, + { + "ph": "s", "id": 289981416, "pid": 5714, "tid": 5714, "ts": 6303771805000.338, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771821370.418, "dur": 5.216, + "args": { + "External id": 149849, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981429, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981429, "pid": 0, "tid": 7, "ts": 6303771821370.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805031.508, "dur": 7.710, + "args": { + "External id": 149849, "cbid": 211, "correlation": 289981429 + } + }, + { + "ph": "s", "id": 289981429, "pid": 5714, "tid": 5714, "ts": 6303771805031.508, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771821376.338, "dur": 157.154, + "args": { + "External id": 149854, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981442, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289981442, "pid": 0, "tid": 7, "ts": 6303771821376.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805067.818, "dur": 6.460, + "args": { + "External id": 149854, "cbid": 211, "correlation": 289981442 + } + }, + { + "ph": "s", "id": 289981442, "pid": 5714, "tid": 5714, "ts": 6303771805067.818, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771821534.164, "dur": 1.696, + "args": { + "External id": 149859, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981450, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981450, "pid": 0, "tid": 7, "ts": 6303771821534.164, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805143.917, "dur": 12.200, + "args": { + "External id": 149859, "cbid": 211, "correlation": 289981450 + } + }, + { + "ph": "s", "id": 289981450, "pid": 5714, "tid": 5714, "ts": 6303771805143.917, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771821536.468, "dur": 2.240, + "args": { + "External id": 149878, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981470, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 289981470, "pid": 0, "tid": 7, "ts": 6303771821536.468, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805248.187, "dur": 10.290, + "args": { + "External id": 149878, "cbid": 211, "correlation": 289981470 + } + }, + { + "ph": "s", "id": 289981470, "pid": 5714, "tid": 5714, "ts": 6303771805248.187, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771821539.412, "dur": 58.944, + "args": { + "External id": 149886, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981488, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289981488, "pid": 0, "tid": 7, "ts": 6303771821539.412, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805381.847, "dur": 11.700, + "args": { + "External id": 149886, "cbid": 211, "correlation": 289981488 + } + }, + { + "ph": "s", "id": 289981488, "pid": 5714, "tid": 5714, "ts": 6303771805381.847, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771821599.508, "dur": 15.072, + "args": { + "External id": 149891, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981505, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289981505, "pid": 0, "tid": 7, "ts": 6303771821599.508, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805434.037, "dur": 7.220, + "args": { + "External id": 149891, "cbid": 211, "correlation": 289981505 + } + }, + { + "ph": "s", "id": 289981505, "pid": 5714, "tid": 5714, "ts": 6303771805434.037, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771821615.252, "dur": 100.354, + "args": { + "External id": 149896, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289981521, "pid": 0, "tid": 7, "ts": 6303771821615.252, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805459.897, "dur": 5.950, + "args": { + "External id": 149896, "cbid": 211, "correlation": 289981521 + } + }, + { + "ph": "s", "id": 289981521, "pid": 5714, "tid": 5714, "ts": 6303771805459.897, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771821716.246, "dur": 2.336, + "args": { + "External id": 149900, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981537, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289981537, "pid": 0, "tid": 7, "ts": 6303771821716.246, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805487.167, "dur": 4.820, + "args": { + "External id": 149900, "cbid": 211, "correlation": 289981537 + } + }, + { + "ph": "s", "id": 289981537, "pid": 5714, "tid": 5714, "ts": 6303771805487.167, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771821719.190, "dur": 1.664, + "args": { + "External id": 149901, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981549, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289981549, "pid": 0, "tid": 7, "ts": 6303771821719.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805513.857, "dur": 6.680, + "args": { + "External id": 149901, "cbid": 211, "correlation": 289981549 + } + }, + { + "ph": "s", "id": 289981549, "pid": 5714, "tid": 5714, "ts": 6303771805513.857, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771821721.462, "dur": 2.080, + "args": { + "External id": 149908, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981567, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289981567, "pid": 0, "tid": 7, "ts": 6303771821721.462, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805551.336, "dur": 7.391, + "args": { + "External id": 149908, "cbid": 211, "correlation": 289981567 + } + }, + { + "ph": "s", "id": 289981567, "pid": 5714, "tid": 5714, "ts": 6303771805551.336, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 0, "tid": 7, + "ts": 6303771821724.214, "dur": 3.840, + "args": { + "External id": 149903, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981576, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981576, "pid": 0, "tid": 7, "ts": 6303771821724.214, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771805567.376, "dur": 5.191, + "args": { + "External id": 149903, "cbid": 211, "correlation": 289981576 + } + }, + { + "ph": "s", "id": 289981576, "pid": 5714, "tid": 5714, "ts": 6303771805567.376, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805586.736, "dur": 2.920, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981581 + } + }, + { + "ph": "f", "id": 289981581, "pid": 5714, "tid": 5714, "ts": 6303771805586.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805590.416, "dur": 0.611, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981582 + } + }, + { + "ph": "f", "id": 289981582, "pid": 5714, "tid": 5714, "ts": 6303771805590.416, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805591.367, "dur": 0.589, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981583 + } + }, + { + "ph": "f", "id": 289981583, "pid": 5714, "tid": 5714, "ts": 6303771805591.367, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805592.516, "dur": 0.791, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981584 + } + }, + { + "ph": "f", "id": 289981584, "pid": 5714, "tid": 5714, "ts": 6303771805592.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805593.596, "dur": 0.551, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981585 + } + }, + { + "ph": "f", "id": 289981585, "pid": 5714, "tid": 5714, "ts": 6303771805593.596, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805594.456, "dur": 0.471, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981586 + } + }, + { + "ph": "f", "id": 289981586, "pid": 5714, "tid": 5714, "ts": 6303771805594.456, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805595.156, "dur": 0.480, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981587 + } + }, + { + "ph": "f", "id": 289981587, "pid": 5714, "tid": 5714, "ts": 6303771805595.156, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805595.936, "dur": 0.620, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981588 + } + }, + { + "ph": "f", "id": 289981588, "pid": 5714, "tid": 5714, "ts": 6303771805595.936, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805596.767, "dur": 0.449, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981589 + } + }, + { + "ph": "f", "id": 289981589, "pid": 5714, "tid": 5714, "ts": 6303771805596.767, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805597.507, "dur": 0.560, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981590 + } + }, + { + "ph": "f", "id": 289981590, "pid": 5714, "tid": 5714, "ts": 6303771805597.507, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771805598.367, "dur": 0.469, + "args": { + "External id": 149910, "cbid": 138, "correlation": 289981591 + } + }, + { + "ph": "f", "id": 289981591, "pid": 5714, "tid": 5714, "ts": 6303771805598.367, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6303771821733.142, "dur": 0.992, + "args": { + "External id": 149910, "device": 0, "context": 1, "stream": 7, "correlation": 289981593, "bytes": 8, "memory bandwidth (GB/s)": 0.008064516129032258 + } + }, + { + "ph": "f", "id": 289981593, "pid": 0, "tid": 7, "ts": 6303771821733.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771805600.756, "dur": 10.771, + "args": { + "External id": 149910, "cbid": 41, "correlation": 289981593 + } + }, + { + "ph": "s", "id": 289981593, "pid": 5714, "tid": 5714, "ts": 6303771805600.756, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6303771805611.967, "dur": 16127.424, + "args": { + "External id": 149910, "cbid": 131, "correlation": 289981594 + } + }, + { + "ph": "s", "id": 289981594, "pid": 5714, "tid": 5714, "ts": 6303771805611.967, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771821814.480, "dur": 1.920, + "args": { + "External id": 149918, "cbid": 210, "correlation": 289981619 + } + }, + { + "ph": "f", "id": 289981619, "pid": 5714, "tid": 5714, "ts": 6303771821814.480, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771821833.655, "dur": 634.439, + "args": { + "External id": 149918, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981620, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981620, "pid": 0, "tid": 7, "ts": 6303771821833.655, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771821820.260, "dur": 13.580, + "args": { + "External id": 149918, "cbid": 211, "correlation": 289981620 + } + }, + { + "ph": "s", "id": 289981620, "pid": 5714, "tid": 5714, "ts": 6303771821820.260, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771822468.702, "dur": 170.690, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981639, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289981639, "pid": 0, "tid": 7, "ts": 6303771822468.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771821949.820, "dur": 9.910, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289981639 + } + }, + { + "ph": "s", "id": 289981639, "pid": 5714, "tid": 5714, "ts": 6303771821949.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771822640.096, "dur": 4.064, + "args": { + "External id": 149928, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981656, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981656, "pid": 0, "tid": 7, "ts": 6303771822640.096, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771821998.820, "dur": 8.100, + "args": { + "External id": 149928, "cbid": 211, "correlation": 289981656 + } + }, + { + "ph": "s", "id": 289981656, "pid": 5714, "tid": 5714, "ts": 6303771821998.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771822644.864, "dur": 1.217, + "args": { + "External id": 149933, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981673, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981673, "pid": 0, "tid": 7, "ts": 6303771822644.864, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822031.740, "dur": 5.660, + "args": { + "External id": 149933, "cbid": 211, "correlation": 289981673 + } + }, + { + "ph": "s", "id": 289981673, "pid": 5714, "tid": 5714, "ts": 6303771822031.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771822646.721, "dur": 1.024, + "args": { + "External id": 149935, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981683, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981683, "pid": 0, "tid": 7, "ts": 6303771822646.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822051.640, "dur": 5.160, + "args": { + "External id": 149935, "cbid": 211, "correlation": 289981683 + } + }, + { + "ph": "s", "id": 289981683, "pid": 5714, "tid": 5714, "ts": 6303771822051.640, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771822648.385, "dur": 1.056, + "args": { + "External id": 149936, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981689, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981689, "pid": 0, "tid": 7, "ts": 6303771822648.385, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822064.440, "dur": 4.810, + "args": { + "External id": 149936, "cbid": 211, "correlation": 289981689 + } + }, + { + "ph": "s", "id": 289981689, "pid": 5714, "tid": 5714, "ts": 6303771822064.440, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771822650.113, "dur": 1.024, + "args": { + "External id": 149937, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981699, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981699, "pid": 0, "tid": 7, "ts": 6303771822650.113, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822080.200, "dur": 4.820, + "args": { + "External id": 149937, "cbid": 211, "correlation": 289981699 + } + }, + { + "ph": "s", "id": 289981699, "pid": 5714, "tid": 5714, "ts": 6303771822080.200, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771822651.809, "dur": 1.024, + "args": { + "External id": 149938, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981705, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981705, "pid": 0, "tid": 7, "ts": 6303771822651.809, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822091.050, "dur": 4.290, + "args": { + "External id": 149938, "cbid": 211, "correlation": 289981705 + } + }, + { + "ph": "s", "id": 289981705, "pid": 5714, "tid": 5714, "ts": 6303771822091.050, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771822653.569, "dur": 3.360, + "args": { + "External id": 149939, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981718, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981718, "pid": 0, "tid": 7, "ts": 6303771822653.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822113.490, "dur": 5.400, + "args": { + "External id": 149939, "cbid": 211, "correlation": 289981718 + } + }, + { + "ph": "s", "id": 289981718, "pid": 5714, "tid": 5714, "ts": 6303771822113.490, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771822657.569, "dur": 1.120, + "args": { + "External id": 149942, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981724, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981724, "pid": 0, "tid": 7, "ts": 6303771822657.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822125.060, "dur": 4.360, + "args": { + "External id": 149942, "cbid": 211, "correlation": 289981724 + } + }, + { + "ph": "s", "id": 289981724, "pid": 5714, "tid": 5714, "ts": 6303771822125.060, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771822659.329, "dur": 0.992, + "args": { + "External id": 149943, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981730, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981730, "pid": 0, "tid": 7, "ts": 6303771822659.329, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822134.620, "dur": 3.920, + "args": { + "External id": 149943, "cbid": 211, "correlation": 289981730 + } + }, + { + "ph": "s", "id": 289981730, "pid": 5714, "tid": 5714, "ts": 6303771822134.620, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771822661.025, "dur": 233.506, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981744, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289981744, "pid": 0, "tid": 7, "ts": 6303771822661.025, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822225.350, "dur": 8.280, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289981744 + } + }, + { + "ph": "s", "id": 289981744, "pid": 5714, "tid": 5714, "ts": 6303771822225.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771822268.319, "dur": 0.611, + "args": { + "External id": 149947, "cbid": 200, "correlation": 289981767 + } + }, + { + "ph": "f", "id": 289981767, "pid": 5714, "tid": 5714, "ts": 6303771822268.319, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771822895.715, "dur": 0.800, + "args": { + "External id": 149947, "device": 0, "context": 1, "stream": 7, "correlation": 289981770, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289981770, "pid": 0, "tid": 7, "ts": 6303771822895.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771822270.770, "dur": 6.940, + "args": { + "External id": 149947, "cbid": 51, "correlation": 289981770 + } + }, + { + "ph": "s", "id": 289981770, "pid": 5714, "tid": 5714, "ts": 6303771822270.770, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771822898.211, "dur": 690.793, + "args": { + "External id": 149947, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981771, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981771, "pid": 0, "tid": 7, "ts": 6303771822898.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822277.979, "dur": 6.031, + "args": { + "External id": 149947, "cbid": 307, "correlation": 289981771 + } + }, + { + "ph": "s", "id": 289981771, "pid": 5714, "tid": 5714, "ts": 6303771822277.979, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771823589.612, "dur": 2.976, + "args": { + "External id": 149950, "device": 0, "context": 1, "stream": 7, "correlation": 289981776, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289981776, "pid": 0, "tid": 7, "ts": 6303771823589.612, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771822318.569, "dur": 15.610, + "args": { + "External id": 149950, "cbid": 41, "correlation": 289981776 + } + }, + { + "ph": "s", "id": 289981776, "pid": 5714, "tid": 5714, "ts": 6303771822318.569, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771822379.259, "dur": 0.480, + "args": { + "External id": 149955, "cbid": 200, "correlation": 289981804 + } + }, + { + "ph": "f", "id": 289981804, "pid": 5714, "tid": 5714, "ts": 6303771822379.259, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771823593.196, "dur": 694.792, + "args": { + "External id": 149955, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981807, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981807, "pid": 0, "tid": 7, "ts": 6303771823593.196, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822381.269, "dur": 7.680, + "args": { + "External id": 149955, "cbid": 307, "correlation": 289981807 + } + }, + { + "ph": "s", "id": 289981807, "pid": 5714, "tid": 5714, "ts": 6303771822381.269, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771824288.692, "dur": 220.290, + "args": { + "External id": 149956, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981812, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289981812, "pid": 0, "tid": 7, "ts": 6303771824288.692, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822403.999, "dur": 6.280, + "args": { + "External id": 149956, "cbid": 211, "correlation": 289981812 + } + }, + { + "ph": "s", "id": 289981812, "pid": 5714, "tid": 5714, "ts": 6303771822403.999, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771822456.829, "dur": 1.200, + "args": { + "External id": 149964, "cbid": 210, "correlation": 289981838 + } + }, + { + "ph": "f", "id": 289981838, "pid": 5714, "tid": 5714, "ts": 6303771822456.829, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771824509.622, "dur": 634.631, + "args": { + "External id": 149964, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981839, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981839, "pid": 0, "tid": 7, "ts": 6303771824509.622, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822461.689, "dur": 7.690, + "args": { + "External id": 149964, "cbid": 211, "correlation": 289981839 + } + }, + { + "ph": "s", "id": 289981839, "pid": 5714, "tid": 5714, "ts": 6303771822461.689, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771825144.957, "dur": 171.171, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981858, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289981858, "pid": 0, "tid": 7, "ts": 6303771825144.957, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822575.949, "dur": 8.580, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289981858 + } + }, + { + "ph": "s", "id": 289981858, "pid": 5714, "tid": 5714, "ts": 6303771822575.949, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771825316.768, "dur": 4.032, + "args": { + "External id": 149974, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981875, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981875, "pid": 0, "tid": 7, "ts": 6303771825316.768, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822618.509, "dur": 7.300, + "args": { + "External id": 149974, "cbid": 211, "correlation": 289981875 + } + }, + { + "ph": "s", "id": 289981875, "pid": 5714, "tid": 5714, "ts": 6303771822618.509, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771825321.440, "dur": 1.184, + "args": { + "External id": 149979, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981892, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981892, "pid": 0, "tid": 7, "ts": 6303771825321.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822649.498, "dur": 5.371, + "args": { + "External id": 149979, "cbid": 211, "correlation": 289981892 + } + }, + { + "ph": "s", "id": 289981892, "pid": 5714, "tid": 5714, "ts": 6303771822649.498, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771825323.296, "dur": 0.992, + "args": { + "External id": 149981, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981902, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981902, "pid": 0, "tid": 7, "ts": 6303771825323.296, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822671.269, "dur": 5.089, + "args": { + "External id": 149981, "cbid": 211, "correlation": 289981902 + } + }, + { + "ph": "s", "id": 289981902, "pid": 5714, "tid": 5714, "ts": 6303771822671.269, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771825324.960, "dur": 1.056, + "args": { + "External id": 149982, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981908, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981908, "pid": 0, "tid": 7, "ts": 6303771825324.960, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822683.538, "dur": 4.380, + "args": { + "External id": 149982, "cbid": 211, "correlation": 289981908 + } + }, + { + "ph": "s", "id": 289981908, "pid": 5714, "tid": 5714, "ts": 6303771822683.538, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771825326.656, "dur": 1.056, + "args": { + "External id": 149983, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981918, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981918, "pid": 0, "tid": 7, "ts": 6303771825326.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822698.998, "dur": 4.440, + "args": { + "External id": 149983, "cbid": 211, "correlation": 289981918 + } + }, + { + "ph": "s", "id": 289981918, "pid": 5714, "tid": 5714, "ts": 6303771822698.998, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771825328.384, "dur": 1.024, + "args": { + "External id": 149984, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981924, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981924, "pid": 0, "tid": 7, "ts": 6303771825328.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822709.058, "dur": 4.211, + "args": { + "External id": 149984, "cbid": 211, "correlation": 289981924 + } + }, + { + "ph": "s", "id": 289981924, "pid": 5714, "tid": 5714, "ts": 6303771822709.058, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771825330.112, "dur": 3.392, + "args": { + "External id": 149985, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981937, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981937, "pid": 0, "tid": 7, "ts": 6303771825330.112, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822730.118, "dur": 4.900, + "args": { + "External id": 149985, "cbid": 211, "correlation": 289981937 + } + }, + { + "ph": "s", "id": 289981937, "pid": 5714, "tid": 5714, "ts": 6303771822730.118, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771825334.080, "dur": 1.088, + "args": { + "External id": 149988, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981943, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981943, "pid": 0, "tid": 7, "ts": 6303771825334.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822741.309, "dur": 4.209, + "args": { + "External id": 149988, "cbid": 211, "correlation": 289981943 + } + }, + { + "ph": "s", "id": 289981943, "pid": 5714, "tid": 5714, "ts": 6303771822741.309, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771825335.808, "dur": 0.992, + "args": { + "External id": 149989, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981949, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289981949, "pid": 0, "tid": 7, "ts": 6303771825335.808, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822750.418, "dur": 3.810, + "args": { + "External id": 149989, "cbid": 211, "correlation": 289981949 + } + }, + { + "ph": "s", "id": 289981949, "pid": 5714, "tid": 5714, "ts": 6303771822750.418, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771825337.504, "dur": 232.803, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981963, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289981963, "pid": 0, "tid": 7, "ts": 6303771825337.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822842.128, "dur": 8.180, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289981963 + } + }, + { + "ph": "s", "id": 289981963, "pid": 5714, "tid": 5714, "ts": 6303771822842.128, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771822884.118, "dur": 0.590, + "args": { + "External id": 149993, "cbid": 200, "correlation": 289981986 + } + }, + { + "ph": "f", "id": 289981986, "pid": 5714, "tid": 5714, "ts": 6303771822884.118, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771825571.427, "dur": 0.832, + "args": { + "External id": 149993, "device": 0, "context": 1, "stream": 7, "correlation": 289981989, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289981989, "pid": 0, "tid": 7, "ts": 6303771825571.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771822886.618, "dur": 7.740, + "args": { + "External id": 149993, "cbid": 51, "correlation": 289981989 + } + }, + { + "ph": "s", "id": 289981989, "pid": 5714, "tid": 5714, "ts": 6303771822886.618, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771825573.955, "dur": 690.439, + "args": { + "External id": 149993, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289981990, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289981990, "pid": 0, "tid": 7, "ts": 6303771825573.955, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822894.598, "dur": 5.870, + "args": { + "External id": 149993, "cbid": 307, "correlation": 289981990 + } + }, + { + "ph": "s", "id": 289981990, "pid": 5714, "tid": 5714, "ts": 6303771822894.598, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771826265.034, "dur": 2.944, + "args": { + "External id": 149996, "device": 0, "context": 1, "stream": 7, "correlation": 289981995, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 289981995, "pid": 0, "tid": 7, "ts": 6303771826265.034, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771822924.518, "dur": 12.160, + "args": { + "External id": 149996, "cbid": 41, "correlation": 289981995 + } + }, + { + "ph": "s", "id": 289981995, "pid": 5714, "tid": 5714, "ts": 6303771822924.518, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771822978.008, "dur": 0.490, + "args": { + "External id": 150001, "cbid": 200, "correlation": 289982023 + } + }, + { + "ph": "f", "id": 289982023, "pid": 5714, "tid": 5714, "ts": 6303771822978.008, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771826268.618, "dur": 689.993, + "args": { + "External id": 150001, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982026, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982026, "pid": 0, "tid": 7, "ts": 6303771826268.618, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771822979.978, "dur": 6.870, + "args": { + "External id": 150001, "cbid": 307, "correlation": 289982026 + } + }, + { + "ph": "s", "id": 289982026, "pid": 5714, "tid": 5714, "ts": 6303771822979.978, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771826959.219, "dur": 220.962, + "args": { + "External id": 150002, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982031, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289982031, "pid": 0, "tid": 7, "ts": 6303771826959.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823000.978, "dur": 5.890, + "args": { + "External id": 150002, "cbid": 211, "correlation": 289982031 + } + }, + { + "ph": "s", "id": 289982031, "pid": 5714, "tid": 5714, "ts": 6303771823000.978, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771823052.638, "dur": 1.250, + "args": { + "External id": 150010, "cbid": 210, "correlation": 289982057 + } + }, + { + "ph": "f", "id": 289982057, "pid": 5714, "tid": 5714, "ts": 6303771823052.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771827180.917, "dur": 634.247, + "args": { + "External id": 150010, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982058, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982058, "pid": 0, "tid": 7, "ts": 6303771827180.917, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823057.468, "dur": 7.600, + "args": { + "External id": 150010, "cbid": 211, "correlation": 289982058 + } + }, + { + "ph": "s", "id": 289982058, "pid": 5714, "tid": 5714, "ts": 6303771823057.468, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771827815.772, "dur": 171.459, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982077, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289982077, "pid": 0, "tid": 7, "ts": 6303771827815.772, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823170.128, "dur": 8.820, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289982077 + } + }, + { + "ph": "s", "id": 289982077, "pid": 5714, "tid": 5714, "ts": 6303771823170.128, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771827987.967, "dur": 4.064, + "args": { + "External id": 150020, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982094, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982094, "pid": 0, "tid": 7, "ts": 6303771827987.967, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823229.687, "dur": 7.190, + "args": { + "External id": 150020, "cbid": 211, "correlation": 289982094 + } + }, + { + "ph": "s", "id": 289982094, "pid": 5714, "tid": 5714, "ts": 6303771823229.687, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771827992.767, "dur": 1.184, + "args": { + "External id": 150025, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982111, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982111, "pid": 0, "tid": 7, "ts": 6303771827992.767, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823259.457, "dur": 5.330, + "args": { + "External id": 150025, "cbid": 211, "correlation": 289982111 + } + }, + { + "ph": "s", "id": 289982111, "pid": 5714, "tid": 5714, "ts": 6303771823259.457, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771827994.623, "dur": 0.992, + "args": { + "External id": 150027, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982121, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982121, "pid": 0, "tid": 7, "ts": 6303771827994.623, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823279.667, "dur": 4.990, + "args": { + "External id": 150027, "cbid": 211, "correlation": 289982121 + } + }, + { + "ph": "s", "id": 289982121, "pid": 5714, "tid": 5714, "ts": 6303771823279.667, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771827996.287, "dur": 1.024, + "args": { + "External id": 150028, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982127, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982127, "pid": 0, "tid": 7, "ts": 6303771827996.287, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823292.067, "dur": 12.770, + "args": { + "External id": 150028, "cbid": 211, "correlation": 289982127 + } + }, + { + "ph": "s", "id": 289982127, "pid": 5714, "tid": 5714, "ts": 6303771823292.067, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771827997.983, "dur": 1.056, + "args": { + "External id": 150029, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982137, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982137, "pid": 0, "tid": 7, "ts": 6303771827997.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823316.177, "dur": 4.830, + "args": { + "External id": 150029, "cbid": 211, "correlation": 289982137 + } + }, + { + "ph": "s", "id": 289982137, "pid": 5714, "tid": 5714, "ts": 6303771823316.177, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771827999.679, "dur": 1.056, + "args": { + "External id": 150030, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982143, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982143, "pid": 0, "tid": 7, "ts": 6303771827999.679, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823328.337, "dur": 4.210, + "args": { + "External id": 150030, "cbid": 211, "correlation": 289982143 + } + }, + { + "ph": "s", "id": 289982143, "pid": 5714, "tid": 5714, "ts": 6303771823328.337, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771828001.439, "dur": 3.360, + "args": { + "External id": 150031, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982156, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982156, "pid": 0, "tid": 7, "ts": 6303771828001.439, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823347.307, "dur": 4.990, + "args": { + "External id": 150031, "cbid": 211, "correlation": 289982156 + } + }, + { + "ph": "s", "id": 289982156, "pid": 5714, "tid": 5714, "ts": 6303771823347.307, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771828005.407, "dur": 1.088, + "args": { + "External id": 150034, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982162, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982162, "pid": 0, "tid": 7, "ts": 6303771828005.407, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823358.447, "dur": 3.960, + "args": { + "External id": 150034, "cbid": 211, "correlation": 289982162 + } + }, + { + "ph": "s", "id": 289982162, "pid": 5714, "tid": 5714, "ts": 6303771823358.447, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771828007.135, "dur": 0.992, + "args": { + "External id": 150035, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982168, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982168, "pid": 0, "tid": 7, "ts": 6303771828007.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823367.347, "dur": 3.770, + "args": { + "External id": 150035, "cbid": 211, "correlation": 289982168 + } + }, + { + "ph": "s", "id": 289982168, "pid": 5714, "tid": 5714, "ts": 6303771823367.347, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771828008.831, "dur": 233.955, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982182, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289982182, "pid": 0, "tid": 7, "ts": 6303771828008.831, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823464.097, "dur": 8.580, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289982182 + } + }, + { + "ph": "s", "id": 289982182, "pid": 5714, "tid": 5714, "ts": 6303771823464.097, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771823506.017, "dur": 0.580, + "args": { + "External id": 150039, "cbid": 200, "correlation": 289982205 + } + }, + { + "ph": "f", "id": 289982205, "pid": 5714, "tid": 5714, "ts": 6303771823506.017, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771828243.682, "dur": 0.800, + "args": { + "External id": 150039, "device": 0, "context": 1, "stream": 7, "correlation": 289982208, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289982208, "pid": 0, "tid": 7, "ts": 6303771828243.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771823508.407, "dur": 7.720, + "args": { + "External id": 150039, "cbid": 51, "correlation": 289982208 + } + }, + { + "ph": "s", "id": 289982208, "pid": 5714, "tid": 5714, "ts": 6303771823508.407, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771828246.178, "dur": 689.639, + "args": { + "External id": 150039, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982209, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982209, "pid": 0, "tid": 7, "ts": 6303771828246.178, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823516.377, "dur": 5.800, + "args": { + "External id": 150039, "cbid": 307, "correlation": 289982209 + } + }, + { + "ph": "s", "id": 289982209, "pid": 5714, "tid": 5714, "ts": 6303771823516.377, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771828936.425, "dur": 2.976, + "args": { + "External id": 150042, "device": 0, "context": 1, "stream": 7, "correlation": 289982214, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289982214, "pid": 0, "tid": 7, "ts": 6303771828936.425, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771823546.196, "dur": 12.220, + "args": { + "External id": 150042, "cbid": 41, "correlation": 289982214 + } + }, + { + "ph": "s", "id": 289982214, "pid": 5714, "tid": 5714, "ts": 6303771823546.196, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771823599.236, "dur": 0.460, + "args": { + "External id": 150047, "cbid": 200, "correlation": 289982242 + } + }, + { + "ph": "f", "id": 289982242, "pid": 5714, "tid": 5714, "ts": 6303771823599.236, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771828939.977, "dur": 687.081, + "args": { + "External id": 150047, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982245, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982245, "pid": 0, "tid": 7, "ts": 6303771828939.977, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823601.227, "dur": 7.280, + "args": { + "External id": 150047, "cbid": 307, "correlation": 289982245 + } + }, + { + "ph": "s", "id": 289982245, "pid": 5714, "tid": 5714, "ts": 6303771823601.227, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771829627.762, "dur": 221.442, + "args": { + "External id": 150048, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982250, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289982250, "pid": 0, "tid": 7, "ts": 6303771829627.762, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823622.827, "dur": 6.240, + "args": { + "External id": 150048, "cbid": 211, "correlation": 289982250 + } + }, + { + "ph": "s", "id": 289982250, "pid": 5714, "tid": 5714, "ts": 6303771823622.827, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771823675.496, "dur": 1.230, + "args": { + "External id": 150056, "cbid": 210, "correlation": 289982276 + } + }, + { + "ph": "f", "id": 289982276, "pid": 5714, "tid": 5714, "ts": 6303771823675.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771829849.812, "dur": 635.239, + "args": { + "External id": 150056, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982277, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982277, "pid": 0, "tid": 7, "ts": 6303771829849.812, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823680.306, "dur": 7.350, + "args": { + "External id": 150056, "cbid": 211, "correlation": 289982277 + } + }, + { + "ph": "s", "id": 289982277, "pid": 5714, "tid": 5714, "ts": 6303771823680.306, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771830485.659, "dur": 171.075, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982296, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289982296, "pid": 0, "tid": 7, "ts": 6303771830485.659, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823792.656, "dur": 8.890, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289982296 + } + }, + { + "ph": "s", "id": 289982296, "pid": 5714, "tid": 5714, "ts": 6303771823792.656, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771830657.342, "dur": 4.096, + "args": { + "External id": 150066, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982313, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982313, "pid": 0, "tid": 7, "ts": 6303771830657.342, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823835.606, "dur": 6.970, + "args": { + "External id": 150066, "cbid": 211, "correlation": 289982313 + } + }, + { + "ph": "s", "id": 289982313, "pid": 5714, "tid": 5714, "ts": 6303771823835.606, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771830662.142, "dur": 1.184, + "args": { + "External id": 150071, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982330, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982330, "pid": 0, "tid": 7, "ts": 6303771830662.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823867.846, "dur": 5.490, + "args": { + "External id": 150071, "cbid": 211, "correlation": 289982330 + } + }, + { + "ph": "s", "id": 289982330, "pid": 5714, "tid": 5714, "ts": 6303771823867.846, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771830663.966, "dur": 1.024, + "args": { + "External id": 150073, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982340, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982340, "pid": 0, "tid": 7, "ts": 6303771830663.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823887.726, "dur": 4.740, + "args": { + "External id": 150073, "cbid": 211, "correlation": 289982340 + } + }, + { + "ph": "s", "id": 289982340, "pid": 5714, "tid": 5714, "ts": 6303771823887.726, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771830665.662, "dur": 1.056, + "args": { + "External id": 150074, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982346, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982346, "pid": 0, "tid": 7, "ts": 6303771830665.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823899.496, "dur": 4.380, + "args": { + "External id": 150074, "cbid": 211, "correlation": 289982346 + } + }, + { + "ph": "s", "id": 289982346, "pid": 5714, "tid": 5714, "ts": 6303771823899.496, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771830667.358, "dur": 1.024, + "args": { + "External id": 150075, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982356, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982356, "pid": 0, "tid": 7, "ts": 6303771830667.358, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823914.356, "dur": 4.250, + "args": { + "External id": 150075, "cbid": 211, "correlation": 289982356 + } + }, + { + "ph": "s", "id": 289982356, "pid": 5714, "tid": 5714, "ts": 6303771823914.356, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771830669.054, "dur": 1.056, + "args": { + "External id": 150076, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982362, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982362, "pid": 0, "tid": 7, "ts": 6303771830669.054, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823924.286, "dur": 4.170, + "args": { + "External id": 150076, "cbid": 211, "correlation": 289982362 + } + }, + { + "ph": "s", "id": 289982362, "pid": 5714, "tid": 5714, "ts": 6303771823924.286, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771830670.814, "dur": 3.360, + "args": { + "External id": 150077, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982375, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982375, "pid": 0, "tid": 7, "ts": 6303771830670.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823944.326, "dur": 4.730, + "args": { + "External id": 150077, "cbid": 211, "correlation": 289982375 + } + }, + { + "ph": "s", "id": 289982375, "pid": 5714, "tid": 5714, "ts": 6303771823944.326, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771830674.782, "dur": 1.088, + "args": { + "External id": 150080, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982381, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982381, "pid": 0, "tid": 7, "ts": 6303771830674.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823955.076, "dur": 4.130, + "args": { + "External id": 150080, "cbid": 211, "correlation": 289982381 + } + }, + { + "ph": "s", "id": 289982381, "pid": 5714, "tid": 5714, "ts": 6303771823955.076, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771830676.478, "dur": 1.024, + "args": { + "External id": 150081, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982387, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982387, "pid": 0, "tid": 7, "ts": 6303771830676.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771823964.186, "dur": 3.960, + "args": { + "External id": 150081, "cbid": 211, "correlation": 289982387 + } + }, + { + "ph": "s", "id": 289982387, "pid": 5714, "tid": 5714, "ts": 6303771823964.186, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771830678.206, "dur": 233.315, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982401, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289982401, "pid": 0, "tid": 7, "ts": 6303771830678.206, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824052.246, "dur": 8.269, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289982401 + } + }, + { + "ph": "s", "id": 289982401, "pid": 5714, "tid": 5714, "ts": 6303771824052.246, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771824093.735, "dur": 0.520, + "args": { + "External id": 150085, "cbid": 200, "correlation": 289982424 + } + }, + { + "ph": "f", "id": 289982424, "pid": 5714, "tid": 5714, "ts": 6303771824093.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771830912.385, "dur": 0.800, + "args": { + "External id": 150085, "device": 0, "context": 1, "stream": 7, "correlation": 289982427, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289982427, "pid": 0, "tid": 7, "ts": 6303771830912.385, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771824096.085, "dur": 6.910, + "args": { + "External id": 150085, "cbid": 51, "correlation": 289982427 + } + }, + { + "ph": "s", "id": 289982427, "pid": 5714, "tid": 5714, "ts": 6303771824096.085, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771830914.913, "dur": 693.255, + "args": { + "External id": 150085, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982428, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982428, "pid": 0, "tid": 7, "ts": 6303771830914.913, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824103.235, "dur": 6.090, + "args": { + "External id": 150085, "cbid": 307, "correlation": 289982428 + } + }, + { + "ph": "s", "id": 289982428, "pid": 5714, "tid": 5714, "ts": 6303771824103.235, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771831608.904, "dur": 2.977, + "args": { + "External id": 150088, "device": 0, "context": 1, "stream": 7, "correlation": 289982433, "bytes": 3145728, "memory bandwidth (GB/s)": 1056.6771918038294 + } + }, + { + "ph": "f", "id": 289982433, "pid": 0, "tid": 7, "ts": 6303771831608.904, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771824134.995, "dur": 11.910, + "args": { + "External id": 150088, "cbid": 41, "correlation": 289982433 + } + }, + { + "ph": "s", "id": 289982433, "pid": 5714, "tid": 5714, "ts": 6303771824134.995, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771824187.705, "dur": 0.490, + "args": { + "External id": 150093, "cbid": 200, "correlation": 289982461 + } + }, + { + "ph": "f", "id": 289982461, "pid": 5714, "tid": 5714, "ts": 6303771824187.705, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771831612.489, "dur": 690.664, + "args": { + "External id": 150093, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982464, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982464, "pid": 0, "tid": 7, "ts": 6303771831612.489, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824189.665, "dur": 7.130, + "args": { + "External id": 150093, "cbid": 307, "correlation": 289982464 + } + }, + { + "ph": "s", "id": 289982464, "pid": 5714, "tid": 5714, "ts": 6303771824189.665, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771832303.761, "dur": 221.250, + "args": { + "External id": 150094, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982469, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289982469, "pid": 0, "tid": 7, "ts": 6303771832303.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824210.745, "dur": 5.830, + "args": { + "External id": 150094, "cbid": 211, "correlation": 289982469 + } + }, + { + "ph": "s", "id": 289982469, "pid": 5714, "tid": 5714, "ts": 6303771824210.745, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771824259.795, "dur": 1.210, + "args": { + "External id": 150102, "cbid": 210, "correlation": 289982495 + } + }, + { + "ph": "f", "id": 289982495, "pid": 5714, "tid": 5714, "ts": 6303771824259.795, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771832525.747, "dur": 632.264, + "args": { + "External id": 150102, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982496, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982496, "pid": 0, "tid": 7, "ts": 6303771832525.747, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824264.555, "dur": 7.540, + "args": { + "External id": 150102, "cbid": 211, "correlation": 289982496 + } + }, + { + "ph": "s", "id": 289982496, "pid": 5714, "tid": 5714, "ts": 6303771824264.555, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771833158.651, "dur": 170.690, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982515, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289982515, "pid": 0, "tid": 7, "ts": 6303771833158.651, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824385.255, "dur": 9.140, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289982515 + } + }, + { + "ph": "s", "id": 289982515, "pid": 5714, "tid": 5714, "ts": 6303771824385.255, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771833330.013, "dur": 4.256, + "args": { + "External id": 150112, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982532, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982532, "pid": 0, "tid": 7, "ts": 6303771833330.013, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824427.205, "dur": 7.290, + "args": { + "External id": 150112, "cbid": 211, "correlation": 289982532 + } + }, + { + "ph": "s", "id": 289982532, "pid": 5714, "tid": 5714, "ts": 6303771824427.205, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771833334.909, "dur": 1.216, + "args": { + "External id": 150117, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982549, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982549, "pid": 0, "tid": 7, "ts": 6303771833334.909, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824459.525, "dur": 5.140, + "args": { + "External id": 150117, "cbid": 211, "correlation": 289982549 + } + }, + { + "ph": "s", "id": 289982549, "pid": 5714, "tid": 5714, "ts": 6303771824459.525, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771833336.797, "dur": 0.992, + "args": { + "External id": 150119, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982559, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982559, "pid": 0, "tid": 7, "ts": 6303771833336.797, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824478.705, "dur": 4.840, + "args": { + "External id": 150119, "cbid": 211, "correlation": 289982559 + } + }, + { + "ph": "s", "id": 289982559, "pid": 5714, "tid": 5714, "ts": 6303771824478.705, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771833338.461, "dur": 1.024, + "args": { + "External id": 150120, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982565, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982565, "pid": 0, "tid": 7, "ts": 6303771833338.461, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824490.674, "dur": 4.251, + "args": { + "External id": 150120, "cbid": 211, "correlation": 289982565 + } + }, + { + "ph": "s", "id": 289982565, "pid": 5714, "tid": 5714, "ts": 6303771824490.674, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771833340.157, "dur": 1.024, + "args": { + "External id": 150121, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982575, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982575, "pid": 0, "tid": 7, "ts": 6303771833340.157, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824505.634, "dur": 4.171, + "args": { + "External id": 150121, "cbid": 211, "correlation": 289982575 + } + }, + { + "ph": "s", "id": 289982575, "pid": 5714, "tid": 5714, "ts": 6303771824505.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771833341.853, "dur": 1.055, + "args": { + "External id": 150122, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982581, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982581, "pid": 0, "tid": 7, "ts": 6303771833341.853, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824515.385, "dur": 4.440, + "args": { + "External id": 150122, "cbid": 211, "correlation": 289982581 + } + }, + { + "ph": "s", "id": 289982581, "pid": 5714, "tid": 5714, "ts": 6303771824515.385, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771833343.612, "dur": 3.392, + "args": { + "External id": 150123, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982594, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982594, "pid": 0, "tid": 7, "ts": 6303771833343.612, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824535.805, "dur": 5.080, + "args": { + "External id": 150123, "cbid": 211, "correlation": 289982594 + } + }, + { + "ph": "s", "id": 289982594, "pid": 5714, "tid": 5714, "ts": 6303771824535.805, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771833347.709, "dur": 1.088, + "args": { + "External id": 150126, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982600, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982600, "pid": 0, "tid": 7, "ts": 6303771833347.709, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824547.094, "dur": 4.210, + "args": { + "External id": 150126, "cbid": 211, "correlation": 289982600 + } + }, + { + "ph": "s", "id": 289982600, "pid": 5714, "tid": 5714, "ts": 6303771824547.094, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771833349.437, "dur": 0.992, + "args": { + "External id": 150127, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982606, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982606, "pid": 0, "tid": 7, "ts": 6303771833349.437, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824556.234, "dur": 4.160, + "args": { + "External id": 150127, "cbid": 211, "correlation": 289982606 + } + }, + { + "ph": "s", "id": 289982606, "pid": 5714, "tid": 5714, "ts": 6303771824556.234, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771833351.133, "dur": 235.139, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982620, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289982620, "pid": 0, "tid": 7, "ts": 6303771833351.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824644.664, "dur": 7.980, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289982620 + } + }, + { + "ph": "s", "id": 289982620, "pid": 5714, "tid": 5714, "ts": 6303771824644.664, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771824685.704, "dur": 0.590, + "args": { + "External id": 150131, "cbid": 200, "correlation": 289982643 + } + }, + { + "ph": "f", "id": 289982643, "pid": 5714, "tid": 5714, "ts": 6303771824685.704, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771833587.136, "dur": 0.832, + "args": { + "External id": 150131, "device": 0, "context": 1, "stream": 7, "correlation": 289982646, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289982646, "pid": 0, "tid": 7, "ts": 6303771833587.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771824688.124, "dur": 7.130, + "args": { + "External id": 150131, "cbid": 51, "correlation": 289982646 + } + }, + { + "ph": "s", "id": 289982646, "pid": 5714, "tid": 5714, "ts": 6303771824688.124, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771833589.760, "dur": 687.463, + "args": { + "External id": 150131, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982647, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982647, "pid": 0, "tid": 7, "ts": 6303771833589.760, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824695.494, "dur": 5.780, + "args": { + "External id": 150131, "cbid": 307, "correlation": 289982647 + } + }, + { + "ph": "s", "id": 289982647, "pid": 5714, "tid": 5714, "ts": 6303771824695.494, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771834277.831, "dur": 2.977, + "args": { + "External id": 150134, "device": 0, "context": 1, "stream": 7, "correlation": 289982652, "bytes": 3145728, "memory bandwidth (GB/s)": 1056.6771918038294 + } + }, + { + "ph": "f", "id": 289982652, "pid": 0, "tid": 7, "ts": 6303771834277.831, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771824725.844, "dur": 12.280, + "args": { + "External id": 150134, "cbid": 41, "correlation": 289982652 + } + }, + { + "ph": "s", "id": 289982652, "pid": 5714, "tid": 5714, "ts": 6303771824725.844, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771824781.884, "dur": 0.430, + "args": { + "External id": 150139, "cbid": 200, "correlation": 289982680 + } + }, + { + "ph": "f", "id": 289982680, "pid": 5714, "tid": 5714, "ts": 6303771824781.884, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771834281.544, "dur": 690.568, + "args": { + "External id": 150139, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982683, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982683, "pid": 0, "tid": 7, "ts": 6303771834281.544, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824783.834, "dur": 7.260, + "args": { + "External id": 150139, "cbid": 307, "correlation": 289982683 + } + }, + { + "ph": "s", "id": 289982683, "pid": 5714, "tid": 5714, "ts": 6303771824783.834, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771834972.784, "dur": 220.866, + "args": { + "External id": 150140, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982688, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289982688, "pid": 0, "tid": 7, "ts": 6303771834972.784, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824805.224, "dur": 5.980, + "args": { + "External id": 150140, "cbid": 211, "correlation": 289982688 + } + }, + { + "ph": "s", "id": 289982688, "pid": 5714, "tid": 5714, "ts": 6303771824805.224, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771824854.264, "dur": 1.230, + "args": { + "External id": 150148, "cbid": 210, "correlation": 289982714 + } + }, + { + "ph": "f", "id": 289982714, "pid": 5714, "tid": 5714, "ts": 6303771824854.264, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771835194.290, "dur": 633.032, + "args": { + "External id": 150148, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982715, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982715, "pid": 0, "tid": 7, "ts": 6303771835194.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824858.984, "dur": 7.600, + "args": { + "External id": 150148, "cbid": 211, "correlation": 289982715 + } + }, + { + "ph": "s", "id": 289982715, "pid": 5714, "tid": 5714, "ts": 6303771824858.984, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771835828.026, "dur": 171.265, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982734, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289982734, "pid": 0, "tid": 7, "ts": 6303771835828.026, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771824970.233, "dur": 8.811, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289982734 + } + }, + { + "ph": "s", "id": 289982734, "pid": 5714, "tid": 5714, "ts": 6303771824970.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771835999.931, "dur": 4.032, + "args": { + "External id": 150158, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982751, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982751, "pid": 0, "tid": 7, "ts": 6303771835999.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825013.823, "dur": 7.250, + "args": { + "External id": 150158, "cbid": 211, "correlation": 289982751 + } + }, + { + "ph": "s", "id": 289982751, "pid": 5714, "tid": 5714, "ts": 6303771825013.823, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771836004.571, "dur": 1.184, + "args": { + "External id": 150163, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982768, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982768, "pid": 0, "tid": 7, "ts": 6303771836004.571, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825044.633, "dur": 5.730, + "args": { + "External id": 150163, "cbid": 211, "correlation": 289982768 + } + }, + { + "ph": "s", "id": 289982768, "pid": 5714, "tid": 5714, "ts": 6303771825044.633, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771836006.427, "dur": 0.992, + "args": { + "External id": 150165, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982778, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982778, "pid": 0, "tid": 7, "ts": 6303771836006.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825063.883, "dur": 4.760, + "args": { + "External id": 150165, "cbid": 211, "correlation": 289982778 + } + }, + { + "ph": "s", "id": 289982778, "pid": 5714, "tid": 5714, "ts": 6303771825063.883, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771836008.091, "dur": 1.056, + "args": { + "External id": 150166, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982784, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982784, "pid": 0, "tid": 7, "ts": 6303771836008.091, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825075.823, "dur": 4.450, + "args": { + "External id": 150166, "cbid": 211, "correlation": 289982784 + } + }, + { + "ph": "s", "id": 289982784, "pid": 5714, "tid": 5714, "ts": 6303771825075.823, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771836009.819, "dur": 1.024, + "args": { + "External id": 150167, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982794, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982794, "pid": 0, "tid": 7, "ts": 6303771836009.819, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825090.363, "dur": 4.080, + "args": { + "External id": 150167, "cbid": 211, "correlation": 289982794 + } + }, + { + "ph": "s", "id": 289982794, "pid": 5714, "tid": 5714, "ts": 6303771825090.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771836011.515, "dur": 1.056, + "args": { + "External id": 150168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982800, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982800, "pid": 0, "tid": 7, "ts": 6303771836011.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825099.903, "dur": 4.150, + "args": { + "External id": 150168, "cbid": 211, "correlation": 289982800 + } + }, + { + "ph": "s", "id": 289982800, "pid": 5714, "tid": 5714, "ts": 6303771825099.903, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771836013.275, "dur": 3.361, + "args": { + "External id": 150169, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982813, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982813, "pid": 0, "tid": 7, "ts": 6303771836013.275, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825120.393, "dur": 4.990, + "args": { + "External id": 150169, "cbid": 211, "correlation": 289982813 + } + }, + { + "ph": "s", "id": 289982813, "pid": 5714, "tid": 5714, "ts": 6303771825120.393, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771836017.244, "dur": 1.088, + "args": { + "External id": 150172, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982819, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982819, "pid": 0, "tid": 7, "ts": 6303771836017.244, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825131.363, "dur": 4.220, + "args": { + "External id": 150172, "cbid": 211, "correlation": 289982819 + } + }, + { + "ph": "s", "id": 289982819, "pid": 5714, "tid": 5714, "ts": 6303771825131.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771836018.972, "dur": 0.992, + "args": { + "External id": 150173, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982825, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982825, "pid": 0, "tid": 7, "ts": 6303771836018.972, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825140.483, "dur": 3.840, + "args": { + "External id": 150173, "cbid": 211, "correlation": 289982825 + } + }, + { + "ph": "s", "id": 289982825, "pid": 5714, "tid": 5714, "ts": 6303771825140.483, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771836020.668, "dur": 232.706, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982839, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289982839, "pid": 0, "tid": 7, "ts": 6303771836020.668, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825229.973, "dur": 8.150, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289982839 + } + }, + { + "ph": "s", "id": 289982839, "pid": 5714, "tid": 5714, "ts": 6303771825229.973, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771825272.593, "dur": 0.580, + "args": { + "External id": 150177, "cbid": 200, "correlation": 289982862 + } + }, + { + "ph": "f", "id": 289982862, "pid": 5714, "tid": 5714, "ts": 6303771825272.593, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771836254.302, "dur": 0.800, + "args": { + "External id": 150177, "device": 0, "context": 1, "stream": 7, "correlation": 289982865, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289982865, "pid": 0, "tid": 7, "ts": 6303771836254.302, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771825275.063, "dur": 7.840, + "args": { + "External id": 150177, "cbid": 51, "correlation": 289982865 + } + }, + { + "ph": "s", "id": 289982865, "pid": 5714, "tid": 5714, "ts": 6303771825275.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771836256.830, "dur": 687.432, + "args": { + "External id": 150177, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982866, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982866, "pid": 0, "tid": 7, "ts": 6303771836256.830, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825283.153, "dur": 5.690, + "args": { + "External id": 150177, "cbid": 307, "correlation": 289982866 + } + }, + { + "ph": "s", "id": 289982866, "pid": 5714, "tid": 5714, "ts": 6303771825283.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771836944.934, "dur": 2.945, + "args": { + "External id": 150180, "device": 0, "context": 1, "stream": 7, "correlation": 289982871, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.1589134125636 + } + }, + { + "ph": "f", "id": 289982871, "pid": 0, "tid": 7, "ts": 6303771836944.934, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771825321.853, "dur": 12.880, + "args": { + "External id": 150180, "cbid": 41, "correlation": 289982871 + } + }, + { + "ph": "s", "id": 289982871, "pid": 5714, "tid": 5714, "ts": 6303771825321.853, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771825376.123, "dur": 0.480, + "args": { + "External id": 150185, "cbid": 200, "correlation": 289982899 + } + }, + { + "ph": "f", "id": 289982899, "pid": 5714, "tid": 5714, "ts": 6303771825376.123, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771836948.519, "dur": 687.528, + "args": { + "External id": 150185, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982902, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982902, "pid": 0, "tid": 7, "ts": 6303771836948.519, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825378.223, "dur": 7.440, + "args": { + "External id": 150185, "cbid": 307, "correlation": 289982902 + } + }, + { + "ph": "s", "id": 289982902, "pid": 5714, "tid": 5714, "ts": 6303771825378.223, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771837636.751, "dur": 220.962, + "args": { + "External id": 150186, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982907, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289982907, "pid": 0, "tid": 7, "ts": 6303771837636.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825399.783, "dur": 6.229, + "args": { + "External id": 150186, "cbid": 211, "correlation": 289982907 + } + }, + { + "ph": "s", "id": 289982907, "pid": 5714, "tid": 5714, "ts": 6303771825399.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771825450.942, "dur": 1.200, + "args": { + "External id": 150194, "cbid": 210, "correlation": 289982933 + } + }, + { + "ph": "f", "id": 289982933, "pid": 5714, "tid": 5714, "ts": 6303771825450.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771837858.321, "dur": 633.640, + "args": { + "External id": 150194, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982934, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289982934, "pid": 0, "tid": 7, "ts": 6303771837858.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825455.672, "dur": 7.590, + "args": { + "External id": 150194, "cbid": 211, "correlation": 289982934 + } + }, + { + "ph": "s", "id": 289982934, "pid": 5714, "tid": 5714, "ts": 6303771825455.672, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771838492.537, "dur": 170.625, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982953, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289982953, "pid": 0, "tid": 7, "ts": 6303771838492.537, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825568.022, "dur": 9.310, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289982953 + } + }, + { + "ph": "s", "id": 289982953, "pid": 5714, "tid": 5714, "ts": 6303771825568.022, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771838663.898, "dur": 4.064, + "args": { + "External id": 150204, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982970, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982970, "pid": 0, "tid": 7, "ts": 6303771838663.898, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825612.722, "dur": 7.410, + "args": { + "External id": 150204, "cbid": 211, "correlation": 289982970 + } + }, + { + "ph": "s", "id": 289982970, "pid": 5714, "tid": 5714, "ts": 6303771825612.722, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771838668.666, "dur": 1.216, + "args": { + "External id": 150209, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982987, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982987, "pid": 0, "tid": 7, "ts": 6303771838668.666, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825642.792, "dur": 5.200, + "args": { + "External id": 150209, "cbid": 211, "correlation": 289982987 + } + }, + { + "ph": "s", "id": 289982987, "pid": 5714, "tid": 5714, "ts": 6303771825642.792, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771838670.522, "dur": 0.992, + "args": { + "External id": 150211, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289982997, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289982997, "pid": 0, "tid": 7, "ts": 6303771838670.522, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825661.592, "dur": 5.150, + "args": { + "External id": 150211, "cbid": 211, "correlation": 289982997 + } + }, + { + "ph": "s", "id": 289982997, "pid": 5714, "tid": 5714, "ts": 6303771825661.592, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771838672.186, "dur": 1.056, + "args": { + "External id": 150212, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983003, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983003, "pid": 0, "tid": 7, "ts": 6303771838672.186, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825673.922, "dur": 4.530, + "args": { + "External id": 150212, "cbid": 211, "correlation": 289983003 + } + }, + { + "ph": "s", "id": 289983003, "pid": 5714, "tid": 5714, "ts": 6303771825673.922, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771838673.882, "dur": 1.056, + "args": { + "External id": 150213, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983013, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983013, "pid": 0, "tid": 7, "ts": 6303771838673.882, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825691.622, "dur": 4.350, + "args": { + "External id": 150213, "cbid": 211, "correlation": 289983013 + } + }, + { + "ph": "s", "id": 289983013, "pid": 5714, "tid": 5714, "ts": 6303771825691.622, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771838675.610, "dur": 1.024, + "args": { + "External id": 150214, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983019, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983019, "pid": 0, "tid": 7, "ts": 6303771838675.610, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825702.812, "dur": 4.660, + "args": { + "External id": 150214, "cbid": 211, "correlation": 289983019 + } + }, + { + "ph": "s", "id": 289983019, "pid": 5714, "tid": 5714, "ts": 6303771825702.812, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771838677.338, "dur": 3.360, + "args": { + "External id": 150215, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983032, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983032, "pid": 0, "tid": 7, "ts": 6303771838677.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825722.252, "dur": 4.910, + "args": { + "External id": 150215, "cbid": 211, "correlation": 289983032 + } + }, + { + "ph": "s", "id": 289983032, "pid": 5714, "tid": 5714, "ts": 6303771825722.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771838681.306, "dur": 1.089, + "args": { + "External id": 150218, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983038, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983038, "pid": 0, "tid": 7, "ts": 6303771838681.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825732.982, "dur": 4.260, + "args": { + "External id": 150218, "cbid": 211, "correlation": 289983038 + } + }, + { + "ph": "s", "id": 289983038, "pid": 5714, "tid": 5714, "ts": 6303771825732.982, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771838683.035, "dur": 1.024, + "args": { + "External id": 150219, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983044, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983044, "pid": 0, "tid": 7, "ts": 6303771838683.035, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825742.082, "dur": 3.860, + "args": { + "External id": 150219, "cbid": 211, "correlation": 289983044 + } + }, + { + "ph": "s", "id": 289983044, "pid": 5714, "tid": 5714, "ts": 6303771825742.082, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771838684.731, "dur": 233.890, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983058, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289983058, "pid": 0, "tid": 7, "ts": 6303771838684.731, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825833.142, "dur": 8.120, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289983058 + } + }, + { + "ph": "s", "id": 289983058, "pid": 5714, "tid": 5714, "ts": 6303771825833.142, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771825874.242, "dur": 0.580, + "args": { + "External id": 150223, "cbid": 200, "correlation": 289983081 + } + }, + { + "ph": "f", "id": 289983081, "pid": 5714, "tid": 5714, "ts": 6303771825874.242, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771838919.549, "dur": 0.800, + "args": { + "External id": 150223, "device": 0, "context": 1, "stream": 7, "correlation": 289983084, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289983084, "pid": 0, "tid": 7, "ts": 6303771838919.549, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771825876.651, "dur": 6.811, + "args": { + "External id": 150223, "cbid": 51, "correlation": 289983084 + } + }, + { + "ph": "s", "id": 289983084, "pid": 5714, "tid": 5714, "ts": 6303771825876.651, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771838922.077, "dur": 690.184, + "args": { + "External id": 150223, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983085, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983085, "pid": 0, "tid": 7, "ts": 6303771838922.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825883.702, "dur": 5.789, + "args": { + "External id": 150223, "cbid": 307, "correlation": 289983085 + } + }, + { + "ph": "s", "id": 289983085, "pid": 5714, "tid": 5714, "ts": 6303771825883.702, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771839612.965, "dur": 2.945, + "args": { + "External id": 150226, "device": 0, "context": 1, "stream": 7, "correlation": 289983090, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.1589134125636 + } + }, + { + "ph": "f", "id": 289983090, "pid": 0, "tid": 7, "ts": 6303771839612.965, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771825912.821, "dur": 13.310, + "args": { + "External id": 150226, "cbid": 41, "correlation": 289983090 + } + }, + { + "ph": "s", "id": 289983090, "pid": 5714, "tid": 5714, "ts": 6303771825912.821, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771825966.591, "dur": 0.480, + "args": { + "External id": 150231, "cbid": 200, "correlation": 289983118 + } + }, + { + "ph": "f", "id": 289983118, "pid": 5714, "tid": 5714, "ts": 6303771825966.591, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771839616.550, "dur": 688.455, + "args": { + "External id": 150231, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983121, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983121, "pid": 0, "tid": 7, "ts": 6303771839616.550, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825968.601, "dur": 7.080, + "args": { + "External id": 150231, "cbid": 307, "correlation": 289983121 + } + }, + { + "ph": "s", "id": 289983121, "pid": 5714, "tid": 5714, "ts": 6303771825968.601, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771840305.613, "dur": 220.739, + "args": { + "External id": 150232, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983126, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289983126, "pid": 0, "tid": 7, "ts": 6303771840305.613, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771825989.681, "dur": 5.850, + "args": { + "External id": 150232, "cbid": 211, "correlation": 289983126 + } + }, + { + "ph": "s", "id": 289983126, "pid": 5714, "tid": 5714, "ts": 6303771825989.681, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771826038.901, "dur": 1.200, + "args": { + "External id": 150240, "cbid": 210, "correlation": 289983152 + } + }, + { + "ph": "f", "id": 289983152, "pid": 5714, "tid": 5714, "ts": 6303771826038.901, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771840526.960, "dur": 636.872, + "args": { + "External id": 150240, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983153, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983153, "pid": 0, "tid": 7, "ts": 6303771840526.960, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826043.661, "dur": 7.700, + "args": { + "External id": 150240, "cbid": 211, "correlation": 289983153 + } + }, + { + "ph": "s", "id": 289983153, "pid": 5714, "tid": 5714, "ts": 6303771826043.661, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771841164.536, "dur": 171.169, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983172, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289983172, "pid": 0, "tid": 7, "ts": 6303771841164.536, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826154.721, "dur": 8.650, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289983172 + } + }, + { + "ph": "s", "id": 289983172, "pid": 5714, "tid": 5714, "ts": 6303771826154.721, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771841336.313, "dur": 4.032, + "args": { + "External id": 150250, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983189, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983189, "pid": 0, "tid": 7, "ts": 6303771841336.313, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826196.951, "dur": 7.160, + "args": { + "External id": 150250, "cbid": 211, "correlation": 289983189 + } + }, + { + "ph": "s", "id": 289983189, "pid": 5714, "tid": 5714, "ts": 6303771826196.951, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771841340.985, "dur": 1.184, + "args": { + "External id": 150255, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983206, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983206, "pid": 0, "tid": 7, "ts": 6303771841340.985, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826227.551, "dur": 5.260, + "args": { + "External id": 150255, "cbid": 211, "correlation": 289983206 + } + }, + { + "ph": "s", "id": 289983206, "pid": 5714, "tid": 5714, "ts": 6303771826227.551, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771841342.809, "dur": 1.024, + "args": { + "External id": 150257, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983216, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983216, "pid": 0, "tid": 7, "ts": 6303771841342.809, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826246.301, "dur": 4.840, + "args": { + "External id": 150257, "cbid": 211, "correlation": 289983216 + } + }, + { + "ph": "s", "id": 289983216, "pid": 5714, "tid": 5714, "ts": 6303771826246.301, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771841344.505, "dur": 1.056, + "args": { + "External id": 150258, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983222, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983222, "pid": 0, "tid": 7, "ts": 6303771841344.505, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826258.181, "dur": 6.369, + "args": { + "External id": 150258, "cbid": 211, "correlation": 289983222 + } + }, + { + "ph": "s", "id": 289983222, "pid": 5714, "tid": 5714, "ts": 6303771826258.181, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771841346.201, "dur": 1.024, + "args": { + "External id": 150259, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983232, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983232, "pid": 0, "tid": 7, "ts": 6303771841346.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826275.101, "dur": 4.560, + "args": { + "External id": 150259, "cbid": 211, "correlation": 289983232 + } + }, + { + "ph": "s", "id": 289983232, "pid": 5714, "tid": 5714, "ts": 6303771826275.101, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771841347.897, "dur": 1.056, + "args": { + "External id": 150260, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983238, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983238, "pid": 0, "tid": 7, "ts": 6303771841347.897, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826285.441, "dur": 4.249, + "args": { + "External id": 150260, "cbid": 211, "correlation": 289983238 + } + }, + { + "ph": "s", "id": 289983238, "pid": 5714, "tid": 5714, "ts": 6303771826285.441, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771841349.658, "dur": 3.328, + "args": { + "External id": 150261, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983251, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983251, "pid": 0, "tid": 7, "ts": 6303771841349.658, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826315.290, "dur": 5.851, + "args": { + "External id": 150261, "cbid": 211, "correlation": 289983251 + } + }, + { + "ph": "s", "id": 289983251, "pid": 5714, "tid": 5714, "ts": 6303771826315.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771841353.626, "dur": 1.088, + "args": { + "External id": 150264, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983257, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983257, "pid": 0, "tid": 7, "ts": 6303771841353.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826328.970, "dur": 4.351, + "args": { + "External id": 150264, "cbid": 211, "correlation": 289983257 + } + }, + { + "ph": "s", "id": 289983257, "pid": 5714, "tid": 5714, "ts": 6303771826328.970, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771841355.322, "dur": 1.024, + "args": { + "External id": 150265, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983263, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983263, "pid": 0, "tid": 7, "ts": 6303771841355.322, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826338.470, "dur": 3.891, + "args": { + "External id": 150265, "cbid": 211, "correlation": 289983263 + } + }, + { + "ph": "s", "id": 289983263, "pid": 5714, "tid": 5714, "ts": 6303771826338.470, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771841357.050, "dur": 232.354, + "args": { + "External id": 149881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983277, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289983277, "pid": 0, "tid": 7, "ts": 6303771841357.050, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826428.060, "dur": 8.050, + "args": { + "External id": 149881, "cbid": 307, "correlation": 289983277 + } + }, + { + "ph": "s", "id": 289983277, "pid": 5714, "tid": 5714, "ts": 6303771826428.060, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771826468.870, "dur": 0.590, + "args": { + "External id": 150269, "cbid": 200, "correlation": 289983300 + } + }, + { + "ph": "f", "id": 289983300, "pid": 5714, "tid": 5714, "ts": 6303771826468.870, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771841590.268, "dur": 0.832, + "args": { + "External id": 150269, "device": 0, "context": 1, "stream": 7, "correlation": 289983303, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289983303, "pid": 0, "tid": 7, "ts": 6303771841590.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771826471.270, "dur": 6.810, + "args": { + "External id": 150269, "cbid": 51, "correlation": 289983303 + } + }, + { + "ph": "s", "id": 289983303, "pid": 5714, "tid": 5714, "ts": 6303771826471.270, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771841592.380, "dur": 690.537, + "args": { + "External id": 150269, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983304, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983304, "pid": 0, "tid": 7, "ts": 6303771841592.380, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826478.320, "dur": 5.600, + "args": { + "External id": 150269, "cbid": 307, "correlation": 289983304 + } + }, + { + "ph": "s", "id": 289983304, "pid": 5714, "tid": 5714, "ts": 6303771826478.320, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771842283.589, "dur": 2.976, + "args": { + "External id": 150272, "device": 0, "context": 1, "stream": 7, "correlation": 289983309, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289983309, "pid": 0, "tid": 7, "ts": 6303771842283.589, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771826508.930, "dur": 13.090, + "args": { + "External id": 150272, "cbid": 41, "correlation": 289983309 + } + }, + { + "ph": "s", "id": 289983309, "pid": 5714, "tid": 5714, "ts": 6303771826508.930, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771826562.100, "dur": 0.550, + "args": { + "External id": 150277, "cbid": 200, "correlation": 289983337 + } + }, + { + "ph": "f", "id": 289983337, "pid": 5714, "tid": 5714, "ts": 6303771826562.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771842287.237, "dur": 693.447, + "args": { + "External id": 150277, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983340, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983340, "pid": 0, "tid": 7, "ts": 6303771842287.237, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826564.080, "dur": 6.970, + "args": { + "External id": 150277, "cbid": 307, "correlation": 289983340 + } + }, + { + "ph": "s", "id": 289983340, "pid": 5714, "tid": 5714, "ts": 6303771826564.080, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771842981.356, "dur": 221.091, + "args": { + "External id": 150278, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983345, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289983345, "pid": 0, "tid": 7, "ts": 6303771842981.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826586.200, "dur": 5.890, + "args": { + "External id": 150278, "cbid": 211, "correlation": 289983345 + } + }, + { + "ph": "s", "id": 289983345, "pid": 5714, "tid": 5714, "ts": 6303771826586.200, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771843203.183, "dur": 5.152, + "args": { + "External id": 150280, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983358, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983358, "pid": 0, "tid": 7, "ts": 6303771843203.183, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826613.790, "dur": 6.220, + "args": { + "External id": 150280, "cbid": 211, "correlation": 289983358 + } + }, + { + "ph": "s", "id": 289983358, "pid": 5714, "tid": 5714, "ts": 6303771826613.790, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771843208.943, "dur": 162.562, + "args": { + "External id": 150285, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983371, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289983371, "pid": 0, "tid": 7, "ts": 6303771843208.943, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826642.890, "dur": 5.960, + "args": { + "External id": 150285, "cbid": 211, "correlation": 289983371 + } + }, + { + "ph": "s", "id": 289983371, "pid": 5714, "tid": 5714, "ts": 6303771826642.890, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771843372.177, "dur": 1.600, + "args": { + "External id": 150290, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983379, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983379, "pid": 0, "tid": 7, "ts": 6303771843372.177, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826706.840, "dur": 7.749, + "args": { + "External id": 150290, "cbid": 211, "correlation": 289983379 + } + }, + { + "ph": "s", "id": 289983379, "pid": 5714, "tid": 5714, "ts": 6303771826706.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771843374.513, "dur": 1.312, + "args": { + "External id": 150291, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983385, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983385, "pid": 0, "tid": 7, "ts": 6303771843374.513, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826726.580, "dur": 5.129, + "args": { + "External id": 150291, "cbid": 211, "correlation": 289983385 + } + }, + { + "ph": "s", "id": 289983385, "pid": 5714, "tid": 5714, "ts": 6303771826726.580, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771843376.561, "dur": 2.240, + "args": { + "External id": 150310, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983405, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 289983405, "pid": 0, "tid": 7, "ts": 6303771843376.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826812.189, "dur": 8.800, + "args": { + "External id": 150310, "cbid": 211, "correlation": 289983405 + } + }, + { + "ph": "s", "id": 289983405, "pid": 5714, "tid": 5714, "ts": 6303771826812.189, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771843379.537, "dur": 58.945, + "args": { + "External id": 150318, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983423, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289983423, "pid": 0, "tid": 7, "ts": 6303771843379.537, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826914.519, "dur": 9.410, + "args": { + "External id": 150318, "cbid": 211, "correlation": 289983423 + } + }, + { + "ph": "s", "id": 289983423, "pid": 5714, "tid": 5714, "ts": 6303771826914.519, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771843439.154, "dur": 15.232, + "args": { + "External id": 150323, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289983440, "pid": 0, "tid": 7, "ts": 6303771843439.154, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826959.179, "dur": 6.580, + "args": { + "External id": 150323, "cbid": 211, "correlation": 289983440 + } + }, + { + "ph": "s", "id": 289983440, "pid": 5714, "tid": 5714, "ts": 6303771826959.179, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771843455.058, "dur": 99.713, + "args": { + "External id": 150328, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983456, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289983456, "pid": 0, "tid": 7, "ts": 6303771843455.058, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771826983.119, "dur": 5.180, + "args": { + "External id": 150328, "cbid": 211, "correlation": 289983456 + } + }, + { + "ph": "s", "id": 289983456, "pid": 5714, "tid": 5714, "ts": 6303771826983.119, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771843555.475, "dur": 1.856, + "args": { + "External id": 150332, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983472, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289983472, "pid": 0, "tid": 7, "ts": 6303771843555.475, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771827006.559, "dur": 4.800, + "args": { + "External id": 150332, "cbid": 211, "correlation": 289983472 + } + }, + { + "ph": "s", "id": 289983472, "pid": 5714, "tid": 5714, "ts": 6303771827006.559, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771843557.939, "dur": 1.760, + "args": { + "External id": 150333, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983484, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289983484, "pid": 0, "tid": 7, "ts": 6303771843557.939, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771827030.679, "dur": 5.370, + "args": { + "External id": 150333, "cbid": 211, "correlation": 289983484 + } + }, + { + "ph": "s", "id": 289983484, "pid": 5714, "tid": 5714, "ts": 6303771827030.679, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771843560.435, "dur": 2.080, + "args": { + "External id": 150340, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983502, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289983502, "pid": 0, "tid": 7, "ts": 6303771843560.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771827065.919, "dur": 6.200, + "args": { + "External id": 150340, "cbid": 211, "correlation": 289983502 + } + }, + { + "ph": "s", "id": 289983502, "pid": 5714, "tid": 5714, "ts": 6303771827065.919, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 0, "tid": 7, + "ts": 6303771843563.219, "dur": 3.937, + "args": { + "External id": 150335, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983511, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983511, "pid": 0, "tid": 7, "ts": 6303771843563.219, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771827078.859, "dur": 4.500, + "args": { + "External id": 150335, "cbid": 211, "correlation": 289983511 + } + }, + { + "ph": "s", "id": 289983511, "pid": 5714, "tid": 5714, "ts": 6303771827078.859, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6303771843568.148, "dur": 1.024, + "args": { + "External id": 150342, "device": 0, "context": 1, "stream": 7, "correlation": 289983517, "bytes": 8, "memory bandwidth (GB/s)": 0.0078125 + } + }, + { + "ph": "f", "id": 289983517, "pid": 0, "tid": 7, "ts": 6303771843568.148, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771827096.869, "dur": 10.540, + "args": { + "External id": 150342, "cbid": 41, "correlation": 289983517 + } + }, + { + "ph": "s", "id": 289983517, "pid": 5714, "tid": 5714, "ts": 6303771827096.869, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6303771827107.739, "dur": 16466.313, + "args": { + "External id": 150342, "cbid": 131, "correlation": 289983518 + } + }, + { + "ph": "s", "id": 289983518, "pid": 5714, "tid": 5714, "ts": 6303771827107.739, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771843644.982, "dur": 1.710, + "args": { + "External id": 150350, "cbid": 210, "correlation": 289983543 + } + }, + { + "ph": "f", "id": 289983543, "pid": 5714, "tid": 5714, "ts": 6303771843644.982, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771843662.228, "dur": 644.264, + "args": { + "External id": 150350, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983544, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983544, "pid": 0, "tid": 7, "ts": 6303771843662.228, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843650.492, "dur": 11.530, + "args": { + "External id": 150350, "cbid": 211, "correlation": 289983544 + } + }, + { + "ph": "s", "id": 289983544, "pid": 5714, "tid": 5714, "ts": 6303771843650.492, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771844307.132, "dur": 171.426, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983563, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289983563, "pid": 0, "tid": 7, "ts": 6303771844307.132, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843774.952, "dur": 9.260, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289983563 + } + }, + { + "ph": "s", "id": 289983563, "pid": 5714, "tid": 5714, "ts": 6303771843774.952, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771844479.294, "dur": 4.128, + "args": { + "External id": 150360, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983580, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983580, "pid": 0, "tid": 7, "ts": 6303771844479.294, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843820.662, "dur": 7.629, + "args": { + "External id": 150360, "cbid": 211, "correlation": 289983580 + } + }, + { + "ph": "s", "id": 289983580, "pid": 5714, "tid": 5714, "ts": 6303771843820.662, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771844484.062, "dur": 1.216, + "args": { + "External id": 150365, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983597, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983597, "pid": 0, "tid": 7, "ts": 6303771844484.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843857.951, "dur": 5.911, + "args": { + "External id": 150365, "cbid": 211, "correlation": 289983597 + } + }, + { + "ph": "s", "id": 289983597, "pid": 5714, "tid": 5714, "ts": 6303771843857.951, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771844485.950, "dur": 1.024, + "args": { + "External id": 150367, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983607, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983607, "pid": 0, "tid": 7, "ts": 6303771844485.950, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843878.231, "dur": 4.940, + "args": { + "External id": 150367, "cbid": 211, "correlation": 289983607 + } + }, + { + "ph": "s", "id": 289983607, "pid": 5714, "tid": 5714, "ts": 6303771843878.231, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771844487.646, "dur": 1.088, + "args": { + "External id": 150368, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983613, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983613, "pid": 0, "tid": 7, "ts": 6303771844487.646, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843890.481, "dur": 4.650, + "args": { + "External id": 150368, "cbid": 211, "correlation": 289983613 + } + }, + { + "ph": "s", "id": 289983613, "pid": 5714, "tid": 5714, "ts": 6303771843890.481, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771844489.406, "dur": 1.056, + "args": { + "External id": 150369, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983623, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983623, "pid": 0, "tid": 7, "ts": 6303771844489.406, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843905.721, "dur": 4.660, + "args": { + "External id": 150369, "cbid": 211, "correlation": 289983623 + } + }, + { + "ph": "s", "id": 289983623, "pid": 5714, "tid": 5714, "ts": 6303771843905.721, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771844491.166, "dur": 1.056, + "args": { + "External id": 150370, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983629, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983629, "pid": 0, "tid": 7, "ts": 6303771844491.166, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843916.161, "dur": 4.200, + "args": { + "External id": 150370, "cbid": 211, "correlation": 289983629 + } + }, + { + "ph": "s", "id": 289983629, "pid": 5714, "tid": 5714, "ts": 6303771843916.161, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771844492.958, "dur": 3.360, + "args": { + "External id": 150371, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983642, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983642, "pid": 0, "tid": 7, "ts": 6303771844492.958, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843937.751, "dur": 5.010, + "args": { + "External id": 150371, "cbid": 211, "correlation": 289983642 + } + }, + { + "ph": "s", "id": 289983642, "pid": 5714, "tid": 5714, "ts": 6303771843937.751, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771844496.990, "dur": 1.120, + "args": { + "External id": 150374, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983648, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983648, "pid": 0, "tid": 7, "ts": 6303771844496.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843948.941, "dur": 4.500, + "args": { + "External id": 150374, "cbid": 211, "correlation": 289983648 + } + }, + { + "ph": "s", "id": 289983648, "pid": 5714, "tid": 5714, "ts": 6303771843948.941, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771844498.782, "dur": 1.024, + "args": { + "External id": 150375, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983654, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983654, "pid": 0, "tid": 7, "ts": 6303771844498.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771843958.651, "dur": 3.930, + "args": { + "External id": 150375, "cbid": 211, "correlation": 289983654 + } + }, + { + "ph": "s", "id": 289983654, "pid": 5714, "tid": 5714, "ts": 6303771843958.651, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771844500.510, "dur": 236.579, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983668, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289983668, "pid": 0, "tid": 7, "ts": 6303771844500.510, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844048.671, "dur": 8.150, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289983668 + } + }, + { + "ph": "s", "id": 289983668, "pid": 5714, "tid": 5714, "ts": 6303771844048.671, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771844090.141, "dur": 0.630, + "args": { + "External id": 150379, "cbid": 200, "correlation": 289983691 + } + }, + { + "ph": "f", "id": 289983691, "pid": 5714, "tid": 5714, "ts": 6303771844090.141, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771844737.985, "dur": 0.992, + "args": { + "External id": 150379, "device": 0, "context": 1, "stream": 7, "correlation": 289983694, "bytes": 1536, "memory bandwidth (GB/s)": 1.5483870967741935 + } + }, + { + "ph": "f", "id": 289983694, "pid": 0, "tid": 7, "ts": 6303771844737.985, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771844092.631, "dur": 7.470, + "args": { + "External id": 150379, "cbid": 51, "correlation": 289983694 + } + }, + { + "ph": "s", "id": 289983694, "pid": 5714, "tid": 5714, "ts": 6303771844092.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771844740.289, "dur": 691.976, + "args": { + "External id": 150379, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983695, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983695, "pid": 0, "tid": 7, "ts": 6303771844740.289, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844100.341, "dur": 6.010, + "args": { + "External id": 150379, "cbid": 307, "correlation": 289983695 + } + }, + { + "ph": "s", "id": 289983695, "pid": 5714, "tid": 5714, "ts": 6303771844100.341, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771845432.969, "dur": 3.040, + "args": { + "External id": 150382, "device": 0, "context": 1, "stream": 7, "correlation": 289983700, "bytes": 3145728, "memory bandwidth (GB/s)": 1034.778947368421 + } + }, + { + "ph": "f", "id": 289983700, "pid": 0, "tid": 7, "ts": 6303771845432.969, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771844132.661, "dur": 13.370, + "args": { + "External id": 150382, "cbid": 41, "correlation": 289983700 + } + }, + { + "ph": "s", "id": 289983700, "pid": 5714, "tid": 5714, "ts": 6303771844132.661, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771844187.671, "dur": 0.510, + "args": { + "External id": 150387, "cbid": 200, "correlation": 289983728 + } + }, + { + "ph": "f", "id": 289983728, "pid": 5714, "tid": 5714, "ts": 6303771844187.671, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771845436.745, "dur": 696.968, + "args": { + "External id": 150387, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983731, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983731, "pid": 0, "tid": 7, "ts": 6303771845436.745, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844189.691, "dur": 7.450, + "args": { + "External id": 150387, "cbid": 307, "correlation": 289983731 + } + }, + { + "ph": "s", "id": 289983731, "pid": 5714, "tid": 5714, "ts": 6303771844189.691, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771846134.449, "dur": 220.995, + "args": { + "External id": 150388, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983736, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289983736, "pid": 0, "tid": 7, "ts": 6303771846134.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844211.641, "dur": 6.030, + "args": { + "External id": 150388, "cbid": 211, "correlation": 289983736 + } + }, + { + "ph": "s", "id": 289983736, "pid": 5714, "tid": 5714, "ts": 6303771844211.641, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771844263.421, "dur": 1.329, + "args": { + "External id": 150396, "cbid": 210, "correlation": 289983762 + } + }, + { + "ph": "f", "id": 289983762, "pid": 5714, "tid": 5714, "ts": 6303771844263.421, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771846356.084, "dur": 643.527, + "args": { + "External id": 150396, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983763, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983763, "pid": 0, "tid": 7, "ts": 6303771846356.084, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844268.341, "dur": 7.560, + "args": { + "External id": 150396, "cbid": 211, "correlation": 289983763 + } + }, + { + "ph": "s", "id": 289983763, "pid": 5714, "tid": 5714, "ts": 6303771844268.341, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771847000.379, "dur": 170.754, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983782, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289983782, "pid": 0, "tid": 7, "ts": 6303771847000.379, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844394.720, "dur": 9.560, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289983782 + } + }, + { + "ph": "s", "id": 289983782, "pid": 5714, "tid": 5714, "ts": 6303771844394.720, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771847171.805, "dur": 4.096, + "args": { + "External id": 150406, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983799, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983799, "pid": 0, "tid": 7, "ts": 6303771847171.805, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844440.640, "dur": 7.200, + "args": { + "External id": 150406, "cbid": 211, "correlation": 289983799 + } + }, + { + "ph": "s", "id": 289983799, "pid": 5714, "tid": 5714, "ts": 6303771844440.640, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771847176.541, "dur": 1.248, + "args": { + "External id": 150411, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983816, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983816, "pid": 0, "tid": 7, "ts": 6303771847176.541, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844471.960, "dur": 5.300, + "args": { + "External id": 150411, "cbid": 211, "correlation": 289983816 + } + }, + { + "ph": "s", "id": 289983816, "pid": 5714, "tid": 5714, "ts": 6303771844471.960, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771847178.429, "dur": 1.056, + "args": { + "External id": 150413, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983826, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983826, "pid": 0, "tid": 7, "ts": 6303771847178.429, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844492.600, "dur": 5.210, + "args": { + "External id": 150413, "cbid": 211, "correlation": 289983826 + } + }, + { + "ph": "s", "id": 289983826, "pid": 5714, "tid": 5714, "ts": 6303771844492.600, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771847180.157, "dur": 1.056, + "args": { + "External id": 150414, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983832, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983832, "pid": 0, "tid": 7, "ts": 6303771847180.157, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844505.030, "dur": 4.530, + "args": { + "External id": 150414, "cbid": 211, "correlation": 289983832 + } + }, + { + "ph": "s", "id": 289983832, "pid": 5714, "tid": 5714, "ts": 6303771844505.030, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771847181.917, "dur": 1.056, + "args": { + "External id": 150415, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983842, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983842, "pid": 0, "tid": 7, "ts": 6303771847181.917, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844521.570, "dur": 4.510, + "args": { + "External id": 150415, "cbid": 211, "correlation": 289983842 + } + }, + { + "ph": "s", "id": 289983842, "pid": 5714, "tid": 5714, "ts": 6303771844521.570, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771847183.645, "dur": 1.056, + "args": { + "External id": 150416, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983848, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983848, "pid": 0, "tid": 7, "ts": 6303771847183.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844533.000, "dur": 4.200, + "args": { + "External id": 150416, "cbid": 211, "correlation": 289983848 + } + }, + { + "ph": "s", "id": 289983848, "pid": 5714, "tid": 5714, "ts": 6303771844533.000, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771847185.437, "dur": 3.456, + "args": { + "External id": 150417, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983861, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983861, "pid": 0, "tid": 7, "ts": 6303771847185.437, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844552.560, "dur": 5.180, + "args": { + "External id": 150417, "cbid": 211, "correlation": 289983861 + } + }, + { + "ph": "s", "id": 289983861, "pid": 5714, "tid": 5714, "ts": 6303771844552.560, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771847189.629, "dur": 1.120, + "args": { + "External id": 150420, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983867, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983867, "pid": 0, "tid": 7, "ts": 6303771847189.629, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844564.040, "dur": 4.180, + "args": { + "External id": 150420, "cbid": 211, "correlation": 289983867 + } + }, + { + "ph": "s", "id": 289983867, "pid": 5714, "tid": 5714, "ts": 6303771844564.040, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771847191.357, "dur": 1.024, + "args": { + "External id": 150421, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983873, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289983873, "pid": 0, "tid": 7, "ts": 6303771847191.357, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844573.240, "dur": 3.930, + "args": { + "External id": 150421, "cbid": 211, "correlation": 289983873 + } + }, + { + "ph": "s", "id": 289983873, "pid": 5714, "tid": 5714, "ts": 6303771844573.240, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771847193.117, "dur": 235.235, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983887, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289983887, "pid": 0, "tid": 7, "ts": 6303771847193.117, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844665.320, "dur": 8.210, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289983887 + } + }, + { + "ph": "s", "id": 289983887, "pid": 5714, "tid": 5714, "ts": 6303771844665.320, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771844707.100, "dur": 0.600, + "args": { + "External id": 150425, "cbid": 200, "correlation": 289983910 + } + }, + { + "ph": "f", "id": 289983910, "pid": 5714, "tid": 5714, "ts": 6303771844707.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771847429.152, "dur": 0.832, + "args": { + "External id": 150425, "device": 0, "context": 1, "stream": 7, "correlation": 289983913, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289983913, "pid": 0, "tid": 7, "ts": 6303771847429.152, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771844709.469, "dur": 7.140, + "args": { + "External id": 150425, "cbid": 51, "correlation": 289983913 + } + }, + { + "ph": "s", "id": 289983913, "pid": 5714, "tid": 5714, "ts": 6303771844709.469, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771847431.264, "dur": 691.528, + "args": { + "External id": 150425, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983914, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983914, "pid": 0, "tid": 7, "ts": 6303771847431.264, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844716.860, "dur": 5.920, + "args": { + "External id": 150425, "cbid": 307, "correlation": 289983914 + } + }, + { + "ph": "s", "id": 289983914, "pid": 5714, "tid": 5714, "ts": 6303771844716.860, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771848123.464, "dur": 2.976, + "args": { + "External id": 150428, "device": 0, "context": 1, "stream": 7, "correlation": 289983919, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289983919, "pid": 0, "tid": 7, "ts": 6303771848123.464, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771844747.520, "dur": 12.129, + "args": { + "External id": 150428, "cbid": 41, "correlation": 289983919 + } + }, + { + "ph": "s", "id": 289983919, "pid": 5714, "tid": 5714, "ts": 6303771844747.520, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771844800.069, "dur": 0.450, + "args": { + "External id": 150433, "cbid": 200, "correlation": 289983947 + } + }, + { + "ph": "f", "id": 289983947, "pid": 5714, "tid": 5714, "ts": 6303771844800.069, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771848127.080, "dur": 690.856, + "args": { + "External id": 150433, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983950, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983950, "pid": 0, "tid": 7, "ts": 6303771848127.080, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844802.019, "dur": 7.120, + "args": { + "External id": 150433, "cbid": 307, "correlation": 289983950 + } + }, + { + "ph": "s", "id": 289983950, "pid": 5714, "tid": 5714, "ts": 6303771844802.019, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771848818.640, "dur": 220.995, + "args": { + "External id": 150434, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983955, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289983955, "pid": 0, "tid": 7, "ts": 6303771848818.640, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844823.229, "dur": 5.820, + "args": { + "External id": 150434, "cbid": 211, "correlation": 289983955 + } + }, + { + "ph": "s", "id": 289983955, "pid": 5714, "tid": 5714, "ts": 6303771844823.229, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771844871.759, "dur": 1.260, + "args": { + "External id": 150442, "cbid": 210, "correlation": 289983981 + } + }, + { + "ph": "f", "id": 289983981, "pid": 5714, "tid": 5714, "ts": 6303771844871.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771849040.307, "dur": 641.671, + "args": { + "External id": 150442, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289983982, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289983982, "pid": 0, "tid": 7, "ts": 6303771849040.307, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844876.569, "dur": 7.440, + "args": { + "External id": 150442, "cbid": 211, "correlation": 289983982 + } + }, + { + "ph": "s", "id": 289983982, "pid": 5714, "tid": 5714, "ts": 6303771844876.569, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771849682.650, "dur": 171.586, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984001, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289984001, "pid": 0, "tid": 7, "ts": 6303771849682.650, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771844989.239, "dur": 9.100, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289984001 + } + }, + { + "ph": "s", "id": 289984001, "pid": 5714, "tid": 5714, "ts": 6303771844989.239, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771849854.908, "dur": 4.096, + "args": { + "External id": 150452, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984018, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984018, "pid": 0, "tid": 7, "ts": 6303771849854.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845032.429, "dur": 7.110, + "args": { + "External id": 150452, "cbid": 211, "correlation": 289984018 + } + }, + { + "ph": "s", "id": 289984018, "pid": 5714, "tid": 5714, "ts": 6303771845032.429, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771849859.644, "dur": 1.216, + "args": { + "External id": 150457, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984035, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984035, "pid": 0, "tid": 7, "ts": 6303771849859.644, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845063.409, "dur": 5.430, + "args": { + "External id": 150457, "cbid": 211, "correlation": 289984035 + } + }, + { + "ph": "s", "id": 289984035, "pid": 5714, "tid": 5714, "ts": 6303771845063.409, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771849861.564, "dur": 1.024, + "args": { + "External id": 150459, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984045, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984045, "pid": 0, "tid": 7, "ts": 6303771849861.564, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845082.919, "dur": 4.900, + "args": { + "External id": 150459, "cbid": 211, "correlation": 289984045 + } + }, + { + "ph": "s", "id": 289984045, "pid": 5714, "tid": 5714, "ts": 6303771845082.919, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771849863.260, "dur": 1.056, + "args": { + "External id": 150460, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984051, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984051, "pid": 0, "tid": 7, "ts": 6303771849863.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845094.929, "dur": 4.670, + "args": { + "External id": 150460, "cbid": 211, "correlation": 289984051 + } + }, + { + "ph": "s", "id": 289984051, "pid": 5714, "tid": 5714, "ts": 6303771845094.929, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771849865.020, "dur": 1.056, + "args": { + "External id": 150461, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984061, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984061, "pid": 0, "tid": 7, "ts": 6303771849865.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845111.079, "dur": 4.420, + "args": { + "External id": 150461, "cbid": 211, "correlation": 289984061 + } + }, + { + "ph": "s", "id": 289984061, "pid": 5714, "tid": 5714, "ts": 6303771845111.079, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771849866.749, "dur": 1.056, + "args": { + "External id": 150462, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984067, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984067, "pid": 0, "tid": 7, "ts": 6303771849866.749, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845121.469, "dur": 4.230, + "args": { + "External id": 150462, "cbid": 211, "correlation": 289984067 + } + }, + { + "ph": "s", "id": 289984067, "pid": 5714, "tid": 5714, "ts": 6303771845121.469, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771849868.541, "dur": 3.392, + "args": { + "External id": 150463, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984080, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984080, "pid": 0, "tid": 7, "ts": 6303771849868.541, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845140.139, "dur": 5.049, + "args": { + "External id": 150463, "cbid": 211, "correlation": 289984080 + } + }, + { + "ph": "s", "id": 289984080, "pid": 5714, "tid": 5714, "ts": 6303771845140.139, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771849872.573, "dur": 1.120, + "args": { + "External id": 150466, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984086, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984086, "pid": 0, "tid": 7, "ts": 6303771849872.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845151.208, "dur": 4.440, + "args": { + "External id": 150466, "cbid": 211, "correlation": 289984086 + } + }, + { + "ph": "s", "id": 289984086, "pid": 5714, "tid": 5714, "ts": 6303771845151.208, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771849874.333, "dur": 1.024, + "args": { + "External id": 150467, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984092, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984092, "pid": 0, "tid": 7, "ts": 6303771849874.333, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845160.659, "dur": 3.949, + "args": { + "External id": 150467, "cbid": 211, "correlation": 289984092 + } + }, + { + "ph": "s", "id": 289984092, "pid": 5714, "tid": 5714, "ts": 6303771845160.659, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771849876.061, "dur": 235.682, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984106, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289984106, "pid": 0, "tid": 7, "ts": 6303771849876.061, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845250.398, "dur": 8.020, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289984106 + } + }, + { + "ph": "s", "id": 289984106, "pid": 5714, "tid": 5714, "ts": 6303771845250.398, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771845291.168, "dur": 0.570, + "args": { + "External id": 150471, "cbid": 200, "correlation": 289984129 + } + }, + { + "ph": "f", "id": 289984129, "pid": 5714, "tid": 5714, "ts": 6303771845291.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771850112.639, "dur": 0.832, + "args": { + "External id": 150471, "device": 0, "context": 1, "stream": 7, "correlation": 289984132, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289984132, "pid": 0, "tid": 7, "ts": 6303771850112.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771845293.578, "dur": 15.750, + "args": { + "External id": 150471, "cbid": 51, "correlation": 289984132 + } + }, + { + "ph": "s", "id": 289984132, "pid": 5714, "tid": 5714, "ts": 6303771845293.578, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771850115.167, "dur": 688.360, + "args": { + "External id": 150471, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984133, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984133, "pid": 0, "tid": 7, "ts": 6303771850115.167, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845309.568, "dur": 6.060, + "args": { + "External id": 150471, "cbid": 307, "correlation": 289984133 + } + }, + { + "ph": "s", "id": 289984133, "pid": 5714, "tid": 5714, "ts": 6303771845309.568, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771850804.231, "dur": 2.976, + "args": { + "External id": 150474, "device": 0, "context": 1, "stream": 7, "correlation": 289984138, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289984138, "pid": 0, "tid": 7, "ts": 6303771850804.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771845340.058, "dur": 12.920, + "args": { + "External id": 150474, "cbid": 41, "correlation": 289984138 + } + }, + { + "ph": "s", "id": 289984138, "pid": 5714, "tid": 5714, "ts": 6303771845340.058, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771845395.428, "dur": 0.490, + "args": { + "External id": 150479, "cbid": 200, "correlation": 289984166 + } + }, + { + "ph": "f", "id": 289984166, "pid": 5714, "tid": 5714, "ts": 6303771845395.428, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771850807.879, "dur": 690.472, + "args": { + "External id": 150479, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984169, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984169, "pid": 0, "tid": 7, "ts": 6303771850807.879, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845397.428, "dur": 6.970, + "args": { + "External id": 150479, "cbid": 307, "correlation": 289984169 + } + }, + { + "ph": "s", "id": 289984169, "pid": 5714, "tid": 5714, "ts": 6303771845397.428, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771851499.087, "dur": 220.739, + "args": { + "External id": 150480, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984174, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289984174, "pid": 0, "tid": 7, "ts": 6303771851499.087, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845418.678, "dur": 5.910, + "args": { + "External id": 150480, "cbid": 211, "correlation": 289984174 + } + }, + { + "ph": "s", "id": 289984174, "pid": 5714, "tid": 5714, "ts": 6303771845418.678, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771845467.668, "dur": 1.160, + "args": { + "External id": 150488, "cbid": 210, "correlation": 289984200 + } + }, + { + "ph": "f", "id": 289984200, "pid": 5714, "tid": 5714, "ts": 6303771845467.668, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771851720.498, "dur": 645.383, + "args": { + "External id": 150488, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984201, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984201, "pid": 0, "tid": 7, "ts": 6303771851720.498, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845472.408, "dur": 7.470, + "args": { + "External id": 150488, "cbid": 211, "correlation": 289984201 + } + }, + { + "ph": "s", "id": 289984201, "pid": 5714, "tid": 5714, "ts": 6303771845472.408, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771852366.585, "dur": 170.659, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984220, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289984220, "pid": 0, "tid": 7, "ts": 6303771852366.585, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845590.798, "dur": 10.389, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289984220 + } + }, + { + "ph": "s", "id": 289984220, "pid": 5714, "tid": 5714, "ts": 6303771845590.798, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771852537.948, "dur": 4.096, + "args": { + "External id": 150498, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984237, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984237, "pid": 0, "tid": 7, "ts": 6303771852537.948, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845641.447, "dur": 8.411, + "args": { + "External id": 150498, "cbid": 211, "correlation": 289984237 + } + }, + { + "ph": "s", "id": 289984237, "pid": 5714, "tid": 5714, "ts": 6303771845641.447, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771852542.684, "dur": 1.216, + "args": { + "External id": 150503, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984254, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984254, "pid": 0, "tid": 7, "ts": 6303771852542.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845678.198, "dur": 6.269, + "args": { + "External id": 150503, "cbid": 211, "correlation": 289984254 + } + }, + { + "ph": "s", "id": 289984254, "pid": 5714, "tid": 5714, "ts": 6303771845678.198, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771852544.572, "dur": 1.024, + "args": { + "External id": 150505, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984264, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984264, "pid": 0, "tid": 7, "ts": 6303771852544.572, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845700.277, "dur": 5.670, + "args": { + "External id": 150505, "cbid": 211, "correlation": 289984264 + } + }, + { + "ph": "s", "id": 289984264, "pid": 5714, "tid": 5714, "ts": 6303771845700.277, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771852546.268, "dur": 1.088, + "args": { + "External id": 150506, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984270, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984270, "pid": 0, "tid": 7, "ts": 6303771852546.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845714.297, "dur": 5.050, + "args": { + "External id": 150506, "cbid": 211, "correlation": 289984270 + } + }, + { + "ph": "s", "id": 289984270, "pid": 5714, "tid": 5714, "ts": 6303771845714.297, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771852548.028, "dur": 1.056, + "args": { + "External id": 150507, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984280, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984280, "pid": 0, "tid": 7, "ts": 6303771852548.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845731.997, "dur": 4.900, + "args": { + "External id": 150507, "cbid": 211, "correlation": 289984280 + } + }, + { + "ph": "s", "id": 289984280, "pid": 5714, "tid": 5714, "ts": 6303771845731.997, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771852549.788, "dur": 1.056, + "args": { + "External id": 150508, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984286, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984286, "pid": 0, "tid": 7, "ts": 6303771852549.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845743.637, "dur": 5.260, + "args": { + "External id": 150508, "cbid": 211, "correlation": 289984286 + } + }, + { + "ph": "s", "id": 289984286, "pid": 5714, "tid": 5714, "ts": 6303771845743.637, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771852551.580, "dur": 3.392, + "args": { + "External id": 150509, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984299, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984299, "pid": 0, "tid": 7, "ts": 6303771852551.580, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845765.547, "dur": 5.070, + "args": { + "External id": 150509, "cbid": 211, "correlation": 289984299 + } + }, + { + "ph": "s", "id": 289984299, "pid": 5714, "tid": 5714, "ts": 6303771845765.547, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771852555.612, "dur": 1.120, + "args": { + "External id": 150512, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984305, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984305, "pid": 0, "tid": 7, "ts": 6303771852555.612, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845778.067, "dur": 4.420, + "args": { + "External id": 150512, "cbid": 211, "correlation": 289984305 + } + }, + { + "ph": "s", "id": 289984305, "pid": 5714, "tid": 5714, "ts": 6303771845778.067, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771852557.372, "dur": 1.024, + "args": { + "External id": 150513, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984311, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984311, "pid": 0, "tid": 7, "ts": 6303771852557.372, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845787.657, "dur": 3.890, + "args": { + "External id": 150513, "cbid": 211, "correlation": 289984311 + } + }, + { + "ph": "s", "id": 289984311, "pid": 5714, "tid": 5714, "ts": 6303771845787.657, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771852559.100, "dur": 234.723, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984325, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289984325, "pid": 0, "tid": 7, "ts": 6303771852559.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845868.937, "dur": 7.110, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289984325 + } + }, + { + "ph": "s", "id": 289984325, "pid": 5714, "tid": 5714, "ts": 6303771845868.937, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771845904.617, "dur": 0.480, + "args": { + "External id": 150517, "cbid": 200, "correlation": 289984348 + } + }, + { + "ph": "f", "id": 289984348, "pid": 5714, "tid": 5714, "ts": 6303771845904.617, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771852794.687, "dur": 0.800, + "args": { + "External id": 150517, "device": 0, "context": 1, "stream": 7, "correlation": 289984351, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289984351, "pid": 0, "tid": 7, "ts": 6303771852794.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771845906.667, "dur": 5.950, + "args": { + "External id": 150517, "cbid": 51, "correlation": 289984351 + } + }, + { + "ph": "s", "id": 289984351, "pid": 5714, "tid": 5714, "ts": 6303771845906.667, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771852796.799, "dur": 692.551, + "args": { + "External id": 150517, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984352, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984352, "pid": 0, "tid": 7, "ts": 6303771852796.799, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845912.837, "dur": 5.230, + "args": { + "External id": 150517, "cbid": 307, "correlation": 289984352 + } + }, + { + "ph": "s", "id": 289984352, "pid": 5714, "tid": 5714, "ts": 6303771845912.837, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771853489.990, "dur": 2.944, + "args": { + "External id": 150520, "device": 0, "context": 1, "stream": 7, "correlation": 289984357, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 289984357, "pid": 0, "tid": 7, "ts": 6303771853489.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771845939.607, "dur": 10.640, + "args": { + "External id": 150520, "cbid": 41, "correlation": 289984357 + } + }, + { + "ph": "s", "id": 289984357, "pid": 5714, "tid": 5714, "ts": 6303771845939.607, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771845984.717, "dur": 0.410, + "args": { + "External id": 150525, "cbid": 200, "correlation": 289984385 + } + }, + { + "ph": "f", "id": 289984385, "pid": 5714, "tid": 5714, "ts": 6303771845984.717, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771853493.638, "dur": 689.417, + "args": { + "External id": 150525, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984388, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984388, "pid": 0, "tid": 7, "ts": 6303771853493.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771845986.477, "dur": 6.100, + "args": { + "External id": 150525, "cbid": 307, "correlation": 289984388 + } + }, + { + "ph": "s", "id": 289984388, "pid": 5714, "tid": 5714, "ts": 6303771845986.477, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771854183.727, "dur": 221.986, + "args": { + "External id": 150526, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984393, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289984393, "pid": 0, "tid": 7, "ts": 6303771854183.727, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846006.287, "dur": 5.440, + "args": { + "External id": 150526, "cbid": 211, "correlation": 289984393 + } + }, + { + "ph": "s", "id": 289984393, "pid": 5714, "tid": 5714, "ts": 6303771846006.287, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771846064.737, "dur": 1.049, + "args": { + "External id": 150534, "cbid": 210, "correlation": 289984419 + } + }, + { + "ph": "f", "id": 289984419, "pid": 5714, "tid": 5714, "ts": 6303771846064.737, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771854406.417, "dur": 643.016, + "args": { + "External id": 150534, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984420, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984420, "pid": 0, "tid": 7, "ts": 6303771854406.417, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846068.917, "dur": 6.829, + "args": { + "External id": 150534, "cbid": 211, "correlation": 289984420 + } + }, + { + "ph": "s", "id": 289984420, "pid": 5714, "tid": 5714, "ts": 6303771846068.917, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771855050.105, "dur": 171.202, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984439, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289984439, "pid": 0, "tid": 7, "ts": 6303771855050.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846167.026, "dur": 7.960, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289984439 + } + }, + { + "ph": "s", "id": 289984439, "pid": 5714, "tid": 5714, "ts": 6303771846167.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771855221.979, "dur": 4.064, + "args": { + "External id": 150544, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984456, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984456, "pid": 0, "tid": 7, "ts": 6303771855221.979, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846205.266, "dur": 6.390, + "args": { + "External id": 150544, "cbid": 211, "correlation": 289984456 + } + }, + { + "ph": "s", "id": 289984456, "pid": 5714, "tid": 5714, "ts": 6303771846205.266, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771855226.715, "dur": 1.216, + "args": { + "External id": 150549, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984473, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984473, "pid": 0, "tid": 7, "ts": 6303771855226.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846232.826, "dur": 4.830, + "args": { + "External id": 150549, "cbid": 211, "correlation": 289984473 + } + }, + { + "ph": "s", "id": 289984473, "pid": 5714, "tid": 5714, "ts": 6303771846232.826, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771855228.603, "dur": 1.024, + "args": { + "External id": 150551, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984483, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984483, "pid": 0, "tid": 7, "ts": 6303771855228.603, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846250.946, "dur": 4.740, + "args": { + "External id": 150551, "cbid": 211, "correlation": 289984483 + } + }, + { + "ph": "s", "id": 289984483, "pid": 5714, "tid": 5714, "ts": 6303771846250.946, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771855230.299, "dur": 1.056, + "args": { + "External id": 150552, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984489, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984489, "pid": 0, "tid": 7, "ts": 6303771855230.299, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846262.036, "dur": 3.910, + "args": { + "External id": 150552, "cbid": 211, "correlation": 289984489 + } + }, + { + "ph": "s", "id": 289984489, "pid": 5714, "tid": 5714, "ts": 6303771846262.036, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771855232.059, "dur": 1.056, + "args": { + "External id": 150553, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984499, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984499, "pid": 0, "tid": 7, "ts": 6303771855232.059, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846275.276, "dur": 3.830, + "args": { + "External id": 150553, "cbid": 211, "correlation": 289984499 + } + }, + { + "ph": "s", "id": 289984499, "pid": 5714, "tid": 5714, "ts": 6303771846275.276, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771855233.819, "dur": 1.056, + "args": { + "External id": 150554, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984505, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984505, "pid": 0, "tid": 7, "ts": 6303771855233.819, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846284.216, "dur": 3.790, + "args": { + "External id": 150554, "cbid": 211, "correlation": 289984505 + } + }, + { + "ph": "s", "id": 289984505, "pid": 5714, "tid": 5714, "ts": 6303771846284.216, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771855235.611, "dur": 3.360, + "args": { + "External id": 150555, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984518, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984518, "pid": 0, "tid": 7, "ts": 6303771855235.611, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846311.196, "dur": 5.310, + "args": { + "External id": 150555, "cbid": 211, "correlation": 289984518 + } + }, + { + "ph": "s", "id": 289984518, "pid": 5714, "tid": 5714, "ts": 6303771846311.196, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771855239.643, "dur": 1.088, + "args": { + "External id": 150558, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984524, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984524, "pid": 0, "tid": 7, "ts": 6303771855239.643, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846323.146, "dur": 4.010, + "args": { + "External id": 150558, "cbid": 211, "correlation": 289984524 + } + }, + { + "ph": "s", "id": 289984524, "pid": 5714, "tid": 5714, "ts": 6303771846323.146, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771855241.403, "dur": 1.024, + "args": { + "External id": 150559, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984530, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984530, "pid": 0, "tid": 7, "ts": 6303771855241.403, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846331.786, "dur": 3.570, + "args": { + "External id": 150559, "cbid": 211, "correlation": 289984530 + } + }, + { + "ph": "s", "id": 289984530, "pid": 5714, "tid": 5714, "ts": 6303771846331.786, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771855243.131, "dur": 235.971, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984544, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289984544, "pid": 0, "tid": 7, "ts": 6303771855243.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846411.016, "dur": 7.170, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289984544 + } + }, + { + "ph": "s", "id": 289984544, "pid": 5714, "tid": 5714, "ts": 6303771846411.016, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771846447.136, "dur": 0.600, + "args": { + "External id": 150563, "cbid": 200, "correlation": 289984567 + } + }, + { + "ph": "f", "id": 289984567, "pid": 5714, "tid": 5714, "ts": 6303771846447.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771855480.062, "dur": 0.832, + "args": { + "External id": 150563, "device": 0, "context": 1, "stream": 7, "correlation": 289984570, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289984570, "pid": 0, "tid": 7, "ts": 6303771855480.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771846449.316, "dur": 6.140, + "args": { + "External id": 150563, "cbid": 51, "correlation": 289984570 + } + }, + { + "ph": "s", "id": 289984570, "pid": 5714, "tid": 5714, "ts": 6303771846449.316, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771855482.590, "dur": 691.784, + "args": { + "External id": 150563, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984571, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984571, "pid": 0, "tid": 7, "ts": 6303771855482.590, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846455.686, "dur": 5.060, + "args": { + "External id": 150563, "cbid": 307, "correlation": 289984571 + } + }, + { + "ph": "s", "id": 289984571, "pid": 5714, "tid": 5714, "ts": 6303771846455.686, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771856175.014, "dur": 2.944, + "args": { + "External id": 150566, "device": 0, "context": 1, "stream": 7, "correlation": 289984576, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 289984576, "pid": 0, "tid": 7, "ts": 6303771856175.014, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771846483.506, "dur": 10.519, + "args": { + "External id": 150566, "cbid": 41, "correlation": 289984576 + } + }, + { + "ph": "s", "id": 289984576, "pid": 5714, "tid": 5714, "ts": 6303771846483.506, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771846529.125, "dur": 0.431, + "args": { + "External id": 150571, "cbid": 200, "correlation": 289984604 + } + }, + { + "ph": "f", "id": 289984604, "pid": 5714, "tid": 5714, "ts": 6303771846529.125, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771856178.662, "dur": 694.024, + "args": { + "External id": 150571, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984607, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984607, "pid": 0, "tid": 7, "ts": 6303771856178.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846530.816, "dur": 6.369, + "args": { + "External id": 150571, "cbid": 307, "correlation": 289984607 + } + }, + { + "ph": "s", "id": 289984607, "pid": 5714, "tid": 5714, "ts": 6303771846530.816, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771856873.326, "dur": 220.802, + "args": { + "External id": 150572, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984612, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289984612, "pid": 0, "tid": 7, "ts": 6303771856873.326, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846549.636, "dur": 5.260, + "args": { + "External id": 150572, "cbid": 211, "correlation": 289984612 + } + }, + { + "ph": "s", "id": 289984612, "pid": 5714, "tid": 5714, "ts": 6303771846549.636, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771846593.495, "dur": 1.060, + "args": { + "External id": 150580, "cbid": 210, "correlation": 289984638 + } + }, + { + "ph": "f", "id": 289984638, "pid": 5714, "tid": 5714, "ts": 6303771846593.495, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771857094.800, "dur": 644.680, + "args": { + "External id": 150580, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984639, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984639, "pid": 0, "tid": 7, "ts": 6303771857094.800, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846597.715, "dur": 6.680, + "args": { + "External id": 150580, "cbid": 211, "correlation": 289984639 + } + }, + { + "ph": "s", "id": 289984639, "pid": 5714, "tid": 5714, "ts": 6303771846597.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771857740.248, "dur": 171.202, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984658, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289984658, "pid": 0, "tid": 7, "ts": 6303771857740.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846696.315, "dur": 7.880, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289984658 + } + }, + { + "ph": "s", "id": 289984658, "pid": 5714, "tid": 5714, "ts": 6303771846696.315, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771857912.058, "dur": 4.096, + "args": { + "External id": 150590, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984675, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984675, "pid": 0, "tid": 7, "ts": 6303771857912.058, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846734.455, "dur": 6.540, + "args": { + "External id": 150590, "cbid": 211, "correlation": 289984675 + } + }, + { + "ph": "s", "id": 289984675, "pid": 5714, "tid": 5714, "ts": 6303771846734.455, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771857916.826, "dur": 1.216, + "args": { + "External id": 150595, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984692, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984692, "pid": 0, "tid": 7, "ts": 6303771857916.826, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846761.705, "dur": 4.830, + "args": { + "External id": 150595, "cbid": 211, "correlation": 289984692 + } + }, + { + "ph": "s", "id": 289984692, "pid": 5714, "tid": 5714, "ts": 6303771846761.705, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771857918.714, "dur": 1.024, + "args": { + "External id": 150597, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984702, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984702, "pid": 0, "tid": 7, "ts": 6303771857918.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846778.545, "dur": 4.320, + "args": { + "External id": 150597, "cbid": 211, "correlation": 289984702 + } + }, + { + "ph": "s", "id": 289984702, "pid": 5714, "tid": 5714, "ts": 6303771846778.545, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771857920.410, "dur": 1.056, + "args": { + "External id": 150598, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984708, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984708, "pid": 0, "tid": 7, "ts": 6303771857920.410, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846789.125, "dur": 3.980, + "args": { + "External id": 150598, "cbid": 211, "correlation": 289984708 + } + }, + { + "ph": "s", "id": 289984708, "pid": 5714, "tid": 5714, "ts": 6303771846789.125, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771857922.170, "dur": 1.056, + "args": { + "External id": 150599, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984718, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984718, "pid": 0, "tid": 7, "ts": 6303771857922.170, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846802.265, "dur": 4.010, + "args": { + "External id": 150599, "cbid": 211, "correlation": 289984718 + } + }, + { + "ph": "s", "id": 289984718, "pid": 5714, "tid": 5714, "ts": 6303771846802.265, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771857923.930, "dur": 1.056, + "args": { + "External id": 150600, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984724, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984724, "pid": 0, "tid": 7, "ts": 6303771857923.930, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846811.215, "dur": 3.820, + "args": { + "External id": 150600, "cbid": 211, "correlation": 289984724 + } + }, + { + "ph": "s", "id": 289984724, "pid": 5714, "tid": 5714, "ts": 6303771846811.215, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771857925.722, "dur": 3.392, + "args": { + "External id": 150601, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984737, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984737, "pid": 0, "tid": 7, "ts": 6303771857925.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846828.955, "dur": 4.490, + "args": { + "External id": 150601, "cbid": 211, "correlation": 289984737 + } + }, + { + "ph": "s", "id": 289984737, "pid": 5714, "tid": 5714, "ts": 6303771846828.955, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771857929.754, "dur": 1.088, + "args": { + "External id": 150604, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984743, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984743, "pid": 0, "tid": 7, "ts": 6303771857929.754, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846838.685, "dur": 3.900, + "args": { + "External id": 150604, "cbid": 211, "correlation": 289984743 + } + }, + { + "ph": "s", "id": 289984743, "pid": 5714, "tid": 5714, "ts": 6303771846838.685, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771857931.482, "dur": 1.024, + "args": { + "External id": 150605, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984749, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984749, "pid": 0, "tid": 7, "ts": 6303771857931.482, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846846.985, "dur": 3.530, + "args": { + "External id": 150605, "cbid": 211, "correlation": 289984749 + } + }, + { + "ph": "s", "id": 289984749, "pid": 5714, "tid": 5714, "ts": 6303771846846.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771857933.242, "dur": 234.499, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984763, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289984763, "pid": 0, "tid": 7, "ts": 6303771857933.242, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846924.845, "dur": 7.030, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289984763 + } + }, + { + "ph": "s", "id": 289984763, "pid": 5714, "tid": 5714, "ts": 6303771846924.845, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771846961.624, "dur": 0.500, + "args": { + "External id": 150609, "cbid": 200, "correlation": 289984786 + } + }, + { + "ph": "f", "id": 289984786, "pid": 5714, "tid": 5714, "ts": 6303771846961.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771858168.669, "dur": 0.832, + "args": { + "External id": 150609, "device": 0, "context": 1, "stream": 7, "correlation": 289984789, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289984789, "pid": 0, "tid": 7, "ts": 6303771858168.669, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771846963.695, "dur": 6.020, + "args": { + "External id": 150609, "cbid": 51, "correlation": 289984789 + } + }, + { + "ph": "s", "id": 289984789, "pid": 5714, "tid": 5714, "ts": 6303771846963.695, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771858171.197, "dur": 689.544, + "args": { + "External id": 150609, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984790, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984790, "pid": 0, "tid": 7, "ts": 6303771858171.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771846969.935, "dur": 5.100, + "args": { + "External id": 150609, "cbid": 307, "correlation": 289984790 + } + }, + { + "ph": "s", "id": 289984790, "pid": 5714, "tid": 5714, "ts": 6303771846969.935, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771858861.445, "dur": 2.976, + "args": { + "External id": 150612, "device": 0, "context": 1, "stream": 7, "correlation": 289984795, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289984795, "pid": 0, "tid": 7, "ts": 6303771858861.445, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771846995.584, "dur": 10.600, + "args": { + "External id": 150612, "cbid": 41, "correlation": 289984795 + } + }, + { + "ph": "s", "id": 289984795, "pid": 5714, "tid": 5714, "ts": 6303771846995.584, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771847043.034, "dur": 0.460, + "args": { + "External id": 150617, "cbid": 200, "correlation": 289984823 + } + }, + { + "ph": "f", "id": 289984823, "pid": 5714, "tid": 5714, "ts": 6303771847043.034, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771858865.253, "dur": 690.440, + "args": { + "External id": 150617, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984826, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984826, "pid": 0, "tid": 7, "ts": 6303771858865.253, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847044.784, "dur": 6.170, + "args": { + "External id": 150617, "cbid": 307, "correlation": 289984826 + } + }, + { + "ph": "s", "id": 289984826, "pid": 5714, "tid": 5714, "ts": 6303771847044.784, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771859556.685, "dur": 221.443, + "args": { + "External id": 150618, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984831, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289984831, "pid": 0, "tid": 7, "ts": 6303771859556.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847063.394, "dur": 5.380, + "args": { + "External id": 150618, "cbid": 211, "correlation": 289984831 + } + }, + { + "ph": "s", "id": 289984831, "pid": 5714, "tid": 5714, "ts": 6303771847063.394, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771847106.814, "dur": 1.030, + "args": { + "External id": 150626, "cbid": 210, "correlation": 289984857 + } + }, + { + "ph": "f", "id": 289984857, "pid": 5714, "tid": 5714, "ts": 6303771847106.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771859778.800, "dur": 642.183, + "args": { + "External id": 150626, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984858, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289984858, "pid": 0, "tid": 7, "ts": 6303771859778.800, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847110.954, "dur": 6.570, + "args": { + "External id": 150626, "cbid": 211, "correlation": 289984858 + } + }, + { + "ph": "s", "id": 289984858, "pid": 5714, "tid": 5714, "ts": 6303771847110.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771860421.687, "dur": 171.234, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984877, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289984877, "pid": 0, "tid": 7, "ts": 6303771860421.687, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847207.854, "dur": 7.500, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289984877 + } + }, + { + "ph": "s", "id": 289984877, "pid": 5714, "tid": 5714, "ts": 6303771847207.854, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771860593.625, "dur": 4.096, + "args": { + "External id": 150636, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984894, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984894, "pid": 0, "tid": 7, "ts": 6303771860593.625, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847246.194, "dur": 6.460, + "args": { + "External id": 150636, "cbid": 211, "correlation": 289984894 + } + }, + { + "ph": "s", "id": 289984894, "pid": 5714, "tid": 5714, "ts": 6303771847246.194, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771860598.361, "dur": 1.248, + "args": { + "External id": 150641, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984911, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984911, "pid": 0, "tid": 7, "ts": 6303771860598.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847274.884, "dur": 4.940, + "args": { + "External id": 150641, "cbid": 211, "correlation": 289984911 + } + }, + { + "ph": "s", "id": 289984911, "pid": 5714, "tid": 5714, "ts": 6303771847274.884, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771860600.249, "dur": 1.056, + "args": { + "External id": 150643, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984921, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984921, "pid": 0, "tid": 7, "ts": 6303771860600.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847291.634, "dur": 4.230, + "args": { + "External id": 150643, "cbid": 211, "correlation": 289984921 + } + }, + { + "ph": "s", "id": 289984921, "pid": 5714, "tid": 5714, "ts": 6303771847291.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771860601.977, "dur": 1.056, + "args": { + "External id": 150644, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984927, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984927, "pid": 0, "tid": 7, "ts": 6303771860601.977, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847309.954, "dur": 4.480, + "args": { + "External id": 150644, "cbid": 211, "correlation": 289984927 + } + }, + { + "ph": "s", "id": 289984927, "pid": 5714, "tid": 5714, "ts": 6303771847309.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771860603.737, "dur": 1.056, + "args": { + "External id": 150645, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984937, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984937, "pid": 0, "tid": 7, "ts": 6303771860603.737, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847324.464, "dur": 3.890, + "args": { + "External id": 150645, "cbid": 211, "correlation": 289984937 + } + }, + { + "ph": "s", "id": 289984937, "pid": 5714, "tid": 5714, "ts": 6303771847324.464, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771860605.497, "dur": 1.056, + "args": { + "External id": 150646, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984943, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984943, "pid": 0, "tid": 7, "ts": 6303771860605.497, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847334.664, "dur": 3.720, + "args": { + "External id": 150646, "cbid": 211, "correlation": 289984943 + } + }, + { + "ph": "s", "id": 289984943, "pid": 5714, "tid": 5714, "ts": 6303771847334.664, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771860607.289, "dur": 3.360, + "args": { + "External id": 150647, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984956, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984956, "pid": 0, "tid": 7, "ts": 6303771860607.289, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847353.984, "dur": 4.360, + "args": { + "External id": 150647, "cbid": 211, "correlation": 289984956 + } + }, + { + "ph": "s", "id": 289984956, "pid": 5714, "tid": 5714, "ts": 6303771847353.984, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771860611.321, "dur": 1.088, + "args": { + "External id": 150650, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984962, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984962, "pid": 0, "tid": 7, "ts": 6303771860611.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847363.684, "dur": 3.570, + "args": { + "External id": 150650, "cbid": 211, "correlation": 289984962 + } + }, + { + "ph": "s", "id": 289984962, "pid": 5714, "tid": 5714, "ts": 6303771847363.684, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771860613.081, "dur": 1.024, + "args": { + "External id": 150651, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984968, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289984968, "pid": 0, "tid": 7, "ts": 6303771860613.081, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847371.534, "dur": 3.360, + "args": { + "External id": 150651, "cbid": 211, "correlation": 289984968 + } + }, + { + "ph": "s", "id": 289984968, "pid": 5714, "tid": 5714, "ts": 6303771847371.534, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771860614.809, "dur": 234.755, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289984982, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289984982, "pid": 0, "tid": 7, "ts": 6303771860614.809, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847452.243, "dur": 7.360, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289984982 + } + }, + { + "ph": "s", "id": 289984982, "pid": 5714, "tid": 5714, "ts": 6303771847452.243, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771847488.583, "dur": 0.490, + "args": { + "External id": 150655, "cbid": 200, "correlation": 289985005 + } + }, + { + "ph": "f", "id": 289985005, "pid": 5714, "tid": 5714, "ts": 6303771847488.583, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771860850.428, "dur": 0.832, + "args": { + "External id": 150655, "device": 0, "context": 1, "stream": 7, "correlation": 289985008, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289985008, "pid": 0, "tid": 7, "ts": 6303771860850.428, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771847490.643, "dur": 5.860, + "args": { + "External id": 150655, "cbid": 51, "correlation": 289985008 + } + }, + { + "ph": "s", "id": 289985008, "pid": 5714, "tid": 5714, "ts": 6303771847490.643, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771860852.956, "dur": 689.640, + "args": { + "External id": 150655, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985009, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985009, "pid": 0, "tid": 7, "ts": 6303771860852.956, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847496.723, "dur": 5.370, + "args": { + "External id": 150655, "cbid": 307, "correlation": 289985009 + } + }, + { + "ph": "s", "id": 289985009, "pid": 5714, "tid": 5714, "ts": 6303771847496.723, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771861543.300, "dur": 3.008, + "args": { + "External id": 150658, "device": 0, "context": 1, "stream": 7, "correlation": 289985014, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 289985014, "pid": 0, "tid": 7, "ts": 6303771861543.300, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771847523.063, "dur": 10.790, + "args": { + "External id": 150658, "cbid": 41, "correlation": 289985014 + } + }, + { + "ph": "s", "id": 289985014, "pid": 5714, "tid": 5714, "ts": 6303771847523.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771847570.253, "dur": 0.440, + "args": { + "External id": 150663, "cbid": 200, "correlation": 289985042 + } + }, + { + "ph": "f", "id": 289985042, "pid": 5714, "tid": 5714, "ts": 6303771847570.253, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771861546.948, "dur": 688.136, + "args": { + "External id": 150663, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985045, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985045, "pid": 0, "tid": 7, "ts": 6303771861546.948, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847572.013, "dur": 6.030, + "args": { + "External id": 150663, "cbid": 307, "correlation": 289985045 + } + }, + { + "ph": "s", "id": 289985045, "pid": 5714, "tid": 5714, "ts": 6303771847572.013, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771862235.852, "dur": 221.282, + "args": { + "External id": 150664, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985050, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289985050, "pid": 0, "tid": 7, "ts": 6303771862235.852, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847590.393, "dur": 5.490, + "args": { + "External id": 150664, "cbid": 211, "correlation": 289985050 + } + }, + { + "ph": "s", "id": 289985050, "pid": 5714, "tid": 5714, "ts": 6303771847590.393, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771847633.963, "dur": 1.070, + "args": { + "External id": 150672, "cbid": 210, "correlation": 289985076 + } + }, + { + "ph": "f", "id": 289985076, "pid": 5714, "tid": 5714, "ts": 6303771847633.963, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771862457.838, "dur": 640.584, + "args": { + "External id": 150672, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985077, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985077, "pid": 0, "tid": 7, "ts": 6303771862457.838, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847638.133, "dur": 6.620, + "args": { + "External id": 150672, "cbid": 211, "correlation": 289985077 + } + }, + { + "ph": "s", "id": 289985077, "pid": 5714, "tid": 5714, "ts": 6303771847638.133, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771863099.062, "dur": 170.722, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985096, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289985096, "pid": 0, "tid": 7, "ts": 6303771863099.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847740.883, "dur": 7.600, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289985096 + } + }, + { + "ph": "s", "id": 289985096, "pid": 5714, "tid": 5714, "ts": 6303771847740.883, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771863270.424, "dur": 4.224, + "args": { + "External id": 150682, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985113, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985113, "pid": 0, "tid": 7, "ts": 6303771863270.424, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847778.593, "dur": 6.450, + "args": { + "External id": 150682, "cbid": 211, "correlation": 289985113 + } + }, + { + "ph": "s", "id": 289985113, "pid": 5714, "tid": 5714, "ts": 6303771847778.593, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771863275.320, "dur": 1.248, + "args": { + "External id": 150687, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985130, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985130, "pid": 0, "tid": 7, "ts": 6303771863275.320, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847807.373, "dur": 4.910, + "args": { + "External id": 150687, "cbid": 211, "correlation": 289985130 + } + }, + { + "ph": "s", "id": 289985130, "pid": 5714, "tid": 5714, "ts": 6303771847807.373, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771863277.208, "dur": 1.024, + "args": { + "External id": 150689, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985140, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985140, "pid": 0, "tid": 7, "ts": 6303771863277.208, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847824.593, "dur": 4.570, + "args": { + "External id": 150689, "cbid": 211, "correlation": 289985140 + } + }, + { + "ph": "s", "id": 289985140, "pid": 5714, "tid": 5714, "ts": 6303771847824.593, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771863278.904, "dur": 1.088, + "args": { + "External id": 150690, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985146, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985146, "pid": 0, "tid": 7, "ts": 6303771863278.904, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847835.363, "dur": 3.970, + "args": { + "External id": 150690, "cbid": 211, "correlation": 289985146 + } + }, + { + "ph": "s", "id": 289985146, "pid": 5714, "tid": 5714, "ts": 6303771847835.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771863280.664, "dur": 1.056, + "args": { + "External id": 150691, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985156, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985156, "pid": 0, "tid": 7, "ts": 6303771863280.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847848.642, "dur": 3.940, + "args": { + "External id": 150691, "cbid": 211, "correlation": 289985156 + } + }, + { + "ph": "s", "id": 289985156, "pid": 5714, "tid": 5714, "ts": 6303771847848.642, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771863282.424, "dur": 1.056, + "args": { + "External id": 150692, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985162, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985162, "pid": 0, "tid": 7, "ts": 6303771863282.424, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847857.493, "dur": 3.749, + "args": { + "External id": 150692, "cbid": 211, "correlation": 289985162 + } + }, + { + "ph": "s", "id": 289985162, "pid": 5714, "tid": 5714, "ts": 6303771847857.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771863284.216, "dur": 3.392, + "args": { + "External id": 150693, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985175, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985175, "pid": 0, "tid": 7, "ts": 6303771863284.216, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847874.662, "dur": 4.500, + "args": { + "External id": 150693, "cbid": 211, "correlation": 289985175 + } + }, + { + "ph": "s", "id": 289985175, "pid": 5714, "tid": 5714, "ts": 6303771847874.662, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771863288.248, "dur": 1.088, + "args": { + "External id": 150696, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985181, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985181, "pid": 0, "tid": 7, "ts": 6303771863288.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847886.022, "dur": 3.700, + "args": { + "External id": 150696, "cbid": 211, "correlation": 289985181 + } + }, + { + "ph": "s", "id": 289985181, "pid": 5714, "tid": 5714, "ts": 6303771847886.022, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771863289.976, "dur": 1.024, + "args": { + "External id": 150697, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985187, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985187, "pid": 0, "tid": 7, "ts": 6303771863289.976, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847894.213, "dur": 3.489, + "args": { + "External id": 150697, "cbid": 211, "correlation": 289985187 + } + }, + { + "ph": "s", "id": 289985187, "pid": 5714, "tid": 5714, "ts": 6303771847894.213, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771863291.736, "dur": 234.691, + "args": { + "External id": 150313, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985201, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289985201, "pid": 0, "tid": 7, "ts": 6303771863291.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771847971.972, "dur": 7.190, + "args": { + "External id": 150313, "cbid": 307, "correlation": 289985201 + } + }, + { + "ph": "s", "id": 289985201, "pid": 5714, "tid": 5714, "ts": 6303771847971.972, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771848018.272, "dur": 0.520, + "args": { + "External id": 150701, "cbid": 200, "correlation": 289985224 + } + }, + { + "ph": "f", "id": 289985224, "pid": 5714, "tid": 5714, "ts": 6303771848018.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771863527.323, "dur": 0.800, + "args": { + "External id": 150701, "device": 0, "context": 1, "stream": 7, "correlation": 289985227, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289985227, "pid": 0, "tid": 7, "ts": 6303771863527.323, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771848020.392, "dur": 6.080, + "args": { + "External id": 150701, "cbid": 51, "correlation": 289985227 + } + }, + { + "ph": "s", "id": 289985227, "pid": 5714, "tid": 5714, "ts": 6303771848020.392, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771863529.851, "dur": 689.384, + "args": { + "External id": 150701, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985228, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985228, "pid": 0, "tid": 7, "ts": 6303771863529.851, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848026.682, "dur": 5.040, + "args": { + "External id": 150701, "cbid": 307, "correlation": 289985228 + } + }, + { + "ph": "s", "id": 289985228, "pid": 5714, "tid": 5714, "ts": 6303771848026.682, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771864219.939, "dur": 3.040, + "args": { + "External id": 150704, "device": 0, "context": 1, "stream": 7, "correlation": 289985233, "bytes": 3145728, "memory bandwidth (GB/s)": 1034.778947368421 + } + }, + { + "ph": "f", "id": 289985233, "pid": 0, "tid": 7, "ts": 6303771864219.939, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771848053.612, "dur": 11.600, + "args": { + "External id": 150704, "cbid": 41, "correlation": 289985233 + } + }, + { + "ph": "s", "id": 289985233, "pid": 5714, "tid": 5714, "ts": 6303771848053.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771848100.122, "dur": 0.430, + "args": { + "External id": 150709, "cbid": 200, "correlation": 289985261 + } + }, + { + "ph": "f", "id": 289985261, "pid": 5714, "tid": 5714, "ts": 6303771848100.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771864223.715, "dur": 692.936, + "args": { + "External id": 150709, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985264, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985264, "pid": 0, "tid": 7, "ts": 6303771864223.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848101.882, "dur": 6.110, + "args": { + "External id": 150709, "cbid": 307, "correlation": 289985264 + } + }, + { + "ph": "s", "id": 289985264, "pid": 5714, "tid": 5714, "ts": 6303771848101.882, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771864917.323, "dur": 222.403, + "args": { + "External id": 150710, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985269, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289985269, "pid": 0, "tid": 7, "ts": 6303771864917.323, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848121.512, "dur": 5.500, + "args": { + "External id": 150710, "cbid": 211, "correlation": 289985269 + } + }, + { + "ph": "s", "id": 289985269, "pid": 5714, "tid": 5714, "ts": 6303771848121.512, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771865140.366, "dur": 5.056, + "args": { + "External id": 150712, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985282, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985282, "pid": 0, "tid": 7, "ts": 6303771865140.366, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848146.432, "dur": 5.750, + "args": { + "External id": 150712, "cbid": 211, "correlation": 289985282 + } + }, + { + "ph": "s", "id": 289985282, "pid": 5714, "tid": 5714, "ts": 6303771848146.432, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771865146.126, "dur": 158.018, + "args": { + "External id": 150717, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985295, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289985295, "pid": 0, "tid": 7, "ts": 6303771865146.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848172.392, "dur": 5.460, + "args": { + "External id": 150717, "cbid": 211, "correlation": 289985295 + } + }, + { + "ph": "s", "id": 289985295, "pid": 5714, "tid": 5714, "ts": 6303771848172.392, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771865304.752, "dur": 1.376, + "args": { + "External id": 150722, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985303, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985303, "pid": 0, "tid": 7, "ts": 6303771865304.752, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848239.152, "dur": 7.130, + "args": { + "External id": 150722, "cbid": 211, "correlation": 289985303 + } + }, + { + "ph": "s", "id": 289985303, "pid": 5714, "tid": 5714, "ts": 6303771848239.152, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771865306.768, "dur": 1.344, + "args": { + "External id": 150723, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985309, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985309, "pid": 0, "tid": 7, "ts": 6303771865306.768, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848257.122, "dur": 4.350, + "args": { + "External id": 150723, "cbid": 211, "correlation": 289985309 + } + }, + { + "ph": "s", "id": 289985309, "pid": 5714, "tid": 5714, "ts": 6303771848257.122, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771865308.848, "dur": 2.272, + "args": { + "External id": 150742, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985329, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.500000, "warps per SM": 2.000000, "grid": [64, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 4 + } + }, + { + "ph": "f", "id": 289985329, "pid": 0, "tid": 7, "ts": 6303771865308.848, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848341.872, "dur": 8.480, + "args": { + "External id": 150742, "cbid": 211, "correlation": 289985329 + } + }, + { + "ph": "s", "id": 289985329, "pid": 5714, "tid": 5714, "ts": 6303771848341.872, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771865311.824, "dur": 59.520, + "args": { + "External id": 150750, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985347, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289985347, "pid": 0, "tid": 7, "ts": 6303771865311.824, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848433.801, "dur": 8.370, + "args": { + "External id": 150750, "cbid": 211, "correlation": 289985347 + } + }, + { + "ph": "s", "id": 289985347, "pid": 5714, "tid": 5714, "ts": 6303771848433.801, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771865372.048, "dur": 14.976, + "args": { + "External id": 150755, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985364, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289985364, "pid": 0, "tid": 7, "ts": 6303771865372.048, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848472.681, "dur": 5.970, + "args": { + "External id": 150755, "cbid": 211, "correlation": 289985364 + } + }, + { + "ph": "s", "id": 289985364, "pid": 5714, "tid": 5714, "ts": 6303771848472.681, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771865387.664, "dur": 101.346, + "args": { + "External id": 150760, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985380, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289985380, "pid": 0, "tid": 7, "ts": 6303771865387.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848493.881, "dur": 4.660, + "args": { + "External id": 150760, "cbid": 211, "correlation": 289985380 + } + }, + { + "ph": "s", "id": 289985380, "pid": 5714, "tid": 5714, "ts": 6303771848493.881, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771865489.682, "dur": 1.312, + "args": { + "External id": 150764, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985396, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289985396, "pid": 0, "tid": 7, "ts": 6303771865489.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848516.571, "dur": 4.560, + "args": { + "External id": 150764, "cbid": 211, "correlation": 289985396 + } + }, + { + "ph": "s", "id": 289985396, "pid": 5714, "tid": 5714, "ts": 6303771848516.571, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771865491.730, "dur": 1.696, + "args": { + "External id": 150765, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985408, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289985408, "pid": 0, "tid": 7, "ts": 6303771865491.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848537.731, "dur": 4.680, + "args": { + "External id": 150765, "cbid": 211, "correlation": 289985408 + } + }, + { + "ph": "s", "id": 289985408, "pid": 5714, "tid": 5714, "ts": 6303771848537.731, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#4}::operator()() const::{lambda(long)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771865494.066, "dur": 2.112, + "args": { + "External id": 150772, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985426, "registers per thread": 34, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289985426, "pid": 0, "tid": 7, "ts": 6303771865494.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848568.041, "dur": 5.750, + "args": { + "External id": 150772, "cbid": 211, "correlation": 289985426 + } + }, + { + "ph": "s", "id": 289985426, "pid": 5714, "tid": 5714, "ts": 6303771848568.041, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(long, long)#1}>, unsigned int, long, 4>)", "pid": 0, "tid": 7, + "ts": 6303771865496.850, "dur": 3.936, + "args": { + "External id": 150767, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985435, "registers per thread": 36, "shared memory": 4112, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985435, "pid": 0, "tid": 7, "ts": 6303771865496.850, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771848579.741, "dur": 4.060, + "args": { + "External id": 150767, "cbid": 211, "correlation": 289985435 + } + }, + { + "ph": "s", "id": 289985435, "pid": 5714, "tid": 5714, "ts": 6303771848579.741, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6303771865502.482, "dur": 0.992, + "args": { + "External id": 150774, "device": 0, "context": 1, "stream": 7, "correlation": 289985441, "bytes": 8, "memory bandwidth (GB/s)": 0.008064516129032258 + } + }, + { + "ph": "f", "id": 289985441, "pid": 0, "tid": 7, "ts": 6303771865502.482, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771848596.141, "dur": 9.230, + "args": { + "External id": 150774, "cbid": 41, "correlation": 289985441 + } + }, + { + "ph": "s", "id": 289985441, "pid": 5714, "tid": 5714, "ts": 6303771848596.141, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6303771848605.641, "dur": 16901.582, + "args": { + "External id": 150774, "cbid": 131, "correlation": 289985442 + } + }, + { + "ph": "s", "id": 289985442, "pid": 5714, "tid": 5714, "ts": 6303771848605.641, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771865572.223, "dur": 1.700, + "args": { + "External id": 150782, "cbid": 210, "correlation": 289985467 + } + }, + { + "ph": "f", "id": 289985467, "pid": 5714, "tid": 5714, "ts": 6303771865572.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771865589.203, "dur": 642.024, + "args": { + "External id": 150782, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985468, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985468, "pid": 0, "tid": 7, "ts": 6303771865589.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865577.543, "dur": 11.310, + "args": { + "External id": 150782, "cbid": 211, "correlation": 289985468 + } + }, + { + "ph": "s", "id": 289985468, "pid": 5714, "tid": 5714, "ts": 6303771865577.543, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771866231.835, "dur": 171.137, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985487, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289985487, "pid": 0, "tid": 7, "ts": 6303771866231.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865710.193, "dur": 9.340, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289985487 + } + }, + { + "ph": "s", "id": 289985487, "pid": 5714, "tid": 5714, "ts": 6303771865710.193, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771866403.708, "dur": 4.064, + "args": { + "External id": 150792, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985504, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985504, "pid": 0, "tid": 7, "ts": 6303771866403.708, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865755.533, "dur": 7.510, + "args": { + "External id": 150792, "cbid": 211, "correlation": 289985504 + } + }, + { + "ph": "s", "id": 289985504, "pid": 5714, "tid": 5714, "ts": 6303771865755.533, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771866408.476, "dur": 1.184, + "args": { + "External id": 150797, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985521, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985521, "pid": 0, "tid": 7, "ts": 6303771866408.476, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865787.123, "dur": 5.320, + "args": { + "External id": 150797, "cbid": 211, "correlation": 289985521 + } + }, + { + "ph": "s", "id": 289985521, "pid": 5714, "tid": 5714, "ts": 6303771865787.123, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771866410.332, "dur": 1.024, + "args": { + "External id": 150799, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985531, "pid": 0, "tid": 7, "ts": 6303771866410.332, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865807.063, "dur": 5.040, + "args": { + "External id": 150799, "cbid": 211, "correlation": 289985531 + } + }, + { + "ph": "s", "id": 289985531, "pid": 5714, "tid": 5714, "ts": 6303771865807.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771866412.028, "dur": 1.088, + "args": { + "External id": 150800, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985537, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985537, "pid": 0, "tid": 7, "ts": 6303771866412.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865819.463, "dur": 4.560, + "args": { + "External id": 150800, "cbid": 211, "correlation": 289985537 + } + }, + { + "ph": "s", "id": 289985537, "pid": 5714, "tid": 5714, "ts": 6303771865819.463, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771866413.788, "dur": 1.056, + "args": { + "External id": 150801, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985547, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985547, "pid": 0, "tid": 7, "ts": 6303771866413.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865834.653, "dur": 4.640, + "args": { + "External id": 150801, "cbid": 211, "correlation": 289985547 + } + }, + { + "ph": "s", "id": 289985547, "pid": 5714, "tid": 5714, "ts": 6303771865834.653, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771866415.516, "dur": 1.056, + "args": { + "External id": 150802, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985553, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985553, "pid": 0, "tid": 7, "ts": 6303771866415.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865845.242, "dur": 4.431, + "args": { + "External id": 150802, "cbid": 211, "correlation": 289985553 + } + }, + { + "ph": "s", "id": 289985553, "pid": 5714, "tid": 5714, "ts": 6303771865845.242, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771866417.180, "dur": 3.425, + "args": { + "External id": 150803, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985566, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985566, "pid": 0, "tid": 7, "ts": 6303771866417.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865867.553, "dur": 5.060, + "args": { + "External id": 150803, "cbid": 211, "correlation": 289985566 + } + }, + { + "ph": "s", "id": 289985566, "pid": 5714, "tid": 5714, "ts": 6303771865867.553, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771866421.213, "dur": 1.120, + "args": { + "External id": 150806, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985572, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985572, "pid": 0, "tid": 7, "ts": 6303771866421.213, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865878.862, "dur": 4.191, + "args": { + "External id": 150806, "cbid": 211, "correlation": 289985572 + } + }, + { + "ph": "s", "id": 289985572, "pid": 5714, "tid": 5714, "ts": 6303771865878.862, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771866422.973, "dur": 1.024, + "args": { + "External id": 150807, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985578, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985578, "pid": 0, "tid": 7, "ts": 6303771866422.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865888.333, "dur": 3.940, + "args": { + "External id": 150807, "cbid": 211, "correlation": 289985578 + } + }, + { + "ph": "s", "id": 289985578, "pid": 5714, "tid": 5714, "ts": 6303771865888.333, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771866424.733, "dur": 234.242, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985592, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289985592, "pid": 0, "tid": 7, "ts": 6303771866424.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771865977.072, "dur": 8.080, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289985592 + } + }, + { + "ph": "s", "id": 289985592, "pid": 5714, "tid": 5714, "ts": 6303771865977.072, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771866018.532, "dur": 0.590, + "args": { + "External id": 150811, "cbid": 200, "correlation": 289985615 + } + }, + { + "ph": "f", "id": 289985615, "pid": 5714, "tid": 5714, "ts": 6303771866018.532, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771866659.807, "dur": 0.832, + "args": { + "External id": 150811, "device": 0, "context": 1, "stream": 7, "correlation": 289985618, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289985618, "pid": 0, "tid": 7, "ts": 6303771866659.807, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771866020.942, "dur": 7.000, + "args": { + "External id": 150811, "cbid": 51, "correlation": 289985618 + } + }, + { + "ph": "s", "id": 289985618, "pid": 5714, "tid": 5714, "ts": 6303771866020.942, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771866661.951, "dur": 694.441, + "args": { + "External id": 150811, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985619, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985619, "pid": 0, "tid": 7, "ts": 6303771866661.951, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866028.182, "dur": 5.770, + "args": { + "External id": 150811, "cbid": 307, "correlation": 289985619 + } + }, + { + "ph": "s", "id": 289985619, "pid": 5714, "tid": 5714, "ts": 6303771866028.182, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771867357.128, "dur": 3.008, + "args": { + "External id": 150814, "device": 0, "context": 1, "stream": 7, "correlation": 289985624, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.787234042553 + } + }, + { + "ph": "f", "id": 289985624, "pid": 0, "tid": 7, "ts": 6303771867357.128, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771866060.372, "dur": 14.690, + "args": { + "External id": 150814, "cbid": 41, "correlation": 289985624 + } + }, + { + "ph": "s", "id": 289985624, "pid": 5714, "tid": 5714, "ts": 6303771866060.372, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771866125.502, "dur": 0.520, + "args": { + "External id": 150819, "cbid": 200, "correlation": 289985652 + } + }, + { + "ph": "f", "id": 289985652, "pid": 5714, "tid": 5714, "ts": 6303771866125.502, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771867360.744, "dur": 697.672, + "args": { + "External id": 150819, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985655, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985655, "pid": 0, "tid": 7, "ts": 6303771867360.744, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866127.492, "dur": 7.150, + "args": { + "External id": 150819, "cbid": 307, "correlation": 289985655 + } + }, + { + "ph": "s", "id": 289985655, "pid": 5714, "tid": 5714, "ts": 6303771866127.492, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771868059.120, "dur": 221.218, + "args": { + "External id": 150820, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985660, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289985660, "pid": 0, "tid": 7, "ts": 6303771868059.120, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866148.822, "dur": 6.140, + "args": { + "External id": 150820, "cbid": 211, "correlation": 289985660 + } + }, + { + "ph": "s", "id": 289985660, "pid": 5714, "tid": 5714, "ts": 6303771866148.822, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771866200.442, "dur": 1.190, + "args": { + "External id": 150828, "cbid": 210, "correlation": 289985686 + } + }, + { + "ph": "f", "id": 289985686, "pid": 5714, "tid": 5714, "ts": 6303771866200.442, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771868280.978, "dur": 644.232, + "args": { + "External id": 150828, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985687, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985687, "pid": 0, "tid": 7, "ts": 6303771868280.978, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866205.202, "dur": 7.670, + "args": { + "External id": 150828, "cbid": 211, "correlation": 289985687 + } + }, + { + "ph": "s", "id": 289985687, "pid": 5714, "tid": 5714, "ts": 6303771866205.202, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771868925.850, "dur": 171.394, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985706, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289985706, "pid": 0, "tid": 7, "ts": 6303771868925.850, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866327.612, "dur": 9.860, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289985706 + } + }, + { + "ph": "s", "id": 289985706, "pid": 5714, "tid": 5714, "ts": 6303771866327.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771869097.884, "dur": 4.064, + "args": { + "External id": 150838, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985723, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985723, "pid": 0, "tid": 7, "ts": 6303771869097.884, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866374.181, "dur": 7.451, + "args": { + "External id": 150838, "cbid": 211, "correlation": 289985723 + } + }, + { + "ph": "s", "id": 289985723, "pid": 5714, "tid": 5714, "ts": 6303771866374.181, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771869102.620, "dur": 1.248, + "args": { + "External id": 150843, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985740, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985740, "pid": 0, "tid": 7, "ts": 6303771869102.620, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866404.541, "dur": 5.620, + "args": { + "External id": 150843, "cbid": 211, "correlation": 289985740 + } + }, + { + "ph": "s", "id": 289985740, "pid": 5714, "tid": 5714, "ts": 6303771866404.541, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771869104.540, "dur": 1.024, + "args": { + "External id": 150845, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985750, "pid": 0, "tid": 7, "ts": 6303771869104.540, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866424.941, "dur": 5.430, + "args": { + "External id": 150845, "cbid": 211, "correlation": 289985750 + } + }, + { + "ph": "s", "id": 289985750, "pid": 5714, "tid": 5714, "ts": 6303771866424.941, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771869106.236, "dur": 1.056, + "args": { + "External id": 150846, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985756, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985756, "pid": 0, "tid": 7, "ts": 6303771869106.236, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866437.531, "dur": 4.540, + "args": { + "External id": 150846, "cbid": 211, "correlation": 289985756 + } + }, + { + "ph": "s", "id": 289985756, "pid": 5714, "tid": 5714, "ts": 6303771866437.531, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771869107.964, "dur": 1.056, + "args": { + "External id": 150847, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985766, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985766, "pid": 0, "tid": 7, "ts": 6303771869107.964, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866453.271, "dur": 4.460, + "args": { + "External id": 150847, "cbid": 211, "correlation": 289985766 + } + }, + { + "ph": "s", "id": 289985766, "pid": 5714, "tid": 5714, "ts": 6303771866453.271, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771869109.692, "dur": 1.056, + "args": { + "External id": 150848, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985772, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985772, "pid": 0, "tid": 7, "ts": 6303771869109.692, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866464.621, "dur": 4.210, + "args": { + "External id": 150848, "cbid": 211, "correlation": 289985772 + } + }, + { + "ph": "s", "id": 289985772, "pid": 5714, "tid": 5714, "ts": 6303771866464.621, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771869111.516, "dur": 3.392, + "args": { + "External id": 150849, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985785, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985785, "pid": 0, "tid": 7, "ts": 6303771869111.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866483.821, "dur": 5.090, + "args": { + "External id": 150849, "cbid": 211, "correlation": 289985785 + } + }, + { + "ph": "s", "id": 289985785, "pid": 5714, "tid": 5714, "ts": 6303771866483.821, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771869115.548, "dur": 1.088, + "args": { + "External id": 150852, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985791, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985791, "pid": 0, "tid": 7, "ts": 6303771869115.548, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866495.071, "dur": 4.850, + "args": { + "External id": 150852, "cbid": 211, "correlation": 289985791 + } + }, + { + "ph": "s", "id": 289985791, "pid": 5714, "tid": 5714, "ts": 6303771866495.071, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771869117.308, "dur": 1.024, + "args": { + "External id": 150853, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985797, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985797, "pid": 0, "tid": 7, "ts": 6303771869117.308, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866505.001, "dur": 4.080, + "args": { + "External id": 150853, "cbid": 211, "correlation": 289985797 + } + }, + { + "ph": "s", "id": 289985797, "pid": 5714, "tid": 5714, "ts": 6303771866505.001, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771869119.036, "dur": 233.731, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985811, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289985811, "pid": 0, "tid": 7, "ts": 6303771869119.036, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866596.821, "dur": 8.310, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289985811 + } + }, + { + "ph": "s", "id": 289985811, "pid": 5714, "tid": 5714, "ts": 6303771866596.821, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771866638.151, "dur": 0.590, + "args": { + "External id": 150857, "cbid": 200, "correlation": 289985834 + } + }, + { + "ph": "f", "id": 289985834, "pid": 5714, "tid": 5714, "ts": 6303771866638.151, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771869353.663, "dur": 0.800, + "args": { + "External id": 150857, "device": 0, "context": 1, "stream": 7, "correlation": 289985837, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289985837, "pid": 0, "tid": 7, "ts": 6303771869353.663, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771866640.581, "dur": 6.830, + "args": { + "External id": 150857, "cbid": 51, "correlation": 289985837 + } + }, + { + "ph": "s", "id": 289985837, "pid": 5714, "tid": 5714, "ts": 6303771866640.581, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771869356.191, "dur": 691.815, + "args": { + "External id": 150857, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985838, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985838, "pid": 0, "tid": 7, "ts": 6303771869356.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866647.651, "dur": 5.750, + "args": { + "External id": 150857, "cbid": 307, "correlation": 289985838 + } + }, + { + "ph": "s", "id": 289985838, "pid": 5714, "tid": 5714, "ts": 6303771866647.651, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771870048.742, "dur": 3.009, + "args": { + "External id": 150860, "device": 0, "context": 1, "stream": 7, "correlation": 289985843, "bytes": 3145728, "memory bandwidth (GB/s)": 1045.4396809571285 + } + }, + { + "ph": "f", "id": 289985843, "pid": 0, "tid": 7, "ts": 6303771870048.742, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771866678.461, "dur": 12.510, + "args": { + "External id": 150860, "cbid": 41, "correlation": 289985843 + } + }, + { + "ph": "s", "id": 289985843, "pid": 5714, "tid": 5714, "ts": 6303771866678.461, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771866731.131, "dur": 0.480, + "args": { + "External id": 150865, "cbid": 200, "correlation": 289985871 + } + }, + { + "ph": "f", "id": 289985871, "pid": 5714, "tid": 5714, "ts": 6303771866731.131, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771870052.391, "dur": 690.664, + "args": { + "External id": 150865, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985874, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985874, "pid": 0, "tid": 7, "ts": 6303771870052.391, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866733.071, "dur": 6.909, + "args": { + "External id": 150865, "cbid": 307, "correlation": 289985874 + } + }, + { + "ph": "s", "id": 289985874, "pid": 5714, "tid": 5714, "ts": 6303771866733.071, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771870743.791, "dur": 221.154, + "args": { + "External id": 150866, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985879, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289985879, "pid": 0, "tid": 7, "ts": 6303771870743.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866754.171, "dur": 6.009, + "args": { + "External id": 150866, "cbid": 211, "correlation": 289985879 + } + }, + { + "ph": "s", "id": 289985879, "pid": 5714, "tid": 5714, "ts": 6303771866754.171, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771866803.340, "dur": 1.191, + "args": { + "External id": 150874, "cbid": 210, "correlation": 289985905 + } + }, + { + "ph": "f", "id": 289985905, "pid": 5714, "tid": 5714, "ts": 6303771866803.340, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771870965.649, "dur": 643.272, + "args": { + "External id": 150874, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985906, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289985906, "pid": 0, "tid": 7, "ts": 6303771870965.649, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866808.071, "dur": 7.480, + "args": { + "External id": 150874, "cbid": 211, "correlation": 289985906 + } + }, + { + "ph": "s", "id": 289985906, "pid": 5714, "tid": 5714, "ts": 6303771866808.071, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771871609.561, "dur": 170.914, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985925, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289985925, "pid": 0, "tid": 7, "ts": 6303771871609.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866921.150, "dur": 9.190, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289985925 + } + }, + { + "ph": "s", "id": 289985925, "pid": 5714, "tid": 5714, "ts": 6303771866921.150, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771871781.179, "dur": 4.096, + "args": { + "External id": 150884, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985942, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985942, "pid": 0, "tid": 7, "ts": 6303771871781.179, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866967.110, "dur": 7.480, + "args": { + "External id": 150884, "cbid": 211, "correlation": 289985942 + } + }, + { + "ph": "s", "id": 289985942, "pid": 5714, "tid": 5714, "ts": 6303771866967.110, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771871785.915, "dur": 1.216, + "args": { + "External id": 150889, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985959, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985959, "pid": 0, "tid": 7, "ts": 6303771871785.915, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771866997.260, "dur": 5.660, + "args": { + "External id": 150889, "cbid": 211, "correlation": 289985959 + } + }, + { + "ph": "s", "id": 289985959, "pid": 5714, "tid": 5714, "ts": 6303771866997.260, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771871787.803, "dur": 1.024, + "args": { + "External id": 150891, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985969, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985969, "pid": 0, "tid": 7, "ts": 6303771871787.803, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867017.670, "dur": 5.010, + "args": { + "External id": 150891, "cbid": 211, "correlation": 289985969 + } + }, + { + "ph": "s", "id": 289985969, "pid": 5714, "tid": 5714, "ts": 6303771867017.670, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771871789.499, "dur": 1.088, + "args": { + "External id": 150892, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985975, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985975, "pid": 0, "tid": 7, "ts": 6303771871789.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867029.820, "dur": 4.360, + "args": { + "External id": 150892, "cbid": 211, "correlation": 289985975 + } + }, + { + "ph": "s", "id": 289985975, "pid": 5714, "tid": 5714, "ts": 6303771867029.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771871791.259, "dur": 1.056, + "args": { + "External id": 150893, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985985, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985985, "pid": 0, "tid": 7, "ts": 6303771871791.259, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867046.370, "dur": 4.330, + "args": { + "External id": 150893, "cbid": 211, "correlation": 289985985 + } + }, + { + "ph": "s", "id": 289985985, "pid": 5714, "tid": 5714, "ts": 6303771867046.370, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771871793.019, "dur": 1.024, + "args": { + "External id": 150894, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289985991, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289985991, "pid": 0, "tid": 7, "ts": 6303771871793.019, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867056.590, "dur": 4.240, + "args": { + "External id": 150894, "cbid": 211, "correlation": 289985991 + } + }, + { + "ph": "s", "id": 289985991, "pid": 5714, "tid": 5714, "ts": 6303771867056.590, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771871794.651, "dur": 3.424, + "args": { + "External id": 150895, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986004, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986004, "pid": 0, "tid": 7, "ts": 6303771871794.651, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867076.080, "dur": 4.980, + "args": { + "External id": 150895, "cbid": 211, "correlation": 289986004 + } + }, + { + "ph": "s", "id": 289986004, "pid": 5714, "tid": 5714, "ts": 6303771867076.080, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771871798.715, "dur": 1.120, + "args": { + "External id": 150898, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986010, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986010, "pid": 0, "tid": 7, "ts": 6303771871798.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867087.210, "dur": 4.120, + "args": { + "External id": 150898, "cbid": 211, "correlation": 289986010 + } + }, + { + "ph": "s", "id": 289986010, "pid": 5714, "tid": 5714, "ts": 6303771867087.210, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771871800.443, "dur": 1.056, + "args": { + "External id": 150899, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986016, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986016, "pid": 0, "tid": 7, "ts": 6303771871800.443, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867097.220, "dur": 3.870, + "args": { + "External id": 150899, "cbid": 211, "correlation": 289986016 + } + }, + { + "ph": "s", "id": 289986016, "pid": 5714, "tid": 5714, "ts": 6303771867097.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771871802.203, "dur": 234.083, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986030, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289986030, "pid": 0, "tid": 7, "ts": 6303771871802.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867186.539, "dur": 8.160, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289986030 + } + }, + { + "ph": "s", "id": 289986030, "pid": 5714, "tid": 5714, "ts": 6303771867186.539, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771867228.010, "dur": 0.560, + "args": { + "External id": 150903, "cbid": 200, "correlation": 289986053 + } + }, + { + "ph": "f", "id": 289986053, "pid": 5714, "tid": 5714, "ts": 6303771867228.010, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771872037.182, "dur": 0.832, + "args": { + "External id": 150903, "device": 0, "context": 1, "stream": 7, "correlation": 289986056, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289986056, "pid": 0, "tid": 7, "ts": 6303771872037.182, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771867230.379, "dur": 7.651, + "args": { + "External id": 150903, "cbid": 51, "correlation": 289986056 + } + }, + { + "ph": "s", "id": 289986056, "pid": 5714, "tid": 5714, "ts": 6303771867230.379, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771872039.294, "dur": 693.736, + "args": { + "External id": 150903, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986057, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986057, "pid": 0, "tid": 7, "ts": 6303771872039.294, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867238.259, "dur": 5.960, + "args": { + "External id": 150903, "cbid": 307, "correlation": 289986057 + } + }, + { + "ph": "s", "id": 289986057, "pid": 5714, "tid": 5714, "ts": 6303771867238.259, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771872733.734, "dur": 2.976, + "args": { + "External id": 150906, "device": 0, "context": 1, "stream": 7, "correlation": 289986062, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289986062, "pid": 0, "tid": 7, "ts": 6303771872733.734, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771867267.899, "dur": 12.020, + "args": { + "External id": 150906, "cbid": 41, "correlation": 289986062 + } + }, + { + "ph": "s", "id": 289986062, "pid": 5714, "tid": 5714, "ts": 6303771867267.899, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771867330.119, "dur": 0.480, + "args": { + "External id": 150911, "cbid": 200, "correlation": 289986090 + } + }, + { + "ph": "f", "id": 289986090, "pid": 5714, "tid": 5714, "ts": 6303771867330.119, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771872737.350, "dur": 692.392, + "args": { + "External id": 150911, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986093, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986093, "pid": 0, "tid": 7, "ts": 6303771872737.350, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867332.099, "dur": 7.920, + "args": { + "External id": 150911, "cbid": 307, "correlation": 289986093 + } + }, + { + "ph": "s", "id": 289986093, "pid": 5714, "tid": 5714, "ts": 6303771867332.099, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771873430.478, "dur": 221.411, + "args": { + "External id": 150912, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986098, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289986098, "pid": 0, "tid": 7, "ts": 6303771873430.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867354.649, "dur": 6.190, + "args": { + "External id": 150912, "cbid": 211, "correlation": 289986098 + } + }, + { + "ph": "s", "id": 289986098, "pid": 5714, "tid": 5714, "ts": 6303771867354.649, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771867406.769, "dur": 1.210, + "args": { + "External id": 150920, "cbid": 210, "correlation": 289986124 + } + }, + { + "ph": "f", "id": 289986124, "pid": 5714, "tid": 5714, "ts": 6303771867406.769, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771873652.561, "dur": 641.863, + "args": { + "External id": 150920, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986125, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986125, "pid": 0, "tid": 7, "ts": 6303771873652.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867411.539, "dur": 7.620, + "args": { + "External id": 150920, "cbid": 211, "correlation": 289986125 + } + }, + { + "ph": "s", "id": 289986125, "pid": 5714, "tid": 5714, "ts": 6303771867411.539, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771874295.032, "dur": 171.362, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986144, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289986144, "pid": 0, "tid": 7, "ts": 6303771874295.032, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867525.509, "dur": 8.820, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289986144 + } + }, + { + "ph": "s", "id": 289986144, "pid": 5714, "tid": 5714, "ts": 6303771867525.509, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771874467.066, "dur": 4.096, + "args": { + "External id": 150930, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986161, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986161, "pid": 0, "tid": 7, "ts": 6303771874467.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867568.759, "dur": 7.160, + "args": { + "External id": 150930, "cbid": 211, "correlation": 289986161 + } + }, + { + "ph": "s", "id": 289986161, "pid": 5714, "tid": 5714, "ts": 6303771867568.759, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771874471.802, "dur": 1.216, + "args": { + "External id": 150935, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986178, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986178, "pid": 0, "tid": 7, "ts": 6303771874471.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867600.279, "dur": 5.750, + "args": { + "External id": 150935, "cbid": 211, "correlation": 289986178 + } + }, + { + "ph": "s", "id": 289986178, "pid": 5714, "tid": 5714, "ts": 6303771867600.279, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771874473.690, "dur": 1.024, + "args": { + "External id": 150937, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986188, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986188, "pid": 0, "tid": 7, "ts": 6303771874473.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867619.619, "dur": 5.020, + "args": { + "External id": 150937, "cbid": 211, "correlation": 289986188 + } + }, + { + "ph": "s", "id": 289986188, "pid": 5714, "tid": 5714, "ts": 6303771867619.619, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771874475.418, "dur": 1.056, + "args": { + "External id": 150938, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986194, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986194, "pid": 0, "tid": 7, "ts": 6303771874475.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867631.679, "dur": 4.490, + "args": { + "External id": 150938, "cbid": 211, "correlation": 289986194 + } + }, + { + "ph": "s", "id": 289986194, "pid": 5714, "tid": 5714, "ts": 6303771867631.679, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771874477.146, "dur": 1.056, + "args": { + "External id": 150939, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986204, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986204, "pid": 0, "tid": 7, "ts": 6303771874477.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867646.498, "dur": 4.360, + "args": { + "External id": 150939, "cbid": 211, "correlation": 289986204 + } + }, + { + "ph": "s", "id": 289986204, "pid": 5714, "tid": 5714, "ts": 6303771867646.498, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771874478.874, "dur": 1.056, + "args": { + "External id": 150940, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986210, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986210, "pid": 0, "tid": 7, "ts": 6303771874478.874, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867656.469, "dur": 4.069, + "args": { + "External id": 150940, "cbid": 211, "correlation": 289986210 + } + }, + { + "ph": "s", "id": 289986210, "pid": 5714, "tid": 5714, "ts": 6303771867656.469, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771874480.538, "dur": 3.392, + "args": { + "External id": 150941, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986223, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986223, "pid": 0, "tid": 7, "ts": 6303771874480.538, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867675.049, "dur": 4.889, + "args": { + "External id": 150941, "cbid": 211, "correlation": 289986223 + } + }, + { + "ph": "s", "id": 289986223, "pid": 5714, "tid": 5714, "ts": 6303771867675.049, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771874484.570, "dur": 1.120, + "args": { + "External id": 150944, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986229, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986229, "pid": 0, "tid": 7, "ts": 6303771874484.570, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867687.098, "dur": 4.131, + "args": { + "External id": 150944, "cbid": 211, "correlation": 289986229 + } + }, + { + "ph": "s", "id": 289986229, "pid": 5714, "tid": 5714, "ts": 6303771867687.098, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771874486.330, "dur": 1.024, + "args": { + "External id": 150945, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986235, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986235, "pid": 0, "tid": 7, "ts": 6303771874486.330, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867696.338, "dur": 3.860, + "args": { + "External id": 150945, "cbid": 211, "correlation": 289986235 + } + }, + { + "ph": "s", "id": 289986235, "pid": 5714, "tid": 5714, "ts": 6303771867696.338, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771874488.090, "dur": 233.667, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986249, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289986249, "pid": 0, "tid": 7, "ts": 6303771874488.090, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867783.848, "dur": 8.190, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289986249 + } + }, + { + "ph": "s", "id": 289986249, "pid": 5714, "tid": 5714, "ts": 6303771867783.848, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771867824.378, "dur": 0.610, + "args": { + "External id": 150949, "cbid": 200, "correlation": 289986272 + } + }, + { + "ph": "f", "id": 289986272, "pid": 5714, "tid": 5714, "ts": 6303771867824.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771874722.621, "dur": 0.832, + "args": { + "External id": 150949, "device": 0, "context": 1, "stream": 7, "correlation": 289986275, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289986275, "pid": 0, "tid": 7, "ts": 6303771874722.621, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771867826.898, "dur": 6.610, + "args": { + "External id": 150949, "cbid": 51, "correlation": 289986275 + } + }, + { + "ph": "s", "id": 289986275, "pid": 5714, "tid": 5714, "ts": 6303771867826.898, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771874725.149, "dur": 690.184, + "args": { + "External id": 150949, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986276, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986276, "pid": 0, "tid": 7, "ts": 6303771874725.149, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867833.738, "dur": 6.030, + "args": { + "External id": 150949, "cbid": 307, "correlation": 289986276 + } + }, + { + "ph": "s", "id": 289986276, "pid": 5714, "tid": 5714, "ts": 6303771867833.738, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771875416.069, "dur": 2.976, + "args": { + "External id": 150952, "device": 0, "context": 1, "stream": 7, "correlation": 289986281, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289986281, "pid": 0, "tid": 7, "ts": 6303771875416.069, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771867865.768, "dur": 11.930, + "args": { + "External id": 150952, "cbid": 41, "correlation": 289986281 + } + }, + { + "ph": "s", "id": 289986281, "pid": 5714, "tid": 5714, "ts": 6303771867865.768, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771867916.838, "dur": 0.490, + "args": { + "External id": 150957, "cbid": 200, "correlation": 289986309 + } + }, + { + "ph": "f", "id": 289986309, "pid": 5714, "tid": 5714, "ts": 6303771867916.838, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771875419.685, "dur": 690.504, + "args": { + "External id": 150957, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986312, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986312, "pid": 0, "tid": 7, "ts": 6303771875419.685, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867918.878, "dur": 7.340, + "args": { + "External id": 150957, "cbid": 307, "correlation": 289986312 + } + }, + { + "ph": "s", "id": 289986312, "pid": 5714, "tid": 5714, "ts": 6303771867918.878, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771876110.829, "dur": 220.963, + "args": { + "External id": 150958, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986317, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289986317, "pid": 0, "tid": 7, "ts": 6303771876110.829, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867941.488, "dur": 5.730, + "args": { + "External id": 150958, "cbid": 211, "correlation": 289986317 + } + }, + { + "ph": "s", "id": 289986317, "pid": 5714, "tid": 5714, "ts": 6303771867941.488, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771867990.708, "dur": 1.210, + "args": { + "External id": 150966, "cbid": 210, "correlation": 289986343 + } + }, + { + "ph": "f", "id": 289986343, "pid": 5714, "tid": 5714, "ts": 6303771867990.708, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771876332.400, "dur": 644.071, + "args": { + "External id": 150966, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986344, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986344, "pid": 0, "tid": 7, "ts": 6303771876332.400, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771867995.418, "dur": 7.220, + "args": { + "External id": 150966, "cbid": 211, "correlation": 289986344 + } + }, + { + "ph": "s", "id": 289986344, "pid": 5714, "tid": 5714, "ts": 6303771867995.418, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771876977.079, "dur": 171.298, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986363, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289986363, "pid": 0, "tid": 7, "ts": 6303771876977.079, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868107.328, "dur": 8.809, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289986363 + } + }, + { + "ph": "s", "id": 289986363, "pid": 5714, "tid": 5714, "ts": 6303771868107.328, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771877149.113, "dur": 4.256, + "args": { + "External id": 150976, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986380, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986380, "pid": 0, "tid": 7, "ts": 6303771877149.113, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868148.797, "dur": 7.391, + "args": { + "External id": 150976, "cbid": 211, "correlation": 289986380 + } + }, + { + "ph": "s", "id": 289986380, "pid": 5714, "tid": 5714, "ts": 6303771868148.797, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771877154.009, "dur": 1.216, + "args": { + "External id": 150981, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986397, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986397, "pid": 0, "tid": 7, "ts": 6303771877154.009, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868180.777, "dur": 5.320, + "args": { + "External id": 150981, "cbid": 211, "correlation": 289986397 + } + }, + { + "ph": "s", "id": 289986397, "pid": 5714, "tid": 5714, "ts": 6303771868180.777, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771877155.897, "dur": 1.024, + "args": { + "External id": 150983, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986407, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986407, "pid": 0, "tid": 7, "ts": 6303771877155.897, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868200.377, "dur": 4.920, + "args": { + "External id": 150983, "cbid": 211, "correlation": 289986407 + } + }, + { + "ph": "s", "id": 289986407, "pid": 5714, "tid": 5714, "ts": 6303771868200.377, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771877157.625, "dur": 1.056, + "args": { + "External id": 150984, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986413, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986413, "pid": 0, "tid": 7, "ts": 6303771877157.625, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868212.417, "dur": 4.390, + "args": { + "External id": 150984, "cbid": 211, "correlation": 289986413 + } + }, + { + "ph": "s", "id": 289986413, "pid": 5714, "tid": 5714, "ts": 6303771868212.417, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771877159.353, "dur": 1.056, + "args": { + "External id": 150985, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986423, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986423, "pid": 0, "tid": 7, "ts": 6303771877159.353, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868226.637, "dur": 4.150, + "args": { + "External id": 150985, "cbid": 211, "correlation": 289986423 + } + }, + { + "ph": "s", "id": 289986423, "pid": 5714, "tid": 5714, "ts": 6303771868226.637, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771877161.113, "dur": 1.024, + "args": { + "External id": 150986, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986429, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986429, "pid": 0, "tid": 7, "ts": 6303771877161.113, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868236.477, "dur": 4.070, + "args": { + "External id": 150986, "cbid": 211, "correlation": 289986429 + } + }, + { + "ph": "s", "id": 289986429, "pid": 5714, "tid": 5714, "ts": 6303771868236.477, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771877162.745, "dur": 3.392, + "args": { + "External id": 150987, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986442, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986442, "pid": 0, "tid": 7, "ts": 6303771877162.745, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868255.637, "dur": 5.020, + "args": { + "External id": 150987, "cbid": 211, "correlation": 289986442 + } + }, + { + "ph": "s", "id": 289986442, "pid": 5714, "tid": 5714, "ts": 6303771868255.637, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771877166.809, "dur": 1.088, + "args": { + "External id": 150990, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986448, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986448, "pid": 0, "tid": 7, "ts": 6303771877166.809, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868267.837, "dur": 4.120, + "args": { + "External id": 150990, "cbid": 211, "correlation": 289986448 + } + }, + { + "ph": "s", "id": 289986448, "pid": 5714, "tid": 5714, "ts": 6303771868267.837, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771877168.569, "dur": 0.992, + "args": { + "External id": 150991, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986454, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986454, "pid": 0, "tid": 7, "ts": 6303771877168.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868276.797, "dur": 3.910, + "args": { + "External id": 150991, "cbid": 211, "correlation": 289986454 + } + }, + { + "ph": "s", "id": 289986454, "pid": 5714, "tid": 5714, "ts": 6303771868276.797, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771877170.297, "dur": 233.059, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986468, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289986468, "pid": 0, "tid": 7, "ts": 6303771877170.297, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868385.377, "dur": 9.320, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289986468 + } + }, + { + "ph": "s", "id": 289986468, "pid": 5714, "tid": 5714, "ts": 6303771868385.377, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771868427.607, "dur": 0.560, + "args": { + "External id": 150995, "cbid": 200, "correlation": 289986491 + } + }, + { + "ph": "f", "id": 289986491, "pid": 5714, "tid": 5714, "ts": 6303771868427.607, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771877404.284, "dur": 0.800, + "args": { + "External id": 150995, "device": 0, "context": 1, "stream": 7, "correlation": 289986494, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289986494, "pid": 0, "tid": 7, "ts": 6303771877404.284, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771868429.977, "dur": 6.920, + "args": { + "External id": 150995, "cbid": 51, "correlation": 289986494 + } + }, + { + "ph": "s", "id": 289986494, "pid": 5714, "tid": 5714, "ts": 6303771868429.977, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771877406.812, "dur": 690.920, + "args": { + "External id": 150995, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986495, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986495, "pid": 0, "tid": 7, "ts": 6303771877406.812, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868437.137, "dur": 5.840, + "args": { + "External id": 150995, "cbid": 307, "correlation": 289986495 + } + }, + { + "ph": "s", "id": 289986495, "pid": 5714, "tid": 5714, "ts": 6303771868437.137, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771878098.468, "dur": 2.912, + "args": { + "External id": 150998, "device": 0, "context": 1, "stream": 7, "correlation": 289986500, "bytes": 3145728, "memory bandwidth (GB/s)": 1080.2637362637363 + } + }, + { + "ph": "f", "id": 289986500, "pid": 0, "tid": 7, "ts": 6303771878098.468, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771868467.337, "dur": 12.120, + "args": { + "External id": 150998, "cbid": 41, "correlation": 289986500 + } + }, + { + "ph": "s", "id": 289986500, "pid": 5714, "tid": 5714, "ts": 6303771868467.337, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771868520.547, "dur": 0.450, + "args": { + "External id": 151003, "cbid": 200, "correlation": 289986528 + } + }, + { + "ph": "f", "id": 289986528, "pid": 5714, "tid": 5714, "ts": 6303771868520.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771878102.084, "dur": 692.744, + "args": { + "External id": 151003, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986531, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986531, "pid": 0, "tid": 7, "ts": 6303771878102.084, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868522.477, "dur": 7.090, + "args": { + "External id": 151003, "cbid": 307, "correlation": 289986531 + } + }, + { + "ph": "s", "id": 289986531, "pid": 5714, "tid": 5714, "ts": 6303771868522.477, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771878795.564, "dur": 221.091, + "args": { + "External id": 151004, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986536, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289986536, "pid": 0, "tid": 7, "ts": 6303771878795.564, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868543.416, "dur": 5.911, + "args": { + "External id": 151004, "cbid": 211, "correlation": 289986536 + } + }, + { + "ph": "s", "id": 289986536, "pid": 5714, "tid": 5714, "ts": 6303771868543.416, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771868593.556, "dur": 1.171, + "args": { + "External id": 151012, "cbid": 210, "correlation": 289986562 + } + }, + { + "ph": "f", "id": 289986562, "pid": 5714, "tid": 5714, "ts": 6303771868593.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771879017.359, "dur": 643.335, + "args": { + "External id": 151012, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986563, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986563, "pid": 0, "tid": 7, "ts": 6303771879017.359, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868598.256, "dur": 7.500, + "args": { + "External id": 151012, "cbid": 211, "correlation": 289986563 + } + }, + { + "ph": "s", "id": 289986563, "pid": 5714, "tid": 5714, "ts": 6303771868598.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771879661.430, "dur": 171.074, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986582, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289986582, "pid": 0, "tid": 7, "ts": 6303771879661.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868707.456, "dur": 8.800, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289986582 + } + }, + { + "ph": "s", "id": 289986582, "pid": 5714, "tid": 5714, "ts": 6303771868707.456, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771879833.144, "dur": 4.192, + "args": { + "External id": 151022, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986599, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986599, "pid": 0, "tid": 7, "ts": 6303771879833.144, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868750.426, "dur": 7.290, + "args": { + "External id": 151022, "cbid": 211, "correlation": 289986599 + } + }, + { + "ph": "s", "id": 289986599, "pid": 5714, "tid": 5714, "ts": 6303771868750.426, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771879838.040, "dur": 1.216, + "args": { + "External id": 151027, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986616, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986616, "pid": 0, "tid": 7, "ts": 6303771879838.040, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868781.176, "dur": 5.480, + "args": { + "External id": 151027, "cbid": 211, "correlation": 289986616 + } + }, + { + "ph": "s", "id": 289986616, "pid": 5714, "tid": 5714, "ts": 6303771868781.176, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771879839.928, "dur": 1.024, + "args": { + "External id": 151029, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986626, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986626, "pid": 0, "tid": 7, "ts": 6303771879839.928, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868800.126, "dur": 4.930, + "args": { + "External id": 151029, "cbid": 211, "correlation": 289986626 + } + }, + { + "ph": "s", "id": 289986626, "pid": 5714, "tid": 5714, "ts": 6303771868800.126, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771879841.624, "dur": 1.088, + "args": { + "External id": 151030, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986632, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986632, "pid": 0, "tid": 7, "ts": 6303771879841.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868812.126, "dur": 4.340, + "args": { + "External id": 151030, "cbid": 211, "correlation": 289986632 + } + }, + { + "ph": "s", "id": 289986632, "pid": 5714, "tid": 5714, "ts": 6303771868812.126, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771879843.384, "dur": 1.056, + "args": { + "External id": 151031, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986642, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986642, "pid": 0, "tid": 7, "ts": 6303771879843.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868826.696, "dur": 4.300, + "args": { + "External id": 151031, "cbid": 211, "correlation": 289986642 + } + }, + { + "ph": "s", "id": 289986642, "pid": 5714, "tid": 5714, "ts": 6303771868826.696, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771879845.112, "dur": 1.056, + "args": { + "External id": 151032, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986648, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986648, "pid": 0, "tid": 7, "ts": 6303771879845.112, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868836.616, "dur": 4.130, + "args": { + "External id": 151032, "cbid": 211, "correlation": 289986648 + } + }, + { + "ph": "s", "id": 289986648, "pid": 5714, "tid": 5714, "ts": 6303771868836.616, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771879846.808, "dur": 3.392, + "args": { + "External id": 151033, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986661, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986661, "pid": 0, "tid": 7, "ts": 6303771879846.808, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868858.126, "dur": 5.230, + "args": { + "External id": 151033, "cbid": 211, "correlation": 289986661 + } + }, + { + "ph": "s", "id": 289986661, "pid": 5714, "tid": 5714, "ts": 6303771868858.126, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771879850.808, "dur": 1.121, + "args": { + "External id": 151036, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986667, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986667, "pid": 0, "tid": 7, "ts": 6303771879850.808, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868869.506, "dur": 4.320, + "args": { + "External id": 151036, "cbid": 211, "correlation": 289986667 + } + }, + { + "ph": "s", "id": 289986667, "pid": 5714, "tid": 5714, "ts": 6303771868869.506, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771879852.569, "dur": 1.024, + "args": { + "External id": 151037, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986673, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986673, "pid": 0, "tid": 7, "ts": 6303771879852.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868878.856, "dur": 3.970, + "args": { + "External id": 151037, "cbid": 211, "correlation": 289986673 + } + }, + { + "ph": "s", "id": 289986673, "pid": 5714, "tid": 5714, "ts": 6303771868878.856, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771879854.329, "dur": 233.538, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986687, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289986687, "pid": 0, "tid": 7, "ts": 6303771879854.329, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771868968.186, "dur": 7.970, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289986687 + } + }, + { + "ph": "s", "id": 289986687, "pid": 5714, "tid": 5714, "ts": 6303771868968.186, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771869023.246, "dur": 0.569, + "args": { + "External id": 151041, "cbid": 200, "correlation": 289986710 + } + }, + { + "ph": "f", "id": 289986710, "pid": 5714, "tid": 5714, "ts": 6303771869023.246, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771880088.891, "dur": 0.800, + "args": { + "External id": 151041, "device": 0, "context": 1, "stream": 7, "correlation": 289986713, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289986713, "pid": 0, "tid": 7, "ts": 6303771880088.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771869025.655, "dur": 6.931, + "args": { + "External id": 151041, "cbid": 51, "correlation": 289986713 + } + }, + { + "ph": "s", "id": 289986713, "pid": 5714, "tid": 5714, "ts": 6303771869025.655, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771880091.419, "dur": 691.016, + "args": { + "External id": 151041, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986714, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986714, "pid": 0, "tid": 7, "ts": 6303771880091.419, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869032.815, "dur": 5.911, + "args": { + "External id": 151041, "cbid": 307, "correlation": 289986714 + } + }, + { + "ph": "s", "id": 289986714, "pid": 5714, "tid": 5714, "ts": 6303771869032.815, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771880783.075, "dur": 2.976, + "args": { + "External id": 151044, "device": 0, "context": 1, "stream": 7, "correlation": 289986719, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289986719, "pid": 0, "tid": 7, "ts": 6303771880783.075, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771869063.786, "dur": 12.209, + "args": { + "External id": 151044, "cbid": 41, "correlation": 289986719 + } + }, + { + "ph": "s", "id": 289986719, "pid": 5714, "tid": 5714, "ts": 6303771869063.786, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771869116.465, "dur": 0.500, + "args": { + "External id": 151049, "cbid": 200, "correlation": 289986747 + } + }, + { + "ph": "f", "id": 289986747, "pid": 5714, "tid": 5714, "ts": 6303771869116.465, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771880786.723, "dur": 689.000, + "args": { + "External id": 151049, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986750, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986750, "pid": 0, "tid": 7, "ts": 6303771880786.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869118.465, "dur": 7.050, + "args": { + "External id": 151049, "cbid": 307, "correlation": 289986750 + } + }, + { + "ph": "s", "id": 289986750, "pid": 5714, "tid": 5714, "ts": 6303771869118.465, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771881476.459, "dur": 221.411, + "args": { + "External id": 151050, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986755, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289986755, "pid": 0, "tid": 7, "ts": 6303771881476.459, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869140.075, "dur": 5.880, + "args": { + "External id": 151050, "cbid": 211, "correlation": 289986755 + } + }, + { + "ph": "s", "id": 289986755, "pid": 5714, "tid": 5714, "ts": 6303771869140.075, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771869188.515, "dur": 1.180, + "args": { + "External id": 151058, "cbid": 210, "correlation": 289986781 + } + }, + { + "ph": "f", "id": 289986781, "pid": 5714, "tid": 5714, "ts": 6303771869188.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771881698.606, "dur": 644.743, + "args": { + "External id": 151058, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986782, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986782, "pid": 0, "tid": 7, "ts": 6303771881698.606, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869193.235, "dur": 7.450, + "args": { + "External id": 151058, "cbid": 211, "correlation": 289986782 + } + }, + { + "ph": "s", "id": 289986782, "pid": 5714, "tid": 5714, "ts": 6303771869193.235, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771882343.989, "dur": 171.010, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986801, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289986801, "pid": 0, "tid": 7, "ts": 6303771882343.989, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869314.615, "dur": 9.210, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289986801 + } + }, + { + "ph": "s", "id": 289986801, "pid": 5714, "tid": 5714, "ts": 6303771869314.615, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771882515.735, "dur": 4.193, + "args": { + "External id": 151068, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986818, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986818, "pid": 0, "tid": 7, "ts": 6303771882515.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869360.605, "dur": 7.550, + "args": { + "External id": 151068, "cbid": 211, "correlation": 289986818 + } + }, + { + "ph": "s", "id": 289986818, "pid": 5714, "tid": 5714, "ts": 6303771869360.605, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771882520.664, "dur": 1.184, + "args": { + "External id": 151073, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986835, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986835, "pid": 0, "tid": 7, "ts": 6303771882520.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869390.585, "dur": 5.470, + "args": { + "External id": 151073, "cbid": 211, "correlation": 289986835 + } + }, + { + "ph": "s", "id": 289986835, "pid": 5714, "tid": 5714, "ts": 6303771869390.585, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771882522.552, "dur": 1.024, + "args": { + "External id": 151075, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986845, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986845, "pid": 0, "tid": 7, "ts": 6303771882522.552, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869411.205, "dur": 4.830, + "args": { + "External id": 151075, "cbid": 211, "correlation": 289986845 + } + }, + { + "ph": "s", "id": 289986845, "pid": 5714, "tid": 5714, "ts": 6303771869411.205, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771882524.248, "dur": 1.088, + "args": { + "External id": 151076, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986851, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986851, "pid": 0, "tid": 7, "ts": 6303771882524.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869423.155, "dur": 4.290, + "args": { + "External id": 151076, "cbid": 211, "correlation": 289986851 + } + }, + { + "ph": "s", "id": 289986851, "pid": 5714, "tid": 5714, "ts": 6303771869423.155, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771882526.008, "dur": 1.056, + "args": { + "External id": 151077, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986861, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986861, "pid": 0, "tid": 7, "ts": 6303771882526.008, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869437.594, "dur": 4.291, + "args": { + "External id": 151077, "cbid": 211, "correlation": 289986861 + } + }, + { + "ph": "s", "id": 289986861, "pid": 5714, "tid": 5714, "ts": 6303771869437.594, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771882527.736, "dur": 1.056, + "args": { + "External id": 151078, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986867, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986867, "pid": 0, "tid": 7, "ts": 6303771882527.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869447.514, "dur": 4.060, + "args": { + "External id": 151078, "cbid": 211, "correlation": 289986867 + } + }, + { + "ph": "s", "id": 289986867, "pid": 5714, "tid": 5714, "ts": 6303771869447.514, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771882529.400, "dur": 3.424, + "args": { + "External id": 151079, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986880, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986880, "pid": 0, "tid": 7, "ts": 6303771882529.400, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869467.954, "dur": 5.080, + "args": { + "External id": 151079, "cbid": 211, "correlation": 289986880 + } + }, + { + "ph": "s", "id": 289986880, "pid": 5714, "tid": 5714, "ts": 6303771869467.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771882533.432, "dur": 1.120, + "args": { + "External id": 151082, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986886, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986886, "pid": 0, "tid": 7, "ts": 6303771882533.432, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869479.014, "dur": 4.180, + "args": { + "External id": 151082, "cbid": 211, "correlation": 289986886 + } + }, + { + "ph": "s", "id": 289986886, "pid": 5714, "tid": 5714, "ts": 6303771869479.014, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771882535.192, "dur": 1.024, + "args": { + "External id": 151083, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986892, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289986892, "pid": 0, "tid": 7, "ts": 6303771882535.192, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869488.085, "dur": 3.769, + "args": { + "External id": 151083, "cbid": 211, "correlation": 289986892 + } + }, + { + "ph": "s", "id": 289986892, "pid": 5714, "tid": 5714, "ts": 6303771869488.085, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771882536.952, "dur": 233.122, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986906, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289986906, "pid": 0, "tid": 7, "ts": 6303771882536.952, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869577.414, "dur": 8.140, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289986906 + } + }, + { + "ph": "s", "id": 289986906, "pid": 5714, "tid": 5714, "ts": 6303771869577.414, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771869619.304, "dur": 0.560, + "args": { + "External id": 151087, "cbid": 200, "correlation": 289986929 + } + }, + { + "ph": "f", "id": 289986929, "pid": 5714, "tid": 5714, "ts": 6303771869619.304, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771882771.002, "dur": 0.800, + "args": { + "External id": 151087, "device": 0, "context": 1, "stream": 7, "correlation": 289986932, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289986932, "pid": 0, "tid": 7, "ts": 6303771882771.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771869621.654, "dur": 6.760, + "args": { + "External id": 151087, "cbid": 51, "correlation": 289986932 + } + }, + { + "ph": "s", "id": 289986932, "pid": 5714, "tid": 5714, "ts": 6303771869621.654, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771882773.114, "dur": 693.640, + "args": { + "External id": 151087, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986933, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986933, "pid": 0, "tid": 7, "ts": 6303771882773.114, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869628.664, "dur": 5.960, + "args": { + "External id": 151087, "cbid": 307, "correlation": 289986933 + } + }, + { + "ph": "s", "id": 289986933, "pid": 5714, "tid": 5714, "ts": 6303771869628.664, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771883467.490, "dur": 2.976, + "args": { + "External id": 151090, "device": 0, "context": 1, "stream": 7, "correlation": 289986938, "bytes": 3145728, "memory bandwidth (GB/s)": 1057.032258064516 + } + }, + { + "ph": "f", "id": 289986938, "pid": 0, "tid": 7, "ts": 6303771883467.490, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771869659.284, "dur": 12.090, + "args": { + "External id": 151090, "cbid": 41, "correlation": 289986938 + } + }, + { + "ph": "s", "id": 289986938, "pid": 5714, "tid": 5714, "ts": 6303771869659.284, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771869711.944, "dur": 0.460, + "args": { + "External id": 151095, "cbid": 200, "correlation": 289986966 + } + }, + { + "ph": "f", "id": 289986966, "pid": 5714, "tid": 5714, "ts": 6303771869711.944, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771883471.106, "dur": 688.649, + "args": { + "External id": 151095, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986969, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289986969, "pid": 0, "tid": 7, "ts": 6303771883471.106, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869713.854, "dur": 7.090, + "args": { + "External id": 151095, "cbid": 307, "correlation": 289986969 + } + }, + { + "ph": "s", "id": 289986969, "pid": 5714, "tid": 5714, "ts": 6303771869713.854, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771884160.459, "dur": 222.274, + "args": { + "External id": 151096, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289986974, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289986974, "pid": 0, "tid": 7, "ts": 6303771884160.459, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869735.244, "dur": 6.020, + "args": { + "External id": 151096, "cbid": 211, "correlation": 289986974 + } + }, + { + "ph": "s", "id": 289986974, "pid": 5714, "tid": 5714, "ts": 6303771869735.244, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaOccupancyMaxActiveBlocksPerMultiprocessor", "pid": 5714, "tid": 5714, + "ts": 6303771869783.784, "dur": 1.210, + "args": { + "External id": 151104, "cbid": 210, "correlation": 289987000 + } + }, + { + "ph": "f", "id": 289987000, "pid": 5714, "tid": 5714, "ts": 6303771869783.784, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_tn", "pid": 0, "tid": 7, + "ts": 6303771884383.373, "dur": 640.807, + "args": { + "External id": 151104, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987001, "registers per thread": 210, "shared memory": 24576, "blocks per SM": 62.500000, "warps per SM": 125.000000, "grid": [250, 32, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289987001, "pid": 0, "tid": 7, "ts": 6303771884383.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869788.554, "dur": 7.820, + "args": { + "External id": 151104, "cbid": 211, "correlation": 289987001 + } + }, + { + "ph": "s", "id": 289987001, "pid": 5714, "tid": 5714, "ts": 6303771869788.554, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "logsumexp_fwd_kernel", "pid": 0, "tid": 7, + "ts": 6303771885024.884, "dur": 171.267, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987020, "registers per thread": 78, "shared memory": 64, "blocks per SM": 16.000000, "warps per SM": 256.000000, "grid": [2048, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 289987020, "pid": 0, "tid": 7, "ts": 6303771885024.884, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869899.853, "dur": 8.651, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289987020 + } + }, + { + "ph": "s", "id": 289987020, "pid": 5714, "tid": 5714, "ts": 6303771869899.853, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp >, unsigned int, float, 4> >(at::native::ReduceOp >, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771885196.791, "dur": 4.096, + "args": { + "External id": 151114, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987037, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987037, "pid": 0, "tid": 7, "ts": 6303771885196.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869942.584, "dur": 7.520, + "args": { + "External id": 151114, "cbid": 211, "correlation": 289987037 + } + }, + { + "ph": "s", "id": 289987037, "pid": 5714, "tid": 5714, "ts": 6303771869942.584, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771885201.527, "dur": 1.248, + "args": { + "External id": 151119, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987054, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987054, "pid": 0, "tid": 7, "ts": 6303771885201.527, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771869973.664, "dur": 5.540, + "args": { + "External id": 151119, "cbid": 211, "correlation": 289987054 + } + }, + { + "ph": "s", "id": 289987054, "pid": 5714, "tid": 5714, "ts": 6303771869973.664, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771885203.447, "dur": 1.024, + "args": { + "External id": 151121, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987064, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987064, "pid": 0, "tid": 7, "ts": 6303771885203.447, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870002.333, "dur": 4.810, + "args": { + "External id": 151121, "cbid": 211, "correlation": 289987064 + } + }, + { + "ph": "s", "id": 289987064, "pid": 5714, "tid": 5714, "ts": 6303771870002.333, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array >(int, at::native::(anonymous namespace)::masked_fill_kernel(at::TensorIterator&, c10::Scalar const&)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float, bool)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771885205.143, "dur": 1.056, + "args": { + "External id": 151122, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987070, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987070, "pid": 0, "tid": 7, "ts": 6303771885205.143, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870014.253, "dur": 4.430, + "args": { + "External id": 151122, "cbid": 211, "correlation": 289987070 + } + }, + { + "ph": "s", "id": 289987070, "pid": 5714, "tid": 5714, "ts": 6303771870014.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771885206.903, "dur": 1.024, + "args": { + "External id": 151123, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987080, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987080, "pid": 0, "tid": 7, "ts": 6303771885206.903, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870028.643, "dur": 4.280, + "args": { + "External id": 151123, "cbid": 211, "correlation": 289987080 + } + }, + { + "ph": "s", "id": 289987080, "pid": 5714, "tid": 5714, "ts": 6303771870028.643, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::exp_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771885208.631, "dur": 1.024, + "args": { + "External id": 151124, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987086, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987086, "pid": 0, "tid": 7, "ts": 6303771885208.631, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870038.493, "dur": 4.100, + "args": { + "External id": 151124, "cbid": 211, "correlation": 289987086 + } + }, + { + "ph": "s", "id": 289987086, "pid": 5714, "tid": 5714, "ts": 6303771870038.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771885210.295, "dur": 3.392, + "args": { + "External id": 151125, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987099, "registers per thread": 48, "shared memory": 16, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987099, "pid": 0, "tid": 7, "ts": 6303771885210.295, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870058.463, "dur": 5.210, + "args": { + "External id": 151125, "cbid": 211, "correlation": 289987099 + } + }, + { + "ph": "s", "id": 289987099, "pid": 5714, "tid": 5714, "ts": 6303771870058.463, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::log_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771885214.295, "dur": 1.120, + "args": { + "External id": 151128, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987105, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987105, "pid": 0, "tid": 7, "ts": 6303771885214.295, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870070.673, "dur": 4.260, + "args": { + "External id": 151128, "cbid": 211, "correlation": 289987105 + } + }, + { + "ph": "s", "id": 289987105, "pid": 5714, "tid": 5714, "ts": 6303771870070.673, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771885216.055, "dur": 1.024, + "args": { + "External id": 151129, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987111, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.031250, "warps per SM": 0.125000, "grid": [4, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987111, "pid": 0, "tid": 7, "ts": 6303771885216.055, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870079.833, "dur": 3.990, + "args": { + "External id": 151129, "cbid": 211, "correlation": 289987111 + } + }, + { + "ph": "s", "id": 289987111, "pid": 5714, "tid": 5714, "ts": 6303771870079.833, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "cross_entropy_kernel", "pid": 0, "tid": 7, + "ts": 6303771885217.815, "dur": 233.475, + "args": { + "External id": 150745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987125, "registers per thread": 50, "shared memory": 0, "blocks per SM": 16.000000, "warps per SM": 512.000000, "grid": [2048, 1, 1], "block": [1024, 1, 1], "est. achieved occupancy %": 67 + } + }, + { + "ph": "f", "id": 289987125, "pid": 0, "tid": 7, "ts": 6303771885217.815, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870168.533, "dur": 8.150, + "args": { + "External id": 150745, "cbid": 307, "correlation": 289987125 + } + }, + { + "ph": "s", "id": 289987125, "pid": 5714, "tid": 5714, "ts": 6303771870168.533, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771870209.423, "dur": 0.570, + "args": { + "External id": 151133, "cbid": 200, "correlation": 289987148 + } + }, + { + "ph": "f", "id": 289987148, "pid": 5714, "tid": 5714, "ts": 6303771870209.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771885452.154, "dur": 0.800, + "args": { + "External id": 151133, "device": 0, "context": 1, "stream": 7, "correlation": 289987151, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289987151, "pid": 0, "tid": 7, "ts": 6303771885452.154, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 5714, + "ts": 6303771870211.833, "dur": 8.030, + "args": { + "External id": 151133, "cbid": 51, "correlation": 289987151 + } + }, + { + "ph": "s", "id": 289987151, "pid": 5714, "tid": 5714, "ts": 6303771870211.833, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771885454.298, "dur": 691.943, + "args": { + "External id": 151133, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987152, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289987152, "pid": 0, "tid": 7, "ts": 6303771885454.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870220.103, "dur": 5.910, + "args": { + "External id": 151133, "cbid": 307, "correlation": 289987152 + } + }, + { + "ph": "s", "id": 289987152, "pid": 5714, "tid": 5714, "ts": 6303771870220.103, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771886146.977, "dur": 2.944, + "args": { + "External id": 151136, "device": 0, "context": 1, "stream": 7, "correlation": 289987157, "bytes": 3145728, "memory bandwidth (GB/s)": 1068.5217391304348 + } + }, + { + "ph": "f", "id": 289987157, "pid": 0, "tid": 7, "ts": 6303771886146.977, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771870251.253, "dur": 13.110, + "args": { + "External id": 151136, "cbid": 41, "correlation": 289987157 + } + }, + { + "ph": "s", "id": 289987157, "pid": 5714, "tid": 5714, "ts": 6303771870251.253, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 5714, + "ts": 6303771870313.373, "dur": 0.480, + "args": { + "External id": 151141, "cbid": 200, "correlation": 289987185 + } + }, + { + "ph": "f", "id": 289987185, "pid": 5714, "tid": 5714, "ts": 6303771870313.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771886150.594, "dur": 691.528, + "args": { + "External id": 151141, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987188, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 62.500000, "warps per SM": 250.000000, "grid": [4000, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289987188, "pid": 0, "tid": 7, "ts": 6303771886150.594, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870315.353, "dur": 7.630, + "args": { + "External id": 151141, "cbid": 307, "correlation": 289987188 + } + }, + { + "ph": "s", "id": 289987188, "pid": 5714, "tid": 5714, "ts": 6303771870315.353, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, std::array, 4, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1> >(int, at::native::CUDAFunctor_add, std::array, TrivialOffsetCalculator<2, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<2>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771886842.762, "dur": 220.578, + "args": { + "External id": 151142, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987193, "registers per thread": 34, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987193, "pid": 0, "tid": 7, "ts": 6303771886842.762, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870339.792, "dur": 6.011, + "args": { + "External id": 151142, "cbid": 211, "correlation": 289987193 + } + }, + { + "ph": "s", "id": 289987193, "pid": 5714, "tid": 5714, "ts": 6303771870339.792, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771887063.948, "dur": 5.312, + "args": { + "External id": 151144, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987206, "registers per thread": 32, "shared memory": 2064, "blocks per SM": 0.007812, "warps per SM": 0.125000, "grid": [1, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987206, "pid": 0, "tid": 7, "ts": 6303771887063.948, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870369.332, "dur": 6.580, + "args": { + "External id": 151144, "cbid": 211, "correlation": 289987206 + } + }, + { + "ph": "s", "id": 289987206, "pid": 5714, "tid": 5714, "ts": 6303771870369.332, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887069.964, "dur": 158.530, + "args": { + "External id": 151149, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987219, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987219, "pid": 0, "tid": 7, "ts": 6303771887069.964, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870399.343, "dur": 6.109, + "args": { + "External id": 151149, "cbid": 211, "correlation": 289987219 + } + }, + { + "ph": "s", "id": 289987219, "pid": 5714, "tid": 5714, "ts": 6303771870399.343, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887229.262, "dur": 1.376, + "args": { + "External id": 151154, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987227, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987227, "pid": 0, "tid": 7, "ts": 6303771887229.262, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870464.292, "dur": 7.900, + "args": { + "External id": 151154, "cbid": 211, "correlation": 289987227 + } + }, + { + "ph": "s", "id": 289987227, "pid": 5714, "tid": 5714, "ts": 6303771870464.292, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887231.278, "dur": 1.344, + "args": { + "External id": 151155, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987233, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987233, "pid": 0, "tid": 7, "ts": 6303771887231.278, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870483.222, "dur": 4.870, + "args": { + "External id": 151155, "cbid": 211, "correlation": 289987233 + } + }, + { + "ph": "s", "id": 289987233, "pid": 5714, "tid": 5714, "ts": 6303771870483.222, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771870711.302, "dur": 3.470, + "args": { + "cbid": 147, "correlation": 289987238 + } + }, + { + "ph": "s", "id": 289987238, "pid": 5714, "tid": 5714, "ts": 6303771870711.302, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771870718.142, "dur": 1.410, + "args": { + "cbid": 147, "correlation": 289987242 + } + }, + { + "ph": "s", "id": 289987242, "pid": 5714, "tid": 5714, "ts": 6303771870718.142, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887233.326, "dur": 1.024, + "args": { + "External id": 151157, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987259, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987259, "pid": 0, "tid": 7, "ts": 6303771887233.326, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870791.762, "dur": 13.580, + "args": { + "External id": 151157, "cbid": 211, "correlation": 289987259 + } + }, + { + "ph": "s", "id": 289987259, "pid": 5714, "tid": 5714, "ts": 6303771870791.762, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887235.054, "dur": 0.896, + "args": { + "External id": 151161, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987272, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987272, "pid": 0, "tid": 7, "ts": 6303771887235.054, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771870880.131, "dur": 8.070, + "args": { + "External id": 151161, "cbid": 211, "correlation": 289987272 + } + }, + { + "ph": "s", "id": 289987272, "pid": 5714, "tid": 5714, "ts": 6303771870880.131, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887236.654, "dur": 1.088, + "args": { + "External id": 151555, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987288, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987288, "pid": 0, "tid": 7, "ts": 6303771887236.654, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771871368.270, "dur": 17.740, + "args": { + "External id": 151555, "cbid": 211, "correlation": 289987288 + } + }, + { + "ph": "s", "id": 289987288, "pid": 5714, "tid": 6744, "ts": 6303771871368.270, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771887238.446, "dur": 1.856, + "args": { + "External id": 151561, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987306, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987306, "pid": 0, "tid": 7, "ts": 6303771887238.446, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771871562.210, "dur": 13.940, + "args": { + "External id": 151561, "cbid": 211, "correlation": 289987306 + } + }, + { + "ph": "s", "id": 289987306, "pid": 5714, "tid": 6744, "ts": 6303771871562.210, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6303771887241.102, "dur": 0.384, + "args": { + "External id": 151569, "device": 0, "context": 1, "stream": 7, "correlation": 289987324, "bytes": 4, "memory bandwidth (GB/s)": 0.010416666666666666 + } + }, + { + "ph": "f", "id": 289987324, "pid": 0, "tid": 7, "ts": 6303771887241.102, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771871669.290, "dur": 16.819, + "args": { + "External id": 151569, "cbid": 41, "correlation": 289987324 + } + }, + { + "ph": "s", "id": 289987324, "pid": 5714, "tid": 6744, "ts": 6303771871669.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6303771871686.569, "dur": 15557.406, + "args": { + "External id": 151569, "cbid": 131, "correlation": 289987325 + } + }, + { + "ph": "s", "id": 289987325, "pid": 5714, "tid": 6744, "ts": 6303771871686.569, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887287.503, "dur": 1.152, + "args": { + "External id": 151573, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987334, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987334, "pid": 0, "tid": 7, "ts": 6303771887287.503, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771887274.075, "dur": 14.540, + "args": { + "External id": 151573, "cbid": 211, "correlation": 289987334 + } + }, + { + "ph": "s", "id": 289987334, "pid": 5714, "tid": 6744, "ts": 6303771887274.075, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6303771887327.695, "dur": 0.992, + "args": { + "External id": 151576, "device": 0, "context": 1, "stream": 7, "correlation": 289987340, "bytes": 1, "memory bandwidth (GB/s)": 0.0010080645161290322 + } + }, + { + "ph": "f", "id": 289987340, "pid": 0, "tid": 7, "ts": 6303771887327.695, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771887316.815, "dur": 10.630, + "args": { + "External id": 151576, "cbid": 41, "correlation": 289987340 + } + }, + { + "ph": "s", "id": 289987340, "pid": 5714, "tid": 6744, "ts": 6303771887316.815, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6303771887327.755, "dur": 3.860, + "args": { + "External id": 151576, "cbid": 131, "correlation": 289987341 + } + }, + { + "ph": "s", "id": 289987341, "pid": 5714, "tid": 6744, "ts": 6303771887327.755, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887430.384, "dur": 16.033, + "args": { + "External id": 151587, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987363, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987363, "pid": 0, "tid": 7, "ts": 6303771887430.384, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771887419.565, "dur": 10.780, + "args": { + "External id": 151587, "cbid": 211, "correlation": 289987363 + } + }, + { + "ph": "s", "id": 289987363, "pid": 5714, "tid": 6744, "ts": 6303771887419.565, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771887461.713, "dur": 44.480, + "args": { + "External id": 151590, "device": 0, "context": 1, "stream": 7, "correlation": 289987370, "bytes": 25165824, "memory bandwidth (GB/s)": 565.778417266187 + } + }, + { + "ph": "f", "id": 289987370, "pid": 0, "tid": 7, "ts": 6303771887461.713, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771887445.025, "dur": 16.140, + "args": { + "External id": 151590, "cbid": 41, "correlation": 289987370 + } + }, + { + "ph": "s", "id": 289987370, "pid": 5714, "tid": 6744, "ts": 6303771887445.025, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887510.737, "dur": 85.569, + "args": { + "External id": 151597, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987388, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987388, "pid": 0, "tid": 7, "ts": 6303771887510.737, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771887502.065, "dur": 8.009, + "args": { + "External id": 151597, "cbid": 211, "correlation": 289987388 + } + }, + { + "ph": "s", "id": 289987388, "pid": 5714, "tid": 6744, "ts": 6303771887502.065, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771887596.947, "dur": 69.952, + "args": { + "External id": 151600, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987396, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987396, "pid": 0, "tid": 7, "ts": 6303771887596.947, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771887527.174, "dur": 5.740, + "args": { + "External id": 151600, "cbid": 211, "correlation": 289987396 + } + }, + { + "ph": "s", "id": 289987396, "pid": 5714, "tid": 6744, "ts": 6303771887527.174, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887667.507, "dur": 86.689, + "args": { + "External id": 151607, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987415, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987415, "pid": 0, "tid": 7, "ts": 6303771887667.507, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771887566.284, "dur": 6.350, + "args": { + "External id": 151607, "cbid": 211, "correlation": 289987415 + } + }, + { + "ph": "s", "id": 289987415, "pid": 5714, "tid": 6744, "ts": 6303771887566.284, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771887754.836, "dur": 226.211, + "args": { + "External id": 151610, "device": 0, "context": 1, "stream": 7, "correlation": 289987422, "bytes": 100663296, "memory bandwidth (GB/s)": 444.99735202974216 + } + }, + { + "ph": "f", "id": 289987422, "pid": 0, "tid": 7, "ts": 6303771887754.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771887583.294, "dur": 17.930, + "args": { + "External id": 151610, "cbid": 41, "correlation": 289987422 + } + }, + { + "ph": "s", "id": 289987422, "pid": 5714, "tid": 6744, "ts": 6303771887583.294, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771887981.783, "dur": 98.657, + "args": { + "External id": 151617, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987440, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987440, "pid": 0, "tid": 7, "ts": 6303771887981.783, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771887645.504, "dur": 6.740, + "args": { + "External id": 151617, "cbid": 211, "correlation": 289987440 + } + }, + { + "ph": "s", "id": 289987440, "pid": 5714, "tid": 6744, "ts": 6303771887645.504, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771888081.144, "dur": 224.963, + "args": { + "External id": 151620, "device": 0, "context": 1, "stream": 7, "correlation": 289987447, "bytes": 100663296, "memory bandwidth (GB/s)": 447.46600996608333 + } + }, + { + "ph": "f", "id": 289987447, "pid": 0, "tid": 7, "ts": 6303771888081.144, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771887664.814, "dur": 10.140, + "args": { + "External id": 151620, "cbid": 41, "correlation": 289987447 + } + }, + { + "ph": "s", "id": 289987447, "pid": 5714, "tid": 6744, "ts": 6303771887664.814, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771888306.843, "dur": 2.560, + "args": { + "External id": 151624, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987465, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987465, "pid": 0, "tid": 7, "ts": 6303771888306.843, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771887709.364, "dur": 7.740, + "args": { + "External id": 151624, "cbid": 211, "correlation": 289987465 + } + }, + { + "ph": "s", "id": 289987465, "pid": 5714, "tid": 6744, "ts": 6303771887709.364, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6303771888310.203, "dur": 0.352, + "args": { + "External id": 151632, "device": 0, "context": 1, "stream": 7, "correlation": 289987483, "bytes": 4, "memory bandwidth (GB/s)": 0.011363636363636364 + } + }, + { + "ph": "f", "id": 289987483, "pid": 0, "tid": 7, "ts": 6303771888310.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771887780.224, "dur": 10.260, + "args": { + "External id": 151632, "cbid": 41, "correlation": 289987483 + } + }, + { + "ph": "s", "id": 289987483, "pid": 5714, "tid": 6744, "ts": 6303771887780.224, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6303771887790.844, "dur": 521.989, + "args": { + "External id": 151632, "cbid": 131, "correlation": 289987484 + } + }, + { + "ph": "s", "id": 289987484, "pid": 5714, "tid": 6744, "ts": 6303771887790.844, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771888345.147, "dur": 1.408, + "args": { + "External id": 151636, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987493, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987493, "pid": 0, "tid": 7, "ts": 6303771888345.147, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771888335.263, "dur": 10.029, + "args": { + "External id": 151636, "cbid": 211, "correlation": 289987493 + } + }, + { + "ph": "s", "id": 289987493, "pid": 5714, "tid": 6744, "ts": 6303771888335.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6303771888366.523, "dur": 0.992, + "args": { + "External id": 151639, "device": 0, "context": 1, "stream": 7, "correlation": 289987499, "bytes": 1, "memory bandwidth (GB/s)": 0.0010080645161290322 + } + }, + { + "ph": "f", "id": 289987499, "pid": 0, "tid": 7, "ts": 6303771888366.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771888357.803, "dur": 8.140, + "args": { + "External id": 151639, "cbid": 41, "correlation": 289987499 + } + }, + { + "ph": "s", "id": 289987499, "pid": 5714, "tid": 6744, "ts": 6303771888357.803, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6303771888366.223, "dur": 3.740, + "args": { + "External id": 151639, "cbid": 131, "correlation": 289987500 + } + }, + { + "ph": "s", "id": 289987500, "pid": 5714, "tid": 6744, "ts": 6303771888366.223, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771888405.244, "dur": 160.610, + "args": { + "External id": 151640, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987507, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987507, "pid": 0, "tid": 7, "ts": 6303771888405.244, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771888396.803, "dur": 8.040, + "args": { + "External id": 151640, "cbid": 211, "correlation": 289987507 + } + }, + { + "ph": "s", "id": 289987507, "pid": 5714, "tid": 6744, "ts": 6303771888396.803, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771888566.526, "dur": 18.304, + "args": { + "External id": 151651, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987531, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987531, "pid": 0, "tid": 7, "ts": 6303771888566.526, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771888458.572, "dur": 8.140, + "args": { + "External id": 151651, "cbid": 211, "correlation": 289987531 + } + }, + { + "ph": "s", "id": 289987531, "pid": 5714, "tid": 6744, "ts": 6303771888458.572, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771888585.534, "dur": 44.512, + "args": { + "External id": 151654, "device": 0, "context": 1, "stream": 7, "correlation": 289987538, "bytes": 25165824, "memory bandwidth (GB/s)": 565.371675053918 + } + }, + { + "ph": "f", "id": 289987538, "pid": 0, "tid": 7, "ts": 6303771888585.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771888479.662, "dur": 11.090, + "args": { + "External id": 151654, "cbid": 41, "correlation": 289987538 + } + }, + { + "ph": "s", "id": 289987538, "pid": 5714, "tid": 6744, "ts": 6303771888479.662, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771888630.782, "dur": 86.401, + "args": { + "External id": 151661, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987556, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987556, "pid": 0, "tid": 7, "ts": 6303771888630.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771888527.002, "dur": 7.530, + "args": { + "External id": 151661, "cbid": 211, "correlation": 289987556 + } + }, + { + "ph": "s", "id": 289987556, "pid": 5714, "tid": 6744, "ts": 6303771888527.002, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771888717.823, "dur": 70.817, + "args": { + "External id": 151664, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987564, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987564, "pid": 0, "tid": 7, "ts": 6303771888717.823, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771888550.592, "dur": 5.180, + "args": { + "External id": 151664, "cbid": 211, "correlation": 289987564 + } + }, + { + "ph": "s", "id": 289987564, "pid": 5714, "tid": 6744, "ts": 6303771888550.592, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771888789.344, "dur": 87.393, + "args": { + "External id": 151671, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987583, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987583, "pid": 0, "tid": 7, "ts": 6303771888789.344, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771888592.332, "dur": 7.490, + "args": { + "External id": 151671, "cbid": 211, "correlation": 289987583 + } + }, + { + "ph": "s", "id": 289987583, "pid": 5714, "tid": 6744, "ts": 6303771888592.332, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771888877.473, "dur": 226.691, + "args": { + "External id": 151674, "device": 0, "context": 1, "stream": 7, "correlation": 289987590, "bytes": 100663296, "memory bandwidth (GB/s)": 444.0551058489309 + } + }, + { + "ph": "f", "id": 289987590, "pid": 0, "tid": 7, "ts": 6303771888877.473, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771888614.432, "dur": 16.740, + "args": { + "External id": 151674, "cbid": 41, "correlation": 289987590 + } + }, + { + "ph": "s", "id": 289987590, "pid": 5714, "tid": 6744, "ts": 6303771888614.432, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771889104.900, "dur": 98.241, + "args": { + "External id": 151681, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987608, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987608, "pid": 0, "tid": 7, "ts": 6303771889104.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771888667.672, "dur": 20.620, + "args": { + "External id": 151681, "cbid": 211, "correlation": 289987608 + } + }, + { + "ph": "s", "id": 289987608, "pid": 5714, "tid": 6744, "ts": 6303771888667.672, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771889203.813, "dur": 225.891, + "args": { + "External id": 151684, "device": 0, "context": 1, "stream": 7, "correlation": 289987615, "bytes": 100663296, "memory bandwidth (GB/s)": 445.627740813047 + } + }, + { + "ph": "f", "id": 289987615, "pid": 0, "tid": 7, "ts": 6303771889203.813, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771888700.522, "dur": 11.270, + "args": { + "External id": 151684, "cbid": 41, "correlation": 289987615 + } + }, + { + "ph": "s", "id": 289987615, "pid": 5714, "tid": 6744, "ts": 6303771888700.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771889430.344, "dur": 323.620, + "args": { + "External id": 151685, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987622, "registers per thread": 23, "shared memory": 0, "blocks per SM": 768.000000, "warps per SM": 3072.000000, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987622, "pid": 0, "tid": 7, "ts": 6303771889430.344, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771888726.422, "dur": 6.490, + "args": { + "External id": 151685, "cbid": 211, "correlation": 289987622 + } + }, + { + "ph": "s", "id": 289987622, "pid": 5714, "tid": 6744, "ts": 6303771888726.422, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771889754.604, "dur": 2.656, + "args": { + "External id": 151689, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987642, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987642, "pid": 0, "tid": 7, "ts": 6303771889754.604, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771888771.842, "dur": 8.249, + "args": { + "External id": 151689, "cbid": 211, "correlation": 289987642 + } + }, + { + "ph": "s", "id": 289987642, "pid": 5714, "tid": 6744, "ts": 6303771888771.842, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6303771889758.124, "dur": 0.352, + "args": { + "External id": 151697, "device": 0, "context": 1, "stream": 7, "correlation": 289987660, "bytes": 4, "memory bandwidth (GB/s)": 0.011363636363636364 + } + }, + { + "ph": "f", "id": 289987660, "pid": 0, "tid": 7, "ts": 6303771889758.124, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771888843.462, "dur": 8.740, + "args": { + "External id": 151697, "cbid": 41, "correlation": 289987660 + } + }, + { + "ph": "s", "id": 289987660, "pid": 5714, "tid": 6744, "ts": 6303771888843.462, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6303771888852.491, "dur": 908.049, + "args": { + "External id": 151697, "cbid": 131, "correlation": 289987661 + } + }, + { + "ph": "s", "id": 289987661, "pid": 5714, "tid": 6744, "ts": 6303771888852.491, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771889789.932, "dur": 1.408, + "args": { + "External id": 151701, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987670, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987670, "pid": 0, "tid": 7, "ts": 6303771889789.932, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771889780.809, "dur": 8.870, + "args": { + "External id": 151701, "cbid": 211, "correlation": 289987670 + } + }, + { + "ph": "s", "id": 289987670, "pid": 5714, "tid": 6744, "ts": 6303771889780.809, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6303771889808.332, "dur": 0.992, + "args": { + "External id": 151704, "device": 0, "context": 1, "stream": 7, "correlation": 289987676, "bytes": 1, "memory bandwidth (GB/s)": 0.0010080645161290322 + } + }, + { + "ph": "f", "id": 289987676, "pid": 0, "tid": 7, "ts": 6303771889808.332, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771889800.729, "dur": 6.920, + "args": { + "External id": 151704, "cbid": 41, "correlation": 289987676 + } + }, + { + "ph": "s", "id": 289987676, "pid": 5714, "tid": 6744, "ts": 6303771889800.729, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6303771889807.929, "dur": 3.600, + "args": { + "External id": 151704, "cbid": 131, "correlation": 289987677 + } + }, + { + "ph": "s", "id": 289987677, "pid": 5714, "tid": 6744, "ts": 6303771889807.929, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771889839.596, "dur": 159.650, + "args": { + "External id": 151705, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987684, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987684, "pid": 0, "tid": 7, "ts": 6303771889839.596, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771889832.419, "dur": 6.660, + "args": { + "External id": 151705, "cbid": 211, "correlation": 289987684 + } + }, + { + "ph": "s", "id": 289987684, "pid": 5714, "tid": 6744, "ts": 6303771889832.419, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771889999.822, "dur": 17.345, + "args": { + "External id": 151716, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987708, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987708, "pid": 0, "tid": 7, "ts": 6303771889999.822, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771889888.479, "dur": 7.560, + "args": { + "External id": 151716, "cbid": 211, "correlation": 289987708 + } + }, + { + "ph": "s", "id": 289987708, "pid": 5714, "tid": 6744, "ts": 6303771889888.479, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771890017.807, "dur": 44.480, + "args": { + "External id": 151719, "device": 0, "context": 1, "stream": 7, "correlation": 289987715, "bytes": 25165824, "memory bandwidth (GB/s)": 565.778417266187 + } + }, + { + "ph": "f", "id": 289987715, "pid": 0, "tid": 7, "ts": 6303771890017.807, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771889906.319, "dur": 10.180, + "args": { + "External id": 151719, "cbid": 41, "correlation": 289987715 + } + }, + { + "ph": "s", "id": 289987715, "pid": 5714, "tid": 6744, "ts": 6303771889906.319, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771890062.927, "dur": 86.625, + "args": { + "External id": 151726, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987733, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987733, "pid": 0, "tid": 7, "ts": 6303771890062.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771889948.419, "dur": 6.270, + "args": { + "External id": 151726, "cbid": 211, "correlation": 289987733 + } + }, + { + "ph": "s", "id": 289987733, "pid": 5714, "tid": 6744, "ts": 6303771889948.419, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771890150.192, "dur": 69.793, + "args": { + "External id": 151729, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987741, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987741, "pid": 0, "tid": 7, "ts": 6303771890150.192, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771889967.749, "dur": 4.500, + "args": { + "External id": 151729, "cbid": 211, "correlation": 289987741 + } + }, + { + "ph": "s", "id": 289987741, "pid": 5714, "tid": 6744, "ts": 6303771889967.749, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771890220.657, "dur": 87.873, + "args": { + "External id": 151736, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987760, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987760, "pid": 0, "tid": 7, "ts": 6303771890220.657, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771889999.759, "dur": 5.560, + "args": { + "External id": 151736, "cbid": 211, "correlation": 289987760 + } + }, + { + "ph": "s", "id": 289987760, "pid": 5714, "tid": 6744, "ts": 6303771889999.759, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771890309.234, "dur": 226.818, + "args": { + "External id": 151739, "device": 0, "context": 1, "stream": 7, "correlation": 289987767, "bytes": 100663296, "memory bandwidth (GB/s)": 443.8064703859482 + } + }, + { + "ph": "f", "id": 289987767, "pid": 0, "tid": 7, "ts": 6303771890309.234, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771890014.019, "dur": 9.980, + "args": { + "External id": 151739, "cbid": 41, "correlation": 289987767 + } + }, + { + "ph": "s", "id": 289987767, "pid": 5714, "tid": 6744, "ts": 6303771890014.019, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771890536.788, "dur": 98.018, + "args": { + "External id": 151746, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987785, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987785, "pid": 0, "tid": 7, "ts": 6303771890536.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771890054.709, "dur": 6.010, + "args": { + "External id": 151746, "cbid": 211, "correlation": 289987785 + } + }, + { + "ph": "s", "id": 289987785, "pid": 5714, "tid": 6744, "ts": 6303771890054.709, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771890635.542, "dur": 225.442, + "args": { + "External id": 151749, "device": 0, "context": 1, "stream": 7, "correlation": 289987792, "bytes": 100663296, "memory bandwidth (GB/s)": 446.51527222079295 + } + }, + { + "ph": "f", "id": 289987792, "pid": 0, "tid": 7, "ts": 6303771890635.542, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771890068.809, "dur": 9.000, + "args": { + "External id": 151749, "cbid": 41, "correlation": 289987792 + } + }, + { + "ph": "s", "id": 289987792, "pid": 5714, "tid": 6744, "ts": 6303771890068.809, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771890861.720, "dur": 324.868, + "args": { + "External id": 151750, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987799, "registers per thread": 23, "shared memory": 0, "blocks per SM": 768.000000, "warps per SM": 3072.000000, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987799, "pid": 0, "tid": 7, "ts": 6303771890861.720, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771890088.169, "dur": 4.390, + "args": { + "External id": 151750, "cbid": 211, "correlation": 289987799 + } + }, + { + "ph": "s", "id": 289987799, "pid": 5714, "tid": 6744, "ts": 6303771890088.169, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771891187.260, "dur": 2.528, + "args": { + "External id": 151754, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987819, "registers per thread": 32, "shared memory": 16, "blocks per SM": 0.007812, "warps per SM": 0.000244, "grid": [1, 1, 1], "block": [1, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987819, "pid": 0, "tid": 7, "ts": 6303771891187.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771890120.779, "dur": 6.489, + "args": { + "External id": 151754, "cbid": 211, "correlation": 289987819 + } + }, + { + "ph": "s", "id": 289987819, "pid": 5714, "tid": 6744, "ts": 6303771890120.779, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pageable -> Device)", "pid": 0, "tid": 7, + "ts": 6303771891190.620, "dur": 0.352, + "args": { + "External id": 151762, "device": 0, "context": 1, "stream": 7, "correlation": 289987837, "bytes": 4, "memory bandwidth (GB/s)": 0.011363636363636364 + } + }, + { + "ph": "f", "id": 289987837, "pid": 0, "tid": 7, "ts": 6303771891190.620, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771890187.119, "dur": 9.829, + "args": { + "External id": 151762, "cbid": 41, "correlation": 289987837 + } + }, + { + "ph": "s", "id": 289987837, "pid": 5714, "tid": 6744, "ts": 6303771890187.119, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6303771890197.279, "dur": 995.717, + "args": { + "External id": 151762, "cbid": 131, "correlation": 289987838 + } + }, + { + "ph": "s", "id": 289987838, "pid": 5714, "tid": 6744, "ts": 6303771890197.279, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771891220.668, "dur": 1.344, + "args": { + "External id": 151766, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987847, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289987847, "pid": 0, "tid": 7, "ts": 6303771891220.668, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891211.406, "dur": 9.080, + "args": { + "External id": 151766, "cbid": 211, "correlation": 289987847 + } + }, + { + "ph": "s", "id": 289987847, "pid": 5714, "tid": 6744, "ts": 6303771891211.406, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6303771891239.645, "dur": 0.864, + "args": { + "External id": 151769, "device": 0, "context": 1, "stream": 7, "correlation": 289987853, "bytes": 1, "memory bandwidth (GB/s)": 0.0011574074074074073 + } + }, + { + "ph": "f", "id": 289987853, "pid": 0, "tid": 7, "ts": 6303771891239.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771891231.666, "dur": 7.250, + "args": { + "External id": 151769, "cbid": 41, "correlation": 289987853 + } + }, + { + "ph": "s", "id": 289987853, "pid": 5714, "tid": 6744, "ts": 6303771891231.666, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 6744, + "ts": 6303771891239.136, "dur": 3.630, + "args": { + "External id": 151769, "cbid": 131, "correlation": 289987854 + } + }, + { + "ph": "s", "id": 289987854, "pid": 5714, "tid": 6744, "ts": 6303771891239.136, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771891271.421, "dur": 159.202, + "args": { + "External id": 151770, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987861, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987861, "pid": 0, "tid": 7, "ts": 6303771891271.421, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891264.246, "dur": 6.680, + "args": { + "External id": 151770, "cbid": 211, "correlation": 289987861 + } + }, + { + "ph": "s", "id": 289987861, "pid": 5714, "tid": 6744, "ts": 6303771891264.246, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771891431.295, "dur": 17.664, + "args": { + "External id": 151781, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987885, "registers per thread": 16, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 384.000000, "grid": [12288, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987885, "pid": 0, "tid": 7, "ts": 6303771891431.295, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891327.226, "dur": 7.720, + "args": { + "External id": 151781, "cbid": 211, "correlation": 289987885 + } + }, + { + "ph": "s", "id": 289987885, "pid": 5714, "tid": 6744, "ts": 6303771891327.226, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771891450.271, "dur": 44.321, + "args": { + "External id": 151784, "device": 0, "context": 1, "stream": 7, "correlation": 289987892, "bytes": 25165824, "memory bandwidth (GB/s)": 567.8081270729451 + } + }, + { + "ph": "f", "id": 289987892, "pid": 0, "tid": 7, "ts": 6303771891450.271, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771891345.516, "dur": 10.810, + "args": { + "External id": 151784, "cbid": 41, "correlation": 289987892 + } + }, + { + "ph": "s", "id": 289987892, "pid": 5714, "tid": 6744, "ts": 6303771891345.516, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771891495.264, "dur": 86.689, + "args": { + "External id": 151791, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987910, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987910, "pid": 0, "tid": 7, "ts": 6303771891495.264, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891391.166, "dur": 6.470, + "args": { + "External id": 151791, "cbid": 211, "correlation": 289987910 + } + }, + { + "ph": "s", "id": 289987910, "pid": 5714, "tid": 6744, "ts": 6303771891391.166, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast(at::TensorIteratorBase&, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#12}::operator()() const::{lambda(c10::BFloat16)#1} const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771891582.657, "dur": 69.985, + "args": { + "External id": 151794, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987918, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987918, "pid": 0, "tid": 7, "ts": 6303771891582.657, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891410.516, "dur": 4.590, + "args": { + "External id": 151794, "cbid": 211, "correlation": 289987918 + } + }, + { + "ph": "s", "id": 289987918, "pid": 5714, "tid": 6744, "ts": 6303771891410.516, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771891653.314, "dur": 87.360, + "args": { + "External id": 151801, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987937, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987937, "pid": 0, "tid": 7, "ts": 6303771891653.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891443.726, "dur": 5.810, + "args": { + "External id": 151801, "cbid": 211, "correlation": 289987937 + } + }, + { + "ph": "s", "id": 289987937, "pid": 5714, "tid": 6744, "ts": 6303771891443.726, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771891741.346, "dur": 226.595, + "args": { + "External id": 151804, "device": 0, "context": 1, "stream": 7, "correlation": 289987944, "bytes": 100663296, "memory bandwidth (GB/s)": 444.2432357289437 + } + }, + { + "ph": "f", "id": 289987944, "pid": 0, "tid": 7, "ts": 6303771891741.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771891457.946, "dur": 16.070, + "args": { + "External id": 151804, "cbid": 41, "correlation": 289987944 + } + }, + { + "ph": "s", "id": 289987944, "pid": 5714, "tid": 6744, "ts": 6303771891457.946, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303771891968.581, "dur": 98.113, + "args": { + "External id": 151811, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987962, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987962, "pid": 0, "tid": 7, "ts": 6303771891968.581, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891503.145, "dur": 7.020, + "args": { + "External id": 151811, "cbid": 211, "correlation": 289987962 + } + }, + { + "ph": "s", "id": 289987962, "pid": 5714, "tid": 6744, "ts": 6303771891503.145, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771892067.430, "dur": 226.083, + "args": { + "External id": 151814, "device": 0, "context": 1, "stream": 7, "correlation": 289987969, "bytes": 100663296, "memory bandwidth (GB/s)": 445.24929340109605 + } + }, + { + "ph": "f", "id": 289987969, "pid": 0, "tid": 7, "ts": 6303771892067.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771891519.705, "dur": 8.651, + "args": { + "External id": 151814, "cbid": 41, "correlation": 289987969 + } + }, + { + "ph": "s", "id": 289987969, "pid": 5714, "tid": 6744, "ts": 6303771891519.705, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771892294.249, "dur": 322.724, + "args": { + "External id": 151815, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987976, "registers per thread": 23, "shared memory": 0, "blocks per SM": 768.000000, "warps per SM": 3072.000000, "grid": [98304, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987976, "pid": 0, "tid": 7, "ts": 6303771892294.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891539.056, "dur": 4.689, + "args": { + "External id": 151815, "cbid": 211, "correlation": 289987976 + } + }, + { + "ph": "s", "id": 289987976, "pid": 5714, "tid": 6744, "ts": 6303771891539.056, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303771892617.613, "dur": 199.010, + "args": { + "External id": 151818, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289987997, "registers per thread": 35, "shared memory": 1024, "blocks per SM": 16.031250, "warps per SM": 64.125000, "grid": [2052, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289987997, "pid": 0, "tid": 7, "ts": 6303771892617.613, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891699.205, "dur": 9.480, + "args": { + "External id": 151818, "cbid": 307, "correlation": 289987997 + } + }, + { + "ph": "s", "id": 289987997, "pid": 5714, "tid": 6744, "ts": 6303771891699.205, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771892817.327, "dur": 4.544, + "args": { + "External id": 151819, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988005, "registers per thread": 21, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289988005, "pid": 0, "tid": 7, "ts": 6303771892817.327, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891742.005, "dur": 7.210, + "args": { + "External id": 151819, "cbid": 307, "correlation": 289988005 + } + }, + { + "ph": "s", "id": 289988005, "pid": 5714, "tid": 6744, "ts": 6303771891742.005, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303771892822.511, "dur": 311.460, + "args": { + "External id": 151820, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988016, "registers per thread": 24, "shared memory": 32, "blocks per SM": 512.000000, "warps per SM": 4096.000000, "grid": [65536, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988016, "pid": 0, "tid": 7, "ts": 6303771892822.511, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891784.315, "dur": 7.280, + "args": { + "External id": 151820, "cbid": 307, "correlation": 289988016 + } + }, + { + "ph": "s", "id": 289988016, "pid": 5714, "tid": 6744, "ts": 6303771891784.315, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771893134.675, "dur": 329.475, + "args": { + "External id": 151849, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988053, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988053, "pid": 0, "tid": 7, "ts": 6303771893134.675, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771891987.175, "dur": 11.389, + "args": { + "External id": 151849, "cbid": 211, "correlation": 289988053 + } + }, + { + "ph": "s", "id": 289988053, "pid": 5714, "tid": 6744, "ts": 6303771891987.175, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6303771893464.854, "dur": 433.926, + "args": { + "External id": 151838, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988081, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988081, "pid": 0, "tid": 7, "ts": 6303771893464.854, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892070.564, "dur": 7.710, + "args": { + "External id": 151838, "cbid": 307, "correlation": 289988081 + } + }, + { + "ph": "s", "id": 289988081, "pid": 5714, "tid": 6744, "ts": 6303771892070.564, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771892190.674, "dur": 0.630, + "args": { + "External id": 151874, "cbid": 200, "correlation": 289988106 + } + }, + { + "ph": "f", "id": 289988106, "pid": 5714, "tid": 6744, "ts": 6303771892190.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771893899.676, "dur": 0.832, + "args": { + "External id": 151874, "device": 0, "context": 1, "stream": 7, "correlation": 289988109, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289988109, "pid": 0, "tid": 7, "ts": 6303771893899.676, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771892192.954, "dur": 8.810, + "args": { + "External id": 151874, "cbid": 51, "correlation": 289988109 + } + }, + { + "ph": "s", "id": 289988109, "pid": 5714, "tid": 6744, "ts": 6303771892192.954, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771893901.724, "dur": 369.636, + "args": { + "External id": 151874, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988110, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988110, "pid": 0, "tid": 7, "ts": 6303771893901.724, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892201.984, "dur": 7.300, + "args": { + "External id": 151874, "cbid": 307, "correlation": 289988110 + } + }, + { + "ph": "s", "id": 289988110, "pid": 5714, "tid": 6744, "ts": 6303771892201.984, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771892331.074, "dur": 0.530, + "args": { + "External id": 151892, "cbid": 200, "correlation": 289988147 + } + }, + { + "ph": "f", "id": 289988147, "pid": 5714, "tid": 6744, "ts": 6303771892331.074, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771894272.224, "dur": 0.832, + "args": { + "External id": 151892, "device": 0, "context": 1, "stream": 7, "correlation": 289988150, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289988150, "pid": 0, "tid": 7, "ts": 6303771894272.224, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771892333.094, "dur": 6.410, + "args": { + "External id": 151892, "cbid": 51, "correlation": 289988150 + } + }, + { + "ph": "s", "id": 289988150, "pid": 5714, "tid": 6744, "ts": 6303771892333.094, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771894274.272, "dur": 354.596, + "args": { + "External id": 151892, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988151, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988151, "pid": 0, "tid": 7, "ts": 6303771894274.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892339.694, "dur": 8.170, + "args": { + "External id": 151892, "cbid": 307, "correlation": 289988151 + } + }, + { + "ph": "s", "id": 289988151, "pid": 5714, "tid": 6744, "ts": 6303771892339.694, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771892377.883, "dur": 0.291, + "args": { + "External id": 151899, "cbid": 200, "correlation": 289988176 + } + }, + { + "ph": "f", "id": 289988176, "pid": 5714, "tid": 6744, "ts": 6303771892377.883, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771894630.116, "dur": 353.828, + "args": { + "External id": 151899, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988179, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988179, "pid": 0, "tid": 7, "ts": 6303771894630.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892379.234, "dur": 5.549, + "args": { + "External id": 151899, "cbid": 307, "correlation": 289988179 + } + }, + { + "ph": "s", "id": 289988179, "pid": 5714, "tid": 6744, "ts": 6303771892379.234, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771892479.513, "dur": 0.430, + "args": { + "External id": 151922, "cbid": 200, "correlation": 289988224 + } + }, + { + "ph": "f", "id": 289988224, "pid": 5714, "tid": 6744, "ts": 6303771892479.513, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771894984.808, "dur": 0.800, + "args": { + "External id": 151922, "device": 0, "context": 1, "stream": 7, "correlation": 289988227, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289988227, "pid": 0, "tid": 7, "ts": 6303771894984.808, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771892481.263, "dur": 5.570, + "args": { + "External id": 151922, "cbid": 51, "correlation": 289988227 + } + }, + { + "ph": "s", "id": 289988227, "pid": 5714, "tid": 6744, "ts": 6303771892481.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771894986.408, "dur": 355.268, + "args": { + "External id": 151922, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988228, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988228, "pid": 0, "tid": 7, "ts": 6303771894986.408, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892486.993, "dur": 6.690, + "args": { + "External id": 151922, "cbid": 307, "correlation": 289988228 + } + }, + { + "ph": "s", "id": 289988228, "pid": 5714, "tid": 6744, "ts": 6303771892486.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771892521.433, "dur": 0.300, + "args": { + "External id": 151929, "cbid": 200, "correlation": 289988253 + } + }, + { + "ph": "f", "id": 289988253, "pid": 5714, "tid": 6744, "ts": 6303771892521.433, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771895342.284, "dur": 358.725, + "args": { + "External id": 151929, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988256, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988256, "pid": 0, "tid": 7, "ts": 6303771895342.284, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892522.653, "dur": 5.270, + "args": { + "External id": 151929, "cbid": 307, "correlation": 289988256 + } + }, + { + "ph": "s", "id": 289988256, "pid": 5714, "tid": 6744, "ts": 6303771892522.653, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771895701.617, "dur": 51.104, + "args": { + "External id": 151934, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988270, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988270, "pid": 0, "tid": 7, "ts": 6303771895701.617, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892567.833, "dur": 6.950, + "args": { + "External id": 151934, "cbid": 211, "correlation": 289988270 + } + }, + { + "ph": "s", "id": 289988270, "pid": 5714, "tid": 6744, "ts": 6303771892567.833, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771895753.361, "dur": 44.609, + "args": { + "External id": 151946, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988294, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988294, "pid": 0, "tid": 7, "ts": 6303771895753.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892628.353, "dur": 9.120, + "args": { + "External id": 151946, "cbid": 211, "correlation": 289988294 + } + }, + { + "ph": "s", "id": 289988294, "pid": 5714, "tid": 6744, "ts": 6303771892628.353, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771895798.578, "dur": 24.640, + "args": { + "External id": 151947, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988304, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988304, "pid": 0, "tid": 7, "ts": 6303771895798.578, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892647.133, "dur": 5.480, + "args": { + "External id": 151947, "cbid": 211, "correlation": 289988304 + } + }, + { + "ph": "s", "id": 289988304, "pid": 5714, "tid": 6744, "ts": 6303771892647.133, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771895824.050, "dur": 0.768, + "args": { + "External id": 151948, "device": 0, "context": 1, "stream": 7, "correlation": 289988319, "bytes": 24, "memory bandwidth (GB/s)": 0.03125 + } + }, + { + "ph": "f", "id": 289988319, "pid": 0, "tid": 7, "ts": 6303771895824.050, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771892670.703, "dur": 6.430, + "args": { + "External id": 151948, "cbid": 51, "correlation": 289988319 + } + }, + { + "ph": "s", "id": 289988319, "pid": 5714, "tid": 6744, "ts": 6303771892670.703, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6303771895825.650, "dur": 42.528, + "args": { + "External id": 151948, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988321, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289988321, "pid": 0, "tid": 7, "ts": 6303771895825.650, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892678.703, "dur": 6.430, + "args": { + "External id": 151948, "cbid": 211, "correlation": 289988321 + } + }, + { + "ph": "s", "id": 289988321, "pid": 5714, "tid": 6744, "ts": 6303771892678.703, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771895868.786, "dur": 51.873, + "args": { + "External id": 151959, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988342, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988342, "pid": 0, "tid": 7, "ts": 6303771895868.786, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892740.533, "dur": 8.780, + "args": { + "External id": 151959, "cbid": 211, "correlation": 289988342 + } + }, + { + "ph": "s", "id": 289988342, "pid": 5714, "tid": 6744, "ts": 6303771892740.533, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771895921.299, "dur": 141.602, + "args": { + "External id": 151962, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988357, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988357, "pid": 0, "tid": 7, "ts": 6303771895921.299, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892774.363, "dur": 6.830, + "args": { + "External id": 151962, "cbid": 211, "correlation": 289988357 + } + }, + { + "ph": "s", "id": 289988357, "pid": 5714, "tid": 6744, "ts": 6303771892774.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771896063.509, "dur": 107.841, + "args": { + "External id": 151963, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988367, "pid": 0, "tid": 7, "ts": 6303771896063.509, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892790.943, "dur": 5.530, + "args": { + "External id": 151963, "cbid": 211, "correlation": 289988367 + } + }, + { + "ph": "s", "id": 289988367, "pid": 5714, "tid": 6744, "ts": 6303771892790.943, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771896172.086, "dur": 77.857, + "args": { + "External id": 151964, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988381, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988381, "pid": 0, "tid": 7, "ts": 6303771896172.086, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892809.163, "dur": 4.640, + "args": { + "External id": 151964, "cbid": 211, "correlation": 289988381 + } + }, + { + "ph": "s", "id": 289988381, "pid": 5714, "tid": 6744, "ts": 6303771892809.163, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771896250.615, "dur": 1.472, + "args": { + "External id": 151967, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988395, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289988395, "pid": 0, "tid": 7, "ts": 6303771896250.615, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892846.702, "dur": 8.000, + "args": { + "External id": 151967, "cbid": 211, "correlation": 289988395 + } + }, + { + "ph": "s", "id": 289988395, "pid": 5714, "tid": 6744, "ts": 6303771892846.702, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771896252.791, "dur": 1.216, + "args": { + "External id": 151971, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988405, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289988405, "pid": 0, "tid": 7, "ts": 6303771896252.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892871.042, "dur": 5.880, + "args": { + "External id": 151971, "cbid": 211, "correlation": 289988405 + } + }, + { + "ph": "s", "id": 289988405, "pid": 5714, "tid": 6744, "ts": 6303771892871.042, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771896254.647, "dur": 1.024, + "args": { + "External id": 151972, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988415, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289988415, "pid": 0, "tid": 7, "ts": 6303771896254.647, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892884.362, "dur": 3.960, + "args": { + "External id": 151972, "cbid": 211, "correlation": 289988415 + } + }, + { + "ph": "s", "id": 289988415, "pid": 5714, "tid": 6744, "ts": 6303771892884.362, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771896256.407, "dur": 26.944, + "args": { + "External id": 151980, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988433, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988433, "pid": 0, "tid": 7, "ts": 6303771896256.407, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771892938.732, "dur": 7.770, + "args": { + "External id": 151980, "cbid": 211, "correlation": 289988433 + } + }, + { + "ph": "s", "id": 289988433, "pid": 5714, "tid": 6744, "ts": 6303771892938.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771896284.055, "dur": 114.273, + "args": { + "External id": 151986, "device": 0, "context": 1, "stream": 7, "correlation": 289988447, "bytes": 50331648, "memory bandwidth (GB/s)": 440.45092016486836 + } + }, + { + "ph": "f", "id": 289988447, "pid": 0, "tid": 7, "ts": 6303771896284.055, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771892973.612, "dur": 15.130, + "args": { + "External id": 151986, "cbid": 41, "correlation": 289988447 + } + }, + { + "ph": "s", "id": 289988447, "pid": 5714, "tid": 6744, "ts": 6303771892973.612, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771896399.000, "dur": 71.778, + "args": { + "External id": 151988, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988459, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988459, "pid": 0, "tid": 7, "ts": 6303771896399.000, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893001.372, "dur": 5.710, + "args": { + "External id": 151988, "cbid": 211, "correlation": 289988459 + } + }, + { + "ph": "s", "id": 289988459, "pid": 5714, "tid": 6744, "ts": 6303771893001.372, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771896471.482, "dur": 150.817, + "args": { + "External id": 151989, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988469, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988469, "pid": 0, "tid": 7, "ts": 6303771896471.482, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893014.862, "dur": 3.980, + "args": { + "External id": 151989, "cbid": 211, "correlation": 289988469 + } + }, + { + "ph": "s", "id": 289988469, "pid": 5714, "tid": 6744, "ts": 6303771893014.862, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771896623.003, "dur": 142.338, + "args": { + "External id": 151990, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988476, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988476, "pid": 0, "tid": 7, "ts": 6303771896623.003, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893031.132, "dur": 4.360, + "args": { + "External id": 151990, "cbid": 211, "correlation": 289988476 + } + }, + { + "ph": "s", "id": 289988476, "pid": 5714, "tid": 6744, "ts": 6303771893031.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771896766.013, "dur": 46.272, + "args": { + "External id": 151996, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988495, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988495, "pid": 0, "tid": 7, "ts": 6303771896766.013, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893066.422, "dur": 6.100, + "args": { + "External id": 151996, "cbid": 211, "correlation": 289988495 + } + }, + { + "ph": "s", "id": 289988495, "pid": 5714, "tid": 6744, "ts": 6303771893066.422, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771896813.469, "dur": 57.857, + "args": { + "External id": 151997, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988507, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988507, "pid": 0, "tid": 7, "ts": 6303771896813.469, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893087.522, "dur": 6.100, + "args": { + "External id": 151997, "cbid": 211, "correlation": 289988507 + } + }, + { + "ph": "s", "id": 289988507, "pid": 5714, "tid": 6744, "ts": 6303771893087.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771893162.942, "dur": 0.500, + "args": { + "External id": 152009, "cbid": 200, "correlation": 289988547 + } + }, + { + "ph": "f", "id": 289988547, "pid": 5714, "tid": 6744, "ts": 6303771893162.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771896872.190, "dur": 0.800, + "args": { + "External id": 152009, "device": 0, "context": 1, "stream": 7, "correlation": 289988550, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 289988550, "pid": 0, "tid": 7, "ts": 6303771896872.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771893165.032, "dur": 6.610, + "args": { + "External id": 152009, "cbid": 51, "correlation": 289988550 + } + }, + { + "ph": "s", "id": 289988550, "pid": 5714, "tid": 6744, "ts": 6303771893165.032, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771896873.758, "dur": 138.753, + "args": { + "External id": 152009, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988551, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988551, "pid": 0, "tid": 7, "ts": 6303771896873.758, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893171.872, "dur": 6.370, + "args": { + "External id": 152009, "cbid": 307, "correlation": 289988551 + } + }, + { + "ph": "s", "id": 289988551, "pid": 5714, "tid": 6744, "ts": 6303771893171.872, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771897013.248, "dur": 122.785, + "args": { + "External id": 152016, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988573, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988573, "pid": 0, "tid": 7, "ts": 6303771897013.248, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893208.362, "dur": 6.890, + "args": { + "External id": 152016, "cbid": 211, "correlation": 289988573 + } + }, + { + "ph": "s", "id": 289988573, "pid": 5714, "tid": 6744, "ts": 6303771893208.362, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771893420.951, "dur": 0.590, + "args": { + "External id": 152042, "cbid": 200, "correlation": 289988620 + } + }, + { + "ph": "f", "id": 289988620, "pid": 5714, "tid": 6744, "ts": 6303771893420.951, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771893421.661, "dur": 0.190, + "args": { + "External id": 152042, "cbid": 200, "correlation": 289988621 + } + }, + { + "ph": "f", "id": 289988621, "pid": 5714, "tid": 6744, "ts": 6303771893421.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771893442.801, "dur": 0.220, + "args": { + "External id": 152042, "cbid": 200, "correlation": 289988639 + } + }, + { + "ph": "f", "id": 289988639, "pid": 5714, "tid": 6744, "ts": 6303771893442.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771897136.801, "dur": 92.833, + "args": { + "External id": 152042, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988640, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988640, "pid": 0, "tid": 7, "ts": 6303771897136.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893445.111, "dur": 12.270, + "args": { + "External id": 152042, "cbid": 211, "correlation": 289988640 + } + }, + { + "ph": "s", "id": 289988640, "pid": 5714, "tid": 6744, "ts": 6303771893445.111, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771893459.081, "dur": 1.650, + "args": { + "External id": 152042, "cbid": 273, "correlation": 289988642 + } + }, + { + "ph": "f", "id": 289988642, "pid": 5714, "tid": 6744, "ts": 6303771893459.081, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771897230.338, "dur": 997.676, + "args": { + "External id": 152042, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988643, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289988643, "pid": 0, "tid": 7, "ts": 6303771897230.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893461.011, "dur": 5.240, + "args": { + "External id": 152042, "cbid": 211, "correlation": 289988643 + } + }, + { + "ph": "s", "id": 289988643, "pid": 5714, "tid": 6744, "ts": 6303771893461.011, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303771898253.966, "dur": 71.585, + "args": { + "External id": 152042, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988645, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289988645, "pid": 0, "tid": 7, "ts": 6303771898253.966, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893467.161, "dur": 5.120, + "args": { + "External id": 152042, "cbid": 211, "correlation": 289988645 + } + }, + { + "ph": "s", "id": 289988645, "pid": 5714, "tid": 6744, "ts": 6303771893467.161, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771898326.223, "dur": 48.160, + "args": { + "External id": 152052, "device": 0, "context": 1, "stream": 7, "correlation": 289988671, "bytes": 25165824, "memory bandwidth (GB/s)": 522.5461794019933 + } + }, + { + "ph": "f", "id": 289988671, "pid": 0, "tid": 7, "ts": 6303771898326.223, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771893601.051, "dur": 17.320, + "args": { + "External id": 152052, "cbid": 41, "correlation": 289988671 + } + }, + { + "ph": "s", "id": 289988671, "pid": 5714, "tid": 6744, "ts": 6303771893601.051, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771898375.087, "dur": 33.185, + "args": { + "External id": 152049, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988689, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988689, "pid": 0, "tid": 7, "ts": 6303771898375.087, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893728.400, "dur": 8.600, + "args": { + "External id": 152049, "cbid": 307, "correlation": 289988689 + } + }, + { + "ph": "s", "id": 289988689, "pid": 5714, "tid": 6744, "ts": 6303771893728.400, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771898408.944, "dur": 38.112, + "args": { + "External id": 152059, "device": 0, "context": 1, "stream": 7, "correlation": 289988704, "bytes": 25165824, "memory bandwidth (GB/s)": 660.3123425692695 + } + }, + { + "ph": "f", "id": 289988704, "pid": 0, "tid": 7, "ts": 6303771898408.944, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771893796.771, "dur": 13.049, + "args": { + "External id": 152059, "cbid": 41, "correlation": 289988704 + } + }, + { + "ph": "s", "id": 289988704, "pid": 5714, "tid": 6744, "ts": 6303771893796.771, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771898447.792, "dur": 26.977, + "args": { + "External id": 152056, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988722, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988722, "pid": 0, "tid": 7, "ts": 6303771898447.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771893894.620, "dur": 7.580, + "args": { + "External id": 152056, "cbid": 307, "correlation": 289988722 + } + }, + { + "ph": "s", "id": 289988722, "pid": 5714, "tid": 6744, "ts": 6303771893894.620, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771894020.130, "dur": 0.520, + "args": { + "External id": 152083, "cbid": 200, "correlation": 289988766 + } + }, + { + "ph": "f", "id": 289988766, "pid": 5714, "tid": 6744, "ts": 6303771894020.130, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771898475.601, "dur": 1.088, + "args": { + "External id": 152083, "device": 0, "context": 1, "stream": 7, "correlation": 289988769, "bytes": 576, "memory bandwidth (GB/s)": 0.5294117647058824 + } + }, + { + "ph": "f", "id": 289988769, "pid": 0, "tid": 7, "ts": 6303771898475.601, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771894022.260, "dur": 6.800, + "args": { + "External id": 152083, "cbid": 51, "correlation": 289988769 + } + }, + { + "ph": "s", "id": 289988769, "pid": 5714, "tid": 6744, "ts": 6303771894022.260, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771898477.873, "dur": 141.922, + "args": { + "External id": 152083, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988770, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988770, "pid": 0, "tid": 7, "ts": 6303771898477.873, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894029.290, "dur": 7.590, + "args": { + "External id": 152083, "cbid": 307, "correlation": 289988770 + } + }, + { + "ph": "s", "id": 289988770, "pid": 5714, "tid": 6744, "ts": 6303771894029.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771898620.499, "dur": 122.689, + "args": { + "External id": 152090, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988792, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988792, "pid": 0, "tid": 7, "ts": 6303771898620.499, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894065.420, "dur": 5.720, + "args": { + "External id": 152090, "cbid": 211, "correlation": 289988792 + } + }, + { + "ph": "s", "id": 289988792, "pid": 5714, "tid": 6744, "ts": 6303771894065.420, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771894166.040, "dur": 0.460, + "args": { + "External id": 152113, "cbid": 200, "correlation": 289988838 + } + }, + { + "ph": "f", "id": 289988838, "pid": 5714, "tid": 6744, "ts": 6303771894166.040, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771898743.988, "dur": 0.768, + "args": { + "External id": 152113, "device": 0, "context": 1, "stream": 7, "correlation": 289988841, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289988841, "pid": 0, "tid": 7, "ts": 6303771898743.988, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771894167.920, "dur": 5.550, + "args": { + "External id": 152113, "cbid": 51, "correlation": 289988841 + } + }, + { + "ph": "s", "id": 289988841, "pid": 5714, "tid": 6744, "ts": 6303771894167.920, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771898746.100, "dur": 142.273, + "args": { + "External id": 152113, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988842, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988842, "pid": 0, "tid": 7, "ts": 6303771898746.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894173.670, "dur": 6.880, + "args": { + "External id": 152113, "cbid": 307, "correlation": 289988842 + } + }, + { + "ph": "s", "id": 289988842, "pid": 5714, "tid": 6744, "ts": 6303771894173.670, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771898888.981, "dur": 122.754, + "args": { + "External id": 152120, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988864, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988864, "pid": 0, "tid": 7, "ts": 6303771898888.981, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894206.039, "dur": 5.180, + "args": { + "External id": 152120, "cbid": 211, "correlation": 289988864 + } + }, + { + "ph": "s", "id": 289988864, "pid": 5714, "tid": 6744, "ts": 6303771894206.039, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771899012.439, "dur": 39.232, + "args": { + "External id": 152125, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988879, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988879, "pid": 0, "tid": 7, "ts": 6303771899012.439, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894247.590, "dur": 7.140, + "args": { + "External id": 152125, "cbid": 211, "correlation": 289988879 + } + }, + { + "ph": "s", "id": 289988879, "pid": 5714, "tid": 6744, "ts": 6303771894247.590, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771894344.089, "dur": 0.480, + "args": { + "External id": 152144, "cbid": 200, "correlation": 289988923 + } + }, + { + "ph": "f", "id": 289988923, "pid": 5714, "tid": 6744, "ts": 6303771894344.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771899052.567, "dur": 0.768, + "args": { + "External id": 152144, "device": 0, "context": 1, "stream": 7, "correlation": 289988926, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289988926, "pid": 0, "tid": 7, "ts": 6303771899052.567, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771894345.939, "dur": 5.950, + "args": { + "External id": 152144, "cbid": 51, "correlation": 289988926 + } + }, + { + "ph": "s", "id": 289988926, "pid": 5714, "tid": 6744, "ts": 6303771894345.939, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771899054.551, "dur": 141.858, + "args": { + "External id": 152144, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988927, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988927, "pid": 0, "tid": 7, "ts": 6303771899054.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894352.059, "dur": 7.850, + "args": { + "External id": 152144, "cbid": 307, "correlation": 289988927 + } + }, + { + "ph": "s", "id": 289988927, "pid": 5714, "tid": 6744, "ts": 6303771894352.059, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771899197.145, "dur": 123.137, + "args": { + "External id": 152151, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988949, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289988949, "pid": 0, "tid": 7, "ts": 6303771899197.145, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894385.709, "dur": 5.250, + "args": { + "External id": 152151, "cbid": 211, "correlation": 289988949 + } + }, + { + "ph": "s", "id": 289988949, "pid": 5714, "tid": 6744, "ts": 6303771894385.709, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771899320.922, "dur": 37.953, + "args": { + "External id": 152156, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988960, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988960, "pid": 0, "tid": 7, "ts": 6303771899320.922, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894425.719, "dur": 6.880, + "args": { + "External id": 152156, "cbid": 211, "correlation": 289988960 + } + }, + { + "ph": "s", "id": 289988960, "pid": 5714, "tid": 6744, "ts": 6303771894425.719, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771899359.579, "dur": 42.784, + "args": { + "External id": 152168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988984, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988984, "pid": 0, "tid": 7, "ts": 6303771899359.579, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894482.019, "dur": 7.340, + "args": { + "External id": 152168, "cbid": 211, "correlation": 289988984 + } + }, + { + "ph": "s", "id": 289988984, "pid": 5714, "tid": 6744, "ts": 6303771894482.019, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771899403.035, "dur": 25.313, + "args": { + "External id": 152169, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289988994, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289988994, "pid": 0, "tid": 7, "ts": 6303771899403.035, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894497.949, "dur": 4.200, + "args": { + "External id": 152169, "cbid": 211, "correlation": 289988994 + } + }, + { + "ph": "s", "id": 289988994, "pid": 5714, "tid": 6744, "ts": 6303771894497.949, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771899429.148, "dur": 0.768, + "args": { + "External id": 152170, "device": 0, "context": 1, "stream": 7, "correlation": 289989009, "bytes": 24, "memory bandwidth (GB/s)": 0.03125 + } + }, + { + "ph": "f", "id": 289989009, "pid": 0, "tid": 7, "ts": 6303771899429.148, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771894516.349, "dur": 5.590, + "args": { + "External id": 152170, "cbid": 51, "correlation": 289989009 + } + }, + { + "ph": "s", "id": 289989009, "pid": 5714, "tid": 6744, "ts": 6303771894516.349, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6303771899431.612, "dur": 42.048, + "args": { + "External id": 152170, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989011, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289989011, "pid": 0, "tid": 7, "ts": 6303771899431.612, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894523.019, "dur": 4.930, + "args": { + "External id": 152170, "cbid": 211, "correlation": 289989011 + } + }, + { + "ph": "s", "id": 289989011, "pid": 5714, "tid": 6744, "ts": 6303771894523.019, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771899474.364, "dur": 50.561, + "args": { + "External id": 152181, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989032, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989032, "pid": 0, "tid": 7, "ts": 6303771899474.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894579.569, "dur": 7.610, + "args": { + "External id": 152181, "cbid": 211, "correlation": 289989032 + } + }, + { + "ph": "s", "id": 289989032, "pid": 5714, "tid": 6744, "ts": 6303771894579.569, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771899525.597, "dur": 141.665, + "args": { + "External id": 152184, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989047, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989047, "pid": 0, "tid": 7, "ts": 6303771899525.597, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894609.999, "dur": 5.390, + "args": { + "External id": 152184, "cbid": 211, "correlation": 289989047 + } + }, + { + "ph": "s", "id": 289989047, "pid": 5714, "tid": 6744, "ts": 6303771894609.999, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771899667.934, "dur": 107.521, + "args": { + "External id": 152185, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989057, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989057, "pid": 0, "tid": 7, "ts": 6303771899667.934, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894623.679, "dur": 4.290, + "args": { + "External id": 152185, "cbid": 211, "correlation": 289989057 + } + }, + { + "ph": "s", "id": 289989057, "pid": 5714, "tid": 6744, "ts": 6303771894623.679, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771899776.095, "dur": 77.474, + "args": { + "External id": 152186, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989071, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989071, "pid": 0, "tid": 7, "ts": 6303771899776.095, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894641.989, "dur": 5.240, + "args": { + "External id": 152186, "cbid": 211, "correlation": 289989071 + } + }, + { + "ph": "s", "id": 289989071, "pid": 5714, "tid": 6744, "ts": 6303771894641.989, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771899854.177, "dur": 1.472, + "args": { + "External id": 152189, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989085, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289989085, "pid": 0, "tid": 7, "ts": 6303771899854.177, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894674.378, "dur": 6.311, + "args": { + "External id": 152189, "cbid": 211, "correlation": 289989085 + } + }, + { + "ph": "s", "id": 289989085, "pid": 5714, "tid": 6744, "ts": 6303771894674.378, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771899856.353, "dur": 1.120, + "args": { + "External id": 152193, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989095, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289989095, "pid": 0, "tid": 7, "ts": 6303771899856.353, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894693.329, "dur": 4.489, + "args": { + "External id": 152193, "cbid": 211, "correlation": 289989095 + } + }, + { + "ph": "s", "id": 289989095, "pid": 5714, "tid": 6744, "ts": 6303771894693.329, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771899858.113, "dur": 1.024, + "args": { + "External id": 152194, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989105, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289989105, "pid": 0, "tid": 7, "ts": 6303771899858.113, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894705.638, "dur": 3.931, + "args": { + "External id": 152194, "cbid": 211, "correlation": 289989105 + } + }, + { + "ph": "s", "id": 289989105, "pid": 5714, "tid": 6744, "ts": 6303771894705.638, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771899859.873, "dur": 27.552, + "args": { + "External id": 152202, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989123, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989123, "pid": 0, "tid": 7, "ts": 6303771899859.873, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894750.538, "dur": 6.740, + "args": { + "External id": 152202, "cbid": 211, "correlation": 289989123 + } + }, + { + "ph": "s", "id": 289989123, "pid": 5714, "tid": 6744, "ts": 6303771894750.538, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771899888.033, "dur": 112.129, + "args": { + "External id": 152208, "device": 0, "context": 1, "stream": 7, "correlation": 289989137, "bytes": 50331648, "memory bandwidth (GB/s)": 448.87270911182657 + } + }, + { + "ph": "f", "id": 289989137, "pid": 0, "tid": 7, "ts": 6303771899888.033, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771894784.388, "dur": 13.740, + "args": { + "External id": 152208, "cbid": 41, "correlation": 289989137 + } + }, + { + "ph": "s", "id": 289989137, "pid": 5714, "tid": 6744, "ts": 6303771894784.388, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771900000.898, "dur": 69.729, + "args": { + "External id": 152210, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989149, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989149, "pid": 0, "tid": 7, "ts": 6303771900000.898, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894812.118, "dur": 5.170, + "args": { + "External id": 152210, "cbid": 211, "correlation": 289989149 + } + }, + { + "ph": "s", "id": 289989149, "pid": 5714, "tid": 6744, "ts": 6303771894812.118, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771900071.363, "dur": 150.018, + "args": { + "External id": 152211, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989159, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989159, "pid": 0, "tid": 7, "ts": 6303771900071.363, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894826.028, "dur": 3.980, + "args": { + "External id": 152211, "cbid": 211, "correlation": 289989159 + } + }, + { + "ph": "s", "id": 289989159, "pid": 5714, "tid": 6744, "ts": 6303771894826.028, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771900222.021, "dur": 142.114, + "args": { + "External id": 152212, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989166, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989166, "pid": 0, "tid": 7, "ts": 6303771900222.021, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894842.138, "dur": 4.380, + "args": { + "External id": 152212, "cbid": 211, "correlation": 289989166 + } + }, + { + "ph": "s", "id": 289989166, "pid": 5714, "tid": 6744, "ts": 6303771894842.138, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771900364.839, "dur": 46.240, + "args": { + "External id": 152218, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989185, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989185, "pid": 0, "tid": 7, "ts": 6303771900364.839, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894875.338, "dur": 6.240, + "args": { + "External id": 152218, "cbid": 211, "correlation": 289989185 + } + }, + { + "ph": "s", "id": 289989185, "pid": 5714, "tid": 6744, "ts": 6303771894875.338, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771900411.751, "dur": 40.673, + "args": { + "External id": 152219, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989193, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989193, "pid": 0, "tid": 7, "ts": 6303771900411.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894891.318, "dur": 4.050, + "args": { + "External id": 152219, "cbid": 211, "correlation": 289989193 + } + }, + { + "ph": "s", "id": 289989193, "pid": 5714, "tid": 6744, "ts": 6303771894891.318, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771900453.064, "dur": 331.331, + "args": { + "External id": 152234, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989226, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289989226, "pid": 0, "tid": 7, "ts": 6303771900453.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771894986.388, "dur": 8.850, + "args": { + "External id": 152234, "cbid": 211, "correlation": 289989226 + } + }, + { + "ph": "s", "id": 289989226, "pid": 5714, "tid": 6744, "ts": 6303771894986.388, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6303771900785.611, "dur": 429.221, + "args": { + "External id": 152223, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989254, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989254, "pid": 0, "tid": 7, "ts": 6303771900785.611, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895050.978, "dur": 6.810, + "args": { + "External id": 152223, "cbid": 307, "correlation": 289989254 + } + }, + { + "ph": "s", "id": 289989254, "pid": 5714, "tid": 6744, "ts": 6303771895050.978, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771895150.108, "dur": 0.509, + "args": { + "External id": 152259, "cbid": 200, "correlation": 289989279 + } + }, + { + "ph": "f", "id": 289989279, "pid": 5714, "tid": 6744, "ts": 6303771895150.108, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771901215.728, "dur": 0.800, + "args": { + "External id": 152259, "device": 0, "context": 1, "stream": 7, "correlation": 289989282, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289989282, "pid": 0, "tid": 7, "ts": 6303771901215.728, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771895152.237, "dur": 7.680, + "args": { + "External id": 152259, "cbid": 51, "correlation": 289989282 + } + }, + { + "ph": "s", "id": 289989282, "pid": 5714, "tid": 6744, "ts": 6303771895152.237, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771901217.296, "dur": 370.117, + "args": { + "External id": 152259, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989283, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289989283, "pid": 0, "tid": 7, "ts": 6303771901217.296, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895160.148, "dur": 7.169, + "args": { + "External id": 152259, "cbid": 307, "correlation": 289989283 + } + }, + { + "ph": "s", "id": 289989283, "pid": 5714, "tid": 6744, "ts": 6303771895160.148, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771895274.237, "dur": 0.460, + "args": { + "External id": 152277, "cbid": 200, "correlation": 289989320 + } + }, + { + "ph": "f", "id": 289989320, "pid": 5714, "tid": 6744, "ts": 6303771895274.237, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771901588.277, "dur": 0.864, + "args": { + "External id": 152277, "device": 0, "context": 1, "stream": 7, "correlation": 289989323, "bytes": 1536, "memory bandwidth (GB/s)": 1.7777777777777777 + } + }, + { + "ph": "f", "id": 289989323, "pid": 0, "tid": 7, "ts": 6303771901588.277, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771895276.167, "dur": 6.260, + "args": { + "External id": 152277, "cbid": 51, "correlation": 289989323 + } + }, + { + "ph": "s", "id": 289989323, "pid": 5714, "tid": 6744, "ts": 6303771895276.167, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771901590.293, "dur": 354.788, + "args": { + "External id": 152277, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989324, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289989324, "pid": 0, "tid": 7, "ts": 6303771901590.293, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895282.627, "dur": 7.360, + "args": { + "External id": 152277, "cbid": 307, "correlation": 289989324 + } + }, + { + "ph": "s", "id": 289989324, "pid": 5714, "tid": 6744, "ts": 6303771895282.627, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771895327.387, "dur": 0.300, + "args": { + "External id": 152284, "cbid": 200, "correlation": 289989349 + } + }, + { + "ph": "f", "id": 289989349, "pid": 5714, "tid": 6744, "ts": 6303771895327.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771901945.721, "dur": 354.052, + "args": { + "External id": 152284, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989352, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289989352, "pid": 0, "tid": 7, "ts": 6303771901945.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895328.827, "dur": 5.960, + "args": { + "External id": 152284, "cbid": 307, "correlation": 289989352 + } + }, + { + "ph": "s", "id": 289989352, "pid": 5714, "tid": 6744, "ts": 6303771895328.827, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771895423.917, "dur": 0.390, + "args": { + "External id": 152307, "cbid": 200, "correlation": 289989397 + } + }, + { + "ph": "f", "id": 289989397, "pid": 5714, "tid": 6744, "ts": 6303771895423.917, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771902300.573, "dur": 0.800, + "args": { + "External id": 152307, "device": 0, "context": 1, "stream": 7, "correlation": 289989400, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289989400, "pid": 0, "tid": 7, "ts": 6303771902300.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771895425.647, "dur": 5.730, + "args": { + "External id": 152307, "cbid": 51, "correlation": 289989400 + } + }, + { + "ph": "s", "id": 289989400, "pid": 5714, "tid": 6744, "ts": 6303771895425.647, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771902302.557, "dur": 353.412, + "args": { + "External id": 152307, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989401, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289989401, "pid": 0, "tid": 7, "ts": 6303771902302.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895431.557, "dur": 6.620, + "args": { + "External id": 152307, "cbid": 307, "correlation": 289989401 + } + }, + { + "ph": "s", "id": 289989401, "pid": 5714, "tid": 6744, "ts": 6303771895431.557, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771895464.847, "dur": 0.260, + "args": { + "External id": 152314, "cbid": 200, "correlation": 289989426 + } + }, + { + "ph": "f", "id": 289989426, "pid": 5714, "tid": 6744, "ts": 6303771895464.847, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771902656.609, "dur": 354.660, + "args": { + "External id": 152314, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989429, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289989429, "pid": 0, "tid": 7, "ts": 6303771902656.609, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895466.037, "dur": 5.030, + "args": { + "External id": 152314, "cbid": 307, "correlation": 289989429 + } + }, + { + "ph": "s", "id": 289989429, "pid": 5714, "tid": 6744, "ts": 6303771895466.037, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771903011.941, "dur": 52.801, + "args": { + "External id": 152319, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989443, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989443, "pid": 0, "tid": 7, "ts": 6303771903011.941, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895507.697, "dur": 7.080, + "args": { + "External id": 152319, "cbid": 211, "correlation": 289989443 + } + }, + { + "ph": "s", "id": 289989443, "pid": 5714, "tid": 6744, "ts": 6303771895507.697, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771903065.382, "dur": 45.536, + "args": { + "External id": 152331, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989467, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989467, "pid": 0, "tid": 7, "ts": 6303771903065.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895564.067, "dur": 7.260, + "args": { + "External id": 152331, "cbid": 211, "correlation": 289989467 + } + }, + { + "ph": "s", "id": 289989467, "pid": 5714, "tid": 6744, "ts": 6303771895564.067, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771903112.166, "dur": 26.753, + "args": { + "External id": 152332, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989477, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989477, "pid": 0, "tid": 7, "ts": 6303771903112.166, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895579.976, "dur": 4.171, + "args": { + "External id": 152332, "cbid": 211, "correlation": 289989477 + } + }, + { + "ph": "s", "id": 289989477, "pid": 5714, "tid": 6744, "ts": 6303771895579.976, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771903139.719, "dur": 0.736, + "args": { + "External id": 152333, "device": 0, "context": 1, "stream": 7, "correlation": 289989492, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 289989492, "pid": 0, "tid": 7, "ts": 6303771903139.719, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771895600.296, "dur": 5.651, + "args": { + "External id": 152333, "cbid": 51, "correlation": 289989492 + } + }, + { + "ph": "s", "id": 289989492, "pid": 5714, "tid": 6744, "ts": 6303771895600.296, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6303771903142.023, "dur": 42.592, + "args": { + "External id": 152333, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989494, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289989494, "pid": 0, "tid": 7, "ts": 6303771903142.023, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895607.036, "dur": 4.851, + "args": { + "External id": 152333, "cbid": 211, "correlation": 289989494 + } + }, + { + "ph": "s", "id": 289989494, "pid": 5714, "tid": 6744, "ts": 6303771895607.036, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771903185.287, "dur": 51.713, + "args": { + "External id": 152344, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989515, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989515, "pid": 0, "tid": 7, "ts": 6303771903185.287, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895662.626, "dur": 7.770, + "args": { + "External id": 152344, "cbid": 211, "correlation": 289989515 + } + }, + { + "ph": "s", "id": 289989515, "pid": 5714, "tid": 6744, "ts": 6303771895662.626, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771903237.640, "dur": 145.538, + "args": { + "External id": 152347, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989530, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989530, "pid": 0, "tid": 7, "ts": 6303771903237.640, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895693.066, "dur": 5.540, + "args": { + "External id": 152347, "cbid": 211, "correlation": 289989530 + } + }, + { + "ph": "s", "id": 289989530, "pid": 5714, "tid": 6744, "ts": 6303771895693.066, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771903383.850, "dur": 109.921, + "args": { + "External id": 152348, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989540, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989540, "pid": 0, "tid": 7, "ts": 6303771903383.850, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895707.166, "dur": 4.300, + "args": { + "External id": 152348, "cbid": 211, "correlation": 289989540 + } + }, + { + "ph": "s", "id": 289989540, "pid": 5714, "tid": 6744, "ts": 6303771895707.166, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771903494.475, "dur": 77.921, + "args": { + "External id": 152349, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989554, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989554, "pid": 0, "tid": 7, "ts": 6303771903494.475, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895724.066, "dur": 4.550, + "args": { + "External id": 152349, "cbid": 211, "correlation": 289989554 + } + }, + { + "ph": "s", "id": 289989554, "pid": 5714, "tid": 6744, "ts": 6303771895724.066, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771903573.132, "dur": 1.472, + "args": { + "External id": 152352, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989568, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289989568, "pid": 0, "tid": 7, "ts": 6303771903573.132, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895754.346, "dur": 6.320, + "args": { + "External id": 152352, "cbid": 211, "correlation": 289989568 + } + }, + { + "ph": "s", "id": 289989568, "pid": 5714, "tid": 6744, "ts": 6303771895754.346, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771903575.276, "dur": 1.184, + "args": { + "External id": 152356, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989578, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289989578, "pid": 0, "tid": 7, "ts": 6303771903575.276, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895773.556, "dur": 4.600, + "args": { + "External id": 152356, "cbid": 211, "correlation": 289989578 + } + }, + { + "ph": "s", "id": 289989578, "pid": 5714, "tid": 6744, "ts": 6303771895773.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771903577.100, "dur": 1.024, + "args": { + "External id": 152357, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989588, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289989588, "pid": 0, "tid": 7, "ts": 6303771903577.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895785.256, "dur": 4.100, + "args": { + "External id": 152357, "cbid": 211, "correlation": 289989588 + } + }, + { + "ph": "s", "id": 289989588, "pid": 5714, "tid": 6744, "ts": 6303771895785.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771903578.732, "dur": 26.592, + "args": { + "External id": 152365, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989606, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989606, "pid": 0, "tid": 7, "ts": 6303771903578.732, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895830.276, "dur": 6.960, + "args": { + "External id": 152365, "cbid": 211, "correlation": 289989606 + } + }, + { + "ph": "s", "id": 289989606, "pid": 5714, "tid": 6744, "ts": 6303771895830.276, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771903605.996, "dur": 126.242, + "args": { + "External id": 152371, "device": 0, "context": 1, "stream": 7, "correlation": 289989620, "bytes": 50331648, "memory bandwidth (GB/s)": 398.69178244958096 + } + }, + { + "ph": "f", "id": 289989620, "pid": 0, "tid": 7, "ts": 6303771903605.996, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771895863.986, "dur": 15.460, + "args": { + "External id": 152371, "cbid": 41, "correlation": 289989620 + } + }, + { + "ph": "s", "id": 289989620, "pid": 5714, "tid": 6744, "ts": 6303771895863.986, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771903732.846, "dur": 93.313, + "args": { + "External id": 152373, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989632, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989632, "pid": 0, "tid": 7, "ts": 6303771903732.846, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895892.156, "dur": 5.300, + "args": { + "External id": 152373, "cbid": 211, "correlation": 289989632 + } + }, + { + "ph": "s", "id": 289989632, "pid": 5714, "tid": 6744, "ts": 6303771895892.156, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771903826.831, "dur": 156.673, + "args": { + "External id": 152374, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989642, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989642, "pid": 0, "tid": 7, "ts": 6303771903826.831, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895908.036, "dur": 6.230, + "args": { + "External id": 152374, "cbid": 211, "correlation": 289989642 + } + }, + { + "ph": "s", "id": 289989642, "pid": 5714, "tid": 6744, "ts": 6303771895908.036, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771903984.208, "dur": 464.166, + "args": { + "External id": 152375, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989649, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989649, "pid": 0, "tid": 7, "ts": 6303771903984.208, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895925.906, "dur": 4.310, + "args": { + "External id": 152375, "cbid": 211, "correlation": 289989649 + } + }, + { + "ph": "s", "id": 289989649, "pid": 5714, "tid": 6744, "ts": 6303771895925.906, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771904463.126, "dur": 347.972, + "args": { + "External id": 152381, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989668, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989668, "pid": 0, "tid": 7, "ts": 6303771904463.126, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895959.226, "dur": 5.990, + "args": { + "External id": 152381, "cbid": 211, "correlation": 289989668 + } + }, + { + "ph": "s", "id": 289989668, "pid": 5714, "tid": 6744, "ts": 6303771895959.226, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771904811.802, "dur": 57.825, + "args": { + "External id": 152382, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989680, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989680, "pid": 0, "tid": 7, "ts": 6303771904811.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771895979.466, "dur": 4.869, + "args": { + "External id": 152382, "cbid": 211, "correlation": 289989680 + } + }, + { + "ph": "s", "id": 289989680, "pid": 5714, "tid": 6744, "ts": 6303771895979.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771904870.875, "dur": 42.048, + "args": { + "External id": 152385, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989693, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989693, "pid": 0, "tid": 7, "ts": 6303771904870.875, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896007.606, "dur": 5.340, + "args": { + "External id": 152385, "cbid": 211, "correlation": 289989693 + } + }, + { + "ph": "s", "id": 289989693, "pid": 5714, "tid": 6744, "ts": 6303771896007.606, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771896066.595, "dur": 0.491, + "args": { + "External id": 152395, "cbid": 200, "correlation": 289989729 + } + }, + { + "ph": "f", "id": 289989729, "pid": 5714, "tid": 6744, "ts": 6303771896066.595, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771904914.235, "dur": 1.184, + "args": { + "External id": 152395, "device": 0, "context": 1, "stream": 7, "correlation": 289989732, "bytes": 576, "memory bandwidth (GB/s)": 0.4864864864864865 + } + }, + { + "ph": "f", "id": 289989732, "pid": 0, "tid": 7, "ts": 6303771904914.235, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771896068.635, "dur": 6.270, + "args": { + "External id": 152395, "cbid": 51, "correlation": 289989732 + } + }, + { + "ph": "s", "id": 289989732, "pid": 5714, "tid": 6744, "ts": 6303771896068.635, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771904916.763, "dur": 142.434, + "args": { + "External id": 152395, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989733, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289989733, "pid": 0, "tid": 7, "ts": 6303771904916.763, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896075.125, "dur": 6.480, + "args": { + "External id": 152395, "cbid": 307, "correlation": 289989733 + } + }, + { + "ph": "s", "id": 289989733, "pid": 5714, "tid": 6744, "ts": 6303771896075.125, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771905059.901, "dur": 142.178, + "args": { + "External id": 152402, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989755, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289989755, "pid": 0, "tid": 7, "ts": 6303771905059.901, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896110.225, "dur": 5.350, + "args": { + "External id": 152402, "cbid": 211, "correlation": 289989755 + } + }, + { + "ph": "s", "id": 289989755, "pid": 5714, "tid": 6744, "ts": 6303771896110.225, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771896275.765, "dur": 0.440, + "args": { + "External id": 152428, "cbid": 200, "correlation": 289989802 + } + }, + { + "ph": "f", "id": 289989802, "pid": 5714, "tid": 6744, "ts": 6303771896275.765, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771896276.315, "dur": 0.210, + "args": { + "External id": 152428, "cbid": 200, "correlation": 289989803 + } + }, + { + "ph": "f", "id": 289989803, "pid": 5714, "tid": 6744, "ts": 6303771896276.315, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771896292.755, "dur": 0.200, + "args": { + "External id": 152428, "cbid": 200, "correlation": 289989821 + } + }, + { + "ph": "f", "id": 289989821, "pid": 5714, "tid": 6744, "ts": 6303771896292.755, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771905202.751, "dur": 96.705, + "args": { + "External id": 152428, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989822, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989822, "pid": 0, "tid": 7, "ts": 6303771905202.751, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896294.115, "dur": 18.850, + "args": { + "External id": 152428, "cbid": 211, "correlation": 289989822 + } + }, + { + "ph": "s", "id": 289989822, "pid": 5714, "tid": 6744, "ts": 6303771896294.115, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771896313.775, "dur": 1.050, + "args": { + "External id": 152428, "cbid": 273, "correlation": 289989824 + } + }, + { + "ph": "f", "id": 289989824, "pid": 5714, "tid": 6744, "ts": 6303771896313.775, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771905300.096, "dur": 1102.989, + "args": { + "External id": 152428, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989825, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289989825, "pid": 0, "tid": 7, "ts": 6303771905300.096, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896315.135, "dur": 4.320, + "args": { + "External id": 152428, "cbid": 211, "correlation": 289989825 + } + }, + { + "ph": "s", "id": 289989825, "pid": 5714, "tid": 6744, "ts": 6303771896315.135, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303771906403.789, "dur": 73.344, + "args": { + "External id": 152428, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989827, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289989827, "pid": 0, "tid": 7, "ts": 6303771906403.789, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896320.015, "dur": 3.740, + "args": { + "External id": 152428, "cbid": 211, "correlation": 289989827 + } + }, + { + "ph": "s", "id": 289989827, "pid": 5714, "tid": 6744, "ts": 6303771896320.015, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771906477.805, "dur": 48.321, + "args": { + "External id": 152438, "device": 0, "context": 1, "stream": 7, "correlation": 289989853, "bytes": 25165824, "memory bandwidth (GB/s)": 520.8051157881666 + } + }, + { + "ph": "f", "id": 289989853, "pid": 0, "tid": 7, "ts": 6303771906477.805, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771896441.045, "dur": 16.669, + "args": { + "External id": 152438, "cbid": 41, "correlation": 289989853 + } + }, + { + "ph": "s", "id": 289989853, "pid": 5714, "tid": 6744, "ts": 6303771896441.045, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771906526.830, "dur": 34.273, + "args": { + "External id": 152435, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989871, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989871, "pid": 0, "tid": 7, "ts": 6303771906526.830, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896549.704, "dur": 8.140, + "args": { + "External id": 152435, "cbid": 307, "correlation": 289989871 + } + }, + { + "ph": "s", "id": 289989871, "pid": 5714, "tid": 6744, "ts": 6303771896549.704, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771906561.711, "dur": 40.128, + "args": { + "External id": 152445, "device": 0, "context": 1, "stream": 7, "correlation": 289989886, "bytes": 25165824, "memory bandwidth (GB/s)": 627.1387559808612 + } + }, + { + "ph": "f", "id": 289989886, "pid": 0, "tid": 7, "ts": 6303771906561.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771896616.164, "dur": 14.760, + "args": { + "External id": 152445, "cbid": 41, "correlation": 289989886 + } + }, + { + "ph": "s", "id": 289989886, "pid": 5714, "tid": 6744, "ts": 6303771896616.164, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771906602.447, "dur": 29.856, + "args": { + "External id": 152442, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989904, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289989904, "pid": 0, "tid": 7, "ts": 6303771906602.447, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896714.264, "dur": 7.450, + "args": { + "External id": 152442, "cbid": 307, "correlation": 289989904 + } + }, + { + "ph": "s", "id": 289989904, "pid": 5714, "tid": 6744, "ts": 6303771896714.264, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771896831.744, "dur": 0.520, + "args": { + "External id": 152469, "cbid": 200, "correlation": 289989948 + } + }, + { + "ph": "f", "id": 289989948, "pid": 5714, "tid": 6744, "ts": 6303771896831.744, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771906633.487, "dur": 1.024, + "args": { + "External id": 152469, "device": 0, "context": 1, "stream": 7, "correlation": 289989951, "bytes": 576, "memory bandwidth (GB/s)": 0.5625 + } + }, + { + "ph": "f", "id": 289989951, "pid": 0, "tid": 7, "ts": 6303771906633.487, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771896833.924, "dur": 7.030, + "args": { + "External id": 152469, "cbid": 51, "correlation": 289989951 + } + }, + { + "ph": "s", "id": 289989951, "pid": 5714, "tid": 6744, "ts": 6303771896833.924, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771906635.983, "dur": 145.922, + "args": { + "External id": 152469, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989952, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289989952, "pid": 0, "tid": 7, "ts": 6303771906635.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896841.184, "dur": 7.260, + "args": { + "External id": 152469, "cbid": 307, "correlation": 289989952 + } + }, + { + "ph": "s", "id": 289989952, "pid": 5714, "tid": 6744, "ts": 6303771896841.184, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771906782.545, "dur": 141.314, + "args": { + "External id": 152476, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289989974, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289989974, "pid": 0, "tid": 7, "ts": 6303771906782.545, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896874.973, "dur": 5.720, + "args": { + "External id": 152476, "cbid": 211, "correlation": 289989974 + } + }, + { + "ph": "s", "id": 289989974, "pid": 5714, "tid": 6744, "ts": 6303771896874.973, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771896974.653, "dur": 0.450, + "args": { + "External id": 152499, "cbid": 200, "correlation": 289990020 + } + }, + { + "ph": "f", "id": 289990020, "pid": 5714, "tid": 6744, "ts": 6303771896974.653, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771906925.299, "dur": 0.928, + "args": { + "External id": 152499, "device": 0, "context": 1, "stream": 7, "correlation": 289990023, "bytes": 576, "memory bandwidth (GB/s)": 0.6206896551724138 + } + }, + { + "ph": "f", "id": 289990023, "pid": 0, "tid": 7, "ts": 6303771906925.299, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771896976.423, "dur": 5.920, + "args": { + "External id": 152499, "cbid": 51, "correlation": 289990023 + } + }, + { + "ph": "s", "id": 289990023, "pid": 5714, "tid": 6744, "ts": 6303771896976.423, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771906928.371, "dur": 145.634, + "args": { + "External id": 152499, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990024, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990024, "pid": 0, "tid": 7, "ts": 6303771906928.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771896982.523, "dur": 6.830, + "args": { + "External id": 152499, "cbid": 307, "correlation": 289990024 + } + }, + { + "ph": "s", "id": 289990024, "pid": 5714, "tid": 6744, "ts": 6303771896982.523, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771907074.581, "dur": 140.897, + "args": { + "External id": 152506, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990046, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990046, "pid": 0, "tid": 7, "ts": 6303771907074.581, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897016.063, "dur": 5.350, + "args": { + "External id": 152506, "cbid": 211, "correlation": 289990046 + } + }, + { + "ph": "s", "id": 289990046, "pid": 5714, "tid": 6744, "ts": 6303771897016.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771907216.246, "dur": 43.040, + "args": { + "External id": 152511, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990061, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990061, "pid": 0, "tid": 7, "ts": 6303771907216.246, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897058.493, "dur": 7.050, + "args": { + "External id": 152511, "cbid": 211, "correlation": 289990061 + } + }, + { + "ph": "s", "id": 289990061, "pid": 5714, "tid": 6744, "ts": 6303771897058.493, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771897144.443, "dur": 0.500, + "args": { + "External id": 152530, "cbid": 200, "correlation": 289990105 + } + }, + { + "ph": "f", "id": 289990105, "pid": 5714, "tid": 6744, "ts": 6303771897144.443, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771907260.790, "dur": 1.056, + "args": { + "External id": 152530, "device": 0, "context": 1, "stream": 7, "correlation": 289990108, "bytes": 576, "memory bandwidth (GB/s)": 0.5454545454545454 + } + }, + { + "ph": "f", "id": 289990108, "pid": 0, "tid": 7, "ts": 6303771907260.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771897146.363, "dur": 5.660, + "args": { + "External id": 152530, "cbid": 51, "correlation": 289990108 + } + }, + { + "ph": "s", "id": 289990108, "pid": 5714, "tid": 6744, "ts": 6303771897146.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771907263.350, "dur": 147.778, + "args": { + "External id": 152530, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990109, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990109, "pid": 0, "tid": 7, "ts": 6303771907263.350, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897152.183, "dur": 6.640, + "args": { + "External id": 152530, "cbid": 307, "correlation": 289990109 + } + }, + { + "ph": "s", "id": 289990109, "pid": 5714, "tid": 6744, "ts": 6303771897152.183, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771907411.864, "dur": 140.802, + "args": { + "External id": 152537, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990131, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990131, "pid": 0, "tid": 7, "ts": 6303771907411.864, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897184.713, "dur": 5.210, + "args": { + "External id": 152537, "cbid": 211, "correlation": 289990131 + } + }, + { + "ph": "s", "id": 289990131, "pid": 5714, "tid": 6744, "ts": 6303771897184.713, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771907553.338, "dur": 39.009, + "args": { + "External id": 152542, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990142, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990142, "pid": 0, "tid": 7, "ts": 6303771907553.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897224.613, "dur": 6.830, + "args": { + "External id": 152542, "cbid": 211, "correlation": 289990142 + } + }, + { + "ph": "s", "id": 289990142, "pid": 5714, "tid": 6744, "ts": 6303771897224.613, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771907593.019, "dur": 44.416, + "args": { + "External id": 152554, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990166, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990166, "pid": 0, "tid": 7, "ts": 6303771907593.019, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897281.063, "dur": 7.930, + "args": { + "External id": 152554, "cbid": 211, "correlation": 289990166 + } + }, + { + "ph": "s", "id": 289990166, "pid": 5714, "tid": 6744, "ts": 6303771897281.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771907638.107, "dur": 24.800, + "args": { + "External id": 152555, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990176, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990176, "pid": 0, "tid": 7, "ts": 6303771907638.107, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897305.293, "dur": 4.950, + "args": { + "External id": 152555, "cbid": 211, "correlation": 289990176 + } + }, + { + "ph": "s", "id": 289990176, "pid": 5714, "tid": 6744, "ts": 6303771897305.293, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771907664.091, "dur": 1.184, + "args": { + "External id": 152556, "device": 0, "context": 1, "stream": 7, "correlation": 289990191, "bytes": 24, "memory bandwidth (GB/s)": 0.02027027027027027 + } + }, + { + "ph": "f", "id": 289990191, "pid": 0, "tid": 7, "ts": 6303771907664.091, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771897327.452, "dur": 5.971, + "args": { + "External id": 152556, "cbid": 51, "correlation": 289990191 + } + }, + { + "ph": "s", "id": 289990191, "pid": 5714, "tid": 6744, "ts": 6303771897327.452, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6303771907667.419, "dur": 43.745, + "args": { + "External id": 152556, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990193, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289990193, "pid": 0, "tid": 7, "ts": 6303771907667.419, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897334.492, "dur": 5.331, + "args": { + "External id": 152556, "cbid": 211, "correlation": 289990193 + } + }, + { + "ph": "s", "id": 289990193, "pid": 5714, "tid": 6744, "ts": 6303771897334.492, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771907711.900, "dur": 48.448, + "args": { + "External id": 152567, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990214, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990214, "pid": 0, "tid": 7, "ts": 6303771907711.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897393.192, "dur": 7.731, + "args": { + "External id": 152567, "cbid": 211, "correlation": 289990214 + } + }, + { + "ph": "s", "id": 289990214, "pid": 5714, "tid": 6744, "ts": 6303771897393.192, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771907761.020, "dur": 151.458, + "args": { + "External id": 152570, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990229, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990229, "pid": 0, "tid": 7, "ts": 6303771907761.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897423.732, "dur": 5.500, + "args": { + "External id": 152570, "cbid": 211, "correlation": 289990229 + } + }, + { + "ph": "s", "id": 289990229, "pid": 5714, "tid": 6744, "ts": 6303771897423.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771907913.214, "dur": 110.945, + "args": { + "External id": 152571, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990239, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990239, "pid": 0, "tid": 7, "ts": 6303771907913.214, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897437.152, "dur": 4.260, + "args": { + "External id": 152571, "cbid": 211, "correlation": 289990239 + } + }, + { + "ph": "s", "id": 289990239, "pid": 5714, "tid": 6744, "ts": 6303771897437.152, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771908024.831, "dur": 79.202, + "args": { + "External id": 152572, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990253, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990253, "pid": 0, "tid": 7, "ts": 6303771908024.831, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897455.152, "dur": 4.740, + "args": { + "External id": 152572, "cbid": 211, "correlation": 289990253 + } + }, + { + "ph": "s", "id": 289990253, "pid": 5714, "tid": 6744, "ts": 6303771897455.152, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771908104.737, "dur": 3.264, + "args": { + "External id": 152575, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990267, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289990267, "pid": 0, "tid": 7, "ts": 6303771908104.737, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897497.752, "dur": 6.420, + "args": { + "External id": 152575, "cbid": 211, "correlation": 289990267 + } + }, + { + "ph": "s", "id": 289990267, "pid": 5714, "tid": 6744, "ts": 6303771897497.752, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771908108.705, "dur": 2.656, + "args": { + "External id": 152579, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990277, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289990277, "pid": 0, "tid": 7, "ts": 6303771908108.705, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897516.142, "dur": 4.520, + "args": { + "External id": 152579, "cbid": 211, "correlation": 289990277 + } + }, + { + "ph": "s", "id": 289990277, "pid": 5714, "tid": 6744, "ts": 6303771897516.142, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771908111.937, "dur": 2.752, + "args": { + "External id": 152580, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990287, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289990287, "pid": 0, "tid": 7, "ts": 6303771908111.937, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897527.872, "dur": 3.900, + "args": { + "External id": 152580, "cbid": 211, "correlation": 289990287 + } + }, + { + "ph": "s", "id": 289990287, "pid": 5714, "tid": 6744, "ts": 6303771897527.872, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771908115.361, "dur": 28.640, + "args": { + "External id": 152588, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990305, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990305, "pid": 0, "tid": 7, "ts": 6303771908115.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897574.652, "dur": 7.100, + "args": { + "External id": 152588, "cbid": 211, "correlation": 289990305 + } + }, + { + "ph": "s", "id": 289990305, "pid": 5714, "tid": 6744, "ts": 6303771897574.652, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771908144.673, "dur": 114.785, + "args": { + "External id": 152594, "device": 0, "context": 1, "stream": 7, "correlation": 289990319, "bytes": 50331648, "memory bandwidth (GB/s)": 438.4862830509213 + } + }, + { + "ph": "f", "id": 289990319, "pid": 0, "tid": 7, "ts": 6303771908144.673, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771897608.442, "dur": 13.530, + "args": { + "External id": 152594, "cbid": 41, "correlation": 289990319 + } + }, + { + "ph": "s", "id": 289990319, "pid": 5714, "tid": 6744, "ts": 6303771897608.442, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771908260.098, "dur": 76.737, + "args": { + "External id": 152596, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990331, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990331, "pid": 0, "tid": 7, "ts": 6303771908260.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897634.372, "dur": 5.500, + "args": { + "External id": 152596, "cbid": 211, "correlation": 289990331 + } + }, + { + "ph": "s", "id": 289990331, "pid": 5714, "tid": 6744, "ts": 6303771897634.372, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771908337.507, "dur": 151.586, + "args": { + "External id": 152597, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990341, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990341, "pid": 0, "tid": 7, "ts": 6303771908337.507, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897648.242, "dur": 3.750, + "args": { + "External id": 152597, "cbid": 211, "correlation": 289990341 + } + }, + { + "ph": "s", "id": 289990341, "pid": 5714, "tid": 6744, "ts": 6303771897648.242, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771908489.797, "dur": 144.961, + "args": { + "External id": 152598, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990348, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990348, "pid": 0, "tid": 7, "ts": 6303771908489.797, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897663.402, "dur": 4.280, + "args": { + "External id": 152598, "cbid": 211, "correlation": 289990348 + } + }, + { + "ph": "s", "id": 289990348, "pid": 5714, "tid": 6744, "ts": 6303771897663.402, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771908635.494, "dur": 47.265, + "args": { + "External id": 152604, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990367, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990367, "pid": 0, "tid": 7, "ts": 6303771908635.494, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897697.272, "dur": 6.050, + "args": { + "External id": 152604, "cbid": 211, "correlation": 289990367 + } + }, + { + "ph": "s", "id": 289990367, "pid": 5714, "tid": 6744, "ts": 6303771897697.272, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771908683.431, "dur": 41.281, + "args": { + "External id": 152605, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990375, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990375, "pid": 0, "tid": 7, "ts": 6303771908683.431, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897712.992, "dur": 4.360, + "args": { + "External id": 152605, "cbid": 211, "correlation": 289990375 + } + }, + { + "ph": "s", "id": 289990375, "pid": 5714, "tid": 6744, "ts": 6303771897712.992, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771908725.352, "dur": 485.445, + "args": { + "External id": 152620, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990408, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990408, "pid": 0, "tid": 7, "ts": 6303771908725.352, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897807.202, "dur": 8.989, + "args": { + "External id": 152620, "cbid": 211, "correlation": 289990408 + } + }, + { + "ph": "s", "id": 289990408, "pid": 5714, "tid": 6744, "ts": 6303771897807.202, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6303771909222.317, "dur": 456.550, + "args": { + "External id": 152609, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990436, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990436, "pid": 0, "tid": 7, "ts": 6303771909222.317, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897869.222, "dur": 6.439, + "args": { + "External id": 152609, "cbid": 307, "correlation": 289990436 + } + }, + { + "ph": "s", "id": 289990436, "pid": 5714, "tid": 6744, "ts": 6303771897869.222, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771897969.781, "dur": 0.510, + "args": { + "External id": 152645, "cbid": 200, "correlation": 289990461 + } + }, + { + "ph": "f", "id": 289990461, "pid": 5714, "tid": 6744, "ts": 6303771897969.781, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771909680.115, "dur": 1.152, + "args": { + "External id": 152645, "device": 0, "context": 1, "stream": 7, "correlation": 289990464, "bytes": 1536, "memory bandwidth (GB/s)": 1.3333333333333333 + } + }, + { + "ph": "f", "id": 289990464, "pid": 0, "tid": 7, "ts": 6303771909680.115, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771897971.891, "dur": 6.870, + "args": { + "External id": 152645, "cbid": 51, "correlation": 289990464 + } + }, + { + "ph": "s", "id": 289990464, "pid": 5714, "tid": 6744, "ts": 6303771897971.891, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771909682.579, "dur": 361.316, + "args": { + "External id": 152645, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990465, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990465, "pid": 0, "tid": 7, "ts": 6303771909682.579, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771897978.981, "dur": 7.040, + "args": { + "External id": 152645, "cbid": 307, "correlation": 289990465 + } + }, + { + "ph": "s", "id": 289990465, "pid": 5714, "tid": 6744, "ts": 6303771897978.981, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771898093.651, "dur": 0.450, + "args": { + "External id": 152663, "cbid": 200, "correlation": 289990502 + } + }, + { + "ph": "f", "id": 289990502, "pid": 5714, "tid": 6744, "ts": 6303771898093.651, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771910045.047, "dur": 0.800, + "args": { + "External id": 152663, "device": 0, "context": 1, "stream": 7, "correlation": 289990505, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289990505, "pid": 0, "tid": 7, "ts": 6303771910045.047, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771898095.521, "dur": 6.080, + "args": { + "External id": 152663, "cbid": 51, "correlation": 289990505 + } + }, + { + "ph": "s", "id": 289990505, "pid": 5714, "tid": 6744, "ts": 6303771898095.521, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771910046.999, "dur": 350.276, + "args": { + "External id": 152663, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990506, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990506, "pid": 0, "tid": 7, "ts": 6303771910046.999, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898101.801, "dur": 7.440, + "args": { + "External id": 152663, "cbid": 307, "correlation": 289990506 + } + }, + { + "ph": "s", "id": 289990506, "pid": 5714, "tid": 6744, "ts": 6303771898101.801, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771898137.661, "dur": 0.240, + "args": { + "External id": 152670, "cbid": 200, "correlation": 289990531 + } + }, + { + "ph": "f", "id": 289990531, "pid": 5714, "tid": 6744, "ts": 6303771898137.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771910397.947, "dur": 355.716, + "args": { + "External id": 152670, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990534, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990534, "pid": 0, "tid": 7, "ts": 6303771910397.947, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898138.901, "dur": 5.410, + "args": { + "External id": 152670, "cbid": 307, "correlation": 289990534 + } + }, + { + "ph": "s", "id": 289990534, "pid": 5714, "tid": 6744, "ts": 6303771898138.901, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771898353.630, "dur": 0.440, + "args": { + "External id": 152693, "cbid": 200, "correlation": 289990579 + } + }, + { + "ph": "f", "id": 289990579, "pid": 5714, "tid": 6744, "ts": 6303771898353.630, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771910754.463, "dur": 0.768, + "args": { + "External id": 152693, "device": 0, "context": 1, "stream": 7, "correlation": 289990582, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 289990582, "pid": 0, "tid": 7, "ts": 6303771910754.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771898355.290, "dur": 5.800, + "args": { + "External id": 152693, "cbid": 51, "correlation": 289990582 + } + }, + { + "ph": "s", "id": 289990582, "pid": 5714, "tid": 6744, "ts": 6303771898355.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771910756.383, "dur": 354.596, + "args": { + "External id": 152693, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990583, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990583, "pid": 0, "tid": 7, "ts": 6303771910756.383, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898361.300, "dur": 6.960, + "args": { + "External id": 152693, "cbid": 307, "correlation": 289990583 + } + }, + { + "ph": "s", "id": 289990583, "pid": 5714, "tid": 6744, "ts": 6303771898361.300, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771898395.340, "dur": 0.290, + "args": { + "External id": 152700, "cbid": 200, "correlation": 289990608 + } + }, + { + "ph": "f", "id": 289990608, "pid": 5714, "tid": 6744, "ts": 6303771898395.340, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771911111.587, "dur": 357.220, + "args": { + "External id": 152700, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990611, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990611, "pid": 0, "tid": 7, "ts": 6303771911111.587, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898396.520, "dur": 5.000, + "args": { + "External id": 152700, "cbid": 307, "correlation": 289990611 + } + }, + { + "ph": "s", "id": 289990611, "pid": 5714, "tid": 6744, "ts": 6303771898396.520, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771911469.479, "dur": 50.945, + "args": { + "External id": 152705, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990625, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990625, "pid": 0, "tid": 7, "ts": 6303771911469.479, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898438.650, "dur": 7.040, + "args": { + "External id": 152705, "cbid": 211, "correlation": 289990625 + } + }, + { + "ph": "s", "id": 289990625, "pid": 5714, "tid": 6744, "ts": 6303771898438.650, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771911521.032, "dur": 45.664, + "args": { + "External id": 152717, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990649, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990649, "pid": 0, "tid": 7, "ts": 6303771911521.032, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898494.160, "dur": 7.220, + "args": { + "External id": 152717, "cbid": 211, "correlation": 289990649 + } + }, + { + "ph": "s", "id": 289990649, "pid": 5714, "tid": 6744, "ts": 6303771898494.160, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771911567.912, "dur": 28.577, + "args": { + "External id": 152718, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990659, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990659, "pid": 0, "tid": 7, "ts": 6303771911567.912, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898510.060, "dur": 4.120, + "args": { + "External id": 152718, "cbid": 211, "correlation": 289990659 + } + }, + { + "ph": "s", "id": 289990659, "pid": 5714, "tid": 6744, "ts": 6303771898510.060, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771911597.545, "dur": 0.768, + "args": { + "External id": 152719, "device": 0, "context": 1, "stream": 7, "correlation": 289990674, "bytes": 24, "memory bandwidth (GB/s)": 0.03125 + } + }, + { + "ph": "f", "id": 289990674, "pid": 0, "tid": 7, "ts": 6303771911597.545, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771898528.950, "dur": 5.240, + "args": { + "External id": 152719, "cbid": 51, "correlation": 289990674 + } + }, + { + "ph": "s", "id": 289990674, "pid": 5714, "tid": 6744, "ts": 6303771898528.950, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6303771911599.465, "dur": 41.665, + "args": { + "External id": 152719, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990676, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289990676, "pid": 0, "tid": 7, "ts": 6303771911599.465, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898535.270, "dur": 4.820, + "args": { + "External id": 152719, "cbid": 211, "correlation": 289990676 + } + }, + { + "ph": "s", "id": 289990676, "pid": 5714, "tid": 6744, "ts": 6303771898535.270, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771911641.834, "dur": 50.112, + "args": { + "External id": 152730, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990697, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990697, "pid": 0, "tid": 7, "ts": 6303771911641.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898591.560, "dur": 7.670, + "args": { + "External id": 152730, "cbid": 211, "correlation": 289990697 + } + }, + { + "ph": "s", "id": 289990697, "pid": 5714, "tid": 6744, "ts": 6303771898591.560, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771911692.682, "dur": 143.458, + "args": { + "External id": 152733, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990712, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990712, "pid": 0, "tid": 7, "ts": 6303771911692.682, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898622.010, "dur": 5.570, + "args": { + "External id": 152733, "cbid": 211, "correlation": 289990712 + } + }, + { + "ph": "s", "id": 289990712, "pid": 5714, "tid": 6744, "ts": 6303771898622.010, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771911836.780, "dur": 106.625, + "args": { + "External id": 152734, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990722, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990722, "pid": 0, "tid": 7, "ts": 6303771911836.780, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898635.870, "dur": 4.080, + "args": { + "External id": 152734, "cbid": 211, "correlation": 289990722 + } + }, + { + "ph": "s", "id": 289990722, "pid": 5714, "tid": 6744, "ts": 6303771898635.870, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771911944.077, "dur": 77.793, + "args": { + "External id": 152735, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990736, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990736, "pid": 0, "tid": 7, "ts": 6303771911944.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898652.670, "dur": 4.550, + "args": { + "External id": 152735, "cbid": 211, "correlation": 289990736 + } + }, + { + "ph": "s", "id": 289990736, "pid": 5714, "tid": 6744, "ts": 6303771898652.670, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771912022.510, "dur": 1.440, + "args": { + "External id": 152738, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990750, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289990750, "pid": 0, "tid": 7, "ts": 6303771912022.510, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898684.429, "dur": 6.060, + "args": { + "External id": 152738, "cbid": 211, "correlation": 289990750 + } + }, + { + "ph": "s", "id": 289990750, "pid": 5714, "tid": 6744, "ts": 6303771898684.429, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771912024.622, "dur": 1.216, + "args": { + "External id": 152742, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990760, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289990760, "pid": 0, "tid": 7, "ts": 6303771912024.622, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898704.289, "dur": 4.431, + "args": { + "External id": 152742, "cbid": 211, "correlation": 289990760 + } + }, + { + "ph": "s", "id": 289990760, "pid": 5714, "tid": 6744, "ts": 6303771898704.289, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771912026.478, "dur": 0.992, + "args": { + "External id": 152743, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990770, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289990770, "pid": 0, "tid": 7, "ts": 6303771912026.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898716.100, "dur": 3.969, + "args": { + "External id": 152743, "cbid": 211, "correlation": 289990770 + } + }, + { + "ph": "s", "id": 289990770, "pid": 5714, "tid": 6744, "ts": 6303771898716.100, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771912028.142, "dur": 27.616, + "args": { + "External id": 152751, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990788, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990788, "pid": 0, "tid": 7, "ts": 6303771912028.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898761.840, "dur": 7.149, + "args": { + "External id": 152751, "cbid": 211, "correlation": 289990788 + } + }, + { + "ph": "s", "id": 289990788, "pid": 5714, "tid": 6744, "ts": 6303771898761.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771912056.430, "dur": 112.898, + "args": { + "External id": 152757, "device": 0, "context": 1, "stream": 7, "correlation": 289990802, "bytes": 50331648, "memory bandwidth (GB/s)": 445.8152314478556 + } + }, + { + "ph": "f", "id": 289990802, "pid": 0, "tid": 7, "ts": 6303771912056.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771898795.379, "dur": 15.800, + "args": { + "External id": 152757, "cbid": 41, "correlation": 289990802 + } + }, + { + "ph": "s", "id": 289990802, "pid": 5714, "tid": 6744, "ts": 6303771898795.379, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771912170.031, "dur": 75.681, + "args": { + "External id": 152759, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990814, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990814, "pid": 0, "tid": 7, "ts": 6303771912170.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898824.569, "dur": 5.190, + "args": { + "External id": 152759, "cbid": 211, "correlation": 289990814 + } + }, + { + "ph": "s", "id": 289990814, "pid": 5714, "tid": 6744, "ts": 6303771898824.569, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771912246.320, "dur": 147.106, + "args": { + "External id": 152760, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990824, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990824, "pid": 0, "tid": 7, "ts": 6303771912246.320, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898837.409, "dur": 3.820, + "args": { + "External id": 152760, "cbid": 211, "correlation": 289990824 + } + }, + { + "ph": "s", "id": 289990824, "pid": 5714, "tid": 6744, "ts": 6303771898837.409, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771912394.066, "dur": 144.066, + "args": { + "External id": 152761, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990831, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990831, "pid": 0, "tid": 7, "ts": 6303771912394.066, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898853.079, "dur": 4.200, + "args": { + "External id": 152761, "cbid": 211, "correlation": 289990831 + } + }, + { + "ph": "s", "id": 289990831, "pid": 5714, "tid": 6744, "ts": 6303771898853.079, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771912538.740, "dur": 46.464, + "args": { + "External id": 152767, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990850, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990850, "pid": 0, "tid": 7, "ts": 6303771912538.740, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898886.019, "dur": 5.850, + "args": { + "External id": 152767, "cbid": 211, "correlation": 289990850 + } + }, + { + "ph": "s", "id": 289990850, "pid": 5714, "tid": 6744, "ts": 6303771898886.019, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771912585.908, "dur": 56.961, + "args": { + "External id": 152768, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990862, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990862, "pid": 0, "tid": 7, "ts": 6303771912585.908, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898905.819, "dur": 4.500, + "args": { + "External id": 152768, "cbid": 211, "correlation": 289990862 + } + }, + { + "ph": "s", "id": 289990862, "pid": 5714, "tid": 6744, "ts": 6303771898905.819, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771912643.989, "dur": 41.440, + "args": { + "External id": 152771, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990875, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289990875, "pid": 0, "tid": 7, "ts": 6303771912643.989, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771898933.069, "dur": 5.340, + "args": { + "External id": 152771, "cbid": 211, "correlation": 289990875 + } + }, + { + "ph": "s", "id": 289990875, "pid": 5714, "tid": 6744, "ts": 6303771898933.069, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771898992.829, "dur": 0.490, + "args": { + "External id": 152781, "cbid": 200, "correlation": 289990911 + } + }, + { + "ph": "f", "id": 289990911, "pid": 5714, "tid": 6744, "ts": 6303771898992.829, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771912686.325, "dur": 0.768, + "args": { + "External id": 152781, "device": 0, "context": 1, "stream": 7, "correlation": 289990914, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289990914, "pid": 0, "tid": 7, "ts": 6303771912686.325, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771898994.889, "dur": 6.200, + "args": { + "External id": 152781, "cbid": 51, "correlation": 289990914 + } + }, + { + "ph": "s", "id": 289990914, "pid": 5714, "tid": 6744, "ts": 6303771898994.889, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771912688.277, "dur": 137.250, + "args": { + "External id": 152781, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990915, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990915, "pid": 0, "tid": 7, "ts": 6303771912688.277, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899001.299, "dur": 6.230, + "args": { + "External id": 152781, "cbid": 307, "correlation": 289990915 + } + }, + { + "ph": "s", "id": 289990915, "pid": 5714, "tid": 6744, "ts": 6303771899001.299, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771912826.263, "dur": 120.961, + "args": { + "External id": 152788, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289990937, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289990937, "pid": 0, "tid": 7, "ts": 6303771912826.263, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899033.469, "dur": 5.480, + "args": { + "External id": 152788, "cbid": 211, "correlation": 289990937 + } + }, + { + "ph": "s", "id": 289990937, "pid": 5714, "tid": 6744, "ts": 6303771899033.469, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771899193.608, "dur": 0.440, + "args": { + "External id": 152814, "cbid": 200, "correlation": 289990984 + } + }, + { + "ph": "f", "id": 289990984, "pid": 5714, "tid": 6744, "ts": 6303771899193.608, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771899194.168, "dur": 0.200, + "args": { + "External id": 152814, "cbid": 200, "correlation": 289990985 + } + }, + { + "ph": "f", "id": 289990985, "pid": 5714, "tid": 6744, "ts": 6303771899194.168, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771899210.179, "dur": 0.220, + "args": { + "External id": 152814, "cbid": 200, "correlation": 289991003 + } + }, + { + "ph": "f", "id": 289991003, "pid": 5714, "tid": 6744, "ts": 6303771899210.179, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771912947.832, "dur": 95.010, + "args": { + "External id": 152814, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991004, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991004, "pid": 0, "tid": 7, "ts": 6303771912947.832, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899211.508, "dur": 10.891, + "args": { + "External id": 152814, "cbid": 211, "correlation": 289991004 + } + }, + { + "ph": "s", "id": 289991004, "pid": 5714, "tid": 6744, "ts": 6303771899211.508, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771899223.139, "dur": 0.919, + "args": { + "External id": 152814, "cbid": 273, "correlation": 289991006 + } + }, + { + "ph": "f", "id": 289991006, "pid": 5714, "tid": 6744, "ts": 6303771899223.139, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771913043.482, "dur": 980.555, + "args": { + "External id": 152814, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991007, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289991007, "pid": 0, "tid": 7, "ts": 6303771913043.482, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899224.328, "dur": 3.890, + "args": { + "External id": 152814, "cbid": 211, "correlation": 289991007 + } + }, + { + "ph": "s", "id": 289991007, "pid": 5714, "tid": 6744, "ts": 6303771899224.328, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303771914024.773, "dur": 71.233, + "args": { + "External id": 152814, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991009, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289991009, "pid": 0, "tid": 7, "ts": 6303771914024.773, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899228.738, "dur": 3.650, + "args": { + "External id": 152814, "cbid": 211, "correlation": 289991009 + } + }, + { + "ph": "s", "id": 289991009, "pid": 5714, "tid": 6744, "ts": 6303771899228.738, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771914096.678, "dur": 46.593, + "args": { + "External id": 152824, "device": 0, "context": 1, "stream": 7, "correlation": 289991035, "bytes": 25165824, "memory bandwidth (GB/s)": 540.1202755778766 + } + }, + { + "ph": "f", "id": 289991035, "pid": 0, "tid": 7, "ts": 6303771914096.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771899356.088, "dur": 14.980, + "args": { + "External id": 152824, "cbid": 41, "correlation": 289991035 + } + }, + { + "ph": "s", "id": 289991035, "pid": 5714, "tid": 6744, "ts": 6303771899356.088, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771914143.911, "dur": 32.000, + "args": { + "External id": 152821, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991053, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991053, "pid": 0, "tid": 7, "ts": 6303771914143.911, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899461.948, "dur": 8.060, + "args": { + "External id": 152821, "cbid": 307, "correlation": 289991053 + } + }, + { + "ph": "s", "id": 289991053, "pid": 5714, "tid": 6744, "ts": 6303771899461.948, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771914176.551, "dur": 38.752, + "args": { + "External id": 152831, "device": 0, "context": 1, "stream": 7, "correlation": 289991068, "bytes": 25165824, "memory bandwidth (GB/s)": 649.4071015689512 + } + }, + { + "ph": "f", "id": 289991068, "pid": 0, "tid": 7, "ts": 6303771914176.551, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771899529.748, "dur": 14.410, + "args": { + "External id": 152831, "cbid": 41, "correlation": 289991068 + } + }, + { + "ph": "s", "id": 289991068, "pid": 5714, "tid": 6744, "ts": 6303771899529.748, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771914215.975, "dur": 25.569, + "args": { + "External id": 152828, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991086, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991086, "pid": 0, "tid": 7, "ts": 6303771914215.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899626.767, "dur": 7.531, + "args": { + "External id": 152828, "cbid": 307, "correlation": 289991086 + } + }, + { + "ph": "s", "id": 289991086, "pid": 5714, "tid": 6744, "ts": 6303771899626.767, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771899740.067, "dur": 0.510, + "args": { + "External id": 152855, "cbid": 200, "correlation": 289991130 + } + }, + { + "ph": "f", "id": 289991130, "pid": 5714, "tid": 6744, "ts": 6303771899740.067, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771914242.344, "dur": 0.768, + "args": { + "External id": 152855, "device": 0, "context": 1, "stream": 7, "correlation": 289991133, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289991133, "pid": 0, "tid": 7, "ts": 6303771914242.344, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771899742.167, "dur": 6.850, + "args": { + "External id": 152855, "cbid": 51, "correlation": 289991133 + } + }, + { + "ph": "s", "id": 289991133, "pid": 5714, "tid": 6744, "ts": 6303771899742.167, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771914244.296, "dur": 139.745, + "args": { + "External id": 152855, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991134, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991134, "pid": 0, "tid": 7, "ts": 6303771914244.296, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899749.217, "dur": 7.180, + "args": { + "External id": 152855, "cbid": 307, "correlation": 289991134 + } + }, + { + "ph": "s", "id": 289991134, "pid": 5714, "tid": 6744, "ts": 6303771899749.217, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771914384.649, "dur": 120.738, + "args": { + "External id": 152862, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991156, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991156, "pid": 0, "tid": 7, "ts": 6303771914384.649, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899783.137, "dur": 5.570, + "args": { + "External id": 152862, "cbid": 211, "correlation": 289991156 + } + }, + { + "ph": "s", "id": 289991156, "pid": 5714, "tid": 6744, "ts": 6303771899783.137, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771899880.407, "dur": 0.450, + "args": { + "External id": 152885, "cbid": 200, "correlation": 289991202 + } + }, + { + "ph": "f", "id": 289991202, "pid": 5714, "tid": 6744, "ts": 6303771899880.407, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771914506.187, "dur": 0.928, + "args": { + "External id": 152885, "device": 0, "context": 1, "stream": 7, "correlation": 289991205, "bytes": 576, "memory bandwidth (GB/s)": 0.6206896551724138 + } + }, + { + "ph": "f", "id": 289991205, "pid": 0, "tid": 7, "ts": 6303771914506.187, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771899882.237, "dur": 5.540, + "args": { + "External id": 152885, "cbid": 51, "correlation": 289991205 + } + }, + { + "ph": "s", "id": 289991205, "pid": 5714, "tid": 6744, "ts": 6303771899882.237, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771914508.299, "dur": 141.729, + "args": { + "External id": 152885, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991206, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991206, "pid": 0, "tid": 7, "ts": 6303771914508.299, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899887.957, "dur": 6.770, + "args": { + "External id": 152885, "cbid": 307, "correlation": 289991206 + } + }, + { + "ph": "s", "id": 289991206, "pid": 5714, "tid": 6744, "ts": 6303771899887.957, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771914650.636, "dur": 120.866, + "args": { + "External id": 152892, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991228, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991228, "pid": 0, "tid": 7, "ts": 6303771914650.636, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899920.277, "dur": 5.210, + "args": { + "External id": 152892, "cbid": 211, "correlation": 289991228 + } + }, + { + "ph": "s", "id": 289991228, "pid": 5714, "tid": 6744, "ts": 6303771899920.277, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771914772.110, "dur": 41.504, + "args": { + "External id": 152897, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991243, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991243, "pid": 0, "tid": 7, "ts": 6303771914772.110, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771899963.747, "dur": 6.760, + "args": { + "External id": 152897, "cbid": 211, "correlation": 289991243 + } + }, + { + "ph": "s", "id": 289991243, "pid": 5714, "tid": 6744, "ts": 6303771899963.747, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771900045.617, "dur": 0.480, + "args": { + "External id": 152916, "cbid": 200, "correlation": 289991287 + } + }, + { + "ph": "f", "id": 289991287, "pid": 5714, "tid": 6744, "ts": 6303771900045.617, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771914814.382, "dur": 0.768, + "args": { + "External id": 152916, "device": 0, "context": 1, "stream": 7, "correlation": 289991290, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289991290, "pid": 0, "tid": 7, "ts": 6303771914814.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771900047.526, "dur": 5.980, + "args": { + "External id": 152916, "cbid": 51, "correlation": 289991290 + } + }, + { + "ph": "s", "id": 289991290, "pid": 5714, "tid": 6744, "ts": 6303771900047.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771914816.270, "dur": 141.218, + "args": { + "External id": 152916, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991291, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991291, "pid": 0, "tid": 7, "ts": 6303771914816.270, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900053.686, "dur": 6.400, + "args": { + "External id": 152916, "cbid": 307, "correlation": 289991291 + } + }, + { + "ph": "s", "id": 289991291, "pid": 5714, "tid": 6744, "ts": 6303771900053.686, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771914958.192, "dur": 120.321, + "args": { + "External id": 152923, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991313, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991313, "pid": 0, "tid": 7, "ts": 6303771914958.192, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900092.037, "dur": 5.300, + "args": { + "External id": 152923, "cbid": 211, "correlation": 289991313 + } + }, + { + "ph": "s", "id": 289991313, "pid": 5714, "tid": 6744, "ts": 6303771900092.037, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771915079.121, "dur": 37.665, + "args": { + "External id": 152928, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991324, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991324, "pid": 0, "tid": 7, "ts": 6303771915079.121, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900131.256, "dur": 6.540, + "args": { + "External id": 152928, "cbid": 211, "correlation": 289991324 + } + }, + { + "ph": "s", "id": 289991324, "pid": 5714, "tid": 6744, "ts": 6303771900131.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771915117.394, "dur": 42.848, + "args": { + "External id": 152940, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991348, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991348, "pid": 0, "tid": 7, "ts": 6303771915117.394, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900187.606, "dur": 7.390, + "args": { + "External id": 152940, "cbid": 211, "correlation": 289991348 + } + }, + { + "ph": "s", "id": 289991348, "pid": 5714, "tid": 6744, "ts": 6303771900187.606, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771915160.946, "dur": 23.969, + "args": { + "External id": 152941, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991358, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991358, "pid": 0, "tid": 7, "ts": 6303771915160.946, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900203.536, "dur": 4.180, + "args": { + "External id": 152941, "cbid": 211, "correlation": 289991358 + } + }, + { + "ph": "s", "id": 289991358, "pid": 5714, "tid": 6744, "ts": 6303771900203.536, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771915185.715, "dur": 0.736, + "args": { + "External id": 152942, "device": 0, "context": 1, "stream": 7, "correlation": 289991373, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 289991373, "pid": 0, "tid": 7, "ts": 6303771915185.715, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771900223.016, "dur": 5.450, + "args": { + "External id": 152942, "cbid": 51, "correlation": 289991373 + } + }, + { + "ph": "s", "id": 289991373, "pid": 5714, "tid": 6744, "ts": 6303771900223.016, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6303771915187.635, "dur": 42.144, + "args": { + "External id": 152942, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991375, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289991375, "pid": 0, "tid": 7, "ts": 6303771915187.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900229.556, "dur": 4.950, + "args": { + "External id": 152942, "cbid": 211, "correlation": 289991375 + } + }, + { + "ph": "s", "id": 289991375, "pid": 5714, "tid": 6744, "ts": 6303771900229.556, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771915230.387, "dur": 45.185, + "args": { + "External id": 152953, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991396, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991396, "pid": 0, "tid": 7, "ts": 6303771915230.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900283.906, "dur": 7.680, + "args": { + "External id": 152953, "cbid": 211, "correlation": 289991396 + } + }, + { + "ph": "s", "id": 289991396, "pid": 5714, "tid": 6744, "ts": 6303771900283.906, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771915276.180, "dur": 145.281, + "args": { + "External id": 152956, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991411, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991411, "pid": 0, "tid": 7, "ts": 6303771915276.180, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900323.876, "dur": 6.100, + "args": { + "External id": 152956, "cbid": 211, "correlation": 289991411 + } + }, + { + "ph": "s", "id": 289991411, "pid": 5714, "tid": 6744, "ts": 6303771900323.876, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771915422.133, "dur": 108.386, + "args": { + "External id": 152957, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991421, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991421, "pid": 0, "tid": 7, "ts": 6303771915422.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900339.396, "dur": 4.200, + "args": { + "External id": 152957, "cbid": 211, "correlation": 289991421 + } + }, + { + "ph": "s", "id": 289991421, "pid": 5714, "tid": 6744, "ts": 6303771900339.396, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771915531.255, "dur": 77.440, + "args": { + "External id": 152958, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991435, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991435, "pid": 0, "tid": 7, "ts": 6303771915531.255, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900356.576, "dur": 4.650, + "args": { + "External id": 152958, "cbid": 211, "correlation": 289991435 + } + }, + { + "ph": "s", "id": 289991435, "pid": 5714, "tid": 6744, "ts": 6303771900356.576, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771915609.399, "dur": 1.440, + "args": { + "External id": 152961, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991449, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289991449, "pid": 0, "tid": 7, "ts": 6303771915609.399, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900389.636, "dur": 6.160, + "args": { + "External id": 152961, "cbid": 211, "correlation": 289991449 + } + }, + { + "ph": "s", "id": 289991449, "pid": 5714, "tid": 6744, "ts": 6303771900389.636, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771915611.543, "dur": 1.216, + "args": { + "External id": 152965, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991459, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289991459, "pid": 0, "tid": 7, "ts": 6303771915611.543, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900408.226, "dur": 4.590, + "args": { + "External id": 152965, "cbid": 211, "correlation": 289991459 + } + }, + { + "ph": "s", "id": 289991459, "pid": 5714, "tid": 6744, "ts": 6303771900408.226, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771915613.399, "dur": 0.992, + "args": { + "External id": 152966, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991469, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289991469, "pid": 0, "tid": 7, "ts": 6303771915613.399, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900420.306, "dur": 3.840, + "args": { + "External id": 152966, "cbid": 211, "correlation": 289991469 + } + }, + { + "ph": "s", "id": 289991469, "pid": 5714, "tid": 6744, "ts": 6303771900420.306, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771915615.127, "dur": 26.529, + "args": { + "External id": 152974, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991487, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991487, "pid": 0, "tid": 7, "ts": 6303771915615.127, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900465.186, "dur": 6.670, + "args": { + "External id": 152974, "cbid": 211, "correlation": 289991487 + } + }, + { + "ph": "s", "id": 289991487, "pid": 5714, "tid": 6744, "ts": 6303771900465.186, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771915643.416, "dur": 110.337, + "args": { + "External id": 152980, "device": 0, "context": 1, "stream": 7, "correlation": 289991501, "bytes": 50331648, "memory bandwidth (GB/s)": 456.1629190570706 + } + }, + { + "ph": "f", "id": 289991501, "pid": 0, "tid": 7, "ts": 6303771915643.416, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771900498.796, "dur": 13.540, + "args": { + "External id": 152980, "cbid": 41, "correlation": 289991501 + } + }, + { + "ph": "s", "id": 289991501, "pid": 5714, "tid": 6744, "ts": 6303771900498.796, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771915754.329, "dur": 71.521, + "args": { + "External id": 152982, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991513, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991513, "pid": 0, "tid": 7, "ts": 6303771915754.329, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900525.045, "dur": 5.271, + "args": { + "External id": 152982, "cbid": 211, "correlation": 289991513 + } + }, + { + "ph": "s", "id": 289991513, "pid": 5714, "tid": 6744, "ts": 6303771900525.045, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771915826.522, "dur": 145.985, + "args": { + "External id": 152983, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991523, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991523, "pid": 0, "tid": 7, "ts": 6303771915826.522, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900539.305, "dur": 3.691, + "args": { + "External id": 152983, "cbid": 211, "correlation": 289991523 + } + }, + { + "ph": "s", "id": 289991523, "pid": 5714, "tid": 6744, "ts": 6303771900539.305, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771915973.179, "dur": 150.594, + "args": { + "External id": 152984, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991530, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991530, "pid": 0, "tid": 7, "ts": 6303771915973.179, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900554.936, "dur": 4.100, + "args": { + "External id": 152984, "cbid": 211, "correlation": 289991530 + } + }, + { + "ph": "s", "id": 289991530, "pid": 5714, "tid": 6744, "ts": 6303771900554.936, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771916124.413, "dur": 49.505, + "args": { + "External id": 152990, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991549, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991549, "pid": 0, "tid": 7, "ts": 6303771916124.413, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900587.605, "dur": 6.120, + "args": { + "External id": 152990, "cbid": 211, "correlation": 289991549 + } + }, + { + "ph": "s", "id": 289991549, "pid": 5714, "tid": 6744, "ts": 6303771900587.605, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771916174.622, "dur": 40.545, + "args": { + "External id": 152991, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991557, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991557, "pid": 0, "tid": 7, "ts": 6303771916174.622, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900603.615, "dur": 4.160, + "args": { + "External id": 152991, "cbid": 211, "correlation": 289991557 + } + }, + { + "ph": "s", "id": 289991557, "pid": 5714, "tid": 6744, "ts": 6303771900603.615, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771916215.871, "dur": 324.195, + "args": { + "External id": 153006, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991590, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991590, "pid": 0, "tid": 7, "ts": 6303771916215.871, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900694.595, "dur": 8.920, + "args": { + "External id": 153006, "cbid": 211, "correlation": 289991590 + } + }, + { + "ph": "s", "id": 289991590, "pid": 5714, "tid": 6744, "ts": 6303771900694.595, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "swiglu_fwdbwd_vectorized4_kernel", "pid": 0, "tid": 7, + "ts": 6303771916540.770, "dur": 429.957, + "args": { + "External id": 152995, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991618, "registers per thread": 31, "shared memory": 0, "blocks per SM": 512.000000, "warps per SM": 2048.000000, "grid": [65536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991618, "pid": 0, "tid": 7, "ts": 6303771916540.770, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900757.155, "dur": 6.350, + "args": { + "External id": 152995, "cbid": 307, "correlation": 289991618 + } + }, + { + "ph": "s", "id": 289991618, "pid": 5714, "tid": 6744, "ts": 6303771900757.155, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771900851.865, "dur": 0.530, + "args": { + "External id": 153031, "cbid": 200, "correlation": 289991643 + } + }, + { + "ph": "f", "id": 289991643, "pid": 5714, "tid": 6744, "ts": 6303771900851.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771916971.975, "dur": 0.832, + "args": { + "External id": 153031, "device": 0, "context": 1, "stream": 7, "correlation": 289991646, "bytes": 1536, "memory bandwidth (GB/s)": 1.8461538461538463 + } + }, + { + "ph": "f", "id": 289991646, "pid": 0, "tid": 7, "ts": 6303771916971.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771900854.055, "dur": 6.960, + "args": { + "External id": 153031, "cbid": 51, "correlation": 289991646 + } + }, + { + "ph": "s", "id": 289991646, "pid": 5714, "tid": 6744, "ts": 6303771900854.055, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771916973.959, "dur": 366.021, + "args": { + "External id": 153031, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991647, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991647, "pid": 0, "tid": 7, "ts": 6303771916973.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900861.245, "dur": 7.080, + "args": { + "External id": 153031, "cbid": 307, "correlation": 289991647 + } + }, + { + "ph": "s", "id": 289991647, "pid": 5714, "tid": 6744, "ts": 6303771900861.245, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771900974.435, "dur": 0.480, + "args": { + "External id": 153049, "cbid": 200, "correlation": 289991684 + } + }, + { + "ph": "f", "id": 289991684, "pid": 5714, "tid": 6744, "ts": 6303771900974.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771917340.844, "dur": 0.768, + "args": { + "External id": 153049, "device": 0, "context": 1, "stream": 7, "correlation": 289991687, "bytes": 1536, "memory bandwidth (GB/s)": 2 + } + }, + { + "ph": "f", "id": 289991687, "pid": 0, "tid": 7, "ts": 6303771917340.844, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771900976.364, "dur": 6.040, + "args": { + "External id": 153049, "cbid": 51, "correlation": 289991687 + } + }, + { + "ph": "s", "id": 289991687, "pid": 5714, "tid": 6744, "ts": 6303771900976.364, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771917342.764, "dur": 354.436, + "args": { + "External id": 153049, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991688, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991688, "pid": 0, "tid": 7, "ts": 6303771917342.764, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771900982.595, "dur": 7.280, + "args": { + "External id": 153049, "cbid": 307, "correlation": 289991688 + } + }, + { + "ph": "s", "id": 289991688, "pid": 5714, "tid": 6744, "ts": 6303771900982.595, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771901018.075, "dur": 0.269, + "args": { + "External id": 153056, "cbid": 200, "correlation": 289991713 + } + }, + { + "ph": "f", "id": 289991713, "pid": 5714, "tid": 6744, "ts": 6303771901018.075, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771917697.840, "dur": 353.188, + "args": { + "External id": 153056, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991716, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991716, "pid": 0, "tid": 7, "ts": 6303771917697.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901019.395, "dur": 5.259, + "args": { + "External id": 153056, "cbid": 307, "correlation": 289991716 + } + }, + { + "ph": "s", "id": 289991716, "pid": 5714, "tid": 6744, "ts": 6303771901019.395, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771901113.604, "dur": 0.390, + "args": { + "External id": 153079, "cbid": 200, "correlation": 289991761 + } + }, + { + "ph": "f", "id": 289991761, "pid": 5714, "tid": 6744, "ts": 6303771901113.604, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771918051.860, "dur": 0.800, + "args": { + "External id": 153079, "device": 0, "context": 1, "stream": 7, "correlation": 289991764, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289991764, "pid": 0, "tid": 7, "ts": 6303771918051.860, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771901115.314, "dur": 5.360, + "args": { + "External id": 153079, "cbid": 51, "correlation": 289991764 + } + }, + { + "ph": "s", "id": 289991764, "pid": 5714, "tid": 6744, "ts": 6303771901115.314, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771918053.844, "dur": 356.164, + "args": { + "External id": 153079, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991765, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991765, "pid": 0, "tid": 7, "ts": 6303771918053.844, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901120.864, "dur": 6.640, + "args": { + "External id": 153079, "cbid": 307, "correlation": 289991765 + } + }, + { + "ph": "s", "id": 289991765, "pid": 5714, "tid": 6744, "ts": 6303771901120.864, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771901153.264, "dur": 0.250, + "args": { + "External id": 153086, "cbid": 200, "correlation": 289991790 + } + }, + { + "ph": "f", "id": 289991790, "pid": 5714, "tid": 6744, "ts": 6303771901153.264, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771918410.712, "dur": 356.324, + "args": { + "External id": 153086, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991793, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289991793, "pid": 0, "tid": 7, "ts": 6303771918410.712, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901154.374, "dur": 4.950, + "args": { + "External id": 153086, "cbid": 307, "correlation": 289991793 + } + }, + { + "ph": "s", "id": 289991793, "pid": 5714, "tid": 6744, "ts": 6303771901154.374, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771918767.740, "dur": 51.713, + "args": { + "External id": 153091, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991807, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991807, "pid": 0, "tid": 7, "ts": 6303771918767.740, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901205.294, "dur": 6.800, + "args": { + "External id": 153091, "cbid": 211, "correlation": 289991807 + } + }, + { + "ph": "s", "id": 289991807, "pid": 5714, "tid": 6744, "ts": 6303771901205.294, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771918820.093, "dur": 43.648, + "args": { + "External id": 153103, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991831, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991831, "pid": 0, "tid": 7, "ts": 6303771918820.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901260.994, "dur": 7.280, + "args": { + "External id": 153103, "cbid": 211, "correlation": 289991831 + } + }, + { + "ph": "s", "id": 289991831, "pid": 5714, "tid": 6744, "ts": 6303771901260.994, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771918864.381, "dur": 26.080, + "args": { + "External id": 153104, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991841, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991841, "pid": 0, "tid": 7, "ts": 6303771918864.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901277.164, "dur": 4.180, + "args": { + "External id": 153104, "cbid": 211, "correlation": 289991841 + } + }, + { + "ph": "s", "id": 289991841, "pid": 5714, "tid": 6744, "ts": 6303771901277.164, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771918891.389, "dur": 0.736, + "args": { + "External id": 153105, "device": 0, "context": 1, "stream": 7, "correlation": 289991856, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 289991856, "pid": 0, "tid": 7, "ts": 6303771918891.389, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771901305.184, "dur": 5.790, + "args": { + "External id": 153105, "cbid": 51, "correlation": 289991856 + } + }, + { + "ph": "s", "id": 289991856, "pid": 5714, "tid": 6744, "ts": 6303771901305.184, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6303771918893.309, "dur": 42.145, + "args": { + "External id": 153105, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991858, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289991858, "pid": 0, "tid": 7, "ts": 6303771918893.309, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901312.094, "dur": 5.610, + "args": { + "External id": 153105, "cbid": 211, "correlation": 289991858 + } + }, + { + "ph": "s", "id": 289991858, "pid": 5714, "tid": 6744, "ts": 6303771901312.094, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771918936.062, "dur": 50.337, + "args": { + "External id": 153116, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991879, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991879, "pid": 0, "tid": 7, "ts": 6303771918936.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901371.774, "dur": 7.629, + "args": { + "External id": 153116, "cbid": 211, "correlation": 289991879 + } + }, + { + "ph": "s", "id": 289991879, "pid": 5714, "tid": 6744, "ts": 6303771901371.774, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771918987.103, "dur": 147.233, + "args": { + "External id": 153119, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991894, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991894, "pid": 0, "tid": 7, "ts": 6303771918987.103, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901403.274, "dur": 5.429, + "args": { + "External id": 153119, "cbid": 211, "correlation": 289991894 + } + }, + { + "ph": "s", "id": 289991894, "pid": 5714, "tid": 6744, "ts": 6303771901403.274, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771919134.944, "dur": 107.394, + "args": { + "External id": 153120, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991904, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991904, "pid": 0, "tid": 7, "ts": 6303771919134.944, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901417.443, "dur": 4.091, + "args": { + "External id": 153120, "cbid": 211, "correlation": 289991904 + } + }, + { + "ph": "s", "id": 289991904, "pid": 5714, "tid": 6744, "ts": 6303771901417.443, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771919242.946, "dur": 78.145, + "args": { + "External id": 153121, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991918, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991918, "pid": 0, "tid": 7, "ts": 6303771919242.946, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901435.034, "dur": 4.760, + "args": { + "External id": 153121, "cbid": 211, "correlation": 289991918 + } + }, + { + "ph": "s", "id": 289991918, "pid": 5714, "tid": 6744, "ts": 6303771901435.034, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771919321.699, "dur": 1.472, + "args": { + "External id": 153124, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991932, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289991932, "pid": 0, "tid": 7, "ts": 6303771919321.699, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901466.354, "dur": 6.109, + "args": { + "External id": 153124, "cbid": 211, "correlation": 289991932 + } + }, + { + "ph": "s", "id": 289991932, "pid": 5714, "tid": 6744, "ts": 6303771901466.354, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771919323.843, "dur": 1.088, + "args": { + "External id": 153128, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991942, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289991942, "pid": 0, "tid": 7, "ts": 6303771919323.843, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901485.673, "dur": 4.440, + "args": { + "External id": 153128, "cbid": 211, "correlation": 289991942 + } + }, + { + "ph": "s", "id": 289991942, "pid": 5714, "tid": 6744, "ts": 6303771901485.673, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771919325.539, "dur": 1.024, + "args": { + "External id": 153129, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991952, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289991952, "pid": 0, "tid": 7, "ts": 6303771919325.539, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901497.983, "dur": 3.880, + "args": { + "External id": 153129, "cbid": 211, "correlation": 289991952 + } + }, + { + "ph": "s", "id": 289991952, "pid": 5714, "tid": 6744, "ts": 6303771901497.983, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771919327.299, "dur": 26.976, + "args": { + "External id": 153137, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991970, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991970, "pid": 0, "tid": 7, "ts": 6303771919327.299, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901542.323, "dur": 6.660, + "args": { + "External id": 153137, "cbid": 211, "correlation": 289991970 + } + }, + { + "ph": "s", "id": 289991970, "pid": 5714, "tid": 6744, "ts": 6303771901542.323, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771919354.979, "dur": 111.873, + "args": { + "External id": 153143, "device": 0, "context": 1, "stream": 7, "correlation": 289991984, "bytes": 50331648, "memory bandwidth (GB/s)": 449.89986860100294 + } + }, + { + "ph": "f", "id": 289991984, "pid": 0, "tid": 7, "ts": 6303771919354.979, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771901575.983, "dur": 15.940, + "args": { + "External id": 153143, "cbid": 41, "correlation": 289991984 + } + }, + { + "ph": "s", "id": 289991984, "pid": 5714, "tid": 6744, "ts": 6303771901575.983, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771919467.524, "dur": 70.433, + "args": { + "External id": 153145, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289991996, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289991996, "pid": 0, "tid": 7, "ts": 6303771919467.524, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901605.003, "dur": 5.400, + "args": { + "External id": 153145, "cbid": 211, "correlation": 289991996 + } + }, + { + "ph": "s", "id": 289991996, "pid": 5714, "tid": 6744, "ts": 6303771901605.003, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771919538.629, "dur": 149.058, + "args": { + "External id": 153146, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992006, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992006, "pid": 0, "tid": 7, "ts": 6303771919538.629, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901618.813, "dur": 3.760, + "args": { + "External id": 153146, "cbid": 211, "correlation": 289992006 + } + }, + { + "ph": "s", "id": 289992006, "pid": 5714, "tid": 6744, "ts": 6303771901618.813, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771919688.327, "dur": 139.169, + "args": { + "External id": 153147, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992013, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992013, "pid": 0, "tid": 7, "ts": 6303771919688.327, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901633.983, "dur": 4.100, + "args": { + "External id": 153147, "cbid": 211, "correlation": 289992013 + } + }, + { + "ph": "s", "id": 289992013, "pid": 5714, "tid": 6744, "ts": 6303771901633.983, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771919828.104, "dur": 43.553, + "args": { + "External id": 153153, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992032, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992032, "pid": 0, "tid": 7, "ts": 6303771919828.104, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901667.743, "dur": 5.920, + "args": { + "External id": 153153, "cbid": 211, "correlation": 289992032 + } + }, + { + "ph": "s", "id": 289992032, "pid": 5714, "tid": 6744, "ts": 6303771901667.743, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast >(at::TensorIteratorBase&, at::native::CUDAFunctor_add const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771919872.265, "dur": 57.953, + "args": { + "External id": 153154, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992044, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992044, "pid": 0, "tid": 7, "ts": 6303771919872.265, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901687.553, "dur": 4.820, + "args": { + "External id": 153154, "cbid": 211, "correlation": 289992044 + } + }, + { + "ph": "s", "id": 289992044, "pid": 5714, "tid": 6744, "ts": 6303771901687.553, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771919930.858, "dur": 41.248, + "args": { + "External id": 153157, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992057, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992057, "pid": 0, "tid": 7, "ts": 6303771919930.858, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901716.143, "dur": 5.320, + "args": { + "External id": 153157, "cbid": 211, "correlation": 289992057 + } + }, + { + "ph": "s", "id": 289992057, "pid": 5714, "tid": 6744, "ts": 6303771901716.143, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771901774.933, "dur": 0.490, + "args": { + "External id": 153167, "cbid": 200, "correlation": 289992093 + } + }, + { + "ph": "f", "id": 289992093, "pid": 5714, "tid": 6744, "ts": 6303771901774.933, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771919973.002, "dur": 0.768, + "args": { + "External id": 153167, "device": 0, "context": 1, "stream": 7, "correlation": 289992096, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289992096, "pid": 0, "tid": 7, "ts": 6303771919973.002, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771901776.993, "dur": 6.260, + "args": { + "External id": 153167, "cbid": 51, "correlation": 289992096 + } + }, + { + "ph": "s", "id": 289992096, "pid": 5714, "tid": 6744, "ts": 6303771901776.993, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771919974.922, "dur": 137.058, + "args": { + "External id": 153167, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992097, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992097, "pid": 0, "tid": 7, "ts": 6303771919974.922, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901783.453, "dur": 6.260, + "args": { + "External id": 153167, "cbid": 307, "correlation": 289992097 + } + }, + { + "ph": "s", "id": 289992097, "pid": 5714, "tid": 6744, "ts": 6303771901783.453, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771920112.684, "dur": 121.121, + "args": { + "External id": 153174, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992119, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992119, "pid": 0, "tid": 7, "ts": 6303771920112.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901815.873, "dur": 5.370, + "args": { + "External id": 153174, "cbid": 211, "correlation": 289992119 + } + }, + { + "ph": "s", "id": 289992119, "pid": 5714, "tid": 6744, "ts": 6303771901815.873, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771901977.842, "dur": 0.470, + "args": { + "External id": 153200, "cbid": 200, "correlation": 289992166 + } + }, + { + "ph": "f", "id": 289992166, "pid": 5714, "tid": 6744, "ts": 6303771901977.842, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771901978.442, "dur": 0.190, + "args": { + "External id": 153200, "cbid": 200, "correlation": 289992167 + } + }, + { + "ph": "f", "id": 289992167, "pid": 5714, "tid": 6744, "ts": 6303771901978.442, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771901995.402, "dur": 0.200, + "args": { + "External id": 153200, "cbid": 200, "correlation": 289992185 + } + }, + { + "ph": "f", "id": 289992185, "pid": 5714, "tid": 6744, "ts": 6303771901995.402, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771920234.509, "dur": 96.321, + "args": { + "External id": 153200, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992186, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992186, "pid": 0, "tid": 7, "ts": 6303771920234.509, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771901996.752, "dur": 10.840, + "args": { + "External id": 153200, "cbid": 211, "correlation": 289992186 + } + }, + { + "ph": "s", "id": 289992186, "pid": 5714, "tid": 6744, "ts": 6303771901996.752, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771902008.322, "dur": 0.930, + "args": { + "External id": 153200, "cbid": 273, "correlation": 289992188 + } + }, + { + "ph": "f", "id": 289992188, "pid": 5714, "tid": 6744, "ts": 6303771902008.322, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771920331.534, "dur": 980.204, + "args": { + "External id": 153200, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992189, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289992189, "pid": 0, "tid": 7, "ts": 6303771920331.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902009.522, "dur": 3.950, + "args": { + "External id": 153200, "cbid": 211, "correlation": 289992189 + } + }, + { + "ph": "s", "id": 289992189, "pid": 5714, "tid": 6744, "ts": 6303771902009.522, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303771921312.474, "dur": 71.169, + "args": { + "External id": 153200, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992191, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289992191, "pid": 0, "tid": 7, "ts": 6303771921312.474, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902013.992, "dur": 3.570, + "args": { + "External id": 153200, "cbid": 211, "correlation": 289992191 + } + }, + { + "ph": "s", "id": 289992191, "pid": 5714, "tid": 6744, "ts": 6303771902013.992, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771921384.347, "dur": 48.416, + "args": { + "External id": 153210, "device": 0, "context": 1, "stream": 7, "correlation": 289992217, "bytes": 25165824, "memory bandwidth (GB/s)": 519.783212161269 + } + }, + { + "ph": "f", "id": 289992217, "pid": 0, "tid": 7, "ts": 6303771921384.347, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771902131.742, "dur": 17.300, + "args": { + "External id": 153210, "cbid": 41, "correlation": 289992217 + } + }, + { + "ph": "s", "id": 289992217, "pid": 5714, "tid": 6744, "ts": 6303771902131.742, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771921433.467, "dur": 30.336, + "args": { + "External id": 153207, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992235, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992235, "pid": 0, "tid": 7, "ts": 6303771921433.467, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902240.592, "dur": 7.900, + "args": { + "External id": 153207, "cbid": 307, "correlation": 289992235 + } + }, + { + "ph": "s", "id": 289992235, "pid": 5714, "tid": 6744, "ts": 6303771902240.592, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771921464.411, "dur": 38.209, + "args": { + "External id": 153217, "device": 0, "context": 1, "stream": 7, "correlation": 289992250, "bytes": 25165824, "memory bandwidth (GB/s)": 658.6360281609045 + } + }, + { + "ph": "f", "id": 289992250, "pid": 0, "tid": 7, "ts": 6303771921464.411, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771902332.041, "dur": 13.611, + "args": { + "External id": 153217, "cbid": 41, "correlation": 289992250 + } + }, + { + "ph": "s", "id": 289992250, "pid": 5714, "tid": 6744, "ts": 6303771902332.041, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771921503.292, "dur": 26.944, + "args": { + "External id": 153214, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992268, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992268, "pid": 0, "tid": 7, "ts": 6303771921503.292, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902431.001, "dur": 7.510, + "args": { + "External id": 153214, "cbid": 307, "correlation": 289992268 + } + }, + { + "ph": "s", "id": 289992268, "pid": 5714, "tid": 6744, "ts": 6303771902431.001, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771902542.971, "dur": 0.540, + "args": { + "External id": 153241, "cbid": 200, "correlation": 289992312 + } + }, + { + "ph": "f", "id": 289992312, "pid": 5714, "tid": 6744, "ts": 6303771902542.971, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771921531.068, "dur": 0.800, + "args": { + "External id": 153241, "device": 0, "context": 1, "stream": 7, "correlation": 289992315, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 289992315, "pid": 0, "tid": 7, "ts": 6303771921531.068, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771902545.091, "dur": 6.950, + "args": { + "External id": 153241, "cbid": 51, "correlation": 289992315 + } + }, + { + "ph": "s", "id": 289992315, "pid": 5714, "tid": 6744, "ts": 6303771902545.091, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771921533.052, "dur": 139.906, + "args": { + "External id": 153241, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992316, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992316, "pid": 0, "tid": 7, "ts": 6303771921533.052, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902552.271, "dur": 7.120, + "args": { + "External id": 153241, "cbid": 307, "correlation": 289992316 + } + }, + { + "ph": "s", "id": 289992316, "pid": 5714, "tid": 6744, "ts": 6303771902552.271, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771921673.662, "dur": 120.513, + "args": { + "External id": 153248, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992338, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992338, "pid": 0, "tid": 7, "ts": 6303771921673.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902587.731, "dur": 5.710, + "args": { + "External id": 153248, "cbid": 211, "correlation": 289992338 + } + }, + { + "ph": "s", "id": 289992338, "pid": 5714, "tid": 6744, "ts": 6303771902587.731, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771902685.021, "dur": 0.470, + "args": { + "External id": 153271, "cbid": 200, "correlation": 289992384 + } + }, + { + "ph": "f", "id": 289992384, "pid": 5714, "tid": 6744, "ts": 6303771902685.021, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771921795.071, "dur": 0.768, + "args": { + "External id": 153271, "device": 0, "context": 1, "stream": 7, "correlation": 289992387, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289992387, "pid": 0, "tid": 7, "ts": 6303771921795.071, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771902686.881, "dur": 5.440, + "args": { + "External id": 153271, "cbid": 51, "correlation": 289992387 + } + }, + { + "ph": "s", "id": 289992387, "pid": 5714, "tid": 6744, "ts": 6303771902686.881, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771921797.535, "dur": 141.378, + "args": { + "External id": 153271, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992388, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992388, "pid": 0, "tid": 7, "ts": 6303771921797.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902692.501, "dur": 6.720, + "args": { + "External id": 153271, "cbid": 307, "correlation": 289992388 + } + }, + { + "ph": "s", "id": 289992388, "pid": 5714, "tid": 6744, "ts": 6303771902692.501, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771921939.585, "dur": 120.769, + "args": { + "External id": 153278, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992410, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992410, "pid": 0, "tid": 7, "ts": 6303771921939.585, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902723.851, "dur": 4.980, + "args": { + "External id": 153278, "cbid": 211, "correlation": 289992410 + } + }, + { + "ph": "s", "id": 289992410, "pid": 5714, "tid": 6744, "ts": 6303771902723.851, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771922061.186, "dur": 41.345, + "args": { + "External id": 153283, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992425, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992425, "pid": 0, "tid": 7, "ts": 6303771922061.186, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902766.651, "dur": 7.069, + "args": { + "External id": 153283, "cbid": 211, "correlation": 289992425 + } + }, + { + "ph": "s", "id": 289992425, "pid": 5714, "tid": 6744, "ts": 6303771902766.651, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771902847.980, "dur": 0.460, + "args": { + "External id": 153302, "cbid": 200, "correlation": 289992469 + } + }, + { + "ph": "f", "id": 289992469, "pid": 5714, "tid": 6744, "ts": 6303771902847.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771922103.395, "dur": 0.800, + "args": { + "External id": 153302, "device": 0, "context": 1, "stream": 7, "correlation": 289992472, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 289992472, "pid": 0, "tid": 7, "ts": 6303771922103.395, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771902849.840, "dur": 5.740, + "args": { + "External id": 153302, "cbid": 51, "correlation": 289992472 + } + }, + { + "ph": "s", "id": 289992472, "pid": 5714, "tid": 6744, "ts": 6303771902849.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771922105.379, "dur": 140.386, + "args": { + "External id": 153302, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992473, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992473, "pid": 0, "tid": 7, "ts": 6303771922105.379, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902855.750, "dur": 6.650, + "args": { + "External id": 153302, "cbid": 307, "correlation": 289992473 + } + }, + { + "ph": "s", "id": 289992473, "pid": 5714, "tid": 6744, "ts": 6303771902855.750, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771922246.469, "dur": 120.577, + "args": { + "External id": 153309, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992495, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992495, "pid": 0, "tid": 7, "ts": 6303771922246.469, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902888.830, "dur": 5.220, + "args": { + "External id": 153309, "cbid": 211, "correlation": 289992495 + } + }, + { + "ph": "s", "id": 289992495, "pid": 5714, "tid": 6744, "ts": 6303771902888.830, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771922367.750, "dur": 38.625, + "args": { + "External id": 153314, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992506, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992506, "pid": 0, "tid": 7, "ts": 6303771922367.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902928.580, "dur": 6.490, + "args": { + "External id": 153314, "cbid": 211, "correlation": 289992506 + } + }, + { + "ph": "s", "id": 289992506, "pid": 5714, "tid": 6744, "ts": 6303771902928.580, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771922407.047, "dur": 47.744, + "args": { + "External id": 153326, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992530, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992530, "pid": 0, "tid": 7, "ts": 6303771922407.047, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902983.420, "dur": 7.250, + "args": { + "External id": 153326, "cbid": 211, "correlation": 289992530 + } + }, + { + "ph": "s", "id": 289992530, "pid": 5714, "tid": 6744, "ts": 6303771902983.420, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 4, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771922455.431, "dur": 31.808, + "args": { + "External id": 153327, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992540, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992540, "pid": 0, "tid": 7, "ts": 6303771922455.431, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771902999.830, "dur": 3.980, + "args": { + "External id": 153327, "cbid": 211, "correlation": 289992540 + } + }, + { + "ph": "s", "id": 289992540, "pid": 5714, "tid": 6744, "ts": 6303771902999.830, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771922488.039, "dur": 0.736, + "args": { + "External id": 153328, "device": 0, "context": 1, "stream": 7, "correlation": 289992555, "bytes": 24, "memory bandwidth (GB/s)": 0.03260869565217391 + } + }, + { + "ph": "f", "id": 289992555, "pid": 0, "tid": 7, "ts": 6303771922488.039, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771903018.840, "dur": 5.450, + "args": { + "External id": 153328, "cbid": 51, "correlation": 289992555 + } + }, + { + "ph": "s", "id": 289992555, "pid": 5714, "tid": 6744, "ts": 6303771903018.840, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<128, 4, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, c10::BFloat16, 4>)", "pid": 0, "tid": 7, + "ts": 6303771922490.439, "dur": 44.257, + "args": { + "External id": 153328, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992557, "registers per thread": 47, "shared memory": 2064, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [6, 256, 1], "block": [32, 4, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289992557, "pid": 0, "tid": 7, "ts": 6303771922490.439, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903025.360, "dur": 4.900, + "args": { + "External id": 153328, "cbid": 211, "correlation": 289992557 + } + }, + { + "ph": "s", "id": 289992557, "pid": 5714, "tid": 6744, "ts": 6303771903025.360, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::unrolled_elementwise_kernel, 4, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1> >(int, at::native::direct_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda()#3}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array, TrivialOffsetCalculator<1, unsigned int>, TrivialOffsetCalculator<1, unsigned int>, at::native::memory::LoadWithCast<1>, at::native::memory::StoreWithCast<1>)", "pid": 0, "tid": 7, + "ts": 6303771922535.432, "dur": 52.224, + "args": { + "External id": 153339, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992578, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992578, "pid": 0, "tid": 7, "ts": 6303771922535.432, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903081.030, "dur": 7.830, + "args": { + "External id": 153339, "cbid": 211, "correlation": 289992578 + } + }, + { + "ph": "s", "id": 289992578, "pid": 5714, "tid": 6744, "ts": 6303771903081.030, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771922588.328, "dur": 149.442, + "args": { + "External id": 153342, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992593, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992593, "pid": 0, "tid": 7, "ts": 6303771922588.328, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903112.070, "dur": 5.290, + "args": { + "External id": 153342, "cbid": 211, "correlation": 289992593 + } + }, + { + "ph": "s", "id": 289992593, "pid": 5714, "tid": 6744, "ts": 6303771903112.070, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BinaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771922738.474, "dur": 109.089, + "args": { + "External id": 153343, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992603, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992603, "pid": 0, "tid": 7, "ts": 6303771922738.474, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903125.680, "dur": 4.090, + "args": { + "External id": 153343, "cbid": 211, "correlation": 289992603 + } + }, + { + "ph": "s", "id": 289992603, "pid": 5714, "tid": 6744, "ts": 6303771903125.680, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4> >(at::native::ReduceOp::operator()(at::TensorIterator&)::{lambda(float, float)#1}>, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303771922848.171, "dur": 77.505, + "args": { + "External id": 153344, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992617, "registers per thread": 32, "shared memory": 16, "blocks per SM": 8.000000, "warps per SM": 128.000000, "grid": [1024, 1, 1], "block": [32, 16, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992617, "pid": 0, "tid": 7, "ts": 6303771922848.171, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903142.790, "dur": 4.760, + "args": { + "External id": 153344, "cbid": 211, "correlation": 289992617 + } + }, + { + "ph": "s", "id": 289992617, "pid": 5714, "tid": 6744, "ts": 6303771903142.790, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#2}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771922926.316, "dur": 1.472, + "args": { + "External id": 153347, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992631, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289992631, "pid": 0, "tid": 7, "ts": 6303771922926.316, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903174.150, "dur": 6.080, + "args": { + "External id": 153347, "cbid": 211, "correlation": 289992631 + } + }, + { + "ph": "s", "id": 289992631, "pid": 5714, "tid": 6744, "ts": 6303771903174.150, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771922928.460, "dur": 0.992, + "args": { + "External id": 153351, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992641, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289992641, "pid": 0, "tid": 7, "ts": 6303771922928.460, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903192.639, "dur": 4.471, + "args": { + "External id": 153351, "cbid": 211, "correlation": 289992641 + } + }, + { + "ph": "s", "id": 289992641, "pid": 5714, "tid": 6744, "ts": 6303771903192.639, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771922930.188, "dur": 0.992, + "args": { + "External id": 153352, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992651, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 1.000000, "grid": [32, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 289992651, "pid": 0, "tid": 7, "ts": 6303771922930.188, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903204.310, "dur": 3.780, + "args": { + "External id": 153352, "cbid": 211, "correlation": 289992651 + } + }, + { + "ph": "s", "id": 289992651, "pid": 5714, "tid": 6744, "ts": 6303771903204.310, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::elementwise_kernel<128, 2, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1}>(int, at::native::gpu_kernel_impl_nocast > >(at::TensorIteratorBase&, at::native::BUnaryFunctor > const&)::{lambda(int)#1})", "pid": 0, "tid": 7, + "ts": 6303771922931.788, "dur": 27.937, + "args": { + "External id": 153360, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992669, "registers per thread": 16, "shared memory": 0, "blocks per SM": 384.000000, "warps per SM": 1536.000000, "grid": [49152, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992669, "pid": 0, "tid": 7, "ts": 6303771922931.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903248.410, "dur": 6.900, + "args": { + "External id": 153360, "cbid": 211, "correlation": 289992669 + } + }, + { + "ph": "s", "id": 289992669, "pid": 5714, "tid": 6744, "ts": 6303771903248.410, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771922960.429, "dur": 113.505, + "args": { + "External id": 153366, "device": 0, "context": 1, "stream": 7, "correlation": 289992683, "bytes": 50331648, "memory bandwidth (GB/s)": 443.4311087617286 + } + }, + { + "ph": "f", "id": 289992683, "pid": 0, "tid": 7, "ts": 6303771922960.429, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771903281.459, "dur": 14.070, + "args": { + "External id": 153366, "cbid": 41, "correlation": 289992683 + } + }, + { + "ph": "s", "id": 289992683, "pid": 5714, "tid": 6744, "ts": 6303771903281.459, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771923074.542, "dur": 68.417, + "args": { + "External id": 153368, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992695, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992695, "pid": 0, "tid": 7, "ts": 6303771923074.542, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903317.519, "dur": 5.680, + "args": { + "External id": 153368, "cbid": 211, "correlation": 289992695 + } + }, + { + "ph": "s", "id": 289992695, "pid": 5714, "tid": 6744, "ts": 6303771903317.519, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303771923143.599, "dur": 146.178, + "args": { + "External id": 153369, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992705, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992705, "pid": 0, "tid": 7, "ts": 6303771923143.599, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903331.159, "dur": 3.840, + "args": { + "External id": 153369, "cbid": 211, "correlation": 289992705 + } + }, + { + "ph": "s", "id": 289992705, "pid": 5714, "tid": 6744, "ts": 6303771903331.159, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771923290.481, "dur": 144.033, + "args": { + "External id": 153370, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992712, "registers per thread": 22, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992712, "pid": 0, "tid": 7, "ts": 6303771923290.481, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903346.539, "dur": 4.140, + "args": { + "External id": 153370, "cbid": 211, "correlation": 289992712 + } + }, + { + "ph": "s", "id": 289992712, "pid": 5714, "tid": 6744, "ts": 6303771903346.539, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array >(int, at::native::bfloat16_copy_kernel_cuda(at::TensorIteratorBase&)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303771923435.218, "dur": 47.169, + "args": { + "External id": 153376, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992731, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992731, "pid": 0, "tid": 7, "ts": 6303771923435.218, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903381.129, "dur": 6.040, + "args": { + "External id": 153376, "cbid": 211, "correlation": 289992731 + } + }, + { + "ph": "s", "id": 289992731, "pid": 5714, "tid": 6744, "ts": 6303771903381.129, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771923483.027, "dur": 40.384, + "args": { + "External id": 153377, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992739, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289992739, "pid": 0, "tid": 7, "ts": 6303771923483.027, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903397.129, "dur": 4.170, + "args": { + "External id": 153377, "cbid": 211, "correlation": 289992739 + } + }, + { + "ph": "s", "id": 289992739, "pid": 5714, "tid": 6744, "ts": 6303771903397.129, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771903618.508, "dur": 111.201, + "args": { + "External id": 153393, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289992762, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289992762, "pid": 0, "tid": 17, "ts": 6303771903618.508, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903599.409, "dur": 11.490, + "args": { + "External id": 153393, "cbid": 211, "correlation": 289992762 + } + }, + { + "ph": "s", "id": 289992762, "pid": 5714, "tid": 6744, "ts": 6303771903599.409, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771903738.766, "dur": 161.474, + "args": { + "External id": 153409, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289992775, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289992775, "pid": 0, "tid": 17, "ts": 6303771903738.766, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771903726.568, "dur": 9.800, + "args": { + "External id": 153409, "cbid": 211, "correlation": 289992775 + } + }, + { + "ph": "s", "id": 289992775, "pid": 5714, "tid": 6744, "ts": 6303771903726.568, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771903762.798, "dur": 1.700, + "args": { + "External id": 153378, "cbid": 135, "correlation": 289992785 + } + }, + { + "ph": "f", "id": 289992785, "pid": 5714, "tid": 6744, "ts": 6303771903762.798, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771903766.568, "dur": 1.460, + "args": { + "External id": 153378, "cbid": 147, "correlation": 289992789 + } + }, + { + "ph": "s", "id": 289992789, "pid": 5714, "tid": 6744, "ts": 6303771903766.568, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771903841.788, "dur": 1.390, + "args": { + "External id": 153411, "cbid": 317, "correlation": 289992802 + } + }, + { + "ph": "f", "id": 289992802, "pid": 5714, "tid": 6744, "ts": 6303771903841.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771903846.368, "dur": 1.440, + "args": { + "External id": 153411, "cbid": 135, "correlation": 289992804 + } + }, + { + "ph": "f", "id": 289992804, "pid": 5714, "tid": 6744, "ts": 6303771903846.368, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771903849.658, "dur": 1.930, + "args": { + "External id": 153411, "cbid": 147, "correlation": 289992808 + } + }, + { + "ph": "s", "id": 289992808, "pid": 5714, "tid": 6744, "ts": 6303771903849.658, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771903877.108, "dur": 1.480, + "args": { + "External id": 153411, "cbid": 409, "correlation": 289992811 + } + }, + { + "ph": "f", "id": 289992811, "pid": 5714, "tid": 6744, "ts": 6303771903877.108, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771903887.418, "dur": 1.140, + "args": { + "External id": 153411, "cbid": 135, "correlation": 289992814 + } + }, + { + "ph": "f", "id": 289992814, "pid": 5714, "tid": 6744, "ts": 6303771903887.418, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771903888.748, "dur": 0.940, + "args": { + "External id": 153411, "cbid": 147, "correlation": 289992815 + } + }, + { + "ph": "s", "id": 289992815, "pid": 5714, "tid": 6744, "ts": 6303771903888.748, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771903978.544, "dur": 5992.518, + "args": { + "External id": 153411, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289992817, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289992817, "pid": 0, "tid": 20, "ts": 6303771903978.544, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771903892.008, "dur": 11.490, + "args": { + "External id": 153411, "cbid": 430, "correlation": 289992817 + } + }, + { + "ph": "s", "id": 289992817, "pid": 5714, "tid": 6744, "ts": 6303771903892.008, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771903904.878, "dur": 0.400, + "args": { + "External id": 153411, "cbid": 135, "correlation": 289992819 + } + }, + { + "ph": "f", "id": 289992819, "pid": 5714, "tid": 6744, "ts": 6303771903904.878, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771903905.388, "dur": 0.510, + "args": { + "External id": 153411, "cbid": 147, "correlation": 289992820 + } + }, + { + "ph": "s", "id": 289992820, "pid": 5714, "tid": 6744, "ts": 6303771903905.388, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771903907.388, "dur": 0.750, + "args": { + "External id": 153411, "cbid": 135, "correlation": 289992823 + } + }, + { + "ph": "f", "id": 289992823, "pid": 5714, "tid": 6744, "ts": 6303771903907.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771903918.288, "dur": 0.450, + "args": { + "External id": 153411, "cbid": 135, "correlation": 289992830 + } + }, + { + "ph": "f", "id": 289992830, "pid": 5714, "tid": 6744, "ts": 6303771903918.288, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771903950.208, "dur": 1.070, + "args": { + "External id": 153413, "cbid": 147, "correlation": 289992835 + } + }, + { + "ph": "s", "id": 289992835, "pid": 5714, "tid": 6744, "ts": 6303771903950.208, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771903969.708, "dur": 0.880, + "args": { + "External id": 153378, "cbid": 135, "correlation": 289992850 + } + }, + { + "ph": "f", "id": 289992850, "pid": 5714, "tid": 6744, "ts": 6303771903969.708, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771923524.115, "dur": 320.804, + "args": { + "External id": 153415, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992875, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992875, "pid": 0, "tid": 7, "ts": 6303771923524.115, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904131.377, "dur": 10.951, + "args": { + "External id": 153415, "cbid": 211, "correlation": 289992875 + } + }, + { + "ph": "s", "id": 289992875, "pid": 5714, "tid": 6744, "ts": 6303771904131.377, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6303771923846.135, "dur": 432.069, + "args": { + "External id": 153416, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992898, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289992898, "pid": 0, "tid": 7, "ts": 6303771923846.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904193.807, "dur": 7.330, + "args": { + "External id": 153416, "cbid": 307, "correlation": 289992898 + } + }, + { + "ph": "s", "id": 289992898, "pid": 5714, "tid": 6744, "ts": 6303771904193.807, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771904238.197, "dur": 0.510, + "args": { + "External id": 153417, "cbid": 200, "correlation": 289992921 + } + }, + { + "ph": "f", "id": 289992921, "pid": 5714, "tid": 6744, "ts": 6303771904238.197, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771924279.004, "dur": 0.800, + "args": { + "External id": 153417, "device": 0, "context": 1, "stream": 7, "correlation": 289992924, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289992924, "pid": 0, "tid": 7, "ts": 6303771924279.004, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771904240.307, "dur": 6.520, + "args": { + "External id": 153417, "cbid": 51, "correlation": 289992924 + } + }, + { + "ph": "s", "id": 289992924, "pid": 5714, "tid": 6744, "ts": 6303771904240.307, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771924280.956, "dur": 355.556, + "args": { + "External id": 153417, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992925, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992925, "pid": 0, "tid": 7, "ts": 6303771924280.956, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904247.007, "dur": 5.810, + "args": { + "External id": 153417, "cbid": 307, "correlation": 289992925 + } + }, + { + "ph": "s", "id": 289992925, "pid": 5714, "tid": 6744, "ts": 6303771904247.007, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771904293.337, "dur": 0.300, + "args": { + "External id": 153418, "cbid": 200, "correlation": 289992950 + } + }, + { + "ph": "f", "id": 289992950, "pid": 5714, "tid": 6744, "ts": 6303771904293.337, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771924637.376, "dur": 0.800, + "args": { + "External id": 153418, "device": 0, "context": 1, "stream": 7, "correlation": 289992953, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289992953, "pid": 0, "tid": 7, "ts": 6303771924637.376, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771904294.657, "dur": 11.200, + "args": { + "External id": 153418, "cbid": 51, "correlation": 289992953 + } + }, + { + "ph": "s", "id": 289992953, "pid": 5714, "tid": 6744, "ts": 6303771904294.657, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771924639.328, "dur": 353.028, + "args": { + "External id": 153418, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992954, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992954, "pid": 0, "tid": 7, "ts": 6303771924639.328, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904306.027, "dur": 5.590, + "args": { + "External id": 153418, "cbid": 307, "correlation": 289992954 + } + }, + { + "ph": "s", "id": 289992954, "pid": 5714, "tid": 6744, "ts": 6303771904306.027, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771904338.207, "dur": 0.300, + "args": { + "External id": 153419, "cbid": 200, "correlation": 289992979 + } + }, + { + "ph": "f", "id": 289992979, "pid": 5714, "tid": 6744, "ts": 6303771904338.207, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771924993.028, "dur": 352.516, + "args": { + "External id": 153419, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289992982, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289992982, "pid": 0, "tid": 7, "ts": 6303771924993.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904339.687, "dur": 5.420, + "args": { + "External id": 153419, "cbid": 307, "correlation": 289992982 + } + }, + { + "ph": "s", "id": 289992982, "pid": 5714, "tid": 6744, "ts": 6303771904339.687, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771904366.767, "dur": 0.230, + "args": { + "External id": 153420, "cbid": 200, "correlation": 289993007 + } + }, + { + "ph": "f", "id": 289993007, "pid": 5714, "tid": 6744, "ts": 6303771904366.767, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771925346.376, "dur": 0.800, + "args": { + "External id": 153420, "device": 0, "context": 1, "stream": 7, "correlation": 289993010, "bytes": 1536, "memory bandwidth (GB/s)": 1.92 + } + }, + { + "ph": "f", "id": 289993010, "pid": 0, "tid": 7, "ts": 6303771925346.376, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771904367.937, "dur": 4.570, + "args": { + "External id": 153420, "cbid": 51, "correlation": 289993010 + } + }, + { + "ph": "s", "id": 289993010, "pid": 5714, "tid": 6744, "ts": 6303771904367.937, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771925348.360, "dur": 354.597, + "args": { + "External id": 153420, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993011, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993011, "pid": 0, "tid": 7, "ts": 6303771925348.360, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904372.647, "dur": 4.750, + "args": { + "External id": 153420, "cbid": 307, "correlation": 289993011 + } + }, + { + "ph": "s", "id": 289993011, "pid": 5714, "tid": 6744, "ts": 6303771904372.647, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771904400.077, "dur": 0.260, + "args": { + "External id": 153421, "cbid": 200, "correlation": 289993036 + } + }, + { + "ph": "f", "id": 289993036, "pid": 5714, "tid": 6744, "ts": 6303771904400.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771925703.661, "dur": 356.132, + "args": { + "External id": 153421, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993039, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993039, "pid": 0, "tid": 7, "ts": 6303771925703.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904401.337, "dur": 4.850, + "args": { + "External id": 153421, "cbid": 307, "correlation": 289993039 + } + }, + { + "ph": "s", "id": 289993039, "pid": 5714, "tid": 6744, "ts": 6303771904401.337, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771926060.465, "dur": 77.921, + "args": { + "External id": 153422, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993052, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993052, "pid": 0, "tid": 7, "ts": 6303771926060.465, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904445.697, "dur": 6.510, + "args": { + "External id": 153422, "cbid": 307, "correlation": 289993052 + } + }, + { + "ph": "s", "id": 289993052, "pid": 5714, "tid": 6744, "ts": 6303771904445.697, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6303771926139.026, "dur": 1.920, + "args": { + "External id": 153423, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993060, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289993060, "pid": 0, "tid": 7, "ts": 6303771926139.026, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904478.677, "dur": 6.670, + "args": { + "External id": 153423, "cbid": 307, "correlation": 289993060 + } + }, + { + "ph": "s", "id": 289993060, "pid": 5714, "tid": 6744, "ts": 6303771904478.677, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6303771926142.130, "dur": 111.905, + "args": { + "External id": 153424, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993068, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993068, "pid": 0, "tid": 7, "ts": 6303771926142.130, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904516.997, "dur": 6.510, + "args": { + "External id": 153424, "cbid": 307, "correlation": 289993068 + } + }, + { + "ph": "s", "id": 289993068, "pid": 5714, "tid": 6744, "ts": 6303771904516.997, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771904698.606, "dur": 0.490, + "args": { + "External id": 153443, "cbid": 200, "correlation": 289993114 + } + }, + { + "ph": "f", "id": 289993114, "pid": 5714, "tid": 6744, "ts": 6303771904698.606, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771926254.835, "dur": 0.768, + "args": { + "External id": 153443, "device": 0, "context": 1, "stream": 7, "correlation": 289993117, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289993117, "pid": 0, "tid": 7, "ts": 6303771926254.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771904700.726, "dur": 7.140, + "args": { + "External id": 153443, "cbid": 51, "correlation": 289993117 + } + }, + { + "ph": "s", "id": 289993117, "pid": 5714, "tid": 6744, "ts": 6303771904700.726, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771926256.723, "dur": 137.858, + "args": { + "External id": 153443, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993118, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993118, "pid": 0, "tid": 7, "ts": 6303771926256.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904708.096, "dur": 7.740, + "args": { + "External id": 153443, "cbid": 307, "correlation": 289993118 + } + }, + { + "ph": "s", "id": 289993118, "pid": 5714, "tid": 6744, "ts": 6303771904708.096, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771926395.253, "dur": 120.705, + "args": { + "External id": 153444, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993140, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993140, "pid": 0, "tid": 7, "ts": 6303771926395.253, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904740.786, "dur": 5.580, + "args": { + "External id": 153444, "cbid": 211, "correlation": 289993140 + } + }, + { + "ph": "s", "id": 289993140, "pid": 5714, "tid": 6744, "ts": 6303771904740.786, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771904823.796, "dur": 0.420, + "args": { + "External id": 153445, "cbid": 200, "correlation": 289993158 + } + }, + { + "ph": "f", "id": 289993158, "pid": 5714, "tid": 6744, "ts": 6303771904823.796, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771904824.326, "dur": 0.190, + "args": { + "External id": 153445, "cbid": 200, "correlation": 289993159 + } + }, + { + "ph": "f", "id": 289993159, "pid": 5714, "tid": 6744, "ts": 6303771904824.326, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771904843.486, "dur": 0.210, + "args": { + "External id": 153445, "cbid": 200, "correlation": 289993177 + } + }, + { + "ph": "f", "id": 289993177, "pid": 5714, "tid": 6744, "ts": 6303771904843.486, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771926516.662, "dur": 91.809, + "args": { + "External id": 153445, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993178, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993178, "pid": 0, "tid": 7, "ts": 6303771926516.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904844.856, "dur": 8.900, + "args": { + "External id": 153445, "cbid": 211, "correlation": 289993178 + } + }, + { + "ph": "s", "id": 289993178, "pid": 5714, "tid": 6744, "ts": 6303771904844.856, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771904854.476, "dur": 0.940, + "args": { + "External id": 153445, "cbid": 273, "correlation": 289993180 + } + }, + { + "ph": "f", "id": 289993180, "pid": 5714, "tid": 6744, "ts": 6303771904854.476, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771926609.111, "dur": 980.140, + "args": { + "External id": 153445, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993181, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289993181, "pid": 0, "tid": 7, "ts": 6303771926609.111, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904855.726, "dur": 3.960, + "args": { + "External id": 153445, "cbid": 211, "correlation": 289993181 + } + }, + { + "ph": "s", "id": 289993181, "pid": 5714, "tid": 6744, "ts": 6303771904855.726, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303771927589.987, "dur": 71.712, + "args": { + "External id": 153445, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993183, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289993183, "pid": 0, "tid": 7, "ts": 6303771927589.987, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771904860.226, "dur": 3.550, + "args": { + "External id": 153445, "cbid": 211, "correlation": 289993183 + } + }, + { + "ph": "s", "id": 289993183, "pid": 5714, "tid": 6744, "ts": 6303771904860.226, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771927662.371, "dur": 47.201, + "args": { + "External id": 153456, "device": 0, "context": 1, "stream": 7, "correlation": 289993205, "bytes": 25165824, "memory bandwidth (GB/s)": 533.1629414631046 + } + }, + { + "ph": "f", "id": 289993205, "pid": 0, "tid": 7, "ts": 6303771927662.371, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771904989.635, "dur": 16.571, + "args": { + "External id": 153456, "cbid": 41, "correlation": 289993205 + } + }, + { + "ph": "s", "id": 289993205, "pid": 5714, "tid": 6744, "ts": 6303771904989.635, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771927710.244, "dur": 33.472, + "args": { + "External id": 153453, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993223, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993223, "pid": 0, "tid": 7, "ts": 6303771927710.244, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905099.645, "dur": 8.180, + "args": { + "External id": 153453, "cbid": 307, "correlation": 289993223 + } + }, + { + "ph": "s", "id": 289993223, "pid": 5714, "tid": 6744, "ts": 6303771905099.645, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771927744.452, "dur": 38.561, + "args": { + "External id": 153463, "device": 0, "context": 1, "stream": 7, "correlation": 289993238, "bytes": 25165824, "memory bandwidth (GB/s)": 652.6237390109178 + } + }, + { + "ph": "f", "id": 289993238, "pid": 0, "tid": 7, "ts": 6303771927744.452, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771905164.875, "dur": 12.830, + "args": { + "External id": 153463, "cbid": 41, "correlation": 289993238 + } + }, + { + "ph": "s", "id": 289993238, "pid": 5714, "tid": 6744, "ts": 6303771905164.875, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771927783.621, "dur": 26.720, + "args": { + "External id": 153460, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993256, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993256, "pid": 0, "tid": 7, "ts": 6303771927783.621, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905261.115, "dur": 7.470, + "args": { + "External id": 153460, "cbid": 307, "correlation": 289993256 + } + }, + { + "ph": "s", "id": 289993256, "pid": 5714, "tid": 6744, "ts": 6303771905261.115, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771905401.355, "dur": 0.520, + "args": { + "External id": 153468, "cbid": 200, "correlation": 289993286 + } + }, + { + "ph": "f", "id": 289993286, "pid": 5714, "tid": 6744, "ts": 6303771905401.355, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771927811.205, "dur": 0.768, + "args": { + "External id": 153468, "device": 0, "context": 1, "stream": 7, "correlation": 289993289, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289993289, "pid": 0, "tid": 7, "ts": 6303771927811.205, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771905403.555, "dur": 7.320, + "args": { + "External id": 153468, "cbid": 51, "correlation": 289993289 + } + }, + { + "ph": "s", "id": 289993289, "pid": 5714, "tid": 6744, "ts": 6303771905403.555, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771927813.157, "dur": 140.578, + "args": { + "External id": 153468, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993290, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993290, "pid": 0, "tid": 7, "ts": 6303771927813.157, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905411.095, "dur": 7.800, + "args": { + "External id": 153468, "cbid": 307, "correlation": 289993290 + } + }, + { + "ph": "s", "id": 289993290, "pid": 5714, "tid": 6744, "ts": 6303771905411.095, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771905444.825, "dur": 0.289, + "args": { + "External id": 153469, "cbid": 200, "correlation": 289993315 + } + }, + { + "ph": "f", "id": 289993315, "pid": 5714, "tid": 6744, "ts": 6303771905444.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771927954.503, "dur": 0.768, + "args": { + "External id": 153469, "device": 0, "context": 1, "stream": 7, "correlation": 289993318, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289993318, "pid": 0, "tid": 7, "ts": 6303771927954.503, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771905446.054, "dur": 4.600, + "args": { + "External id": 153469, "cbid": 51, "correlation": 289993318 + } + }, + { + "ph": "s", "id": 289993318, "pid": 5714, "tid": 6744, "ts": 6303771905446.054, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771927956.423, "dur": 136.802, + "args": { + "External id": 153469, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993319, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993319, "pid": 0, "tid": 7, "ts": 6303771927956.423, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905450.774, "dur": 4.871, + "args": { + "External id": 153469, "cbid": 307, "correlation": 289993319 + } + }, + { + "ph": "s", "id": 289993319, "pid": 5714, "tid": 6744, "ts": 6303771905450.774, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771905477.345, "dur": 0.309, + "args": { + "External id": 153470, "cbid": 200, "correlation": 289993344 + } + }, + { + "ph": "f", "id": 289993344, "pid": 5714, "tid": 6744, "ts": 6303771905477.345, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771928094.089, "dur": 0.768, + "args": { + "External id": 153470, "device": 0, "context": 1, "stream": 7, "correlation": 289993347, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289993347, "pid": 0, "tid": 7, "ts": 6303771928094.089, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771905478.625, "dur": 4.320, + "args": { + "External id": 153470, "cbid": 51, "correlation": 289993347 + } + }, + { + "ph": "s", "id": 289993347, "pid": 5714, "tid": 6744, "ts": 6303771905478.625, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771928096.169, "dur": 136.129, + "args": { + "External id": 153470, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993348, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993348, "pid": 0, "tid": 7, "ts": 6303771928096.169, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905483.085, "dur": 4.660, + "args": { + "External id": 153470, "cbid": 307, "correlation": 289993348 + } + }, + { + "ph": "s", "id": 289993348, "pid": 5714, "tid": 6744, "ts": 6303771905483.085, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771928232.970, "dur": 120.322, + "args": { + "External id": 153471, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993370, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993370, "pid": 0, "tid": 7, "ts": 6303771928232.970, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905510.245, "dur": 5.289, + "args": { + "External id": 153471, "cbid": 211, "correlation": 289993370 + } + }, + { + "ph": "s", "id": 289993370, "pid": 5714, "tid": 6744, "ts": 6303771905510.245, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771928353.996, "dur": 121.089, + "args": { + "External id": 153472, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993393, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993393, "pid": 0, "tid": 7, "ts": 6303771928353.996, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905533.474, "dur": 4.650, + "args": { + "External id": 153472, "cbid": 211, "correlation": 289993393 + } + }, + { + "ph": "s", "id": 289993393, "pid": 5714, "tid": 6744, "ts": 6303771905533.474, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771928475.789, "dur": 122.113, + "args": { + "External id": 153473, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993416, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993416, "pid": 0, "tid": 7, "ts": 6303771928475.789, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905556.424, "dur": 4.430, + "args": { + "External id": 153473, "cbid": 211, "correlation": 289993416 + } + }, + { + "ph": "s", "id": 289993416, "pid": 5714, "tid": 6744, "ts": 6303771905556.424, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6303771928598.542, "dur": 80.449, + "args": { + "External id": 153474, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993424, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993424, "pid": 0, "tid": 7, "ts": 6303771928598.542, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905596.214, "dur": 6.520, + "args": { + "External id": 153474, "cbid": 307, "correlation": 289993424 + } + }, + { + "ph": "s", "id": 289993424, "pid": 5714, "tid": 6744, "ts": 6303771905596.214, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303771928679.695, "dur": 42.817, + "args": { + "External id": 153489, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993453, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993453, "pid": 0, "tid": 7, "ts": 6303771928679.695, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905760.444, "dur": 9.160, + "args": { + "External id": 153489, "cbid": 307, "correlation": 289993453 + } + }, + { + "ph": "s", "id": 289993453, "pid": 5714, "tid": 6744, "ts": 6303771905760.444, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771928723.216, "dur": 1.952, + "args": { + "External id": 153490, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993461, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289993461, "pid": 0, "tid": 7, "ts": 6303771928723.216, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905797.494, "dur": 6.080, + "args": { + "External id": 153490, "cbid": 307, "correlation": 289993461 + } + }, + { + "ph": "s", "id": 289993461, "pid": 5714, "tid": 6744, "ts": 6303771905797.494, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303771928725.776, "dur": 48.608, + "args": { + "External id": 153491, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993472, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993472, "pid": 0, "tid": 7, "ts": 6303771928725.776, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905833.514, "dur": 6.260, + "args": { + "External id": 153491, "cbid": 307, "correlation": 289993472 + } + }, + { + "ph": "s", "id": 289993472, "pid": 5714, "tid": 6744, "ts": 6303771905833.514, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771928775.024, "dur": 44.897, + "args": { + "External id": 153492, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993477, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993477, "pid": 0, "tid": 7, "ts": 6303771928775.024, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771905873.384, "dur": 6.680, + "args": { + "External id": 153492, "cbid": 211, "correlation": 289993477 + } + }, + { + "ph": "s", "id": 289993477, "pid": 5714, "tid": 6744, "ts": 6303771905873.384, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771928822.257, "dur": 2.272, + "args": { + "External id": 153510, "device": 0, "context": 1, "stream": 7, "correlation": 289993508, "bytes": 28112, "memory bandwidth (GB/s)": 12.373239436619718 + } + }, + { + "ph": "f", "id": 289993508, "pid": 0, "tid": 7, "ts": 6303771928822.257, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771906176.903, "dur": 11.630, + "args": { + "External id": 153510, "cbid": 41, "correlation": 289993508 + } + }, + { + "ph": "s", "id": 289993508, "pid": 5714, "tid": 6744, "ts": 6303771906176.903, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906192.983, "dur": 1.820, + "args": { + "External id": 153505, "cbid": 135, "correlation": 289993512 + } + }, + { + "ph": "f", "id": 289993512, "pid": 5714, "tid": 6744, "ts": 6303771906192.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303771928826.449, "dur": 34.784, + "args": { + "External id": 153505, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993516, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993516, "pid": 0, "tid": 7, "ts": 6303771928826.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771906198.113, "dur": 10.720, + "args": { + "External id": 153505, "cbid": 211, "correlation": 289993516 + } + }, + { + "ph": "s", "id": 289993516, "pid": 5714, "tid": 6744, "ts": 6303771906198.113, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906248.183, "dur": 1.020, + "args": { + "External id": 153498, "cbid": 135, "correlation": 289993527 + } + }, + { + "ph": "f", "id": 289993527, "pid": 5714, "tid": 6744, "ts": 6303771906248.183, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771906251.543, "dur": 1.690, + "args": { + "External id": 153498, "cbid": 147, "correlation": 289993531 + } + }, + { + "ph": "s", "id": 289993531, "pid": 5714, "tid": 6744, "ts": 6303771906251.543, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771906339.143, "dur": 1.049, + "args": { + "External id": 153514, "cbid": 317, "correlation": 289993551 + } + }, + { + "ph": "f", "id": 289993551, "pid": 5714, "tid": 6744, "ts": 6303771906339.143, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906341.952, "dur": 1.440, + "args": { + "External id": 153514, "cbid": 135, "correlation": 289993553 + } + }, + { + "ph": "f", "id": 289993553, "pid": 5714, "tid": 6744, "ts": 6303771906341.952, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771906344.732, "dur": 1.160, + "args": { + "External id": 153514, "cbid": 147, "correlation": 289993557 + } + }, + { + "ph": "s", "id": 289993557, "pid": 5714, "tid": 6744, "ts": 6303771906344.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771906360.872, "dur": 0.740, + "args": { + "External id": 153514, "cbid": 409, "correlation": 289993560 + } + }, + { + "ph": "f", "id": 289993560, "pid": 5714, "tid": 6744, "ts": 6303771906360.872, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906365.603, "dur": 0.809, + "args": { + "External id": 153514, "cbid": 135, "correlation": 289993563 + } + }, + { + "ph": "f", "id": 289993563, "pid": 5714, "tid": 6744, "ts": 6303771906365.603, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771906366.572, "dur": 0.811, + "args": { + "External id": 153514, "cbid": 147, "correlation": 289993564 + } + }, + { + "ph": "s", "id": 289993564, "pid": 5714, "tid": 6744, "ts": 6303771906366.572, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771928863.346, "dur": 9149.961, + "args": { + "External id": 153514, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289993566, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289993566, "pid": 0, "tid": 20, "ts": 6303771928863.346, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771906368.732, "dur": 10.871, + "args": { + "External id": 153514, "cbid": 430, "correlation": 289993566 + } + }, + { + "ph": "s", "id": 289993566, "pid": 5714, "tid": 6744, "ts": 6303771906368.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906380.563, "dur": 0.520, + "args": { + "External id": 153514, "cbid": 135, "correlation": 289993568 + } + }, + { + "ph": "f", "id": 289993568, "pid": 5714, "tid": 6744, "ts": 6303771906380.563, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771906381.192, "dur": 0.551, + "args": { + "External id": 153514, "cbid": 147, "correlation": 289993569 + } + }, + { + "ph": "s", "id": 289993569, "pid": 5714, "tid": 6744, "ts": 6303771906381.192, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906383.132, "dur": 0.900, + "args": { + "External id": 153514, "cbid": 135, "correlation": 289993572 + } + }, + { + "ph": "f", "id": 289993572, "pid": 5714, "tid": 6744, "ts": 6303771906383.132, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906391.892, "dur": 0.440, + "args": { + "External id": 153514, "cbid": 135, "correlation": 289993579 + } + }, + { + "ph": "f", "id": 289993579, "pid": 5714, "tid": 6744, "ts": 6303771906391.892, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771906417.732, "dur": 1.220, + "args": { + "External id": 153516, "cbid": 147, "correlation": 289993584 + } + }, + { + "ph": "s", "id": 289993584, "pid": 5714, "tid": 6744, "ts": 6303771906417.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906435.132, "dur": 0.750, + "args": { + "External id": 153498, "cbid": 135, "correlation": 289993599 + } + }, + { + "ph": "f", "id": 289993599, "pid": 5714, "tid": 6744, "ts": 6303771906435.132, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906634.162, "dur": 1.070, + "args": { + "External id": 153498, "cbid": 135, "correlation": 289993612 + } + }, + { + "ph": "f", "id": 289993612, "pid": 5714, "tid": 6744, "ts": 6303771906634.162, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771906737.352, "dur": 3.030, + "args": { + "External id": 153526, "cbid": 147, "correlation": 289993623 + } + }, + { + "ph": "s", "id": 289993623, "pid": 5714, "tid": 6744, "ts": 6303771906737.352, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771906848.451, "dur": 1.131, + "args": { + "External id": 153540, "cbid": 317, "correlation": 289993664 + } + }, + { + "ph": "f", "id": 289993664, "pid": 5714, "tid": 6744, "ts": 6303771906848.451, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771906858.082, "dur": 2.100, + "args": { + "External id": 153541, "cbid": 138, "correlation": 289993667 + } + }, + { + "ph": "f", "id": 289993667, "pid": 5714, "tid": 6744, "ts": 6303771906858.082, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771928864.370, "dur": 1.567, + "args": { + "External id": 153545, "device": 0, "context": 1, "stream": 7, "correlation": 289993678, "bytes": 7224, "memory bandwidth (GB/s)": 4.610082961072112 + } + }, + { + "ph": "f", "id": 289993678, "pid": 0, "tid": 7, "ts": 6303771928864.370, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771906880.741, "dur": 11.510, + "args": { + "External id": 153545, "cbid": 41, "correlation": 289993678 + } + }, + { + "ph": "s", "id": 289993678, "pid": 5714, "tid": 6744, "ts": 6303771906880.741, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906896.241, "dur": 1.630, + "args": { + "External id": 153540, "cbid": 135, "correlation": 289993682 + } + }, + { + "ph": "f", "id": 289993682, "pid": 5714, "tid": 6744, "ts": 6303771906896.241, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771928867.857, "dur": 359.045, + "args": { + "External id": 153540, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993686, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289993686, "pid": 0, "tid": 7, "ts": 6303771928867.857, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771906900.331, "dur": 11.080, + "args": { + "External id": 153540, "cbid": 211, "correlation": 289993686 + } + }, + { + "ph": "s", "id": 289993686, "pid": 5714, "tid": 6744, "ts": 6303771906900.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771906996.791, "dur": 1.350, + "args": { + "External id": 153526, "cbid": 135, "correlation": 289993697 + } + }, + { + "ph": "f", "id": 289993697, "pid": 5714, "tid": 6744, "ts": 6303771906996.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771907001.691, "dur": 1.140, + "args": { + "External id": 153526, "cbid": 147, "correlation": 289993701 + } + }, + { + "ph": "s", "id": 289993701, "pid": 5714, "tid": 6744, "ts": 6303771907001.691, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771907004.591, "dur": 0.800, + "args": { + "External id": 153526, "cbid": 147, "correlation": 289993705 + } + }, + { + "ph": "s", "id": 289993705, "pid": 5714, "tid": 6744, "ts": 6303771907004.591, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771930076.959, "dur": 458.502, + "args": { + "External id": 153559, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289993729, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289993729, "pid": 0, "tid": 17, "ts": 6303771930076.959, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907148.801, "dur": 11.180, + "args": { + "External id": 153559, "cbid": 211, "correlation": 289993729 + } + }, + { + "ph": "s", "id": 289993729, "pid": 5714, "tid": 6744, "ts": 6303771907148.801, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771930551.237, "dur": 118.849, + "args": { + "External id": 153575, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289993742, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289993742, "pid": 0, "tid": 17, "ts": 6303771930551.237, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907264.070, "dur": 9.131, + "args": { + "External id": 153575, "cbid": 211, "correlation": 289993742 + } + }, + { + "ph": "s", "id": 289993742, "pid": 5714, "tid": 6744, "ts": 6303771907264.070, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771907303.750, "dur": 1.271, + "args": { + "External id": 153526, "cbid": 135, "correlation": 289993752 + } + }, + { + "ph": "f", "id": 289993752, "pid": 5714, "tid": 6744, "ts": 6303771907303.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771907307.081, "dur": 1.080, + "args": { + "External id": 153526, "cbid": 147, "correlation": 289993756 + } + }, + { + "ph": "s", "id": 289993756, "pid": 5714, "tid": 6744, "ts": 6303771907307.081, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771907358.320, "dur": 0.900, + "args": { + "External id": 153577, "cbid": 317, "correlation": 289993769 + } + }, + { + "ph": "f", "id": 289993769, "pid": 5714, "tid": 6744, "ts": 6303771907358.320, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771907360.980, "dur": 1.160, + "args": { + "External id": 153577, "cbid": 135, "correlation": 289993771 + } + }, + { + "ph": "f", "id": 289993771, "pid": 5714, "tid": 6744, "ts": 6303771907360.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771907363.480, "dur": 1.170, + "args": { + "External id": 153577, "cbid": 147, "correlation": 289993775 + } + }, + { + "ph": "s", "id": 289993775, "pid": 5714, "tid": 6744, "ts": 6303771907363.480, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771907377.900, "dur": 0.620, + "args": { + "External id": 153577, "cbid": 409, "correlation": 289993778 + } + }, + { + "ph": "f", "id": 289993778, "pid": 5714, "tid": 6744, "ts": 6303771907377.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771907382.330, "dur": 0.730, + "args": { + "External id": 153577, "cbid": 135, "correlation": 289993781 + } + }, + { + "ph": "f", "id": 289993781, "pid": 5714, "tid": 6744, "ts": 6303771907382.330, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771907383.230, "dur": 0.820, + "args": { + "External id": 153577, "cbid": 147, "correlation": 289993782 + } + }, + { + "ph": "s", "id": 289993782, "pid": 5714, "tid": 6744, "ts": 6303771907383.230, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771938014.363, "dur": 4333.202, + "args": { + "External id": 153577, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289993784, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289993784, "pid": 0, "tid": 20, "ts": 6303771938014.363, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771907385.040, "dur": 9.930, + "args": { + "External id": 153577, "cbid": 430, "correlation": 289993784 + } + }, + { + "ph": "s", "id": 289993784, "pid": 5714, "tid": 6744, "ts": 6303771907385.040, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771907395.950, "dur": 0.400, + "args": { + "External id": 153577, "cbid": 135, "correlation": 289993786 + } + }, + { + "ph": "f", "id": 289993786, "pid": 5714, "tid": 6744, "ts": 6303771907395.950, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771907396.460, "dur": 0.570, + "args": { + "External id": 153577, "cbid": 147, "correlation": 289993787 + } + }, + { + "ph": "s", "id": 289993787, "pid": 5714, "tid": 6744, "ts": 6303771907396.460, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771907398.470, "dur": 0.720, + "args": { + "External id": 153577, "cbid": 135, "correlation": 289993790 + } + }, + { + "ph": "f", "id": 289993790, "pid": 5714, "tid": 6744, "ts": 6303771907398.470, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771907406.550, "dur": 0.430, + "args": { + "External id": 153577, "cbid": 135, "correlation": 289993797 + } + }, + { + "ph": "f", "id": 289993797, "pid": 5714, "tid": 6744, "ts": 6303771907406.550, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771907432.990, "dur": 0.990, + "args": { + "External id": 153579, "cbid": 147, "correlation": 289993802 + } + }, + { + "ph": "s", "id": 289993802, "pid": 5714, "tid": 6744, "ts": 6303771907432.990, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771907458.090, "dur": 0.880, + "args": { + "External id": 153526, "cbid": 135, "correlation": 289993817 + } + }, + { + "ph": "f", "id": 289993817, "pid": 5714, "tid": 6744, "ts": 6303771907458.090, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771929344.055, "dur": 1870.934, + "args": { + "External id": 153581, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993842, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993842, "pid": 0, "tid": 7, "ts": 6303771929344.055, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907595.970, "dur": 10.780, + "args": { + "External id": 153581, "cbid": 211, "correlation": 289993842 + } + }, + { + "ph": "s", "id": 289993842, "pid": 5714, "tid": 6744, "ts": 6303771907595.970, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6303771931218.669, "dur": 564.390, + "args": { + "External id": 153582, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993865, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289993865, "pid": 0, "tid": 7, "ts": 6303771931218.669, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907652.750, "dur": 6.150, + "args": { + "External id": 153582, "cbid": 307, "correlation": 289993865 + } + }, + { + "ph": "s", "id": 289993865, "pid": 5714, "tid": 6744, "ts": 6303771907652.750, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771907693.380, "dur": 0.509, + "args": { + "External id": 153583, "cbid": 200, "correlation": 289993888 + } + }, + { + "ph": "f", "id": 289993888, "pid": 5714, "tid": 6744, "ts": 6303771907693.380, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771931818.644, "dur": 58.336, + "args": { + "External id": 153583, "device": 0, "context": 1, "stream": 7, "correlation": 289993891, "bytes": 1536, "memory bandwidth (GB/s)": 0.026330224904004388 + } + }, + { + "ph": "f", "id": 289993891, "pid": 0, "tid": 7, "ts": 6303771931818.644, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771907695.549, "dur": 6.520, + "args": { + "External id": 153583, "cbid": 51, "correlation": 289993891 + } + }, + { + "ph": "s", "id": 289993891, "pid": 5714, "tid": 6744, "ts": 6303771907695.549, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771931936.565, "dur": 373.572, + "args": { + "External id": 153583, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993892, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993892, "pid": 0, "tid": 7, "ts": 6303771931936.565, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907702.280, "dur": 5.880, + "args": { + "External id": 153583, "cbid": 307, "correlation": 289993892 + } + }, + { + "ph": "s", "id": 289993892, "pid": 5714, "tid": 6744, "ts": 6303771907702.280, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771907735.529, "dur": 0.340, + "args": { + "External id": 153584, "cbid": 200, "correlation": 289993917 + } + }, + { + "ph": "f", "id": 289993917, "pid": 5714, "tid": 6744, "ts": 6303771907735.529, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771932311.385, "dur": 1.248, + "args": { + "External id": 153584, "device": 0, "context": 1, "stream": 7, "correlation": 289993920, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 289993920, "pid": 0, "tid": 7, "ts": 6303771932311.385, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771907736.940, "dur": 4.569, + "args": { + "External id": 153584, "cbid": 51, "correlation": 289993920 + } + }, + { + "ph": "s", "id": 289993920, "pid": 5714, "tid": 6744, "ts": 6303771907736.940, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771932314.105, "dur": 356.260, + "args": { + "External id": 153584, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993921, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993921, "pid": 0, "tid": 7, "ts": 6303771932314.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907741.640, "dur": 5.000, + "args": { + "External id": 153584, "cbid": 307, "correlation": 289993921 + } + }, + { + "ph": "s", "id": 289993921, "pid": 5714, "tid": 6744, "ts": 6303771907741.640, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771907779.059, "dur": 0.290, + "args": { + "External id": 153585, "cbid": 200, "correlation": 289993946 + } + }, + { + "ph": "f", "id": 289993946, "pid": 5714, "tid": 6744, "ts": 6303771907779.059, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771932671.069, "dur": 360.581, + "args": { + "External id": 153585, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993949, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993949, "pid": 0, "tid": 7, "ts": 6303771932671.069, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907780.369, "dur": 5.770, + "args": { + "External id": 153585, "cbid": 307, "correlation": 289993949 + } + }, + { + "ph": "s", "id": 289993949, "pid": 5714, "tid": 6744, "ts": 6303771907780.369, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771907807.249, "dur": 0.260, + "args": { + "External id": 153586, "cbid": 200, "correlation": 289993974 + } + }, + { + "ph": "f", "id": 289993974, "pid": 5714, "tid": 6744, "ts": 6303771907807.249, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771933032.802, "dur": 1.248, + "args": { + "External id": 153586, "device": 0, "context": 1, "stream": 7, "correlation": 289993977, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 289993977, "pid": 0, "tid": 7, "ts": 6303771933032.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771907808.439, "dur": 4.380, + "args": { + "External id": 153586, "cbid": 51, "correlation": 289993977 + } + }, + { + "ph": "s", "id": 289993977, "pid": 5714, "tid": 6744, "ts": 6303771907808.439, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771933035.266, "dur": 356.068, + "args": { + "External id": 153586, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289993978, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289993978, "pid": 0, "tid": 7, "ts": 6303771933035.266, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907812.939, "dur": 4.790, + "args": { + "External id": 153586, "cbid": 307, "correlation": 289993978 + } + }, + { + "ph": "s", "id": 289993978, "pid": 5714, "tid": 6744, "ts": 6303771907812.939, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771907840.749, "dur": 0.270, + "args": { + "External id": 153587, "cbid": 200, "correlation": 289994003 + } + }, + { + "ph": "f", "id": 289994003, "pid": 5714, "tid": 6744, "ts": 6303771907840.749, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771933391.974, "dur": 358.724, + "args": { + "External id": 153587, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994006, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994006, "pid": 0, "tid": 7, "ts": 6303771933391.974, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907842.019, "dur": 4.940, + "args": { + "External id": 153587, "cbid": 307, "correlation": 289994006 + } + }, + { + "ph": "s", "id": 289994006, "pid": 5714, "tid": 6744, "ts": 6303771907842.019, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771933751.370, "dur": 86.529, + "args": { + "External id": 153588, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994019, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994019, "pid": 0, "tid": 7, "ts": 6303771933751.370, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907882.329, "dur": 5.820, + "args": { + "External id": 153588, "cbid": 307, "correlation": 289994019 + } + }, + { + "ph": "s", "id": 289994019, "pid": 5714, "tid": 6744, "ts": 6303771907882.329, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6303771933838.571, "dur": 4.160, + "args": { + "External id": 153589, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994027, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289994027, "pid": 0, "tid": 7, "ts": 6303771933838.571, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907911.729, "dur": 5.160, + "args": { + "External id": 153589, "cbid": 307, "correlation": 289994027 + } + }, + { + "ph": "s", "id": 289994027, "pid": 5714, "tid": 6744, "ts": 6303771907911.729, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6303771933843.915, "dur": 113.537, + "args": { + "External id": 153590, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994035, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994035, "pid": 0, "tid": 7, "ts": 6303771933843.915, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771907942.129, "dur": 4.940, + "args": { + "External id": 153590, "cbid": 307, "correlation": 289994035 + } + }, + { + "ph": "s", "id": 289994035, "pid": 5714, "tid": 6744, "ts": 6303771907942.129, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771908123.828, "dur": 0.511, + "args": { + "External id": 153609, "cbid": 200, "correlation": 289994081 + } + }, + { + "ph": "f", "id": 289994081, "pid": 5714, "tid": 6744, "ts": 6303771908123.828, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771933958.636, "dur": 0.768, + "args": { + "External id": 153609, "device": 0, "context": 1, "stream": 7, "correlation": 289994084, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 289994084, "pid": 0, "tid": 7, "ts": 6303771933958.636, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771908125.868, "dur": 6.931, + "args": { + "External id": 153609, "cbid": 51, "correlation": 289994084 + } + }, + { + "ph": "s", "id": 289994084, "pid": 5714, "tid": 6744, "ts": 6303771908125.868, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771933960.556, "dur": 143.746, + "args": { + "External id": 153609, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994085, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994085, "pid": 0, "tid": 7, "ts": 6303771933960.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908133.039, "dur": 7.809, + "args": { + "External id": 153609, "cbid": 307, "correlation": 289994085 + } + }, + { + "ph": "s", "id": 289994085, "pid": 5714, "tid": 6744, "ts": 6303771908133.039, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771934105.006, "dur": 141.762, + "args": { + "External id": 153610, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994107, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994107, "pid": 0, "tid": 7, "ts": 6303771934105.006, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908166.319, "dur": 5.700, + "args": { + "External id": 153610, "cbid": 211, "correlation": 289994107 + } + }, + { + "ph": "s", "id": 289994107, "pid": 5714, "tid": 6744, "ts": 6303771908166.319, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771908245.028, "dur": 0.430, + "args": { + "External id": 153611, "cbid": 200, "correlation": 289994125 + } + }, + { + "ph": "f", "id": 289994125, "pid": 5714, "tid": 6744, "ts": 6303771908245.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771908245.568, "dur": 0.190, + "args": { + "External id": 153611, "cbid": 200, "correlation": 289994126 + } + }, + { + "ph": "f", "id": 289994126, "pid": 5714, "tid": 6744, "ts": 6303771908245.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771908264.968, "dur": 0.230, + "args": { + "External id": 153611, "cbid": 200, "correlation": 289994144 + } + }, + { + "ph": "f", "id": 289994144, "pid": 5714, "tid": 6744, "ts": 6303771908264.968, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771934247.440, "dur": 92.705, + "args": { + "External id": 153611, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994145, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994145, "pid": 0, "tid": 7, "ts": 6303771934247.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908266.428, "dur": 9.040, + "args": { + "External id": 153611, "cbid": 211, "correlation": 289994145 + } + }, + { + "ph": "s", "id": 289994145, "pid": 5714, "tid": 6744, "ts": 6303771908266.428, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771908276.208, "dur": 0.910, + "args": { + "External id": 153611, "cbid": 273, "correlation": 289994147 + } + }, + { + "ph": "f", "id": 289994147, "pid": 5714, "tid": 6744, "ts": 6303771908276.208, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771934340.753, "dur": 1010.315, + "args": { + "External id": 153611, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994148, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289994148, "pid": 0, "tid": 7, "ts": 6303771934340.753, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908277.488, "dur": 4.060, + "args": { + "External id": 153611, "cbid": 211, "correlation": 289994148 + } + }, + { + "ph": "s", "id": 289994148, "pid": 5714, "tid": 6744, "ts": 6303771908277.488, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303771935351.708, "dur": 73.986, + "args": { + "External id": 153611, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994150, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289994150, "pid": 0, "tid": 7, "ts": 6303771935351.708, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908282.068, "dur": 3.710, + "args": { + "External id": 153611, "cbid": 211, "correlation": 289994150 + } + }, + { + "ph": "s", "id": 289994150, "pid": 5714, "tid": 6744, "ts": 6303771908282.068, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771935426.398, "dur": 49.536, + "args": { + "External id": 153622, "device": 0, "context": 1, "stream": 7, "correlation": 289994172, "bytes": 25165824, "memory bandwidth (GB/s)": 508.031007751938 + } + }, + { + "ph": "f", "id": 289994172, "pid": 0, "tid": 7, "ts": 6303771935426.398, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771908420.578, "dur": 17.900, + "args": { + "External id": 153622, "cbid": 41, "correlation": 289994172 + } + }, + { + "ph": "s", "id": 289994172, "pid": 5714, "tid": 6744, "ts": 6303771908420.578, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771935476.638, "dur": 34.336, + "args": { + "External id": 153619, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994190, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994190, "pid": 0, "tid": 7, "ts": 6303771935476.638, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908532.448, "dur": 8.120, + "args": { + "External id": 153619, "cbid": 307, "correlation": 289994190 + } + }, + { + "ph": "s", "id": 289994190, "pid": 5714, "tid": 6744, "ts": 6303771908532.448, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771935511.678, "dur": 40.001, + "args": { + "External id": 153629, "device": 0, "context": 1, "stream": 7, "correlation": 289994205, "bytes": 25165824, "memory bandwidth (GB/s)": 629.1298717532062 + } + }, + { + "ph": "f", "id": 289994205, "pid": 0, "tid": 7, "ts": 6303771935511.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771908598.987, "dur": 14.140, + "args": { + "External id": 153629, "cbid": 41, "correlation": 289994205 + } + }, + { + "ph": "s", "id": 289994205, "pid": 5714, "tid": 6744, "ts": 6303771908598.987, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771935552.383, "dur": 30.176, + "args": { + "External id": 153626, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994223, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994223, "pid": 0, "tid": 7, "ts": 6303771935552.383, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908700.727, "dur": 7.840, + "args": { + "External id": 153626, "cbid": 307, "correlation": 289994223 + } + }, + { + "ph": "s", "id": 289994223, "pid": 5714, "tid": 6744, "ts": 6303771908700.727, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771908822.767, "dur": 0.530, + "args": { + "External id": 153634, "cbid": 200, "correlation": 289994253 + } + }, + { + "ph": "f", "id": 289994253, "pid": 5714, "tid": 6744, "ts": 6303771908822.767, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771935583.807, "dur": 1.312, + "args": { + "External id": 153634, "device": 0, "context": 1, "stream": 7, "correlation": 289994256, "bytes": 576, "memory bandwidth (GB/s)": 0.43902439024390244 + } + }, + { + "ph": "f", "id": 289994256, "pid": 0, "tid": 7, "ts": 6303771935583.807, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771908824.977, "dur": 7.470, + "args": { + "External id": 153634, "cbid": 51, "correlation": 289994256 + } + }, + { + "ph": "s", "id": 289994256, "pid": 5714, "tid": 6744, "ts": 6303771908824.977, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771935586.623, "dur": 147.266, + "args": { + "External id": 153634, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994257, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994257, "pid": 0, "tid": 7, "ts": 6303771935586.623, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908832.697, "dur": 7.770, + "args": { + "External id": 153634, "cbid": 307, "correlation": 289994257 + } + }, + { + "ph": "s", "id": 289994257, "pid": 5714, "tid": 6744, "ts": 6303771908832.697, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771908866.507, "dur": 0.290, + "args": { + "External id": 153635, "cbid": 200, "correlation": 289994282 + } + }, + { + "ph": "f", "id": 289994282, "pid": 5714, "tid": 6744, "ts": 6303771908866.507, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771935735.041, "dur": 1.120, + "args": { + "External id": 153635, "device": 0, "context": 1, "stream": 7, "correlation": 289994285, "bytes": 576, "memory bandwidth (GB/s)": 0.5142857142857142 + } + }, + { + "ph": "f", "id": 289994285, "pid": 0, "tid": 7, "ts": 6303771935735.041, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771908867.737, "dur": 4.560, + "args": { + "External id": 153635, "cbid": 51, "correlation": 289994285 + } + }, + { + "ph": "s", "id": 289994285, "pid": 5714, "tid": 6744, "ts": 6303771908867.737, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771935737.601, "dur": 156.705, + "args": { + "External id": 153635, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994286, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994286, "pid": 0, "tid": 7, "ts": 6303771935737.601, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908872.447, "dur": 5.150, + "args": { + "External id": 153635, "cbid": 307, "correlation": 289994286 + } + }, + { + "ph": "s", "id": 289994286, "pid": 5714, "tid": 6744, "ts": 6303771908872.447, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771908901.997, "dur": 0.270, + "args": { + "External id": 153636, "cbid": 200, "correlation": 289994311 + } + }, + { + "ph": "f", "id": 289994311, "pid": 5714, "tid": 6744, "ts": 6303771908901.997, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771935906.243, "dur": 4.256, + "args": { + "External id": 153636, "device": 0, "context": 1, "stream": 7, "correlation": 289994314, "bytes": 576, "memory bandwidth (GB/s)": 0.13533834586466165 + } + }, + { + "ph": "f", "id": 289994314, "pid": 0, "tid": 7, "ts": 6303771935906.243, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771908903.207, "dur": 4.560, + "args": { + "External id": 153636, "cbid": 51, "correlation": 289994314 + } + }, + { + "ph": "s", "id": 289994314, "pid": 5714, "tid": 6744, "ts": 6303771908903.207, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771935926.211, "dur": 339.428, + "args": { + "External id": 153636, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994315, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994315, "pid": 0, "tid": 7, "ts": 6303771935926.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908907.907, "dur": 5.030, + "args": { + "External id": 153636, "cbid": 307, "correlation": 289994315 + } + }, + { + "ph": "s", "id": 289994315, "pid": 5714, "tid": 6744, "ts": 6303771908907.907, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771936266.311, "dur": 346.948, + "args": { + "External id": 153637, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994337, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994337, "pid": 0, "tid": 7, "ts": 6303771936266.311, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908938.577, "dur": 5.650, + "args": { + "External id": 153637, "cbid": 211, "correlation": 289994337 + } + }, + { + "ph": "s", "id": 289994337, "pid": 5714, "tid": 6744, "ts": 6303771908938.577, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771936613.963, "dur": 143.201, + "args": { + "External id": 153638, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994360, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994360, "pid": 0, "tid": 7, "ts": 6303771936613.963, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908962.697, "dur": 4.990, + "args": { + "External id": 153638, "cbid": 211, "correlation": 289994360 + } + }, + { + "ph": "s", "id": 289994360, "pid": 5714, "tid": 6744, "ts": 6303771908962.697, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771936757.804, "dur": 142.658, + "args": { + "External id": 153639, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994383, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994383, "pid": 0, "tid": 7, "ts": 6303771936757.804, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771908984.797, "dur": 4.540, + "args": { + "External id": 153639, "cbid": 211, "correlation": 289994383 + } + }, + { + "ph": "s", "id": 289994383, "pid": 5714, "tid": 6744, "ts": 6303771908984.797, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6303771936901.198, "dur": 140.898, + "args": { + "External id": 153640, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994391, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994391, "pid": 0, "tid": 7, "ts": 6303771936901.198, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771909022.977, "dur": 5.629, + "args": { + "External id": 153640, "cbid": 307, "correlation": 289994391 + } + }, + { + "ph": "s", "id": 289994391, "pid": 5714, "tid": 6744, "ts": 6303771909022.977, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303771937042.704, "dur": 169.314, + "args": { + "External id": 153655, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994420, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994420, "pid": 0, "tid": 7, "ts": 6303771937042.704, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771909182.636, "dur": 8.980, + "args": { + "External id": 153655, "cbid": 307, "correlation": 289994420 + } + }, + { + "ph": "s", "id": 289994420, "pid": 5714, "tid": 6744, "ts": 6303771909182.636, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771937212.722, "dur": 120.513, + "args": { + "External id": 153656, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994428, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289994428, "pid": 0, "tid": 7, "ts": 6303771937212.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771909216.876, "dur": 5.410, + "args": { + "External id": 153656, "cbid": 307, "correlation": 289994428 + } + }, + { + "ph": "s", "id": 289994428, "pid": 5714, "tid": 6744, "ts": 6303771909216.876, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303771937333.971, "dur": 184.066, + "args": { + "External id": 153657, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994439, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994439, "pid": 0, "tid": 7, "ts": 6303771937333.971, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771909250.146, "dur": 5.180, + "args": { + "External id": 153657, "cbid": 307, "correlation": 289994439 + } + }, + { + "ph": "s", "id": 289994439, "pid": 5714, "tid": 6744, "ts": 6303771909250.146, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771937518.773, "dur": 156.610, + "args": { + "External id": 153658, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994444, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994444, "pid": 0, "tid": 7, "ts": 6303771937518.773, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771909290.396, "dur": 15.710, + "args": { + "External id": 153658, "cbid": 211, "correlation": 289994444 + } + }, + { + "ph": "s", "id": 289994444, "pid": 5714, "tid": 6744, "ts": 6303771909290.396, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771909473.845, "dur": 2.600, + "args": { + "External id": 153664, "cbid": 147, "correlation": 289994461 + } + }, + { + "ph": "s", "id": 289994461, "pid": 5714, "tid": 6744, "ts": 6303771909473.845, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771909577.595, "dur": 2.340, + "args": { + "External id": 153672, "cbid": 138, "correlation": 289994476 + } + }, + { + "ph": "f", "id": 289994476, "pid": 5714, "tid": 6744, "ts": 6303771909577.595, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771938019.931, "dur": 2.720, + "args": { + "External id": 153676, "device": 0, "context": 1, "stream": 7, "correlation": 289994487, "bytes": 28112, "memory bandwidth (GB/s)": 10.33529411764706 + } + }, + { + "ph": "f", "id": 289994487, "pid": 0, "tid": 7, "ts": 6303771938019.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771909601.375, "dur": 11.850, + "args": { + "External id": 153676, "cbid": 41, "correlation": 289994487 + } + }, + { + "ph": "s", "id": 289994487, "pid": 5714, "tid": 6744, "ts": 6303771909601.375, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771909617.455, "dur": 1.780, + "args": { + "External id": 153671, "cbid": 135, "correlation": 289994491 + } + }, + { + "ph": "f", "id": 289994491, "pid": 5714, "tid": 6744, "ts": 6303771909617.455, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303771938024.635, "dur": 34.721, + "args": { + "External id": 153671, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994495, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994495, "pid": 0, "tid": 7, "ts": 6303771938024.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771909623.085, "dur": 10.360, + "args": { + "External id": 153671, "cbid": 211, "correlation": 289994495 + } + }, + { + "ph": "s", "id": 289994495, "pid": 5714, "tid": 6744, "ts": 6303771909623.085, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771909671.785, "dur": 0.990, + "args": { + "External id": 153664, "cbid": 135, "correlation": 289994506 + } + }, + { + "ph": "f", "id": 289994506, "pid": 5714, "tid": 6744, "ts": 6303771909671.785, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771909674.785, "dur": 1.200, + "args": { + "External id": 153664, "cbid": 147, "correlation": 289994510 + } + }, + { + "ph": "s", "id": 289994510, "pid": 5714, "tid": 6744, "ts": 6303771909674.785, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771909744.675, "dur": 1.040, + "args": { + "External id": 153680, "cbid": 317, "correlation": 289994530 + } + }, + { + "ph": "f", "id": 289994530, "pid": 5714, "tid": 6744, "ts": 6303771909744.675, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771909748.245, "dur": 1.590, + "args": { + "External id": 153680, "cbid": 135, "correlation": 289994532 + } + }, + { + "ph": "f", "id": 289994532, "pid": 5714, "tid": 6744, "ts": 6303771909748.245, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771909751.145, "dur": 0.780, + "args": { + "External id": 153680, "cbid": 147, "correlation": 289994536 + } + }, + { + "ph": "s", "id": 289994536, "pid": 5714, "tid": 6744, "ts": 6303771909751.145, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771909766.215, "dur": 0.690, + "args": { + "External id": 153680, "cbid": 409, "correlation": 289994539 + } + }, + { + "ph": "f", "id": 289994539, "pid": 5714, "tid": 6744, "ts": 6303771909766.215, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771909771.015, "dur": 0.860, + "args": { + "External id": 153680, "cbid": 135, "correlation": 289994542 + } + }, + { + "ph": "f", "id": 289994542, "pid": 5714, "tid": 6744, "ts": 6303771909771.015, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771909772.045, "dur": 1.000, + "args": { + "External id": 153680, "cbid": 147, "correlation": 289994543 + } + }, + { + "ph": "s", "id": 289994543, "pid": 5714, "tid": 6744, "ts": 6303771909772.045, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771942349.357, "dur": 7842.107, + "args": { + "External id": 153680, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289994545, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289994545, "pid": 0, "tid": 20, "ts": 6303771942349.357, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771909774.065, "dur": 10.150, + "args": { + "External id": 153680, "cbid": 430, "correlation": 289994545 + } + }, + { + "ph": "s", "id": 289994545, "pid": 5714, "tid": 6744, "ts": 6303771909774.065, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771909785.255, "dur": 0.390, + "args": { + "External id": 153680, "cbid": 135, "correlation": 289994547 + } + }, + { + "ph": "f", "id": 289994547, "pid": 5714, "tid": 6744, "ts": 6303771909785.255, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771909785.775, "dur": 0.560, + "args": { + "External id": 153680, "cbid": 147, "correlation": 289994548 + } + }, + { + "ph": "s", "id": 289994548, "pid": 5714, "tid": 6744, "ts": 6303771909785.775, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771909787.825, "dur": 0.920, + "args": { + "External id": 153680, "cbid": 135, "correlation": 289994551 + } + }, + { + "ph": "f", "id": 289994551, "pid": 5714, "tid": 6744, "ts": 6303771909787.825, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771909796.455, "dur": 0.440, + "args": { + "External id": 153680, "cbid": 135, "correlation": 289994558 + } + }, + { + "ph": "f", "id": 289994558, "pid": 5714, "tid": 6744, "ts": 6303771909796.455, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771909822.805, "dur": 0.850, + "args": { + "External id": 153682, "cbid": 147, "correlation": 289994563 + } + }, + { + "ph": "s", "id": 289994563, "pid": 5714, "tid": 6744, "ts": 6303771909822.805, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771909840.035, "dur": 0.780, + "args": { + "External id": 153664, "cbid": 135, "correlation": 289994578 + } + }, + { + "ph": "f", "id": 289994578, "pid": 5714, "tid": 6744, "ts": 6303771909840.035, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771910027.044, "dur": 1.140, + "args": { + "External id": 153664, "cbid": 135, "correlation": 289994591 + } + }, + { + "ph": "f", "id": 289994591, "pid": 5714, "tid": 6744, "ts": 6303771910027.044, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771910130.814, "dur": 2.940, + "args": { + "External id": 153692, "cbid": 147, "correlation": 289994602 + } + }, + { + "ph": "s", "id": 289994602, "pid": 5714, "tid": 6744, "ts": 6303771910130.814, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771910240.204, "dur": 1.130, + "args": { + "External id": 153706, "cbid": 317, "correlation": 289994643 + } + }, + { + "ph": "f", "id": 289994643, "pid": 5714, "tid": 6744, "ts": 6303771910240.204, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771910249.804, "dur": 2.140, + "args": { + "External id": 153707, "cbid": 138, "correlation": 289994646 + } + }, + { + "ph": "f", "id": 289994646, "pid": 5714, "tid": 6744, "ts": 6303771910249.804, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771942348.429, "dur": 1.632, + "args": { + "External id": 153711, "device": 0, "context": 1, "stream": 7, "correlation": 289994657, "bytes": 7224, "memory bandwidth (GB/s)": 4.426470588235294 + } + }, + { + "ph": "f", "id": 289994657, "pid": 0, "tid": 7, "ts": 6303771942348.429, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771910273.004, "dur": 12.150, + "args": { + "External id": 153711, "cbid": 41, "correlation": 289994657 + } + }, + { + "ph": "s", "id": 289994657, "pid": 5714, "tid": 6744, "ts": 6303771910273.004, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771910289.604, "dur": 1.560, + "args": { + "External id": 153706, "cbid": 135, "correlation": 289994661 + } + }, + { + "ph": "f", "id": 289994661, "pid": 5714, "tid": 6744, "ts": 6303771910289.604, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771942352.045, "dur": 11.329, + "args": { + "External id": 153706, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994665, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994665, "pid": 0, "tid": 7, "ts": 6303771942352.045, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771910293.504, "dur": 18.900, + "args": { + "External id": 153706, "cbid": 211, "correlation": 289994665 + } + }, + { + "ph": "s", "id": 289994665, "pid": 5714, "tid": 6744, "ts": 6303771910293.504, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771910401.703, "dur": 1.380, + "args": { + "External id": 153692, "cbid": 135, "correlation": 289994676 + } + }, + { + "ph": "f", "id": 289994676, "pid": 5714, "tid": 6744, "ts": 6303771910401.703, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771910406.243, "dur": 1.231, + "args": { + "External id": 153692, "cbid": 147, "correlation": 289994680 + } + }, + { + "ph": "s", "id": 289994680, "pid": 5714, "tid": 6744, "ts": 6303771910406.243, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771910409.274, "dur": 0.780, + "args": { + "External id": 153692, "cbid": 147, "correlation": 289994684 + } + }, + { + "ph": "s", "id": 289994684, "pid": 5714, "tid": 6744, "ts": 6303771910409.274, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771942397.070, "dur": 28.992, + "args": { + "External id": 153725, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289994708, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289994708, "pid": 0, "tid": 17, "ts": 6303771942397.070, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771910557.243, "dur": 11.600, + "args": { + "External id": 153725, "cbid": 211, "correlation": 289994708 + } + }, + { + "ph": "s", "id": 289994708, "pid": 5714, "tid": 6744, "ts": 6303771910557.243, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771942438.702, "dur": 283.844, + "args": { + "External id": 153741, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289994721, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289994721, "pid": 0, "tid": 17, "ts": 6303771942438.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771910672.043, "dur": 9.330, + "args": { + "External id": 153741, "cbid": 211, "correlation": 289994721 + } + }, + { + "ph": "s", "id": 289994721, "pid": 5714, "tid": 6744, "ts": 6303771910672.043, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771910704.753, "dur": 1.270, + "args": { + "External id": 153692, "cbid": 135, "correlation": 289994731 + } + }, + { + "ph": "f", "id": 289994731, "pid": 5714, "tid": 6744, "ts": 6303771910704.753, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771910707.853, "dur": 1.200, + "args": { + "External id": 153692, "cbid": 147, "correlation": 289994735 + } + }, + { + "ph": "s", "id": 289994735, "pid": 5714, "tid": 6744, "ts": 6303771910707.853, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771910759.303, "dur": 0.900, + "args": { + "External id": 153743, "cbid": 317, "correlation": 289994748 + } + }, + { + "ph": "f", "id": 289994748, "pid": 5714, "tid": 6744, "ts": 6303771910759.303, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771910762.033, "dur": 1.130, + "args": { + "External id": 153743, "cbid": 135, "correlation": 289994750 + } + }, + { + "ph": "f", "id": 289994750, "pid": 5714, "tid": 6744, "ts": 6303771910762.033, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771910764.643, "dur": 1.140, + "args": { + "External id": 153743, "cbid": 147, "correlation": 289994754 + } + }, + { + "ph": "s", "id": 289994754, "pid": 5714, "tid": 6744, "ts": 6303771910764.643, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771910779.463, "dur": 0.670, + "args": { + "External id": 153743, "cbid": 409, "correlation": 289994757 + } + }, + { + "ph": "f", "id": 289994757, "pid": 5714, "tid": 6744, "ts": 6303771910779.463, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771910784.063, "dur": 0.790, + "args": { + "External id": 153743, "cbid": 135, "correlation": 289994760 + } + }, + { + "ph": "f", "id": 289994760, "pid": 5714, "tid": 6744, "ts": 6303771910784.063, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771910785.033, "dur": 0.880, + "args": { + "External id": 153743, "cbid": 147, "correlation": 289994761 + } + }, + { + "ph": "s", "id": 289994761, "pid": 5714, "tid": 6744, "ts": 6303771910785.033, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771950230.281, "dur": 5081.403, + "args": { + "External id": 153743, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289994763, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289994763, "pid": 0, "tid": 20, "ts": 6303771950230.281, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771910786.973, "dur": 9.610, + "args": { + "External id": 153743, "cbid": 430, "correlation": 289994763 + } + }, + { + "ph": "s", "id": 289994763, "pid": 5714, "tid": 6744, "ts": 6303771910786.973, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771910797.643, "dur": 0.430, + "args": { + "External id": 153743, "cbid": 135, "correlation": 289994765 + } + }, + { + "ph": "f", "id": 289994765, "pid": 5714, "tid": 6744, "ts": 6303771910797.643, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771910798.183, "dur": 0.540, + "args": { + "External id": 153743, "cbid": 147, "correlation": 289994766 + } + }, + { + "ph": "s", "id": 289994766, "pid": 5714, "tid": 6744, "ts": 6303771910798.183, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771910800.123, "dur": 0.750, + "args": { + "External id": 153743, "cbid": 135, "correlation": 289994769 + } + }, + { + "ph": "f", "id": 289994769, "pid": 5714, "tid": 6744, "ts": 6303771910800.123, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771910808.393, "dur": 0.480, + "args": { + "External id": 153743, "cbid": 135, "correlation": 289994776 + } + }, + { + "ph": "f", "id": 289994776, "pid": 5714, "tid": 6744, "ts": 6303771910808.393, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771910833.373, "dur": 1.009, + "args": { + "External id": 153745, "cbid": 147, "correlation": 289994781 + } + }, + { + "ph": "s", "id": 289994781, "pid": 5714, "tid": 6744, "ts": 6303771910833.373, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771910850.573, "dur": 0.960, + "args": { + "External id": 153692, "cbid": 135, "correlation": 289994796 + } + }, + { + "ph": "f", "id": 289994796, "pid": 5714, "tid": 6744, "ts": 6303771910850.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771942364.014, "dur": 2020.887, + "args": { + "External id": 153747, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994821, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994821, "pid": 0, "tid": 7, "ts": 6303771942364.014, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771910993.432, "dur": 11.400, + "args": { + "External id": 153747, "cbid": 211, "correlation": 289994821 + } + }, + { + "ph": "s", "id": 289994821, "pid": 5714, "tid": 6744, "ts": 6303771910993.432, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6303771944394.149, "dur": 549.959, + "args": { + "External id": 153748, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994844, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289994844, "pid": 0, "tid": 7, "ts": 6303771944394.149, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911050.812, "dur": 6.470, + "args": { + "External id": 153748, "cbid": 307, "correlation": 289994844 + } + }, + { + "ph": "s", "id": 289994844, "pid": 5714, "tid": 6744, "ts": 6303771911050.812, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771911092.322, "dur": 0.560, + "args": { + "External id": 153749, "cbid": 200, "correlation": 289994867 + } + }, + { + "ph": "f", "id": 289994867, "pid": 5714, "tid": 6744, "ts": 6303771911092.322, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771945002.892, "dur": 57.345, + "args": { + "External id": 153749, "device": 0, "context": 1, "stream": 7, "correlation": 289994870, "bytes": 1536, "memory bandwidth (GB/s)": 0.026785247188072195 + } + }, + { + "ph": "f", "id": 289994870, "pid": 0, "tid": 7, "ts": 6303771945002.892, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771911094.512, "dur": 6.650, + "args": { + "External id": 153749, "cbid": 51, "correlation": 289994870 + } + }, + { + "ph": "s", "id": 289994870, "pid": 5714, "tid": 6744, "ts": 6303771911094.512, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771945100.813, "dur": 594.087, + "args": { + "External id": 153749, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994871, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994871, "pid": 0, "tid": 7, "ts": 6303771945100.813, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911101.372, "dur": 5.950, + "args": { + "External id": 153749, "cbid": 307, "correlation": 289994871 + } + }, + { + "ph": "s", "id": 289994871, "pid": 5714, "tid": 6744, "ts": 6303771911101.372, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771911134.662, "dur": 0.300, + "args": { + "External id": 153750, "cbid": 200, "correlation": 289994896 + } + }, + { + "ph": "f", "id": 289994896, "pid": 5714, "tid": 6744, "ts": 6303771911134.662, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771945696.148, "dur": 1.152, + "args": { + "External id": 153750, "device": 0, "context": 1, "stream": 7, "correlation": 289994899, "bytes": 1536, "memory bandwidth (GB/s)": 1.3333333333333333 + } + }, + { + "ph": "f", "id": 289994899, "pid": 0, "tid": 7, "ts": 6303771945696.148, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771911136.032, "dur": 4.890, + "args": { + "External id": 153750, "cbid": 51, "correlation": 289994899 + } + }, + { + "ph": "s", "id": 289994899, "pid": 5714, "tid": 6744, "ts": 6303771911136.032, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771945698.900, "dur": 352.836, + "args": { + "External id": 153750, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994900, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994900, "pid": 0, "tid": 7, "ts": 6303771945698.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911141.072, "dur": 5.120, + "args": { + "External id": 153750, "cbid": 307, "correlation": 289994900 + } + }, + { + "ph": "s", "id": 289994900, "pid": 5714, "tid": 6744, "ts": 6303771911141.072, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771911169.702, "dur": 0.300, + "args": { + "External id": 153751, "cbid": 200, "correlation": 289994925 + } + }, + { + "ph": "f", "id": 289994925, "pid": 5714, "tid": 6744, "ts": 6303771911169.702, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771946052.440, "dur": 360.453, + "args": { + "External id": 153751, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994928, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994928, "pid": 0, "tid": 7, "ts": 6303771946052.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911171.152, "dur": 5.300, + "args": { + "External id": 153751, "cbid": 307, "correlation": 289994928 + } + }, + { + "ph": "s", "id": 289994928, "pid": 5714, "tid": 6744, "ts": 6303771911171.152, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771911198.272, "dur": 0.280, + "args": { + "External id": 153752, "cbid": 200, "correlation": 289994953 + } + }, + { + "ph": "f", "id": 289994953, "pid": 5714, "tid": 6744, "ts": 6303771911198.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771946414.077, "dur": 1.280, + "args": { + "External id": 153752, "device": 0, "context": 1, "stream": 7, "correlation": 289994956, "bytes": 1536, "memory bandwidth (GB/s)": 1.2 + } + }, + { + "ph": "f", "id": 289994956, "pid": 0, "tid": 7, "ts": 6303771946414.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771911199.502, "dur": 4.860, + "args": { + "External id": 153752, "cbid": 51, "correlation": 289994956 + } + }, + { + "ph": "s", "id": 289994956, "pid": 5714, "tid": 6744, "ts": 6303771911199.502, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771946416.637, "dur": 355.140, + "args": { + "External id": 153752, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994957, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994957, "pid": 0, "tid": 7, "ts": 6303771946416.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911204.502, "dur": 5.140, + "args": { + "External id": 153752, "cbid": 307, "correlation": 289994957 + } + }, + { + "ph": "s", "id": 289994957, "pid": 5714, "tid": 6744, "ts": 6303771911204.502, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771911237.592, "dur": 0.330, + "args": { + "External id": 153753, "cbid": 200, "correlation": 289994982 + } + }, + { + "ph": "f", "id": 289994982, "pid": 5714, "tid": 6744, "ts": 6303771911237.592, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771946772.385, "dur": 363.364, + "args": { + "External id": 153753, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994985, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289994985, "pid": 0, "tid": 7, "ts": 6303771946772.385, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911239.132, "dur": 5.980, + "args": { + "External id": 153753, "cbid": 307, "correlation": 289994985 + } + }, + { + "ph": "s", "id": 289994985, "pid": 5714, "tid": 6744, "ts": 6303771911239.132, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771947136.485, "dur": 85.377, + "args": { + "External id": 153754, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289994998, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289994998, "pid": 0, "tid": 7, "ts": 6303771947136.485, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911290.492, "dur": 17.120, + "args": { + "External id": 153754, "cbid": 307, "correlation": 289994998 + } + }, + { + "ph": "s", "id": 289994998, "pid": 5714, "tid": 6744, "ts": 6303771911290.492, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6303771947222.502, "dur": 4.128, + "args": { + "External id": 153755, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995006, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289995006, "pid": 0, "tid": 7, "ts": 6303771947222.502, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911342.081, "dur": 7.471, + "args": { + "External id": 153755, "cbid": 307, "correlation": 289995006 + } + }, + { + "ph": "s", "id": 289995006, "pid": 5714, "tid": 6744, "ts": 6303771911342.081, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6303771947227.910, "dur": 113.922, + "args": { + "External id": 153756, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995014, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995014, "pid": 0, "tid": 7, "ts": 6303771947227.910, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911382.911, "dur": 6.110, + "args": { + "External id": 153756, "cbid": 307, "correlation": 289995014 + } + }, + { + "ph": "s", "id": 289995014, "pid": 5714, "tid": 6744, "ts": 6303771911382.911, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771911574.721, "dur": 0.540, + "args": { + "External id": 153775, "cbid": 200, "correlation": 289995060 + } + }, + { + "ph": "f", "id": 289995060, "pid": 5714, "tid": 6744, "ts": 6303771911574.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771947343.048, "dur": 0.800, + "args": { + "External id": 153775, "device": 0, "context": 1, "stream": 7, "correlation": 289995063, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 289995063, "pid": 0, "tid": 7, "ts": 6303771947343.048, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771911577.071, "dur": 8.180, + "args": { + "External id": 153775, "cbid": 51, "correlation": 289995063 + } + }, + { + "ph": "s", "id": 289995063, "pid": 5714, "tid": 6744, "ts": 6303771911577.071, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771947345.000, "dur": 143.297, + "args": { + "External id": 153775, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995064, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995064, "pid": 0, "tid": 7, "ts": 6303771947345.000, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911585.501, "dur": 8.820, + "args": { + "External id": 153775, "cbid": 307, "correlation": 289995064 + } + }, + { + "ph": "s", "id": 289995064, "pid": 5714, "tid": 6744, "ts": 6303771911585.501, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771947489.001, "dur": 141.602, + "args": { + "External id": 153776, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995086, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995086, "pid": 0, "tid": 7, "ts": 6303771947489.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911625.081, "dur": 6.300, + "args": { + "External id": 153776, "cbid": 211, "correlation": 289995086 + } + }, + { + "ph": "s", "id": 289995086, "pid": 5714, "tid": 6744, "ts": 6303771911625.081, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771911709.361, "dur": 0.510, + "args": { + "External id": 153777, "cbid": 200, "correlation": 289995104 + } + }, + { + "ph": "f", "id": 289995104, "pid": 5714, "tid": 6744, "ts": 6303771911709.361, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771911710.001, "dur": 0.210, + "args": { + "External id": 153777, "cbid": 200, "correlation": 289995105 + } + }, + { + "ph": "f", "id": 289995105, "pid": 5714, "tid": 6744, "ts": 6303771911710.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771911731.251, "dur": 0.300, + "args": { + "External id": 153777, "cbid": 200, "correlation": 289995123 + } + }, + { + "ph": "f", "id": 289995123, "pid": 5714, "tid": 6744, "ts": 6303771911731.251, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771947631.211, "dur": 126.209, + "args": { + "External id": 153777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995124, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995124, "pid": 0, "tid": 7, "ts": 6303771947631.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911732.911, "dur": 10.229, + "args": { + "External id": 153777, "cbid": 211, "correlation": 289995124 + } + }, + { + "ph": "s", "id": 289995124, "pid": 5714, "tid": 6744, "ts": 6303771911732.911, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771911743.980, "dur": 1.051, + "args": { + "External id": 153777, "cbid": 273, "correlation": 289995126 + } + }, + { + "ph": "f", "id": 289995126, "pid": 5714, "tid": 6744, "ts": 6303771911743.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771947758.060, "dur": 1298.991, + "args": { + "External id": 153777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995127, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289995127, "pid": 0, "tid": 7, "ts": 6303771947758.060, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911745.411, "dur": 4.509, + "args": { + "External id": 153777, "cbid": 211, "correlation": 289995127 + } + }, + { + "ph": "s", "id": 289995127, "pid": 5714, "tid": 6744, "ts": 6303771911745.411, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303771949066.779, "dur": 240.227, + "args": { + "External id": 153777, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995129, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289995129, "pid": 0, "tid": 7, "ts": 6303771949066.779, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771911750.500, "dur": 4.031, + "args": { + "External id": 153777, "cbid": 211, "correlation": 289995129 + } + }, + { + "ph": "s", "id": 289995129, "pid": 5714, "tid": 6744, "ts": 6303771911750.500, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771949310.494, "dur": 177.730, + "args": { + "External id": 153788, "device": 0, "context": 1, "stream": 7, "correlation": 289995151, "bytes": 25165824, "memory bandwidth (GB/s)": 141.5958138749789 + } + }, + { + "ph": "f", "id": 289995151, "pid": 0, "tid": 7, "ts": 6303771949310.494, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771911895.690, "dur": 18.550, + "args": { + "External id": 153788, "cbid": 41, "correlation": 289995151 + } + }, + { + "ph": "s", "id": 289995151, "pid": 5714, "tid": 6744, "ts": 6303771911895.690, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771949488.864, "dur": 193.218, + "args": { + "External id": 153785, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995169, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995169, "pid": 0, "tid": 7, "ts": 6303771949488.864, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912022.290, "dur": 9.800, + "args": { + "External id": 153785, "cbid": 307, "correlation": 289995169 + } + }, + { + "ph": "s", "id": 289995169, "pid": 5714, "tid": 6744, "ts": 6303771912022.290, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771949682.690, "dur": 166.883, + "args": { + "External id": 153795, "device": 0, "context": 1, "stream": 7, "correlation": 289995184, "bytes": 25165824, "memory bandwidth (GB/s)": 150.79920662979453 + } + }, + { + "ph": "f", "id": 289995184, "pid": 0, "tid": 7, "ts": 6303771949682.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771912100.350, "dur": 16.290, + "args": { + "External id": 153795, "cbid": 41, "correlation": 289995184 + } + }, + { + "ph": "s", "id": 289995184, "pid": 5714, "tid": 6744, "ts": 6303771912100.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771949850.213, "dur": 44.800, + "args": { + "External id": 153792, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995202, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995202, "pid": 0, "tid": 7, "ts": 6303771949850.213, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912217.550, "dur": 8.709, + "args": { + "External id": 153792, "cbid": 307, "correlation": 289995202 + } + }, + { + "ph": "s", "id": 289995202, "pid": 5714, "tid": 6744, "ts": 6303771912217.550, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771912368.189, "dur": 0.650, + "args": { + "External id": 153800, "cbid": 200, "correlation": 289995232 + } + }, + { + "ph": "f", "id": 289995232, "pid": 5714, "tid": 6744, "ts": 6303771912368.189, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771949901.605, "dur": 7.776, + "args": { + "External id": 153800, "device": 0, "context": 1, "stream": 7, "correlation": 289995235, "bytes": 576, "memory bandwidth (GB/s)": 0.07407407407407407 + } + }, + { + "ph": "f", "id": 289995235, "pid": 0, "tid": 7, "ts": 6303771949901.605, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771912370.839, "dur": 8.720, + "args": { + "External id": 153800, "cbid": 51, "correlation": 289995235 + } + }, + { + "ph": "s", "id": 289995235, "pid": 5714, "tid": 6744, "ts": 6303771912370.839, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771949917.093, "dur": 160.738, + "args": { + "External id": 153800, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995236, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995236, "pid": 0, "tid": 7, "ts": 6303771949917.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912379.839, "dur": 9.310, + "args": { + "External id": 153800, "cbid": 307, "correlation": 289995236 + } + }, + { + "ph": "s", "id": 289995236, "pid": 5714, "tid": 6744, "ts": 6303771912379.839, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771912419.309, "dur": 0.330, + "args": { + "External id": 153801, "cbid": 200, "correlation": 289995261 + } + }, + { + "ph": "f", "id": 289995261, "pid": 5714, "tid": 6744, "ts": 6303771912419.309, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771950087.911, "dur": 7.392, + "args": { + "External id": 153801, "device": 0, "context": 1, "stream": 7, "correlation": 289995264, "bytes": 576, "memory bandwidth (GB/s)": 0.07792207792207792 + } + }, + { + "ph": "f", "id": 289995264, "pid": 0, "tid": 7, "ts": 6303771950087.911, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771912420.829, "dur": 5.100, + "args": { + "External id": 153801, "cbid": 51, "correlation": 289995264 + } + }, + { + "ph": "s", "id": 289995264, "pid": 5714, "tid": 6744, "ts": 6303771912420.829, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771950102.759, "dur": 143.650, + "args": { + "External id": 153801, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995265, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995265, "pid": 0, "tid": 7, "ts": 6303771950102.759, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912426.079, "dur": 5.650, + "args": { + "External id": 153801, "cbid": 307, "correlation": 289995265 + } + }, + { + "ph": "s", "id": 289995265, "pid": 5714, "tid": 6744, "ts": 6303771912426.079, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771912457.029, "dur": 0.310, + "args": { + "External id": 153802, "cbid": 200, "correlation": 289995290 + } + }, + { + "ph": "f", "id": 289995290, "pid": 5714, "tid": 6744, "ts": 6303771912457.029, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771950247.881, "dur": 1.280, + "args": { + "External id": 153802, "device": 0, "context": 1, "stream": 7, "correlation": 289995293, "bytes": 576, "memory bandwidth (GB/s)": 0.45 + } + }, + { + "ph": "f", "id": 289995293, "pid": 0, "tid": 7, "ts": 6303771950247.881, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771912458.429, "dur": 4.640, + "args": { + "External id": 153802, "cbid": 51, "correlation": 289995293 + } + }, + { + "ph": "s", "id": 289995293, "pid": 5714, "tid": 6744, "ts": 6303771912458.429, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771950250.697, "dur": 577.095, + "args": { + "External id": 153802, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995294, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995294, "pid": 0, "tid": 7, "ts": 6303771950250.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912463.239, "dur": 5.130, + "args": { + "External id": 153802, "cbid": 307, "correlation": 289995294 + } + }, + { + "ph": "s", "id": 289995294, "pid": 5714, "tid": 6744, "ts": 6303771912463.239, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771950828.400, "dur": 143.649, + "args": { + "External id": 153803, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995316, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995316, "pid": 0, "tid": 7, "ts": 6303771950828.400, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912495.279, "dur": 6.500, + "args": { + "External id": 153803, "cbid": 211, "correlation": 289995316 + } + }, + { + "ph": "s", "id": 289995316, "pid": 5714, "tid": 6744, "ts": 6303771912495.279, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771950972.721, "dur": 142.818, + "args": { + "External id": 153804, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995339, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995339, "pid": 0, "tid": 7, "ts": 6303771950972.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912522.659, "dur": 5.550, + "args": { + "External id": 153804, "cbid": 211, "correlation": 289995339 + } + }, + { + "ph": "s", "id": 289995339, "pid": 5714, "tid": 6744, "ts": 6303771912522.659, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771951116.211, "dur": 143.202, + "args": { + "External id": 153805, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995362, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995362, "pid": 0, "tid": 7, "ts": 6303771951116.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912547.419, "dur": 5.260, + "args": { + "External id": 153805, "cbid": 211, "correlation": 289995362 + } + }, + { + "ph": "s", "id": 289995362, "pid": 5714, "tid": 6744, "ts": 6303771912547.419, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6303771951260.117, "dur": 81.409, + "args": { + "External id": 153806, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995370, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995370, "pid": 0, "tid": 7, "ts": 6303771951260.117, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912590.179, "dur": 6.190, + "args": { + "External id": 153806, "cbid": 307, "correlation": 289995370 + } + }, + { + "ph": "s", "id": 289995370, "pid": 5714, "tid": 6744, "ts": 6303771912590.179, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303771951342.134, "dur": 47.008, + "args": { + "External id": 153821, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995399, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995399, "pid": 0, "tid": 7, "ts": 6303771951342.134, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912800.268, "dur": 11.190, + "args": { + "External id": 153821, "cbid": 307, "correlation": 289995399 + } + }, + { + "ph": "s", "id": 289995399, "pid": 5714, "tid": 6744, "ts": 6303771912800.268, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771951389.814, "dur": 3.873, + "args": { + "External id": 153822, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995407, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289995407, "pid": 0, "tid": 7, "ts": 6303771951389.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912837.988, "dur": 5.850, + "args": { + "External id": 153822, "cbid": 307, "correlation": 289995407 + } + }, + { + "ph": "s", "id": 289995407, "pid": 5714, "tid": 6744, "ts": 6303771912837.988, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303771951394.327, "dur": 50.144, + "args": { + "External id": 153823, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995418, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995418, "pid": 0, "tid": 7, "ts": 6303771951394.327, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912872.898, "dur": 5.610, + "args": { + "External id": 153823, "cbid": 307, "correlation": 289995418 + } + }, + { + "ph": "s", "id": 289995418, "pid": 5714, "tid": 6744, "ts": 6303771912872.898, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771951445.143, "dur": 47.169, + "args": { + "External id": 153824, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995423, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995423, "pid": 0, "tid": 7, "ts": 6303771951445.143, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771912916.288, "dur": 7.240, + "args": { + "External id": 153824, "cbid": 211, "correlation": 289995423 + } + }, + { + "ph": "s", "id": 289995423, "pid": 5714, "tid": 6744, "ts": 6303771912916.288, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771913099.868, "dur": 2.889, + "args": { + "External id": 153830, "cbid": 147, "correlation": 289995440 + } + }, + { + "ph": "s", "id": 289995440, "pid": 5714, "tid": 6744, "ts": 6303771913099.868, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771913209.757, "dur": 2.530, + "args": { + "External id": 153838, "cbid": 138, "correlation": 289995455 + } + }, + { + "ph": "f", "id": 289995455, "pid": 5714, "tid": 6744, "ts": 6303771913209.757, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771951498.712, "dur": 5.344, + "args": { + "External id": 153842, "device": 0, "context": 1, "stream": 7, "correlation": 289995466, "bytes": 28112, "memory bandwidth (GB/s)": 5.2604790419161676 + } + }, + { + "ph": "f", "id": 289995466, "pid": 0, "tid": 7, "ts": 6303771951498.712, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771913234.927, "dur": 13.070, + "args": { + "External id": 153842, "cbid": 41, "correlation": 289995466 + } + }, + { + "ph": "s", "id": 289995466, "pid": 5714, "tid": 6744, "ts": 6303771913234.927, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771913252.327, "dur": 1.900, + "args": { + "External id": 153837, "cbid": 135, "correlation": 289995470 + } + }, + { + "ph": "f", "id": 289995470, "pid": 5714, "tid": 6744, "ts": 6303771913252.327, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303771951506.136, "dur": 38.112, + "args": { + "External id": 153837, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995474, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995474, "pid": 0, "tid": 7, "ts": 6303771951506.136, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771913257.427, "dur": 10.800, + "args": { + "External id": 153837, "cbid": 211, "correlation": 289995474 + } + }, + { + "ph": "s", "id": 289995474, "pid": 5714, "tid": 6744, "ts": 6303771913257.427, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771913319.087, "dur": 1.210, + "args": { + "External id": 153830, "cbid": 135, "correlation": 289995485 + } + }, + { + "ph": "f", "id": 289995485, "pid": 5714, "tid": 6744, "ts": 6303771913319.087, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771913322.377, "dur": 1.380, + "args": { + "External id": 153830, "cbid": 147, "correlation": 289995489 + } + }, + { + "ph": "s", "id": 289995489, "pid": 5714, "tid": 6744, "ts": 6303771913322.377, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771913398.467, "dur": 1.040, + "args": { + "External id": 153846, "cbid": 317, "correlation": 289995509 + } + }, + { + "ph": "f", "id": 289995509, "pid": 5714, "tid": 6744, "ts": 6303771913398.467, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771913401.427, "dur": 1.440, + "args": { + "External id": 153846, "cbid": 135, "correlation": 289995511 + } + }, + { + "ph": "f", "id": 289995511, "pid": 5714, "tid": 6744, "ts": 6303771913401.427, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771913404.247, "dur": 0.900, + "args": { + "External id": 153846, "cbid": 147, "correlation": 289995515 + } + }, + { + "ph": "s", "id": 289995515, "pid": 5714, "tid": 6744, "ts": 6303771913404.247, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771913420.377, "dur": 0.770, + "args": { + "External id": 153846, "cbid": 409, "correlation": 289995518 + } + }, + { + "ph": "f", "id": 289995518, "pid": 5714, "tid": 6744, "ts": 6303771913420.377, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771913425.547, "dur": 0.790, + "args": { + "External id": 153846, "cbid": 135, "correlation": 289995521 + } + }, + { + "ph": "f", "id": 289995521, "pid": 5714, "tid": 6744, "ts": 6303771913425.547, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771913426.527, "dur": 1.020, + "args": { + "External id": 153846, "cbid": 147, "correlation": 289995522 + } + }, + { + "ph": "s", "id": 289995522, "pid": 5714, "tid": 6744, "ts": 6303771913426.527, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771955313.220, "dur": 8611.748, + "args": { + "External id": 153846, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289995524, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289995524, "pid": 0, "tid": 20, "ts": 6303771955313.220, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771913428.677, "dur": 11.060, + "args": { + "External id": 153846, "cbid": 430, "correlation": 289995524 + } + }, + { + "ph": "s", "id": 289995524, "pid": 5714, "tid": 6744, "ts": 6303771913428.677, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771913440.847, "dur": 0.420, + "args": { + "External id": 153846, "cbid": 135, "correlation": 289995526 + } + }, + { + "ph": "f", "id": 289995526, "pid": 5714, "tid": 6744, "ts": 6303771913440.847, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771913441.397, "dur": 0.610, + "args": { + "External id": 153846, "cbid": 147, "correlation": 289995527 + } + }, + { + "ph": "s", "id": 289995527, "pid": 5714, "tid": 6744, "ts": 6303771913441.397, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771913443.577, "dur": 0.910, + "args": { + "External id": 153846, "cbid": 135, "correlation": 289995530 + } + }, + { + "ph": "f", "id": 289995530, "pid": 5714, "tid": 6744, "ts": 6303771913443.577, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771913452.557, "dur": 0.490, + "args": { + "External id": 153846, "cbid": 135, "correlation": 289995537 + } + }, + { + "ph": "f", "id": 289995537, "pid": 5714, "tid": 6744, "ts": 6303771913452.557, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771913479.787, "dur": 0.970, + "args": { + "External id": 153848, "cbid": 147, "correlation": 289995542 + } + }, + { + "ph": "s", "id": 289995542, "pid": 5714, "tid": 6744, "ts": 6303771913479.787, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771913498.167, "dur": 0.850, + "args": { + "External id": 153830, "cbid": 135, "correlation": 289995557 + } + }, + { + "ph": "f", "id": 289995557, "pid": 5714, "tid": 6744, "ts": 6303771913498.167, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771913719.496, "dur": 1.470, + "args": { + "External id": 153830, "cbid": 135, "correlation": 289995570 + } + }, + { + "ph": "f", "id": 289995570, "pid": 5714, "tid": 6744, "ts": 6303771913719.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771913844.796, "dur": 3.160, + "args": { + "External id": 153858, "cbid": 147, "correlation": 289995581 + } + }, + { + "ph": "s", "id": 289995581, "pid": 5714, "tid": 6744, "ts": 6303771913844.796, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771913953.626, "dur": 1.130, + "args": { + "External id": 153872, "cbid": 317, "correlation": 289995622 + } + }, + { + "ph": "f", "id": 289995622, "pid": 5714, "tid": 6744, "ts": 6303771913953.626, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771913962.406, "dur": 2.150, + "args": { + "External id": 153873, "cbid": 138, "correlation": 289995625 + } + }, + { + "ph": "f", "id": 289995625, "pid": 5714, "tid": 6744, "ts": 6303771913962.406, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771955316.516, "dur": 2.336, + "args": { + "External id": 153877, "device": 0, "context": 1, "stream": 7, "correlation": 289995636, "bytes": 7224, "memory bandwidth (GB/s)": 3.0924657534246576 + } + }, + { + "ph": "f", "id": 289995636, "pid": 0, "tid": 7, "ts": 6303771955316.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771913985.946, "dur": 11.980, + "args": { + "External id": 153877, "cbid": 41, "correlation": 289995636 + } + }, + { + "ph": "s", "id": 289995636, "pid": 5714, "tid": 6744, "ts": 6303771913985.946, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771914002.146, "dur": 1.589, + "args": { + "External id": 153872, "cbid": 135, "correlation": 289995640 + } + }, + { + "ph": "f", "id": 289995640, "pid": 5714, "tid": 6744, "ts": 6303771914002.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771955321.668, "dur": 12.704, + "args": { + "External id": 153872, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995644, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995644, "pid": 0, "tid": 7, "ts": 6303771955321.668, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914006.246, "dur": 10.569, + "args": { + "External id": 153872, "cbid": 211, "correlation": 289995644 + } + }, + { + "ph": "s", "id": 289995644, "pid": 5714, "tid": 6744, "ts": 6303771914006.246, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771914104.225, "dur": 1.270, + "args": { + "External id": 153858, "cbid": 135, "correlation": 289995655 + } + }, + { + "ph": "f", "id": 289995655, "pid": 5714, "tid": 6744, "ts": 6303771914104.225, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771914108.455, "dur": 1.210, + "args": { + "External id": 153858, "cbid": 147, "correlation": 289995659 + } + }, + { + "ph": "s", "id": 289995659, "pid": 5714, "tid": 6744, "ts": 6303771914108.455, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771914111.455, "dur": 0.770, + "args": { + "External id": 153858, "cbid": 147, "correlation": 289995663 + } + }, + { + "ph": "s", "id": 289995663, "pid": 5714, "tid": 6744, "ts": 6303771914111.455, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771955367.524, "dur": 29.985, + "args": { + "External id": 153891, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289995687, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289995687, "pid": 0, "tid": 17, "ts": 6303771955367.524, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914256.065, "dur": 11.430, + "args": { + "External id": 153891, "cbid": 211, "correlation": 289995687 + } + }, + { + "ph": "s", "id": 289995687, "pid": 5714, "tid": 6744, "ts": 6303771914256.065, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771955406.661, "dur": 222.626, + "args": { + "External id": 153907, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289995700, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289995700, "pid": 0, "tid": 17, "ts": 6303771955406.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914376.995, "dur": 10.180, + "args": { + "External id": 153907, "cbid": 211, "correlation": 289995700 + } + }, + { + "ph": "s", "id": 289995700, "pid": 5714, "tid": 6744, "ts": 6303771914376.995, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771914411.515, "dur": 1.330, + "args": { + "External id": 153858, "cbid": 135, "correlation": 289995710 + } + }, + { + "ph": "f", "id": 289995710, "pid": 5714, "tid": 6744, "ts": 6303771914411.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771914414.715, "dur": 1.140, + "args": { + "External id": 153858, "cbid": 147, "correlation": 289995714 + } + }, + { + "ph": "s", "id": 289995714, "pid": 5714, "tid": 6744, "ts": 6303771914414.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771914466.854, "dur": 0.920, + "args": { + "External id": 153909, "cbid": 317, "correlation": 289995727 + } + }, + { + "ph": "f", "id": 289995727, "pid": 5714, "tid": 6744, "ts": 6303771914466.854, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771914469.674, "dur": 1.280, + "args": { + "External id": 153909, "cbid": 135, "correlation": 289995729 + } + }, + { + "ph": "f", "id": 289995729, "pid": 5714, "tid": 6744, "ts": 6303771914469.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771914472.414, "dur": 1.131, + "args": { + "External id": 153909, "cbid": 147, "correlation": 289995733 + } + }, + { + "ph": "s", "id": 289995733, "pid": 5714, "tid": 6744, "ts": 6303771914472.414, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771914487.865, "dur": 0.709, + "args": { + "External id": 153909, "cbid": 409, "correlation": 289995736 + } + }, + { + "ph": "f", "id": 289995736, "pid": 5714, "tid": 6744, "ts": 6303771914487.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771914492.534, "dur": 0.871, + "args": { + "External id": 153909, "cbid": 135, "correlation": 289995739 + } + }, + { + "ph": "f", "id": 289995739, "pid": 5714, "tid": 6744, "ts": 6303771914492.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771914493.594, "dur": 0.880, + "args": { + "External id": 153909, "cbid": 147, "correlation": 289995740 + } + }, + { + "ph": "s", "id": 289995740, "pid": 5714, "tid": 6744, "ts": 6303771914493.594, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771963926.504, "dur": 5209.980, + "args": { + "External id": 153909, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289995742, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289995742, "pid": 0, "tid": 20, "ts": 6303771963926.504, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771914495.514, "dur": 10.151, + "args": { + "External id": 153909, "cbid": 430, "correlation": 289995742 + } + }, + { + "ph": "s", "id": 289995742, "pid": 5714, "tid": 6744, "ts": 6303771914495.514, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771914506.714, "dur": 0.620, + "args": { + "External id": 153909, "cbid": 135, "correlation": 289995744 + } + }, + { + "ph": "f", "id": 289995744, "pid": 5714, "tid": 6744, "ts": 6303771914506.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771914507.454, "dur": 0.551, + "args": { + "External id": 153909, "cbid": 147, "correlation": 289995745 + } + }, + { + "ph": "s", "id": 289995745, "pid": 5714, "tid": 6744, "ts": 6303771914507.454, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771914509.485, "dur": 0.680, + "args": { + "External id": 153909, "cbid": 135, "correlation": 289995748 + } + }, + { + "ph": "f", "id": 289995748, "pid": 5714, "tid": 6744, "ts": 6303771914509.485, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771914518.374, "dur": 0.440, + "args": { + "External id": 153909, "cbid": 135, "correlation": 289995755 + } + }, + { + "ph": "f", "id": 289995755, "pid": 5714, "tid": 6744, "ts": 6303771914518.374, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771914543.724, "dur": 1.050, + "args": { + "External id": 153911, "cbid": 147, "correlation": 289995760 + } + }, + { + "ph": "s", "id": 289995760, "pid": 5714, "tid": 6744, "ts": 6303771914543.724, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771914561.154, "dur": 0.910, + "args": { + "External id": 153858, "cbid": 135, "correlation": 289995775 + } + }, + { + "ph": "f", "id": 289995775, "pid": 5714, "tid": 6744, "ts": 6303771914561.154, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771955334.980, "dur": 2291.195, + "args": { + "External id": 153913, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995800, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995800, "pid": 0, "tid": 7, "ts": 6303771955334.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914707.374, "dur": 11.470, + "args": { + "External id": 153913, "cbid": 211, "correlation": 289995800 + } + }, + { + "ph": "s", "id": 289995800, "pid": 5714, "tid": 6744, "ts": 6303771914707.374, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6303771957688.031, "dur": 672.072, + "args": { + "External id": 153914, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995823, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289995823, "pid": 0, "tid": 7, "ts": 6303771957688.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914763.284, "dur": 5.740, + "args": { + "External id": 153914, "cbid": 307, "correlation": 289995823 + } + }, + { + "ph": "s", "id": 289995823, "pid": 5714, "tid": 6744, "ts": 6303771914763.284, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771914801.674, "dur": 0.550, + "args": { + "External id": 153915, "cbid": 200, "correlation": 289995846 + } + }, + { + "ph": "f", "id": 289995846, "pid": 5714, "tid": 6744, "ts": 6303771914801.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771958416.712, "dur": 57.601, + "args": { + "External id": 153915, "device": 0, "context": 1, "stream": 7, "correlation": 289995849, "bytes": 1536, "memory bandwidth (GB/s)": 0.026666203711741117 + } + }, + { + "ph": "f", "id": 289995849, "pid": 0, "tid": 7, "ts": 6303771958416.712, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771914803.874, "dur": 6.240, + "args": { + "External id": 153915, "cbid": 51, "correlation": 289995849 + } + }, + { + "ph": "s", "id": 289995849, "pid": 5714, "tid": 6744, "ts": 6303771914803.874, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771958541.641, "dur": 595.463, + "args": { + "External id": 153915, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995850, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995850, "pid": 0, "tid": 7, "ts": 6303771958541.641, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914810.304, "dur": 5.700, + "args": { + "External id": 153915, "cbid": 307, "correlation": 289995850 + } + }, + { + "ph": "s", "id": 289995850, "pid": 5714, "tid": 6744, "ts": 6303771914810.304, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771914843.324, "dur": 0.320, + "args": { + "External id": 153916, "cbid": 200, "correlation": 289995875 + } + }, + { + "ph": "f", "id": 289995875, "pid": 5714, "tid": 6744, "ts": 6303771914843.324, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771959138.448, "dur": 1.280, + "args": { + "External id": 153916, "device": 0, "context": 1, "stream": 7, "correlation": 289995878, "bytes": 1536, "memory bandwidth (GB/s)": 1.2 + } + }, + { + "ph": "f", "id": 289995878, "pid": 0, "tid": 7, "ts": 6303771959138.448, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771914844.634, "dur": 4.320, + "args": { + "External id": 153916, "cbid": 51, "correlation": 289995878 + } + }, + { + "ph": "s", "id": 289995878, "pid": 5714, "tid": 6744, "ts": 6303771914844.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771959141.072, "dur": 352.837, + "args": { + "External id": 153916, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995879, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995879, "pid": 0, "tid": 7, "ts": 6303771959141.072, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914849.084, "dur": 4.920, + "args": { + "External id": 153916, "cbid": 307, "correlation": 289995879 + } + }, + { + "ph": "s", "id": 289995879, "pid": 5714, "tid": 6744, "ts": 6303771914849.084, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771914875.724, "dur": 0.289, + "args": { + "External id": 153917, "cbid": 200, "correlation": 289995904 + } + }, + { + "ph": "f", "id": 289995904, "pid": 5714, "tid": 6744, "ts": 6303771914875.724, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771959494.549, "dur": 357.572, + "args": { + "External id": 153917, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995907, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995907, "pid": 0, "tid": 7, "ts": 6303771959494.549, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914877.073, "dur": 5.140, + "args": { + "External id": 153917, "cbid": 307, "correlation": 289995907 + } + }, + { + "ph": "s", "id": 289995907, "pid": 5714, "tid": 6744, "ts": 6303771914877.073, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771914902.684, "dur": 0.260, + "args": { + "External id": 153918, "cbid": 200, "correlation": 289995932 + } + }, + { + "ph": "f", "id": 289995932, "pid": 5714, "tid": 6744, "ts": 6303771914902.684, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771959853.369, "dur": 1.312, + "args": { + "External id": 153918, "device": 0, "context": 1, "stream": 7, "correlation": 289995935, "bytes": 1536, "memory bandwidth (GB/s)": 1.170731707317073 + } + }, + { + "ph": "f", "id": 289995935, "pid": 0, "tid": 7, "ts": 6303771959853.369, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771914903.853, "dur": 4.411, + "args": { + "External id": 153918, "cbid": 51, "correlation": 289995935 + } + }, + { + "ph": "s", "id": 289995935, "pid": 5714, "tid": 6744, "ts": 6303771914903.853, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771959855.929, "dur": 353.508, + "args": { + "External id": 153918, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995936, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995936, "pid": 0, "tid": 7, "ts": 6303771959855.929, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914908.404, "dur": 4.629, + "args": { + "External id": 153918, "cbid": 307, "correlation": 289995936 + } + }, + { + "ph": "s", "id": 289995936, "pid": 5714, "tid": 6744, "ts": 6303771914908.404, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771914934.004, "dur": 0.280, + "args": { + "External id": 153919, "cbid": 200, "correlation": 289995961 + } + }, + { + "ph": "f", "id": 289995961, "pid": 5714, "tid": 6744, "ts": 6303771914934.004, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771960210.109, "dur": 366.020, + "args": { + "External id": 153919, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995964, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289995964, "pid": 0, "tid": 7, "ts": 6303771960210.109, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914935.233, "dur": 4.831, + "args": { + "External id": 153919, "cbid": 307, "correlation": 289995964 + } + }, + { + "ph": "s", "id": 289995964, "pid": 5714, "tid": 6744, "ts": 6303771914935.233, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771960576.801, "dur": 90.017, + "args": { + "External id": 153920, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995977, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995977, "pid": 0, "tid": 7, "ts": 6303771960576.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771914974.363, "dur": 5.870, + "args": { + "External id": 153920, "cbid": 307, "correlation": 289995977 + } + }, + { + "ph": "s", "id": 289995977, "pid": 5714, "tid": 6744, "ts": 6303771914974.363, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6303771960667.426, "dur": 3.808, + "args": { + "External id": 153921, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995985, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289995985, "pid": 0, "tid": 7, "ts": 6303771960667.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915003.733, "dur": 5.130, + "args": { + "External id": 153921, "cbid": 307, "correlation": 289995985 + } + }, + { + "ph": "s", "id": 289995985, "pid": 5714, "tid": 6744, "ts": 6303771915003.733, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6303771960672.546, "dur": 115.073, + "args": { + "External id": 153922, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289995993, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289995993, "pid": 0, "tid": 7, "ts": 6303771960672.546, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915032.063, "dur": 4.860, + "args": { + "External id": 153922, "cbid": 307, "correlation": 289995993 + } + }, + { + "ph": "s", "id": 289995993, "pid": 5714, "tid": 6744, "ts": 6303771915032.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771915202.213, "dur": 0.520, + "args": { + "External id": 153941, "cbid": 200, "correlation": 289996039 + } + }, + { + "ph": "f", "id": 289996039, "pid": 5714, "tid": 6744, "ts": 6303771915202.213, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771960788.835, "dur": 1.248, + "args": { + "External id": 153941, "device": 0, "context": 1, "stream": 7, "correlation": 289996042, "bytes": 576, "memory bandwidth (GB/s)": 0.46153846153846156 + } + }, + { + "ph": "f", "id": 289996042, "pid": 0, "tid": 7, "ts": 6303771960788.835, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771915204.313, "dur": 7.110, + "args": { + "External id": 153941, "cbid": 51, "correlation": 289996042 + } + }, + { + "ph": "s", "id": 289996042, "pid": 5714, "tid": 6744, "ts": 6303771915204.313, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771960791.491, "dur": 142.914, + "args": { + "External id": 153941, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996043, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996043, "pid": 0, "tid": 7, "ts": 6303771960791.491, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915211.643, "dur": 7.820, + "args": { + "External id": 153941, "cbid": 307, "correlation": 289996043 + } + }, + { + "ph": "s", "id": 289996043, "pid": 5714, "tid": 6744, "ts": 6303771915211.643, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771960935.077, "dur": 141.698, + "args": { + "External id": 153942, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996065, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996065, "pid": 0, "tid": 7, "ts": 6303771960935.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915246.143, "dur": 5.870, + "args": { + "External id": 153942, "cbid": 211, "correlation": 289996065 + } + }, + { + "ph": "s", "id": 289996065, "pid": 5714, "tid": 6744, "ts": 6303771915246.143, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771915331.272, "dur": 0.480, + "args": { + "External id": 153943, "cbid": 200, "correlation": 289996083 + } + }, + { + "ph": "f", "id": 289996083, "pid": 5714, "tid": 6744, "ts": 6303771915331.272, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771915331.863, "dur": 0.189, + "args": { + "External id": 153943, "cbid": 200, "correlation": 289996084 + } + }, + { + "ph": "f", "id": 289996084, "pid": 5714, "tid": 6744, "ts": 6303771915331.863, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771915350.603, "dur": 0.249, + "args": { + "External id": 153943, "cbid": 200, "correlation": 289996102 + } + }, + { + "ph": "f", "id": 289996102, "pid": 5714, "tid": 6744, "ts": 6303771915350.603, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771961077.511, "dur": 93.825, + "args": { + "External id": 153943, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996103, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996103, "pid": 0, "tid": 7, "ts": 6303771961077.511, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915352.063, "dur": 9.300, + "args": { + "External id": 153943, "cbid": 211, "correlation": 289996103 + } + }, + { + "ph": "s", "id": 289996103, "pid": 5714, "tid": 6744, "ts": 6303771915352.063, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771915362.123, "dur": 0.949, + "args": { + "External id": 153943, "cbid": 273, "correlation": 289996105 + } + }, + { + "ph": "f", "id": 289996105, "pid": 5714, "tid": 6744, "ts": 6303771915362.123, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771961171.944, "dur": 1244.526, + "args": { + "External id": 153943, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996106, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289996106, "pid": 0, "tid": 7, "ts": 6303771961171.944, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915363.403, "dur": 4.109, + "args": { + "External id": 153943, "cbid": 211, "correlation": 289996106 + } + }, + { + "ph": "s", "id": 289996106, "pid": 5714, "tid": 6744, "ts": 6303771915363.403, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303771962417.142, "dur": 72.257, + "args": { + "External id": 153943, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996108, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289996108, "pid": 0, "tid": 7, "ts": 6303771962417.142, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915368.043, "dur": 3.669, + "args": { + "External id": 153943, "cbid": 211, "correlation": 289996108 + } + }, + { + "ph": "s", "id": 289996108, "pid": 5714, "tid": 6744, "ts": 6303771915368.043, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771962490.039, "dur": 48.385, + "args": { + "External id": 153954, "device": 0, "context": 1, "stream": 7, "correlation": 289996130, "bytes": 25165824, "memory bandwidth (GB/s)": 520.116234370156 + } + }, + { + "ph": "f", "id": 289996130, "pid": 0, "tid": 7, "ts": 6303771962490.039, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771915496.142, "dur": 17.130, + "args": { + "External id": 153954, "cbid": 41, "correlation": 289996130 + } + }, + { + "ph": "s", "id": 289996130, "pid": 5714, "tid": 6744, "ts": 6303771915496.142, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771962539.160, "dur": 33.152, + "args": { + "External id": 153951, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996148, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996148, "pid": 0, "tid": 7, "ts": 6303771962539.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915607.402, "dur": 8.480, + "args": { + "External id": 153951, "cbid": 307, "correlation": 289996148 + } + }, + { + "ph": "s", "id": 289996148, "pid": 5714, "tid": 6744, "ts": 6303771915607.402, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771962572.984, "dur": 39.521, + "args": { + "External id": 153961, "device": 0, "context": 1, "stream": 7, "correlation": 289996163, "bytes": 25165824, "memory bandwidth (GB/s)": 636.7709319096176 + } + }, + { + "ph": "f", "id": 289996163, "pid": 0, "tid": 7, "ts": 6303771962572.984, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771915673.862, "dur": 13.640, + "args": { + "External id": 153961, "cbid": 41, "correlation": 289996163 + } + }, + { + "ph": "s", "id": 289996163, "pid": 5714, "tid": 6744, "ts": 6303771915673.862, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771962613.177, "dur": 28.192, + "args": { + "External id": 153958, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996181, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996181, "pid": 0, "tid": 7, "ts": 6303771962613.177, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915771.322, "dur": 7.709, + "args": { + "External id": 153958, "cbid": 307, "correlation": 289996181 + } + }, + { + "ph": "s", "id": 289996181, "pid": 5714, "tid": 6744, "ts": 6303771915771.322, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771915891.311, "dur": 0.530, + "args": { + "External id": 153966, "cbid": 200, "correlation": 289996211 + } + }, + { + "ph": "f", "id": 289996211, "pid": 5714, "tid": 6744, "ts": 6303771915891.311, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771962642.681, "dur": 0.800, + "args": { + "External id": 153966, "device": 0, "context": 1, "stream": 7, "correlation": 289996214, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 289996214, "pid": 0, "tid": 7, "ts": 6303771962642.681, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771915893.461, "dur": 7.520, + "args": { + "External id": 153966, "cbid": 51, "correlation": 289996214 + } + }, + { + "ph": "s", "id": 289996214, "pid": 5714, "tid": 6744, "ts": 6303771915893.461, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771962644.793, "dur": 384.293, + "args": { + "External id": 153966, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996215, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996215, "pid": 0, "tid": 7, "ts": 6303771962644.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915901.221, "dur": 7.570, + "args": { + "External id": 153966, "cbid": 307, "correlation": 289996215 + } + }, + { + "ph": "s", "id": 289996215, "pid": 5714, "tid": 6744, "ts": 6303771915901.221, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771915934.181, "dur": 0.310, + "args": { + "External id": 153967, "cbid": 200, "correlation": 289996240 + } + }, + { + "ph": "f", "id": 289996240, "pid": 5714, "tid": 6744, "ts": 6303771915934.181, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771963100.798, "dur": 68.993, + "args": { + "External id": 153967, "device": 0, "context": 1, "stream": 7, "correlation": 289996243, "bytes": 576, "memory bandwidth (GB/s)": 0.008348673053788065 + } + }, + { + "ph": "f", "id": 289996243, "pid": 0, "tid": 7, "ts": 6303771963100.798, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771915935.521, "dur": 4.500, + "args": { + "External id": 153967, "cbid": 51, "correlation": 289996243 + } + }, + { + "ph": "s", "id": 289996243, "pid": 5714, "tid": 6744, "ts": 6303771915935.521, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771963244.448, "dur": 388.068, + "args": { + "External id": 153967, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996244, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996244, "pid": 0, "tid": 7, "ts": 6303771963244.448, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915940.171, "dur": 4.840, + "args": { + "External id": 153967, "cbid": 307, "correlation": 289996244 + } + }, + { + "ph": "s", "id": 289996244, "pid": 5714, "tid": 6744, "ts": 6303771915940.171, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771915966.491, "dur": 0.270, + "args": { + "External id": 153968, "cbid": 200, "correlation": 289996269 + } + }, + { + "ph": "f", "id": 289996269, "pid": 5714, "tid": 6744, "ts": 6303771915966.491, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771963637.733, "dur": 7.680, + "args": { + "External id": 153968, "device": 0, "context": 1, "stream": 7, "correlation": 289996272, "bytes": 576, "memory bandwidth (GB/s)": 0.075 + } + }, + { + "ph": "f", "id": 289996272, "pid": 0, "tid": 7, "ts": 6303771963637.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771915967.741, "dur": 3.910, + "args": { + "External id": 153968, "cbid": 51, "correlation": 289996272 + } + }, + { + "ph": "s", "id": 289996272, "pid": 5714, "tid": 6744, "ts": 6303771915967.741, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771963653.637, "dur": 157.537, + "args": { + "External id": 153968, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996273, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996273, "pid": 0, "tid": 7, "ts": 6303771963653.637, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915971.801, "dur": 4.410, + "args": { + "External id": 153968, "cbid": 307, "correlation": 289996273 + } + }, + { + "ph": "s", "id": 289996273, "pid": 5714, "tid": 6744, "ts": 6303771915971.801, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771963811.846, "dur": 142.306, + "args": { + "External id": 153969, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996295, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996295, "pid": 0, "tid": 7, "ts": 6303771963811.846, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771915998.691, "dur": 5.630, + "args": { + "External id": 153969, "cbid": 211, "correlation": 289996295 + } + }, + { + "ph": "s", "id": 289996295, "pid": 5714, "tid": 6744, "ts": 6303771915998.691, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771963954.792, "dur": 141.794, + "args": { + "External id": 153970, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996318, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996318, "pid": 0, "tid": 7, "ts": 6303771963954.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771916022.521, "dur": 4.480, + "args": { + "External id": 153970, "cbid": 211, "correlation": 289996318 + } + }, + { + "ph": "s", "id": 289996318, "pid": 5714, "tid": 6744, "ts": 6303771916022.521, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771964097.194, "dur": 552.646, + "args": { + "External id": 153971, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996341, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996341, "pid": 0, "tid": 7, "ts": 6303771964097.194, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771916043.151, "dur": 4.310, + "args": { + "External id": 153971, "cbid": 211, "correlation": 289996341 + } + }, + { + "ph": "s", "id": 289996341, "pid": 5714, "tid": 6744, "ts": 6303771916043.151, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6303771964650.512, "dur": 81.665, + "args": { + "External id": 153972, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996349, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996349, "pid": 0, "tid": 7, "ts": 6303771964650.512, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771916079.631, "dur": 5.360, + "args": { + "External id": 153972, "cbid": 307, "correlation": 289996349 + } + }, + { + "ph": "s", "id": 289996349, "pid": 5714, "tid": 6744, "ts": 6303771916079.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303771964732.849, "dur": 45.825, + "args": { + "External id": 153987, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996378, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996378, "pid": 0, "tid": 7, "ts": 6303771964732.849, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771916230.721, "dur": 8.969, + "args": { + "External id": 153987, "cbid": 307, "correlation": 289996378 + } + }, + { + "ph": "s", "id": 289996378, "pid": 5714, "tid": 6744, "ts": 6303771916230.721, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771964779.314, "dur": 3.712, + "args": { + "External id": 153988, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996386, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289996386, "pid": 0, "tid": 7, "ts": 6303771964779.314, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771916264.110, "dur": 5.091, + "args": { + "External id": 153988, "cbid": 307, "correlation": 289996386 + } + }, + { + "ph": "s", "id": 289996386, "pid": 5714, "tid": 6744, "ts": 6303771916264.110, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303771964783.698, "dur": 49.728, + "args": { + "External id": 153989, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996397, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996397, "pid": 0, "tid": 7, "ts": 6303771964783.698, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771916295.110, "dur": 12.560, + "args": { + "External id": 153989, "cbid": 307, "correlation": 289996397 + } + }, + { + "ph": "s", "id": 289996397, "pid": 5714, "tid": 6744, "ts": 6303771916295.110, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771964834.098, "dur": 48.001, + "args": { + "External id": 153990, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996402, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996402, "pid": 0, "tid": 7, "ts": 6303771964834.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771916342.990, "dur": 6.790, + "args": { + "External id": 153990, "cbid": 211, "correlation": 289996402 + } + }, + { + "ph": "s", "id": 289996402, "pid": 5714, "tid": 6744, "ts": 6303771916342.990, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771916509.260, "dur": 2.640, + "args": { + "External id": 153996, "cbid": 147, "correlation": 289996419 + } + }, + { + "ph": "s", "id": 289996419, "pid": 5714, "tid": 6744, "ts": 6303771916509.260, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771916606.960, "dur": 2.480, + "args": { + "External id": 154004, "cbid": 138, "correlation": 289996434 + } + }, + { + "ph": "f", "id": 289996434, "pid": 5714, "tid": 6744, "ts": 6303771916606.960, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771964886.707, "dur": 5.184, + "args": { + "External id": 154008, "device": 0, "context": 1, "stream": 7, "correlation": 289996445, "bytes": 28112, "memory bandwidth (GB/s)": 5.422839506172839 + } + }, + { + "ph": "f", "id": 289996445, "pid": 0, "tid": 7, "ts": 6303771964886.707, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771916629.200, "dur": 11.430, + "args": { + "External id": 154008, "cbid": 41, "correlation": 289996445 + } + }, + { + "ph": "s", "id": 289996445, "pid": 5714, "tid": 6744, "ts": 6303771916629.200, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771916644.780, "dur": 1.690, + "args": { + "External id": 154003, "cbid": 135, "correlation": 289996449 + } + }, + { + "ph": "f", "id": 289996449, "pid": 5714, "tid": 6744, "ts": 6303771916644.780, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303771964894.387, "dur": 37.537, + "args": { + "External id": 154003, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996453, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996453, "pid": 0, "tid": 7, "ts": 6303771964894.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771916649.360, "dur": 9.780, + "args": { + "External id": 154003, "cbid": 211, "correlation": 289996453 + } + }, + { + "ph": "s", "id": 289996453, "pid": 5714, "tid": 6744, "ts": 6303771916649.360, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771916696.880, "dur": 0.929, + "args": { + "External id": 153996, "cbid": 135, "correlation": 289996464 + } + }, + { + "ph": "f", "id": 289996464, "pid": 5714, "tid": 6744, "ts": 6303771916696.880, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771916699.740, "dur": 1.220, + "args": { + "External id": 153996, "cbid": 147, "correlation": 289996468 + } + }, + { + "ph": "s", "id": 289996468, "pid": 5714, "tid": 6744, "ts": 6303771916699.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771916765.569, "dur": 1.020, + "args": { + "External id": 154012, "cbid": 317, "correlation": 289996488 + } + }, + { + "ph": "f", "id": 289996488, "pid": 5714, "tid": 6744, "ts": 6303771916765.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771916768.469, "dur": 1.190, + "args": { + "External id": 154012, "cbid": 135, "correlation": 289996490 + } + }, + { + "ph": "f", "id": 289996490, "pid": 5714, "tid": 6744, "ts": 6303771916768.469, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771916770.919, "dur": 0.850, + "args": { + "External id": 154012, "cbid": 147, "correlation": 289996494 + } + }, + { + "ph": "s", "id": 289996494, "pid": 5714, "tid": 6744, "ts": 6303771916770.919, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771916785.569, "dur": 0.690, + "args": { + "External id": 154012, "cbid": 409, "correlation": 289996497 + } + }, + { + "ph": "f", "id": 289996497, "pid": 5714, "tid": 6744, "ts": 6303771916785.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771916790.069, "dur": 0.760, + "args": { + "External id": 154012, "cbid": 135, "correlation": 289996500 + } + }, + { + "ph": "f", "id": 289996500, "pid": 5714, "tid": 6744, "ts": 6303771916790.069, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771916791.009, "dur": 1.010, + "args": { + "External id": 154012, "cbid": 147, "correlation": 289996501 + } + }, + { + "ph": "s", "id": 289996501, "pid": 5714, "tid": 6744, "ts": 6303771916791.009, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771969139.556, "dur": 8915.112, + "args": { + "External id": 154012, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289996503, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289996503, "pid": 0, "tid": 20, "ts": 6303771969139.556, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771916793.049, "dur": 9.480, + "args": { + "External id": 154012, "cbid": 430, "correlation": 289996503 + } + }, + { + "ph": "s", "id": 289996503, "pid": 5714, "tid": 6744, "ts": 6303771916793.049, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771916803.569, "dur": 0.390, + "args": { + "External id": 154012, "cbid": 135, "correlation": 289996505 + } + }, + { + "ph": "f", "id": 289996505, "pid": 5714, "tid": 6744, "ts": 6303771916803.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771916804.099, "dur": 0.490, + "args": { + "External id": 154012, "cbid": 147, "correlation": 289996506 + } + }, + { + "ph": "s", "id": 289996506, "pid": 5714, "tid": 6744, "ts": 6303771916804.099, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771916805.989, "dur": 0.690, + "args": { + "External id": 154012, "cbid": 135, "correlation": 289996509 + } + }, + { + "ph": "f", "id": 289996509, "pid": 5714, "tid": 6744, "ts": 6303771916805.989, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771916813.879, "dur": 0.410, + "args": { + "External id": 154012, "cbid": 135, "correlation": 289996516 + } + }, + { + "ph": "f", "id": 289996516, "pid": 5714, "tid": 6744, "ts": 6303771916813.879, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771916838.799, "dur": 0.900, + "args": { + "External id": 154014, "cbid": 147, "correlation": 289996521 + } + }, + { + "ph": "s", "id": 289996521, "pid": 5714, "tid": 6744, "ts": 6303771916838.799, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771916856.229, "dur": 0.790, + "args": { + "External id": 153996, "cbid": 135, "correlation": 289996536 + } + }, + { + "ph": "f", "id": 289996536, "pid": 5714, "tid": 6744, "ts": 6303771916856.229, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771917035.299, "dur": 1.910, + "args": { + "External id": 153996, "cbid": 135, "correlation": 289996549 + } + }, + { + "ph": "f", "id": 289996549, "pid": 5714, "tid": 6744, "ts": 6303771917035.299, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771917137.228, "dur": 3.071, + "args": { + "External id": 154024, "cbid": 147, "correlation": 289996560 + } + }, + { + "ph": "s", "id": 289996560, "pid": 5714, "tid": 6744, "ts": 6303771917137.228, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771917240.378, "dur": 1.090, + "args": { + "External id": 154038, "cbid": 317, "correlation": 289996601 + } + }, + { + "ph": "f", "id": 289996601, "pid": 5714, "tid": 6744, "ts": 6303771917240.378, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771917248.268, "dur": 2.160, + "args": { + "External id": 154039, "cbid": 138, "correlation": 289996604 + } + }, + { + "ph": "f", "id": 289996604, "pid": 5714, "tid": 6744, "ts": 6303771917248.268, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771969138.948, "dur": 2.144, + "args": { + "External id": 154043, "device": 0, "context": 1, "stream": 7, "correlation": 289996615, "bytes": 7224, "memory bandwidth (GB/s)": 3.3694029850746268 + } + }, + { + "ph": "f", "id": 289996615, "pid": 0, "tid": 7, "ts": 6303771969138.948, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771917270.508, "dur": 11.420, + "args": { + "External id": 154043, "cbid": 41, "correlation": 289996615 + } + }, + { + "ph": "s", "id": 289996615, "pid": 5714, "tid": 6744, "ts": 6303771917270.508, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771917286.028, "dur": 1.840, + "args": { + "External id": 154038, "cbid": 135, "correlation": 289996619 + } + }, + { + "ph": "f", "id": 289996619, "pid": 5714, "tid": 6744, "ts": 6303771917286.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771969143.332, "dur": 12.320, + "args": { + "External id": 154038, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996623, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996623, "pid": 0, "tid": 7, "ts": 6303771969143.332, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771917290.138, "dur": 18.990, + "args": { + "External id": 154038, "cbid": 211, "correlation": 289996623 + } + }, + { + "ph": "s", "id": 289996623, "pid": 5714, "tid": 6744, "ts": 6303771917290.138, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771917395.068, "dur": 1.320, + "args": { + "External id": 154024, "cbid": 135, "correlation": 289996634 + } + }, + { + "ph": "f", "id": 289996634, "pid": 5714, "tid": 6744, "ts": 6303771917395.068, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771917399.398, "dur": 1.140, + "args": { + "External id": 154024, "cbid": 147, "correlation": 289996638 + } + }, + { + "ph": "s", "id": 289996638, "pid": 5714, "tid": 6744, "ts": 6303771917399.398, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771917402.178, "dur": 0.770, + "args": { + "External id": 154024, "cbid": 147, "correlation": 289996642 + } + }, + { + "ph": "s", "id": 289996642, "pid": 5714, "tid": 6744, "ts": 6303771917402.178, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771969189.381, "dur": 28.960, + "args": { + "External id": 154057, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289996666, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289996666, "pid": 0, "tid": 17, "ts": 6303771969189.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771917540.338, "dur": 11.000, + "args": { + "External id": 154057, "cbid": 211, "correlation": 289996666 + } + }, + { + "ph": "s", "id": 289996666, "pid": 5714, "tid": 6744, "ts": 6303771917540.338, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771969228.134, "dur": 10.432, + "args": { + "External id": 154073, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289996679, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289996679, "pid": 0, "tid": 17, "ts": 6303771969228.134, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771917646.827, "dur": 9.031, + "args": { + "External id": 154073, "cbid": 211, "correlation": 289996679 + } + }, + { + "ph": "s", "id": 289996679, "pid": 5714, "tid": 6744, "ts": 6303771917646.827, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771917678.617, "dur": 1.180, + "args": { + "External id": 154024, "cbid": 135, "correlation": 289996689 + } + }, + { + "ph": "f", "id": 289996689, "pid": 5714, "tid": 6744, "ts": 6303771917678.617, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771917681.577, "dur": 1.120, + "args": { + "External id": 154024, "cbid": 147, "correlation": 289996693 + } + }, + { + "ph": "s", "id": 289996693, "pid": 5714, "tid": 6744, "ts": 6303771917681.577, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771917729.667, "dur": 0.820, + "args": { + "External id": 154075, "cbid": 317, "correlation": 289996706 + } + }, + { + "ph": "f", "id": 289996706, "pid": 5714, "tid": 6744, "ts": 6303771917729.667, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771917732.387, "dur": 1.070, + "args": { + "External id": 154075, "cbid": 135, "correlation": 289996708 + } + }, + { + "ph": "f", "id": 289996708, "pid": 5714, "tid": 6744, "ts": 6303771917732.387, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771917734.847, "dur": 1.140, + "args": { + "External id": 154075, "cbid": 147, "correlation": 289996712 + } + }, + { + "ph": "s", "id": 289996712, "pid": 5714, "tid": 6744, "ts": 6303771917734.847, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771917749.467, "dur": 0.650, + "args": { + "External id": 154075, "cbid": 409, "correlation": 289996715 + } + }, + { + "ph": "f", "id": 289996715, "pid": 5714, "tid": 6744, "ts": 6303771917749.467, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771917753.937, "dur": 0.710, + "args": { + "External id": 154075, "cbid": 135, "correlation": 289996718 + } + }, + { + "ph": "f", "id": 289996718, "pid": 5714, "tid": 6744, "ts": 6303771917753.937, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771917754.827, "dur": 0.810, + "args": { + "External id": 154075, "cbid": 147, "correlation": 289996719 + } + }, + { + "ph": "s", "id": 289996719, "pid": 5714, "tid": 6744, "ts": 6303771917754.827, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771978057.036, "dur": 5586.337, + "args": { + "External id": 154075, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289996721, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289996721, "pid": 0, "tid": 20, "ts": 6303771978057.036, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771917756.617, "dur": 9.260, + "args": { + "External id": 154075, "cbid": 430, "correlation": 289996721 + } + }, + { + "ph": "s", "id": 289996721, "pid": 5714, "tid": 6744, "ts": 6303771917756.617, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771917766.867, "dur": 0.380, + "args": { + "External id": 154075, "cbid": 135, "correlation": 289996723 + } + }, + { + "ph": "f", "id": 289996723, "pid": 5714, "tid": 6744, "ts": 6303771917766.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771917767.367, "dur": 0.480, + "args": { + "External id": 154075, "cbid": 147, "correlation": 289996724 + } + }, + { + "ph": "s", "id": 289996724, "pid": 5714, "tid": 6744, "ts": 6303771917767.367, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771917769.317, "dur": 0.620, + "args": { + "External id": 154075, "cbid": 135, "correlation": 289996727 + } + }, + { + "ph": "f", "id": 289996727, "pid": 5714, "tid": 6744, "ts": 6303771917769.317, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771917777.237, "dur": 0.410, + "args": { + "External id": 154075, "cbid": 135, "correlation": 289996734 + } + }, + { + "ph": "f", "id": 289996734, "pid": 5714, "tid": 6744, "ts": 6303771917777.237, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771917801.357, "dur": 0.950, + "args": { + "External id": 154077, "cbid": 147, "correlation": 289996739 + } + }, + { + "ph": "s", "id": 289996739, "pid": 5714, "tid": 6744, "ts": 6303771917801.357, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771917818.017, "dur": 0.850, + "args": { + "External id": 154024, "cbid": 135, "correlation": 289996754 + } + }, + { + "ph": "f", "id": 289996754, "pid": 5714, "tid": 6744, "ts": 6303771917818.017, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771969156.228, "dur": 2438.717, + "args": { + "External id": 154079, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996779, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996779, "pid": 0, "tid": 7, "ts": 6303771969156.228, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771917954.597, "dur": 10.650, + "args": { + "External id": 154079, "cbid": 211, "correlation": 289996779 + } + }, + { + "ph": "s", "id": 289996779, "pid": 5714, "tid": 6744, "ts": 6303771917954.597, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6303771971639.266, "dur": 603.462, + "args": { + "External id": 154080, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996802, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289996802, "pid": 0, "tid": 7, "ts": 6303771971639.266, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918011.197, "dur": 6.030, + "args": { + "External id": 154080, "cbid": 307, "correlation": 289996802 + } + }, + { + "ph": "s", "id": 289996802, "pid": 5714, "tid": 6744, "ts": 6303771918011.197, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771918050.486, "dur": 0.551, + "args": { + "External id": 154081, "cbid": 200, "correlation": 289996825 + } + }, + { + "ph": "f", "id": 289996825, "pid": 5714, "tid": 6744, "ts": 6303771918050.486, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771972310.217, "dur": 51.841, + "args": { + "External id": 154081, "device": 0, "context": 1, "stream": 7, "correlation": 289996828, "bytes": 1536, "memory bandwidth (GB/s)": 0.02962905808144133 + } + }, + { + "ph": "f", "id": 289996828, "pid": 0, "tid": 7, "ts": 6303771972310.217, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771918052.657, "dur": 6.829, + "args": { + "External id": 154081, "cbid": 51, "correlation": 289996828 + } + }, + { + "ph": "s", "id": 289996828, "pid": 5714, "tid": 6744, "ts": 6303771918052.657, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771972421.578, "dur": 654.088, + "args": { + "External id": 154081, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996829, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996829, "pid": 0, "tid": 7, "ts": 6303771972421.578, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918059.726, "dur": 5.811, + "args": { + "External id": 154081, "cbid": 307, "correlation": 289996829 + } + }, + { + "ph": "s", "id": 289996829, "pid": 5714, "tid": 6744, "ts": 6303771918059.726, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771918092.697, "dur": 0.289, + "args": { + "External id": 154082, "cbid": 200, "correlation": 289996854 + } + }, + { + "ph": "f", "id": 289996854, "pid": 5714, "tid": 6744, "ts": 6303771918092.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771973076.978, "dur": 1.184, + "args": { + "External id": 154082, "device": 0, "context": 1, "stream": 7, "correlation": 289996857, "bytes": 1536, "memory bandwidth (GB/s)": 1.2972972972972974 + } + }, + { + "ph": "f", "id": 289996857, "pid": 0, "tid": 7, "ts": 6303771973076.978, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771918093.997, "dur": 4.369, + "args": { + "External id": 154082, "cbid": 51, "correlation": 289996857 + } + }, + { + "ph": "s", "id": 289996857, "pid": 5714, "tid": 6744, "ts": 6303771918093.997, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771973079.698, "dur": 352.612, + "args": { + "External id": 154082, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996858, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996858, "pid": 0, "tid": 7, "ts": 6303771973079.698, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918098.497, "dur": 4.700, + "args": { + "External id": 154082, "cbid": 307, "correlation": 289996858 + } + }, + { + "ph": "s", "id": 289996858, "pid": 5714, "tid": 6744, "ts": 6303771918098.497, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771918124.616, "dur": 0.270, + "args": { + "External id": 154083, "cbid": 200, "correlation": 289996883 + } + }, + { + "ph": "f", "id": 289996883, "pid": 5714, "tid": 6744, "ts": 6303771918124.616, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771973433.046, "dur": 358.597, + "args": { + "External id": 154083, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996886, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996886, "pid": 0, "tid": 7, "ts": 6303771973433.046, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918125.956, "dur": 4.920, + "args": { + "External id": 154083, "cbid": 307, "correlation": 289996886 + } + }, + { + "ph": "s", "id": 289996886, "pid": 5714, "tid": 6744, "ts": 6303771918125.956, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771918152.216, "dur": 0.220, + "args": { + "External id": 154084, "cbid": 200, "correlation": 289996911 + } + }, + { + "ph": "f", "id": 289996911, "pid": 5714, "tid": 6744, "ts": 6303771918152.216, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771973793.051, "dur": 1.248, + "args": { + "External id": 154084, "device": 0, "context": 1, "stream": 7, "correlation": 289996914, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 289996914, "pid": 0, "tid": 7, "ts": 6303771973793.051, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771918153.416, "dur": 4.380, + "args": { + "External id": 154084, "cbid": 51, "correlation": 289996914 + } + }, + { + "ph": "s", "id": 289996914, "pid": 5714, "tid": 6744, "ts": 6303771918153.416, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771973795.931, "dur": 358.212, + "args": { + "External id": 154084, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996915, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996915, "pid": 0, "tid": 7, "ts": 6303771973795.931, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918157.926, "dur": 4.680, + "args": { + "External id": 154084, "cbid": 307, "correlation": 289996915 + } + }, + { + "ph": "s", "id": 289996915, "pid": 5714, "tid": 6744, "ts": 6303771918157.926, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771918183.996, "dur": 0.290, + "args": { + "External id": 154085, "cbid": 200, "correlation": 289996940 + } + }, + { + "ph": "f", "id": 289996940, "pid": 5714, "tid": 6744, "ts": 6303771918183.996, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771974154.783, "dur": 359.908, + "args": { + "External id": 154085, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996943, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289996943, "pid": 0, "tid": 7, "ts": 6303771974154.783, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918185.266, "dur": 4.850, + "args": { + "External id": 154085, "cbid": 307, "correlation": 289996943 + } + }, + { + "ph": "s", "id": 289996943, "pid": 5714, "tid": 6744, "ts": 6303771918185.266, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771974515.395, "dur": 91.393, + "args": { + "External id": 154086, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996956, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996956, "pid": 0, "tid": 7, "ts": 6303771974515.395, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918225.526, "dur": 5.710, + "args": { + "External id": 154086, "cbid": 307, "correlation": 289996956 + } + }, + { + "ph": "s", "id": 289996956, "pid": 5714, "tid": 6744, "ts": 6303771918225.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6303771974607.396, "dur": 3.936, + "args": { + "External id": 154087, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996964, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289996964, "pid": 0, "tid": 7, "ts": 6303771974607.396, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918254.726, "dur": 5.320, + "args": { + "External id": 154087, "cbid": 307, "correlation": 289996964 + } + }, + { + "ph": "s", "id": 289996964, "pid": 5714, "tid": 6744, "ts": 6303771918254.726, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6303771974612.580, "dur": 113.217, + "args": { + "External id": 154088, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289996972, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289996972, "pid": 0, "tid": 7, "ts": 6303771974612.580, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918284.436, "dur": 4.930, + "args": { + "External id": 154088, "cbid": 307, "correlation": 289996972 + } + }, + { + "ph": "s", "id": 289996972, "pid": 5714, "tid": 6744, "ts": 6303771918284.436, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771918462.946, "dur": 0.560, + "args": { + "External id": 154107, "cbid": 200, "correlation": 289997018 + } + }, + { + "ph": "f", "id": 289997018, "pid": 5714, "tid": 6744, "ts": 6303771918462.946, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771974727.013, "dur": 1.568, + "args": { + "External id": 154107, "device": 0, "context": 1, "stream": 7, "correlation": 289997021, "bytes": 576, "memory bandwidth (GB/s)": 0.3673469387755102 + } + }, + { + "ph": "f", "id": 289997021, "pid": 0, "tid": 7, "ts": 6303771974727.013, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771918465.096, "dur": 7.129, + "args": { + "External id": 154107, "cbid": 51, "correlation": 289997021 + } + }, + { + "ph": "s", "id": 289997021, "pid": 5714, "tid": 6744, "ts": 6303771918465.096, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771974729.989, "dur": 143.170, + "args": { + "External id": 154107, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997022, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997022, "pid": 0, "tid": 7, "ts": 6303771974729.989, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918472.445, "dur": 8.260, + "args": { + "External id": 154107, "cbid": 307, "correlation": 289997022 + } + }, + { + "ph": "s", "id": 289997022, "pid": 5714, "tid": 6744, "ts": 6303771918472.445, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771974873.895, "dur": 141.186, + "args": { + "External id": 154108, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997044, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997044, "pid": 0, "tid": 7, "ts": 6303771974873.895, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918507.096, "dur": 5.669, + "args": { + "External id": 154108, "cbid": 211, "correlation": 289997044 + } + }, + { + "ph": "s", "id": 289997044, "pid": 5714, "tid": 6744, "ts": 6303771918507.096, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771918582.535, "dur": 0.430, + "args": { + "External id": 154109, "cbid": 200, "correlation": 289997062 + } + }, + { + "ph": "f", "id": 289997062, "pid": 5714, "tid": 6744, "ts": 6303771918582.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771918583.075, "dur": 0.190, + "args": { + "External id": 154109, "cbid": 200, "correlation": 289997063 + } + }, + { + "ph": "f", "id": 289997063, "pid": 5714, "tid": 6744, "ts": 6303771918583.075, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771918601.285, "dur": 0.230, + "args": { + "External id": 154109, "cbid": 200, "correlation": 289997081 + } + }, + { + "ph": "f", "id": 289997081, "pid": 5714, "tid": 6744, "ts": 6303771918601.285, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771975015.785, "dur": 92.160, + "args": { + "External id": 154109, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997082, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997082, "pid": 0, "tid": 7, "ts": 6303771975015.785, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918602.685, "dur": 8.980, + "args": { + "External id": 154109, "cbid": 211, "correlation": 289997082 + } + }, + { + "ph": "s", "id": 289997082, "pid": 5714, "tid": 6744, "ts": 6303771918602.685, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771918612.455, "dur": 0.920, + "args": { + "External id": 154109, "cbid": 273, "correlation": 289997084 + } + }, + { + "ph": "f", "id": 289997084, "pid": 5714, "tid": 6744, "ts": 6303771918612.455, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771975108.649, "dur": 1445.073, + "args": { + "External id": 154109, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997085, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289997085, "pid": 0, "tid": 7, "ts": 6303771975108.649, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918613.705, "dur": 4.180, + "args": { + "External id": 154109, "cbid": 211, "correlation": 289997085 + } + }, + { + "ph": "s", "id": 289997085, "pid": 5714, "tid": 6744, "ts": 6303771918613.705, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303771976554.394, "dur": 73.569, + "args": { + "External id": 154109, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997087, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289997087, "pid": 0, "tid": 7, "ts": 6303771976554.394, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918618.425, "dur": 3.630, + "args": { + "External id": 154109, "cbid": 211, "correlation": 289997087 + } + }, + { + "ph": "s", "id": 289997087, "pid": 5714, "tid": 6744, "ts": 6303771918618.425, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771976628.571, "dur": 49.601, + "args": { + "External id": 154120, "device": 0, "context": 1, "stream": 7, "correlation": 289997109, "bytes": 25165824, "memory bandwidth (GB/s)": 507.3652547327675 + } + }, + { + "ph": "f", "id": 289997109, "pid": 0, "tid": 7, "ts": 6303771976628.571, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771918752.515, "dur": 16.630, + "args": { + "External id": 154120, "cbid": 41, "correlation": 289997109 + } + }, + { + "ph": "s", "id": 289997109, "pid": 5714, "tid": 6744, "ts": 6303771918752.515, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771976678.780, "dur": 34.464, + "args": { + "External id": 154117, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997127, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997127, "pid": 0, "tid": 7, "ts": 6303771976678.780, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771918864.345, "dur": 8.140, + "args": { + "External id": 154117, "cbid": 307, "correlation": 289997127 + } + }, + { + "ph": "s", "id": 289997127, "pid": 5714, "tid": 6744, "ts": 6303771918864.345, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771976713.948, "dur": 51.393, + "args": { + "External id": 154127, "device": 0, "context": 1, "stream": 7, "correlation": 289997142, "bytes": 25165824, "memory bandwidth (GB/s)": 489.6741579592551 + } + }, + { + "ph": "f", "id": 289997142, "pid": 0, "tid": 7, "ts": 6303771976713.948, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771918929.555, "dur": 13.100, + "args": { + "External id": 154127, "cbid": 41, "correlation": 289997142 + } + }, + { + "ph": "s", "id": 289997142, "pid": 5714, "tid": 6744, "ts": 6303771918929.555, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771976766.045, "dur": 184.066, + "args": { + "External id": 154124, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997160, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997160, "pid": 0, "tid": 7, "ts": 6303771976766.045, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919026.304, "dur": 7.390, + "args": { + "External id": 154124, "cbid": 307, "correlation": 289997160 + } + }, + { + "ph": "s", "id": 289997160, "pid": 5714, "tid": 6744, "ts": 6303771919026.304, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771919143.404, "dur": 0.530, + "args": { + "External id": 154132, "cbid": 200, "correlation": 289997190 + } + }, + { + "ph": "f", "id": 289997190, "pid": 5714, "tid": 6744, "ts": 6303771919143.404, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771976979.295, "dur": 23.680, + "args": { + "External id": 154132, "device": 0, "context": 1, "stream": 7, "correlation": 289997193, "bytes": 576, "memory bandwidth (GB/s)": 0.024324324324324326 + } + }, + { + "ph": "f", "id": 289997193, "pid": 0, "tid": 7, "ts": 6303771976979.295, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771919145.614, "dur": 7.160, + "args": { + "External id": 154132, "cbid": 51, "correlation": 289997193 + } + }, + { + "ph": "s", "id": 289997193, "pid": 5714, "tid": 6744, "ts": 6303771919145.614, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771977075.809, "dur": 538.565, + "args": { + "External id": 154132, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997194, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997194, "pid": 0, "tid": 7, "ts": 6303771977075.809, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919152.994, "dur": 7.400, + "args": { + "External id": 154132, "cbid": 307, "correlation": 289997194 + } + }, + { + "ph": "s", "id": 289997194, "pid": 5714, "tid": 6744, "ts": 6303771919152.994, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771919185.194, "dur": 0.300, + "args": { + "External id": 154133, "cbid": 200, "correlation": 289997219 + } + }, + { + "ph": "f", "id": 289997219, "pid": 5714, "tid": 6744, "ts": 6303771919185.194, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771977645.319, "dur": 51.841, + "args": { + "External id": 154133, "device": 0, "context": 1, "stream": 7, "correlation": 289997222, "bytes": 576, "memory bandwidth (GB/s)": 0.011110896780540499 + } + }, + { + "ph": "f", "id": 289997222, "pid": 0, "tid": 7, "ts": 6303771977645.319, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771919186.484, "dur": 4.510, + "args": { + "External id": 154133, "cbid": 51, "correlation": 289997222 + } + }, + { + "ph": "s", "id": 289997222, "pid": 5714, "tid": 6744, "ts": 6303771919186.484, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771977733.960, "dur": 154.082, + "args": { + "External id": 154133, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997223, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997223, "pid": 0, "tid": 7, "ts": 6303771977733.960, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919191.134, "dur": 4.980, + "args": { + "External id": 154133, "cbid": 307, "correlation": 289997223 + } + }, + { + "ph": "s", "id": 289997223, "pid": 5714, "tid": 6744, "ts": 6303771919191.134, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771919217.864, "dur": 0.280, + "args": { + "External id": 154134, "cbid": 200, "correlation": 289997248 + } + }, + { + "ph": "f", "id": 289997248, "pid": 5714, "tid": 6744, "ts": 6303771919217.864, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771977895.306, "dur": 6.528, + "args": { + "External id": 154134, "device": 0, "context": 1, "stream": 7, "correlation": 289997251, "bytes": 576, "memory bandwidth (GB/s)": 0.08823529411764706 + } + }, + { + "ph": "f", "id": 289997251, "pid": 0, "tid": 7, "ts": 6303771977895.306, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771919219.074, "dur": 3.880, + "args": { + "External id": 154134, "cbid": 51, "correlation": 289997251 + } + }, + { + "ph": "s", "id": 289997251, "pid": 5714, "tid": 6744, "ts": 6303771919219.074, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771977908.714, "dur": 151.874, + "args": { + "External id": 154134, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997252, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997252, "pid": 0, "tid": 7, "ts": 6303771977908.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919223.084, "dur": 4.830, + "args": { + "External id": 154134, "cbid": 307, "correlation": 289997252 + } + }, + { + "ph": "s", "id": 289997252, "pid": 5714, "tid": 6744, "ts": 6303771919223.084, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771978061.260, "dur": 141.954, + "args": { + "External id": 154135, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997274, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997274, "pid": 0, "tid": 7, "ts": 6303771978061.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919249.894, "dur": 5.490, + "args": { + "External id": 154135, "cbid": 211, "correlation": 289997274 + } + }, + { + "ph": "s", "id": 289997274, "pid": 5714, "tid": 6744, "ts": 6303771919249.894, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771978203.854, "dur": 562.438, + "args": { + "External id": 154136, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997297, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997297, "pid": 0, "tid": 7, "ts": 6303771978203.854, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919273.074, "dur": 4.790, + "args": { + "External id": 154136, "cbid": 211, "correlation": 289997297 + } + }, + { + "ph": "s", "id": 289997297, "pid": 5714, "tid": 6744, "ts": 6303771919273.074, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771978766.900, "dur": 142.786, + "args": { + "External id": 154137, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997320, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997320, "pid": 0, "tid": 7, "ts": 6303771978766.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919294.034, "dur": 12.380, + "args": { + "External id": 154137, "cbid": 211, "correlation": 289997320 + } + }, + { + "ph": "s", "id": 289997320, "pid": 5714, "tid": 6744, "ts": 6303771919294.034, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6303771978910.358, "dur": 82.273, + "args": { + "External id": 154138, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997328, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997328, "pid": 0, "tid": 7, "ts": 6303771978910.358, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919339.634, "dur": 5.590, + "args": { + "External id": 154138, "cbid": 307, "correlation": 289997328 + } + }, + { + "ph": "s", "id": 289997328, "pid": 5714, "tid": 6744, "ts": 6303771919339.634, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303771978993.239, "dur": 46.464, + "args": { + "External id": 154153, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997357, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997357, "pid": 0, "tid": 7, "ts": 6303771978993.239, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919494.443, "dur": 8.900, + "args": { + "External id": 154153, "cbid": 307, "correlation": 289997357 + } + }, + { + "ph": "s", "id": 289997357, "pid": 5714, "tid": 6744, "ts": 6303771919494.443, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771979040.439, "dur": 4.128, + "args": { + "External id": 154154, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997365, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289997365, "pid": 0, "tid": 7, "ts": 6303771979040.439, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919526.503, "dur": 5.130, + "args": { + "External id": 154154, "cbid": 307, "correlation": 289997365 + } + }, + { + "ph": "s", "id": 289997365, "pid": 5714, "tid": 6744, "ts": 6303771919526.503, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303771979045.207, "dur": 49.313, + "args": { + "External id": 154155, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997376, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997376, "pid": 0, "tid": 7, "ts": 6303771979045.207, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919556.603, "dur": 5.380, + "args": { + "External id": 154155, "cbid": 307, "correlation": 289997376 + } + }, + { + "ph": "s", "id": 289997376, "pid": 5714, "tid": 6744, "ts": 6303771919556.603, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771979095.160, "dur": 45.728, + "args": { + "External id": 154156, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997381, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997381, "pid": 0, "tid": 7, "ts": 6303771979095.160, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919594.863, "dur": 6.670, + "args": { + "External id": 154156, "cbid": 211, "correlation": 289997381 + } + }, + { + "ph": "s", "id": 289997381, "pid": 5714, "tid": 6744, "ts": 6303771919594.863, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771919760.183, "dur": 2.750, + "args": { + "External id": 154162, "cbid": 147, "correlation": 289997398 + } + }, + { + "ph": "s", "id": 289997398, "pid": 5714, "tid": 6744, "ts": 6303771919760.183, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771919858.042, "dur": 2.340, + "args": { + "External id": 154170, "cbid": 138, "correlation": 289997413 + } + }, + { + "ph": "f", "id": 289997413, "pid": 5714, "tid": 6744, "ts": 6303771919858.042, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771979150.296, "dur": 3.232, + "args": { + "External id": 154174, "device": 0, "context": 1, "stream": 7, "correlation": 289997424, "bytes": 28112, "memory bandwidth (GB/s)": 8.698019801980198 + } + }, + { + "ph": "f", "id": 289997424, "pid": 0, "tid": 7, "ts": 6303771979150.296, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771919880.153, "dur": 11.420, + "args": { + "External id": 154174, "cbid": 41, "correlation": 289997424 + } + }, + { + "ph": "s", "id": 289997424, "pid": 5714, "tid": 6744, "ts": 6303771919880.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771919895.533, "dur": 1.749, + "args": { + "External id": 154169, "cbid": 135, "correlation": 289997428 + } + }, + { + "ph": "f", "id": 289997428, "pid": 5714, "tid": 6744, "ts": 6303771919895.533, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303771979156.152, "dur": 39.265, + "args": { + "External id": 154169, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997432, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997432, "pid": 0, "tid": 7, "ts": 6303771979156.152, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771919900.162, "dur": 9.820, + "args": { + "External id": 154169, "cbid": 211, "correlation": 289997432 + } + }, + { + "ph": "s", "id": 289997432, "pid": 5714, "tid": 6744, "ts": 6303771919900.162, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771919947.592, "dur": 1.100, + "args": { + "External id": 154162, "cbid": 135, "correlation": 289997443 + } + }, + { + "ph": "f", "id": 289997443, "pid": 5714, "tid": 6744, "ts": 6303771919947.592, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771919950.552, "dur": 1.200, + "args": { + "External id": 154162, "cbid": 147, "correlation": 289997447 + } + }, + { + "ph": "s", "id": 289997447, "pid": 5714, "tid": 6744, "ts": 6303771919950.552, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771920016.832, "dur": 0.960, + "args": { + "External id": 154178, "cbid": 317, "correlation": 289997467 + } + }, + { + "ph": "f", "id": 289997467, "pid": 5714, "tid": 6744, "ts": 6303771920016.832, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920019.532, "dur": 1.400, + "args": { + "External id": 154178, "cbid": 135, "correlation": 289997469 + } + }, + { + "ph": "f", "id": 289997469, "pid": 5714, "tid": 6744, "ts": 6303771920019.532, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771920022.192, "dur": 1.040, + "args": { + "External id": 154178, "cbid": 147, "correlation": 289997473 + } + }, + { + "ph": "s", "id": 289997473, "pid": 5714, "tid": 6744, "ts": 6303771920022.192, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771920036.772, "dur": 0.720, + "args": { + "External id": 154178, "cbid": 409, "correlation": 289997476 + } + }, + { + "ph": "f", "id": 289997476, "pid": 5714, "tid": 6744, "ts": 6303771920036.772, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920041.342, "dur": 0.750, + "args": { + "External id": 154178, "cbid": 135, "correlation": 289997479 + } + }, + { + "ph": "f", "id": 289997479, "pid": 5714, "tid": 6744, "ts": 6303771920041.342, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771920042.262, "dur": 0.800, + "args": { + "External id": 154178, "cbid": 147, "correlation": 289997480 + } + }, + { + "ph": "s", "id": 289997480, "pid": 5714, "tid": 6744, "ts": 6303771920042.262, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771983646.093, "dur": 9172.842, + "args": { + "External id": 154178, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289997482, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289997482, "pid": 0, "tid": 20, "ts": 6303771983646.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771920044.062, "dur": 9.630, + "args": { + "External id": 154178, "cbid": 430, "correlation": 289997482 + } + }, + { + "ph": "s", "id": 289997482, "pid": 5714, "tid": 6744, "ts": 6303771920044.062, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920054.672, "dur": 0.390, + "args": { + "External id": 154178, "cbid": 135, "correlation": 289997484 + } + }, + { + "ph": "f", "id": 289997484, "pid": 5714, "tid": 6744, "ts": 6303771920054.672, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771920055.192, "dur": 0.580, + "args": { + "External id": 154178, "cbid": 147, "correlation": 289997485 + } + }, + { + "ph": "s", "id": 289997485, "pid": 5714, "tid": 6744, "ts": 6303771920055.192, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920057.172, "dur": 0.820, + "args": { + "External id": 154178, "cbid": 135, "correlation": 289997488 + } + }, + { + "ph": "f", "id": 289997488, "pid": 5714, "tid": 6744, "ts": 6303771920057.172, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920065.212, "dur": 0.470, + "args": { + "External id": 154178, "cbid": 135, "correlation": 289997495 + } + }, + { + "ph": "f", "id": 289997495, "pid": 5714, "tid": 6744, "ts": 6303771920065.212, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771920089.862, "dur": 1.050, + "args": { + "External id": 154180, "cbid": 147, "correlation": 289997500 + } + }, + { + "ph": "s", "id": 289997500, "pid": 5714, "tid": 6744, "ts": 6303771920089.862, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920106.792, "dur": 0.840, + "args": { + "External id": 154162, "cbid": 135, "correlation": 289997515 + } + }, + { + "ph": "f", "id": 289997515, "pid": 5714, "tid": 6744, "ts": 6303771920106.792, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920287.021, "dur": 1.160, + "args": { + "External id": 154162, "cbid": 135, "correlation": 289997528 + } + }, + { + "ph": "f", "id": 289997528, "pid": 5714, "tid": 6744, "ts": 6303771920287.021, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771920395.971, "dur": 3.010, + "args": { + "External id": 154190, "cbid": 147, "correlation": 289997539 + } + }, + { + "ph": "s", "id": 289997539, "pid": 5714, "tid": 6744, "ts": 6303771920395.971, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771920502.691, "dur": 1.100, + "args": { + "External id": 154204, "cbid": 317, "correlation": 289997580 + } + }, + { + "ph": "f", "id": 289997580, "pid": 5714, "tid": 6744, "ts": 6303771920502.691, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771920510.821, "dur": 2.210, + "args": { + "External id": 154205, "cbid": 138, "correlation": 289997583 + } + }, + { + "ph": "f", "id": 289997583, "pid": 5714, "tid": 6744, "ts": 6303771920510.821, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771983647.117, "dur": 2.016, + "args": { + "External id": 154209, "device": 0, "context": 1, "stream": 7, "correlation": 289997594, "bytes": 7224, "memory bandwidth (GB/s)": 3.5833333333333335 + } + }, + { + "ph": "f", "id": 289997594, "pid": 0, "tid": 7, "ts": 6303771983647.117, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771920532.291, "dur": 11.630, + "args": { + "External id": 154209, "cbid": 41, "correlation": 289997594 + } + }, + { + "ph": "s", "id": 289997594, "pid": 5714, "tid": 6744, "ts": 6303771920532.291, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920548.331, "dur": 1.510, + "args": { + "External id": 154204, "cbid": 135, "correlation": 289997598 + } + }, + { + "ph": "f", "id": 289997598, "pid": 5714, "tid": 6744, "ts": 6303771920548.331, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771983651.149, "dur": 12.128, + "args": { + "External id": 154204, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997602, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997602, "pid": 0, "tid": 7, "ts": 6303771983651.149, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771920552.111, "dur": 9.960, + "args": { + "External id": 154204, "cbid": 211, "correlation": 289997602 + } + }, + { + "ph": "s", "id": 289997602, "pid": 5714, "tid": 6744, "ts": 6303771920552.111, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920646.421, "dur": 1.300, + "args": { + "External id": 154190, "cbid": 135, "correlation": 289997613 + } + }, + { + "ph": "f", "id": 289997613, "pid": 5714, "tid": 6744, "ts": 6303771920646.421, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771920650.761, "dur": 1.140, + "args": { + "External id": 154190, "cbid": 147, "correlation": 289997617 + } + }, + { + "ph": "s", "id": 289997617, "pid": 5714, "tid": 6744, "ts": 6303771920650.761, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771920653.601, "dur": 0.790, + "args": { + "External id": 154190, "cbid": 147, "correlation": 289997621 + } + }, + { + "ph": "s", "id": 289997621, "pid": 5714, "tid": 6744, "ts": 6303771920653.601, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771983697.133, "dur": 28.577, + "args": { + "External id": 154223, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289997645, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289997645, "pid": 0, "tid": 17, "ts": 6303771983697.133, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771920792.040, "dur": 11.040, + "args": { + "External id": 154223, "cbid": 211, "correlation": 289997645 + } + }, + { + "ph": "s", "id": 289997645, "pid": 5714, "tid": 6744, "ts": 6303771920792.040, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771983741.390, "dur": 617.959, + "args": { + "External id": 154239, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289997658, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289997658, "pid": 0, "tid": 17, "ts": 6303771983741.390, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771920897.630, "dur": 8.930, + "args": { + "External id": 154239, "cbid": 211, "correlation": 289997658 + } + }, + { + "ph": "s", "id": 289997658, "pid": 5714, "tid": 6744, "ts": 6303771920897.630, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920929.850, "dur": 1.210, + "args": { + "External id": 154190, "cbid": 135, "correlation": 289997668 + } + }, + { + "ph": "f", "id": 289997668, "pid": 5714, "tid": 6744, "ts": 6303771920929.850, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771920932.800, "dur": 1.130, + "args": { + "External id": 154190, "cbid": 147, "correlation": 289997672 + } + }, + { + "ph": "s", "id": 289997672, "pid": 5714, "tid": 6744, "ts": 6303771920932.800, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771920982.390, "dur": 0.820, + "args": { + "External id": 154241, "cbid": 317, "correlation": 289997685 + } + }, + { + "ph": "f", "id": 289997685, "pid": 5714, "tid": 6744, "ts": 6303771920982.390, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771920985.020, "dur": 1.050, + "args": { + "External id": 154241, "cbid": 135, "correlation": 289997687 + } + }, + { + "ph": "f", "id": 289997687, "pid": 5714, "tid": 6744, "ts": 6303771920985.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771920987.380, "dur": 1.100, + "args": { + "External id": 154241, "cbid": 147, "correlation": 289997691 + } + }, + { + "ph": "s", "id": 289997691, "pid": 5714, "tid": 6744, "ts": 6303771920987.380, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771921002.020, "dur": 0.650, + "args": { + "External id": 154241, "cbid": 409, "correlation": 289997694 + } + }, + { + "ph": "f", "id": 289997694, "pid": 5714, "tid": 6744, "ts": 6303771921002.020, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771921006.440, "dur": 0.750, + "args": { + "External id": 154241, "cbid": 135, "correlation": 289997697 + } + }, + { + "ph": "f", "id": 289997697, "pid": 5714, "tid": 6744, "ts": 6303771921006.440, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771921007.360, "dur": 0.790, + "args": { + "External id": 154241, "cbid": 147, "correlation": 289997698 + } + }, + { + "ph": "s", "id": 289997698, "pid": 5714, "tid": 6744, "ts": 6303771921007.360, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771992821.207, "dur": 5231.965, + "args": { + "External id": 154241, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289997700, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289997700, "pid": 0, "tid": 20, "ts": 6303771992821.207, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771921009.140, "dur": 9.290, + "args": { + "External id": 154241, "cbid": 430, "correlation": 289997700 + } + }, + { + "ph": "s", "id": 289997700, "pid": 5714, "tid": 6744, "ts": 6303771921009.140, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771921019.520, "dur": 0.380, + "args": { + "External id": 154241, "cbid": 135, "correlation": 289997702 + } + }, + { + "ph": "f", "id": 289997702, "pid": 5714, "tid": 6744, "ts": 6303771921019.520, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771921020.020, "dur": 0.500, + "args": { + "External id": 154241, "cbid": 147, "correlation": 289997703 + } + }, + { + "ph": "s", "id": 289997703, "pid": 5714, "tid": 6744, "ts": 6303771921020.020, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771921021.900, "dur": 0.730, + "args": { + "External id": 154241, "cbid": 135, "correlation": 289997706 + } + }, + { + "ph": "f", "id": 289997706, "pid": 5714, "tid": 6744, "ts": 6303771921021.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771921029.890, "dur": 0.420, + "args": { + "External id": 154241, "cbid": 135, "correlation": 289997713 + } + }, + { + "ph": "f", "id": 289997713, "pid": 5714, "tid": 6744, "ts": 6303771921029.890, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771921054.890, "dur": 0.920, + "args": { + "External id": 154243, "cbid": 147, "correlation": 289997718 + } + }, + { + "ph": "s", "id": 289997718, "pid": 5714, "tid": 6744, "ts": 6303771921054.890, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771921071.430, "dur": 0.880, + "args": { + "External id": 154190, "cbid": 135, "correlation": 289997733 + } + }, + { + "ph": "f", "id": 289997733, "pid": 5714, "tid": 6744, "ts": 6303771921071.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771983664.013, "dur": 2853.441, + "args": { + "External id": 154245, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997758, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997758, "pid": 0, "tid": 7, "ts": 6303771983664.013, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921207.979, "dur": 11.180, + "args": { + "External id": 154245, "cbid": 211, "correlation": 289997758 + } + }, + { + "ph": "s", "id": 289997758, "pid": 5714, "tid": 6744, "ts": 6303771921207.979, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6303771986537.422, "dur": 677.896, + "args": { + "External id": 154246, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997781, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289997781, "pid": 0, "tid": 7, "ts": 6303771986537.422, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921264.719, "dur": 5.840, + "args": { + "External id": 154246, "cbid": 307, "correlation": 289997781 + } + }, + { + "ph": "s", "id": 289997781, "pid": 5714, "tid": 6744, "ts": 6303771921264.719, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771921311.609, "dur": 0.490, + "args": { + "External id": 154247, "cbid": 200, "correlation": 289997804 + } + }, + { + "ph": "f", "id": 289997804, "pid": 5714, "tid": 6744, "ts": 6303771921311.609, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771987287.639, "dur": 76.641, + "args": { + "External id": 154247, "device": 0, "context": 1, "stream": 7, "correlation": 289997807, "bytes": 1536, "memory bandwidth (GB/s)": 0.02004149215172036 + } + }, + { + "ph": "f", "id": 289997807, "pid": 0, "tid": 7, "ts": 6303771987287.639, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771921313.729, "dur": 6.860, + "args": { + "External id": 154247, "cbid": 51, "correlation": 289997807 + } + }, + { + "ph": "s", "id": 289997807, "pid": 5714, "tid": 6744, "ts": 6303771921313.729, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771987433.209, "dur": 639.399, + "args": { + "External id": 154247, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997808, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997808, "pid": 0, "tid": 7, "ts": 6303771987433.209, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921320.779, "dur": 6.140, + "args": { + "External id": 154247, "cbid": 307, "correlation": 289997808 + } + }, + { + "ph": "s", "id": 289997808, "pid": 5714, "tid": 6744, "ts": 6303771921320.779, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771921355.809, "dur": 0.310, + "args": { + "External id": 154248, "cbid": 200, "correlation": 289997833 + } + }, + { + "ph": "f", "id": 289997833, "pid": 5714, "tid": 6744, "ts": 6303771921355.809, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771988073.856, "dur": 1.344, + "args": { + "External id": 154248, "device": 0, "context": 1, "stream": 7, "correlation": 289997836, "bytes": 1536, "memory bandwidth (GB/s)": 1.1428571428571428 + } + }, + { + "ph": "f", "id": 289997836, "pid": 0, "tid": 7, "ts": 6303771988073.856, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771921357.099, "dur": 4.500, + "args": { + "External id": 154248, "cbid": 51, "correlation": 289997836 + } + }, + { + "ph": "s", "id": 289997836, "pid": 5714, "tid": 6744, "ts": 6303771921357.099, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771988076.480, "dur": 353.412, + "args": { + "External id": 154248, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997837, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997837, "pid": 0, "tid": 7, "ts": 6303771988076.480, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921361.729, "dur": 4.870, + "args": { + "External id": 154248, "cbid": 307, "correlation": 289997837 + } + }, + { + "ph": "s", "id": 289997837, "pid": 5714, "tid": 6744, "ts": 6303771921361.729, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771921389.529, "dur": 0.290, + "args": { + "External id": 154249, "cbid": 200, "correlation": 289997862 + } + }, + { + "ph": "f", "id": 289997862, "pid": 5714, "tid": 6744, "ts": 6303771921389.529, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771988430.596, "dur": 357.764, + "args": { + "External id": 154249, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997865, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997865, "pid": 0, "tid": 7, "ts": 6303771988430.596, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921390.929, "dur": 5.600, + "args": { + "External id": 154249, "cbid": 307, "correlation": 289997865 + } + }, + { + "ph": "s", "id": 289997865, "pid": 5714, "tid": 6744, "ts": 6303771921390.929, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771921417.409, "dur": 0.260, + "args": { + "External id": 154250, "cbid": 200, "correlation": 289997890 + } + }, + { + "ph": "f", "id": 289997890, "pid": 5714, "tid": 6744, "ts": 6303771921417.409, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771988789.640, "dur": 1.536, + "args": { + "External id": 154250, "device": 0, "context": 1, "stream": 7, "correlation": 289997893, "bytes": 1536, "memory bandwidth (GB/s)": 1 + } + }, + { + "ph": "f", "id": 289997893, "pid": 0, "tid": 7, "ts": 6303771988789.640, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771921418.639, "dur": 4.510, + "args": { + "External id": 154250, "cbid": 51, "correlation": 289997893 + } + }, + { + "ph": "s", "id": 289997893, "pid": 5714, "tid": 6744, "ts": 6303771921418.639, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771988792.456, "dur": 358.308, + "args": { + "External id": 154250, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997894, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997894, "pid": 0, "tid": 7, "ts": 6303771988792.456, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921423.299, "dur": 4.800, + "args": { + "External id": 154250, "cbid": 307, "correlation": 289997894 + } + }, + { + "ph": "s", "id": 289997894, "pid": 5714, "tid": 6744, "ts": 6303771921423.299, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771921456.689, "dur": 0.290, + "args": { + "External id": 154251, "cbid": 200, "correlation": 289997919 + } + }, + { + "ph": "f", "id": 289997919, "pid": 5714, "tid": 6744, "ts": 6303771921456.689, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771989151.436, "dur": 360.197, + "args": { + "External id": 154251, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997922, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289997922, "pid": 0, "tid": 7, "ts": 6303771989151.436, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921457.959, "dur": 5.180, + "args": { + "External id": 154251, "cbid": 307, "correlation": 289997922 + } + }, + { + "ph": "s", "id": 289997922, "pid": 5714, "tid": 6744, "ts": 6303771921457.959, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771989512.241, "dur": 89.441, + "args": { + "External id": 154252, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997935, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997935, "pid": 0, "tid": 7, "ts": 6303771989512.241, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921498.929, "dur": 5.540, + "args": { + "External id": 154252, "cbid": 307, "correlation": 289997935 + } + }, + { + "ph": "s", "id": 289997935, "pid": 5714, "tid": 6744, "ts": 6303771921498.929, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6303771989602.290, "dur": 3.584, + "args": { + "External id": 154253, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997943, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289997943, "pid": 0, "tid": 7, "ts": 6303771989602.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921527.689, "dur": 5.050, + "args": { + "External id": 154253, "cbid": 307, "correlation": 289997943 + } + }, + { + "ph": "s", "id": 289997943, "pid": 5714, "tid": 6744, "ts": 6303771921527.689, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6303771989607.058, "dur": 113.729, + "args": { + "External id": 154254, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289997951, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289997951, "pid": 0, "tid": 7, "ts": 6303771989607.058, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921557.099, "dur": 4.780, + "args": { + "External id": 154254, "cbid": 307, "correlation": 289997951 + } + }, + { + "ph": "s", "id": 289997951, "pid": 5714, "tid": 6744, "ts": 6303771921557.099, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771921725.928, "dur": 0.470, + "args": { + "External id": 154273, "cbid": 200, "correlation": 289997997 + } + }, + { + "ph": "f", "id": 289997997, "pid": 5714, "tid": 6744, "ts": 6303771921725.928, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771989722.099, "dur": 1.056, + "args": { + "External id": 154273, "device": 0, "context": 1, "stream": 7, "correlation": 289998000, "bytes": 576, "memory bandwidth (GB/s)": 0.5454545454545454 + } + }, + { + "ph": "f", "id": 289998000, "pid": 0, "tid": 7, "ts": 6303771989722.099, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771921727.968, "dur": 7.120, + "args": { + "External id": 154273, "cbid": 51, "correlation": 289998000 + } + }, + { + "ph": "s", "id": 289998000, "pid": 5714, "tid": 6744, "ts": 6303771921727.968, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771989724.691, "dur": 142.978, + "args": { + "External id": 154273, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998001, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998001, "pid": 0, "tid": 7, "ts": 6303771989724.691, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921735.348, "dur": 8.180, + "args": { + "External id": 154273, "cbid": 307, "correlation": 289998001 + } + }, + { + "ph": "s", "id": 289998001, "pid": 5714, "tid": 6744, "ts": 6303771921735.348, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771989868.341, "dur": 141.505, + "args": { + "External id": 154274, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998023, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998023, "pid": 0, "tid": 7, "ts": 6303771989868.341, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921769.008, "dur": 5.820, + "args": { + "External id": 154274, "cbid": 211, "correlation": 289998023 + } + }, + { + "ph": "s", "id": 289998023, "pid": 5714, "tid": 6744, "ts": 6303771921769.008, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771921846.098, "dur": 0.440, + "args": { + "External id": 154275, "cbid": 200, "correlation": 289998041 + } + }, + { + "ph": "f", "id": 289998041, "pid": 5714, "tid": 6744, "ts": 6303771921846.098, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771921846.648, "dur": 0.250, + "args": { + "External id": 154275, "cbid": 200, "correlation": 289998042 + } + }, + { + "ph": "f", "id": 289998042, "pid": 5714, "tid": 6744, "ts": 6303771921846.648, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771921865.918, "dur": 0.240, + "args": { + "External id": 154275, "cbid": 200, "correlation": 289998060 + } + }, + { + "ph": "f", "id": 289998060, "pid": 5714, "tid": 6744, "ts": 6303771921865.918, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771990010.550, "dur": 92.514, + "args": { + "External id": 154275, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998061, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998061, "pid": 0, "tid": 7, "ts": 6303771990010.550, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921867.348, "dur": 9.150, + "args": { + "External id": 154275, "cbid": 211, "correlation": 289998061 + } + }, + { + "ph": "s", "id": 289998061, "pid": 5714, "tid": 6744, "ts": 6303771921867.348, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771921877.238, "dur": 0.930, + "args": { + "External id": 154275, "cbid": 273, "correlation": 289998063 + } + }, + { + "ph": "f", "id": 289998063, "pid": 5714, "tid": 6744, "ts": 6303771921877.238, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303771990103.672, "dur": 1276.591, + "args": { + "External id": 154275, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998064, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289998064, "pid": 0, "tid": 7, "ts": 6303771990103.672, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921878.458, "dur": 4.010, + "args": { + "External id": 154275, "cbid": 211, "correlation": 289998064 + } + }, + { + "ph": "s", "id": 289998064, "pid": 5714, "tid": 6744, "ts": 6303771921878.458, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303771991380.935, "dur": 73.664, + "args": { + "External id": 154275, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998066, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289998066, "pid": 0, "tid": 7, "ts": 6303771991380.935, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771921883.058, "dur": 3.590, + "args": { + "External id": 154275, "cbid": 211, "correlation": 289998066 + } + }, + { + "ph": "s", "id": 289998066, "pid": 5714, "tid": 6744, "ts": 6303771921883.058, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771991455.271, "dur": 49.729, + "args": { + "External id": 154286, "device": 0, "context": 1, "stream": 7, "correlation": 289998088, "bytes": 25165824, "memory bandwidth (GB/s)": 506.0593215226528 + } + }, + { + "ph": "f", "id": 289998088, "pid": 0, "tid": 7, "ts": 6303771991455.271, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771922011.118, "dur": 17.370, + "args": { + "External id": 154286, "cbid": 41, "correlation": 289998088 + } + }, + { + "ph": "s", "id": 289998088, "pid": 5714, "tid": 6744, "ts": 6303771922011.118, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771991505.704, "dur": 34.368, + "args": { + "External id": 154283, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998106, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998106, "pid": 0, "tid": 7, "ts": 6303771991505.704, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922122.268, "dur": 8.800, + "args": { + "External id": 154283, "cbid": 307, "correlation": 289998106 + } + }, + { + "ph": "s", "id": 289998106, "pid": 5714, "tid": 6744, "ts": 6303771922122.268, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303771991540.648, "dur": 40.321, + "args": { + "External id": 154293, "device": 0, "context": 1, "stream": 7, "correlation": 289998121, "bytes": 25165824, "memory bandwidth (GB/s)": 624.1369013665336 + } + }, + { + "ph": "f", "id": 289998121, "pid": 0, "tid": 7, "ts": 6303771991540.648, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771922188.717, "dur": 13.120, + "args": { + "External id": 154293, "cbid": 41, "correlation": 289998121 + } + }, + { + "ph": "s", "id": 289998121, "pid": 5714, "tid": 6744, "ts": 6303771922188.717, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303771991581.705, "dur": 91.777, + "args": { + "External id": 154290, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998139, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998139, "pid": 0, "tid": 7, "ts": 6303771991581.705, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922284.687, "dur": 7.400, + "args": { + "External id": 154290, "cbid": 307, "correlation": 289998139 + } + }, + { + "ph": "s", "id": 289998139, "pid": 5714, "tid": 6744, "ts": 6303771922284.687, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771922411.277, "dur": 0.530, + "args": { + "External id": 154298, "cbid": 200, "correlation": 289998169 + } + }, + { + "ph": "f", "id": 289998169, "pid": 5714, "tid": 6744, "ts": 6303771922411.277, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771991727.627, "dur": 68.192, + "args": { + "External id": 154298, "device": 0, "context": 1, "stream": 7, "correlation": 289998172, "bytes": 576, "memory bandwidth (GB/s)": 0.008446738620366025 + } + }, + { + "ph": "f", "id": 289998172, "pid": 0, "tid": 7, "ts": 6303771991727.627, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771922413.457, "dur": 7.360, + "args": { + "External id": 154298, "cbid": 51, "correlation": 289998172 + } + }, + { + "ph": "s", "id": 289998172, "pid": 5714, "tid": 6744, "ts": 6303771922413.457, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771991848.492, "dur": 487.654, + "args": { + "External id": 154298, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998173, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998173, "pid": 0, "tid": 7, "ts": 6303771991848.492, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922421.057, "dur": 7.760, + "args": { + "External id": 154298, "cbid": 307, "correlation": 289998173 + } + }, + { + "ph": "s", "id": 289998173, "pid": 5714, "tid": 6744, "ts": 6303771922421.057, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771922453.907, "dur": 0.290, + "args": { + "External id": 154299, "cbid": 200, "correlation": 289998198 + } + }, + { + "ph": "f", "id": 289998198, "pid": 5714, "tid": 6744, "ts": 6303771922453.907, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771992407.250, "dur": 58.337, + "args": { + "External id": 154299, "device": 0, "context": 1, "stream": 7, "correlation": 289998201, "bytes": 576, "memory bandwidth (GB/s)": 0.009873665083909012 + } + }, + { + "ph": "f", "id": 289998201, "pid": 0, "tid": 7, "ts": 6303771992407.250, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771922455.177, "dur": 4.330, + "args": { + "External id": 154299, "cbid": 51, "correlation": 289998201 + } + }, + { + "ph": "s", "id": 289998201, "pid": 5714, "tid": 6744, "ts": 6303771922455.177, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771992512.308, "dur": 158.817, + "args": { + "External id": 154299, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998202, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998202, "pid": 0, "tid": 7, "ts": 6303771992512.308, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922459.637, "dur": 5.430, + "args": { + "External id": 154299, "cbid": 307, "correlation": 289998202 + } + }, + { + "ph": "s", "id": 289998202, "pid": 5714, "tid": 6744, "ts": 6303771922459.637, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771922486.077, "dur": 0.310, + "args": { + "External id": 154300, "cbid": 200, "correlation": 289998227 + } + }, + { + "ph": "f", "id": 289998227, "pid": 5714, "tid": 6744, "ts": 6303771922486.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303771992677.397, "dur": 5.312, + "args": { + "External id": 154300, "device": 0, "context": 1, "stream": 7, "correlation": 289998230, "bytes": 576, "memory bandwidth (GB/s)": 0.10843373493975904 + } + }, + { + "ph": "f", "id": 289998230, "pid": 0, "tid": 7, "ts": 6303771992677.397, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771922487.247, "dur": 3.840, + "args": { + "External id": 154300, "cbid": 51, "correlation": 289998230 + } + }, + { + "ph": "s", "id": 289998230, "pid": 5714, "tid": 6744, "ts": 6303771922487.247, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303771992689.942, "dur": 151.361, + "args": { + "External id": 154300, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998231, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998231, "pid": 0, "tid": 7, "ts": 6303771992689.942, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922491.217, "dur": 4.310, + "args": { + "External id": 154300, "cbid": 307, "correlation": 289998231 + } + }, + { + "ph": "s", "id": 289998231, "pid": 5714, "tid": 6744, "ts": 6303771922491.217, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771992842.007, "dur": 141.026, + "args": { + "External id": 154301, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998253, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998253, "pid": 0, "tid": 7, "ts": 6303771992842.007, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922517.947, "dur": 5.609, + "args": { + "External id": 154301, "cbid": 211, "correlation": 289998253 + } + }, + { + "ph": "s", "id": 289998253, "pid": 5714, "tid": 6744, "ts": 6303771922517.947, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771992983.705, "dur": 551.335, + "args": { + "External id": 154302, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998276, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998276, "pid": 0, "tid": 7, "ts": 6303771992983.705, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922540.207, "dur": 4.920, + "args": { + "External id": 154302, "cbid": 211, "correlation": 289998276 + } + }, + { + "ph": "s", "id": 289998276, "pid": 5714, "tid": 6744, "ts": 6303771922540.207, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303771993535.744, "dur": 142.657, + "args": { + "External id": 154303, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998299, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998299, "pid": 0, "tid": 7, "ts": 6303771993535.744, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922562.407, "dur": 4.280, + "args": { + "External id": 154303, "cbid": 211, "correlation": 289998299 + } + }, + { + "ph": "s", "id": 289998299, "pid": 5714, "tid": 6744, "ts": 6303771922562.407, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6303771993679.041, "dur": 79.425, + "args": { + "External id": 154304, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998307, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998307, "pid": 0, "tid": 7, "ts": 6303771993679.041, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922597.467, "dur": 5.100, + "args": { + "External id": 154304, "cbid": 307, "correlation": 289998307 + } + }, + { + "ph": "s", "id": 289998307, "pid": 5714, "tid": 6744, "ts": 6303771922597.467, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303771993759.138, "dur": 45.984, + "args": { + "External id": 154319, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998336, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998336, "pid": 0, "tid": 7, "ts": 6303771993759.138, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922748.506, "dur": 8.640, + "args": { + "External id": 154319, "cbid": 307, "correlation": 289998336 + } + }, + { + "ph": "s", "id": 289998336, "pid": 5714, "tid": 6744, "ts": 6303771922748.506, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303771993805.858, "dur": 3.456, + "args": { + "External id": 154320, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998344, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289998344, "pid": 0, "tid": 7, "ts": 6303771993805.858, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922781.736, "dur": 5.070, + "args": { + "External id": 154320, "cbid": 307, "correlation": 289998344 + } + }, + { + "ph": "s", "id": 289998344, "pid": 5714, "tid": 6744, "ts": 6303771922781.736, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303771993809.954, "dur": 51.809, + "args": { + "External id": 154321, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998355, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998355, "pid": 0, "tid": 7, "ts": 6303771993809.954, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922812.916, "dur": 5.180, + "args": { + "External id": 154321, "cbid": 307, "correlation": 289998355 + } + }, + { + "ph": "s", "id": 289998355, "pid": 5714, "tid": 6744, "ts": 6303771922812.916, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303771993862.435, "dur": 46.945, + "args": { + "External id": 154322, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998360, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998360, "pid": 0, "tid": 7, "ts": 6303771993862.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771922851.616, "dur": 6.640, + "args": { + "External id": 154322, "cbid": 211, "correlation": 289998360 + } + }, + { + "ph": "s", "id": 289998360, "pid": 5714, "tid": 6744, "ts": 6303771922851.616, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771923016.806, "dur": 2.800, + "args": { + "External id": 154328, "cbid": 147, "correlation": 289998377 + } + }, + { + "ph": "s", "id": 289998377, "pid": 5714, "tid": 6744, "ts": 6303771923016.806, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771923115.675, "dur": 2.510, + "args": { + "External id": 154336, "cbid": 138, "correlation": 289998392 + } + }, + { + "ph": "f", "id": 289998392, "pid": 5714, "tid": 6744, "ts": 6303771923115.675, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771993919.620, "dur": 2.848, + "args": { + "External id": 154340, "device": 0, "context": 1, "stream": 7, "correlation": 289998403, "bytes": 28112, "memory bandwidth (GB/s)": 9.870786516853933 + } + }, + { + "ph": "f", "id": 289998403, "pid": 0, "tid": 7, "ts": 6303771993919.620, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771923138.085, "dur": 12.040, + "args": { + "External id": 154340, "cbid": 41, "correlation": 289998403 + } + }, + { + "ph": "s", "id": 289998403, "pid": 5714, "tid": 6744, "ts": 6303771923138.085, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923154.075, "dur": 1.710, + "args": { + "External id": 154335, "cbid": 135, "correlation": 289998407 + } + }, + { + "ph": "f", "id": 289998407, "pid": 5714, "tid": 6744, "ts": 6303771923154.075, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303771993925.060, "dur": 37.536, + "args": { + "External id": 154335, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998411, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998411, "pid": 0, "tid": 7, "ts": 6303771993925.060, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771923158.675, "dur": 9.630, + "args": { + "External id": 154335, "cbid": 211, "correlation": 289998411 + } + }, + { + "ph": "s", "id": 289998411, "pid": 5714, "tid": 6744, "ts": 6303771923158.675, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923205.475, "dur": 0.980, + "args": { + "External id": 154328, "cbid": 135, "correlation": 289998422 + } + }, + { + "ph": "f", "id": 289998422, "pid": 5714, "tid": 6744, "ts": 6303771923205.475, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771923208.425, "dur": 1.230, + "args": { + "External id": 154328, "cbid": 147, "correlation": 289998426 + } + }, + { + "ph": "s", "id": 289998426, "pid": 5714, "tid": 6744, "ts": 6303771923208.425, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771923274.385, "dur": 0.960, + "args": { + "External id": 154344, "cbid": 317, "correlation": 289998446 + } + }, + { + "ph": "f", "id": 289998446, "pid": 5714, "tid": 6744, "ts": 6303771923274.385, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923277.225, "dur": 1.340, + "args": { + "External id": 154344, "cbid": 135, "correlation": 289998448 + } + }, + { + "ph": "f", "id": 289998448, "pid": 5714, "tid": 6744, "ts": 6303771923277.225, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771923279.855, "dur": 0.980, + "args": { + "External id": 154344, "cbid": 147, "correlation": 289998452 + } + }, + { + "ph": "s", "id": 289998452, "pid": 5714, "tid": 6744, "ts": 6303771923279.855, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771923294.425, "dur": 0.720, + "args": { + "External id": 154344, "cbid": 409, "correlation": 289998455 + } + }, + { + "ph": "f", "id": 289998455, "pid": 5714, "tid": 6744, "ts": 6303771923294.425, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923305.945, "dur": 0.870, + "args": { + "External id": 154344, "cbid": 135, "correlation": 289998458 + } + }, + { + "ph": "f", "id": 289998458, "pid": 5714, "tid": 6744, "ts": 6303771923305.945, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771923307.005, "dur": 0.850, + "args": { + "External id": 154344, "cbid": 147, "correlation": 289998459 + } + }, + { + "ph": "s", "id": 289998459, "pid": 5714, "tid": 6744, "ts": 6303771923307.005, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303771998054.644, "dur": 8670.852, + "args": { + "External id": 154344, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289998461, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289998461, "pid": 0, "tid": 20, "ts": 6303771998054.644, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771923309.055, "dur": 9.880, + "args": { + "External id": 154344, "cbid": 430, "correlation": 289998461 + } + }, + { + "ph": "s", "id": 289998461, "pid": 5714, "tid": 6744, "ts": 6303771923309.055, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923319.955, "dur": 0.390, + "args": { + "External id": 154344, "cbid": 135, "correlation": 289998463 + } + }, + { + "ph": "f", "id": 289998463, "pid": 5714, "tid": 6744, "ts": 6303771923319.955, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771923320.465, "dur": 0.540, + "args": { + "External id": 154344, "cbid": 147, "correlation": 289998464 + } + }, + { + "ph": "s", "id": 289998464, "pid": 5714, "tid": 6744, "ts": 6303771923320.465, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923322.655, "dur": 0.840, + "args": { + "External id": 154344, "cbid": 135, "correlation": 289998467 + } + }, + { + "ph": "f", "id": 289998467, "pid": 5714, "tid": 6744, "ts": 6303771923322.655, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923331.135, "dur": 0.470, + "args": { + "External id": 154344, "cbid": 135, "correlation": 289998474 + } + }, + { + "ph": "f", "id": 289998474, "pid": 5714, "tid": 6744, "ts": 6303771923331.135, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771923357.025, "dur": 0.900, + "args": { + "External id": 154346, "cbid": 147, "correlation": 289998479 + } + }, + { + "ph": "s", "id": 289998479, "pid": 5714, "tid": 6744, "ts": 6303771923357.025, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923373.635, "dur": 0.830, + "args": { + "External id": 154328, "cbid": 135, "correlation": 289998494 + } + }, + { + "ph": "f", "id": 289998494, "pid": 5714, "tid": 6744, "ts": 6303771923373.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923550.444, "dur": 1.130, + "args": { + "External id": 154328, "cbid": 135, "correlation": 289998507 + } + }, + { + "ph": "f", "id": 289998507, "pid": 5714, "tid": 6744, "ts": 6303771923550.444, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771923650.024, "dur": 2.840, + "args": { + "External id": 154356, "cbid": 147, "correlation": 289998518 + } + }, + { + "ph": "s", "id": 289998518, "pid": 5714, "tid": 6744, "ts": 6303771923650.024, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771923752.744, "dur": 1.080, + "args": { + "External id": 154370, "cbid": 317, "correlation": 289998559 + } + }, + { + "ph": "f", "id": 289998559, "pid": 5714, "tid": 6744, "ts": 6303771923752.744, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771923760.754, "dur": 2.100, + "args": { + "External id": 154371, "cbid": 138, "correlation": 289998562 + } + }, + { + "ph": "f", "id": 289998562, "pid": 5714, "tid": 6744, "ts": 6303771923760.754, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303771998054.004, "dur": 1.728, + "args": { + "External id": 154375, "device": 0, "context": 1, "stream": 7, "correlation": 289998573, "bytes": 7224, "memory bandwidth (GB/s)": 4.180555555555555 + } + }, + { + "ph": "f", "id": 289998573, "pid": 0, "tid": 7, "ts": 6303771998054.004, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771923782.644, "dur": 11.360, + "args": { + "External id": 154375, "cbid": 41, "correlation": 289998573 + } + }, + { + "ph": "s", "id": 289998573, "pid": 5714, "tid": 6744, "ts": 6303771923782.644, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923798.254, "dur": 1.590, + "args": { + "External id": 154370, "cbid": 135, "correlation": 289998577 + } + }, + { + "ph": "f", "id": 289998577, "pid": 5714, "tid": 6744, "ts": 6303771923798.254, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303771998058.388, "dur": 13.792, + "args": { + "External id": 154370, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998581, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998581, "pid": 0, "tid": 7, "ts": 6303771998058.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771923802.104, "dur": 10.100, + "args": { + "External id": 154370, "cbid": 211, "correlation": 289998581 + } + }, + { + "ph": "s", "id": 289998581, "pid": 5714, "tid": 6744, "ts": 6303771923802.104, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771923896.973, "dur": 1.260, + "args": { + "External id": 154356, "cbid": 135, "correlation": 289998592 + } + }, + { + "ph": "f", "id": 289998592, "pid": 5714, "tid": 6744, "ts": 6303771923896.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771923901.093, "dur": 1.111, + "args": { + "External id": 154356, "cbid": 147, "correlation": 289998596 + } + }, + { + "ph": "s", "id": 289998596, "pid": 5714, "tid": 6744, "ts": 6303771923901.093, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771923903.753, "dur": 0.671, + "args": { + "External id": 154356, "cbid": 147, "correlation": 289998600 + } + }, + { + "ph": "s", "id": 289998600, "pid": 5714, "tid": 6744, "ts": 6303771923903.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771998105.460, "dur": 28.801, + "args": { + "External id": 154389, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289998624, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289998624, "pid": 0, "tid": 17, "ts": 6303771998105.460, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924039.653, "dur": 11.020, + "args": { + "External id": 154389, "cbid": 211, "correlation": 289998624 + } + }, + { + "ph": "s", "id": 289998624, "pid": 5714, "tid": 6744, "ts": 6303771924039.653, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303771998188.534, "dur": 615.559, + "args": { + "External id": 154405, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289998637, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289998637, "pid": 0, "tid": 17, "ts": 6303771998188.534, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924144.743, "dur": 9.140, + "args": { + "External id": 154405, "cbid": 211, "correlation": 289998637 + } + }, + { + "ph": "s", "id": 289998637, "pid": 5714, "tid": 6744, "ts": 6303771924144.743, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771924176.743, "dur": 1.220, + "args": { + "External id": 154356, "cbid": 135, "correlation": 289998647 + } + }, + { + "ph": "f", "id": 289998647, "pid": 5714, "tid": 6744, "ts": 6303771924176.743, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771924179.733, "dur": 1.060, + "args": { + "External id": 154356, "cbid": 147, "correlation": 289998651 + } + }, + { + "ph": "s", "id": 289998651, "pid": 5714, "tid": 6744, "ts": 6303771924179.733, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771924229.073, "dur": 0.860, + "args": { + "External id": 154407, "cbid": 317, "correlation": 289998664 + } + }, + { + "ph": "f", "id": 289998664, "pid": 5714, "tid": 6744, "ts": 6303771924229.073, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771924231.723, "dur": 1.050, + "args": { + "External id": 154407, "cbid": 135, "correlation": 289998666 + } + }, + { + "ph": "f", "id": 289998666, "pid": 5714, "tid": 6744, "ts": 6303771924231.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771924234.163, "dur": 1.060, + "args": { + "External id": 154407, "cbid": 147, "correlation": 289998670 + } + }, + { + "ph": "s", "id": 289998670, "pid": 5714, "tid": 6744, "ts": 6303771924234.163, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771924248.493, "dur": 0.610, + "args": { + "External id": 154407, "cbid": 409, "correlation": 289998673 + } + }, + { + "ph": "f", "id": 289998673, "pid": 5714, "tid": 6744, "ts": 6303771924248.493, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771924252.923, "dur": 0.770, + "args": { + "External id": 154407, "cbid": 135, "correlation": 289998676 + } + }, + { + "ph": "f", "id": 289998676, "pid": 5714, "tid": 6744, "ts": 6303771924252.923, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771924253.863, "dur": 0.810, + "args": { + "External id": 154407, "cbid": 147, "correlation": 289998677 + } + }, + { + "ph": "s", "id": 289998677, "pid": 5714, "tid": 6744, "ts": 6303771924253.863, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303772006846.490, "dur": 5631.041, + "args": { + "External id": 154407, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289998679, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289998679, "pid": 0, "tid": 20, "ts": 6303772006846.490, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771924255.783, "dur": 9.190, + "args": { + "External id": 154407, "cbid": 430, "correlation": 289998679 + } + }, + { + "ph": "s", "id": 289998679, "pid": 5714, "tid": 6744, "ts": 6303771924255.783, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771924265.953, "dur": 0.390, + "args": { + "External id": 154407, "cbid": 135, "correlation": 289998681 + } + }, + { + "ph": "f", "id": 289998681, "pid": 5714, "tid": 6744, "ts": 6303771924265.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771924266.453, "dur": 0.430, + "args": { + "External id": 154407, "cbid": 147, "correlation": 289998682 + } + }, + { + "ph": "s", "id": 289998682, "pid": 5714, "tid": 6744, "ts": 6303771924266.453, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771924268.233, "dur": 0.760, + "args": { + "External id": 154407, "cbid": 135, "correlation": 289998685 + } + }, + { + "ph": "f", "id": 289998685, "pid": 5714, "tid": 6744, "ts": 6303771924268.233, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771924276.073, "dur": 0.440, + "args": { + "External id": 154407, "cbid": 135, "correlation": 289998692 + } + }, + { + "ph": "f", "id": 289998692, "pid": 5714, "tid": 6744, "ts": 6303771924276.073, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771924308.753, "dur": 0.990, + "args": { + "External id": 154409, "cbid": 147, "correlation": 289998697 + } + }, + { + "ph": "s", "id": 289998697, "pid": 5714, "tid": 6744, "ts": 6303771924308.753, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771924325.683, "dur": 0.980, + "args": { + "External id": 154356, "cbid": 135, "correlation": 289998712 + } + }, + { + "ph": "f", "id": 289998712, "pid": 5714, "tid": 6744, "ts": 6303771924325.683, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303771998072.884, "dur": 2609.694, + "args": { + "External id": 154411, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998737, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998737, "pid": 0, "tid": 7, "ts": 6303771998072.884, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924461.992, "dur": 10.980, + "args": { + "External id": 154411, "cbid": 211, "correlation": 289998737 + } + }, + { + "ph": "s", "id": 289998737, "pid": 5714, "tid": 6744, "ts": 6303771924461.992, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6303772000715.347, "dur": 569.670, + "args": { + "External id": 154412, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998760, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289998760, "pid": 0, "tid": 7, "ts": 6303772000715.347, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924517.872, "dur": 5.870, + "args": { + "External id": 154412, "cbid": 307, "correlation": 289998760 + } + }, + { + "ph": "s", "id": 289998760, "pid": 5714, "tid": 6744, "ts": 6303771924517.872, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771924556.122, "dur": 0.480, + "args": { + "External id": 154413, "cbid": 200, "correlation": 289998783 + } + }, + { + "ph": "f", "id": 289998783, "pid": 5714, "tid": 6744, "ts": 6303771924556.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772001356.794, "dur": 69.633, + "args": { + "External id": 154413, "device": 0, "context": 1, "stream": 7, "correlation": 289998786, "bytes": 1536, "memory bandwidth (GB/s)": 0.022058506742492785 + } + }, + { + "ph": "f", "id": 289998786, "pid": 0, "tid": 7, "ts": 6303772001356.794, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771924558.252, "dur": 6.620, + "args": { + "External id": 154413, "cbid": 51, "correlation": 289998786 + } + }, + { + "ph": "s", "id": 289998786, "pid": 5714, "tid": 6744, "ts": 6303771924558.252, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772001484.412, "dur": 522.662, + "args": { + "External id": 154413, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998787, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998787, "pid": 0, "tid": 7, "ts": 6303772001484.412, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924565.062, "dur": 5.870, + "args": { + "External id": 154413, "cbid": 307, "correlation": 289998787 + } + }, + { + "ph": "s", "id": 289998787, "pid": 5714, "tid": 6744, "ts": 6303771924565.062, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771924597.382, "dur": 0.280, + "args": { + "External id": 154414, "cbid": 200, "correlation": 289998812 + } + }, + { + "ph": "f", "id": 289998812, "pid": 5714, "tid": 6744, "ts": 6303771924597.382, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772002008.226, "dur": 1.248, + "args": { + "External id": 154414, "device": 0, "context": 1, "stream": 7, "correlation": 289998815, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 289998815, "pid": 0, "tid": 7, "ts": 6303772002008.226, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771924598.622, "dur": 4.310, + "args": { + "External id": 154414, "cbid": 51, "correlation": 289998815 + } + }, + { + "ph": "s", "id": 289998815, "pid": 5714, "tid": 6744, "ts": 6303771924598.622, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772002010.882, "dur": 353.028, + "args": { + "External id": 154414, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998816, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998816, "pid": 0, "tid": 7, "ts": 6303772002010.882, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924603.072, "dur": 4.740, + "args": { + "External id": 154414, "cbid": 307, "correlation": 289998816 + } + }, + { + "ph": "s", "id": 289998816, "pid": 5714, "tid": 6744, "ts": 6303771924603.072, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771924629.732, "dur": 0.300, + "args": { + "External id": 154415, "cbid": 200, "correlation": 289998841 + } + }, + { + "ph": "f", "id": 289998841, "pid": 5714, "tid": 6744, "ts": 6303771924629.732, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772002364.550, "dur": 359.652, + "args": { + "External id": 154415, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998844, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998844, "pid": 0, "tid": 7, "ts": 6303772002364.550, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924631.092, "dur": 5.280, + "args": { + "External id": 154415, "cbid": 307, "correlation": 289998844 + } + }, + { + "ph": "s", "id": 289998844, "pid": 5714, "tid": 6744, "ts": 6303771924631.092, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771924656.562, "dur": 0.240, + "args": { + "External id": 154416, "cbid": 200, "correlation": 289998869 + } + }, + { + "ph": "f", "id": 289998869, "pid": 5714, "tid": 6744, "ts": 6303771924656.562, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772002725.482, "dur": 1.248, + "args": { + "External id": 154416, "device": 0, "context": 1, "stream": 7, "correlation": 289998872, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 289998872, "pid": 0, "tid": 7, "ts": 6303772002725.482, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771924657.722, "dur": 4.400, + "args": { + "External id": 154416, "cbid": 51, "correlation": 289998872 + } + }, + { + "ph": "s", "id": 289998872, "pid": 5714, "tid": 6744, "ts": 6303771924657.722, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772002728.042, "dur": 357.092, + "args": { + "External id": 154416, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998873, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998873, "pid": 0, "tid": 7, "ts": 6303772002728.042, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924662.282, "dur": 4.580, + "args": { + "External id": 154416, "cbid": 307, "correlation": 289998873 + } + }, + { + "ph": "s", "id": 289998873, "pid": 5714, "tid": 6744, "ts": 6303771924662.282, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771924688.322, "dur": 0.310, + "args": { + "External id": 154417, "cbid": 200, "correlation": 289998898 + } + }, + { + "ph": "f", "id": 289998898, "pid": 5714, "tid": 6744, "ts": 6303771924688.322, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772003085.838, "dur": 358.660, + "args": { + "External id": 154417, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998901, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998901, "pid": 0, "tid": 7, "ts": 6303772003085.838, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924689.602, "dur": 4.890, + "args": { + "External id": 154417, "cbid": 307, "correlation": 289998901 + } + }, + { + "ph": "s", "id": 289998901, "pid": 5714, "tid": 6744, "ts": 6303771924689.602, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303772003445.170, "dur": 90.242, + "args": { + "External id": 154418, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998914, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998914, "pid": 0, "tid": 7, "ts": 6303772003445.170, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924729.802, "dur": 5.580, + "args": { + "External id": 154418, "cbid": 307, "correlation": 289998914 + } + }, + { + "ph": "s", "id": 289998914, "pid": 5714, "tid": 6744, "ts": 6303771924729.802, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6303772003536.116, "dur": 3.360, + "args": { + "External id": 154419, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998922, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289998922, "pid": 0, "tid": 7, "ts": 6303772003536.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924758.562, "dur": 4.770, + "args": { + "External id": 154419, "cbid": 307, "correlation": 289998922 + } + }, + { + "ph": "s", "id": 289998922, "pid": 5714, "tid": 6744, "ts": 6303771924758.562, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6303772003540.756, "dur": 114.337, + "args": { + "External id": 154420, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998930, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289998930, "pid": 0, "tid": 7, "ts": 6303772003540.756, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924787.371, "dur": 4.891, + "args": { + "External id": 154420, "cbid": 307, "correlation": 289998930 + } + }, + { + "ph": "s", "id": 289998930, "pid": 5714, "tid": 6744, "ts": 6303771924787.371, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771924955.001, "dur": 0.480, + "args": { + "External id": 154439, "cbid": 200, "correlation": 289998976 + } + }, + { + "ph": "f", "id": 289998976, "pid": 5714, "tid": 6744, "ts": 6303771924955.001, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772003656.373, "dur": 0.928, + "args": { + "External id": 154439, "device": 0, "context": 1, "stream": 7, "correlation": 289998979, "bytes": 576, "memory bandwidth (GB/s)": 0.6206896551724138 + } + }, + { + "ph": "f", "id": 289998979, "pid": 0, "tid": 7, "ts": 6303772003656.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771924957.011, "dur": 6.800, + "args": { + "External id": 154439, "cbid": 51, "correlation": 289998979 + } + }, + { + "ph": "s", "id": 289998979, "pid": 5714, "tid": 6744, "ts": 6303771924957.011, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772003659.189, "dur": 143.361, + "args": { + "External id": 154439, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289998980, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289998980, "pid": 0, "tid": 7, "ts": 6303772003659.189, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924964.041, "dur": 7.920, + "args": { + "External id": 154439, "cbid": 307, "correlation": 289998980 + } + }, + { + "ph": "s", "id": 289998980, "pid": 5714, "tid": 6744, "ts": 6303771924964.041, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772003803.190, "dur": 141.218, + "args": { + "External id": 154440, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999002, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999002, "pid": 0, "tid": 7, "ts": 6303772003803.190, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771924997.481, "dur": 5.660, + "args": { + "External id": 154440, "cbid": 211, "correlation": 289999002 + } + }, + { + "ph": "s", "id": 289999002, "pid": 5714, "tid": 6744, "ts": 6303771924997.481, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771925071.721, "dur": 0.450, + "args": { + "External id": 154441, "cbid": 200, "correlation": 289999020 + } + }, + { + "ph": "f", "id": 289999020, "pid": 5714, "tid": 6744, "ts": 6303771925071.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771925072.341, "dur": 0.190, + "args": { + "External id": 154441, "cbid": 200, "correlation": 289999021 + } + }, + { + "ph": "f", "id": 289999021, "pid": 5714, "tid": 6744, "ts": 6303771925072.341, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771925091.201, "dur": 0.230, + "args": { + "External id": 154441, "cbid": 200, "correlation": 289999039 + } + }, + { + "ph": "f", "id": 289999039, "pid": 5714, "tid": 6744, "ts": 6303771925091.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303772003945.112, "dur": 92.290, + "args": { + "External id": 154441, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999040, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999040, "pid": 0, "tid": 7, "ts": 6303772003945.112, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925092.591, "dur": 8.950, + "args": { + "External id": 154441, "cbid": 211, "correlation": 289999040 + } + }, + { + "ph": "s", "id": 289999040, "pid": 5714, "tid": 6744, "ts": 6303771925092.591, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771925102.291, "dur": 0.960, + "args": { + "External id": 154441, "cbid": 273, "correlation": 289999042 + } + }, + { + "ph": "f", "id": 289999042, "pid": 5714, "tid": 6744, "ts": 6303771925102.291, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303772004038.010, "dur": 1046.667, + "args": { + "External id": 154441, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999043, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 289999043, "pid": 0, "tid": 7, "ts": 6303772004038.010, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925103.551, "dur": 4.150, + "args": { + "External id": 154441, "cbid": 211, "correlation": 289999043 + } + }, + { + "ph": "s", "id": 289999043, "pid": 5714, "tid": 6744, "ts": 6303771925103.551, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303772005085.285, "dur": 73.602, + "args": { + "External id": 154441, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999045, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289999045, "pid": 0, "tid": 7, "ts": 6303772005085.285, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925108.221, "dur": 3.730, + "args": { + "External id": 154441, "cbid": 211, "correlation": 289999045 + } + }, + { + "ph": "s", "id": 289999045, "pid": 5714, "tid": 6744, "ts": 6303771925108.221, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303772005159.559, "dur": 49.920, + "args": { + "External id": 154452, "device": 0, "context": 1, "stream": 7, "correlation": 289999067, "bytes": 25165824, "memory bandwidth (GB/s)": 504.12307692307695 + } + }, + { + "ph": "f", "id": 289999067, "pid": 0, "tid": 7, "ts": 6303772005159.559, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771925236.221, "dur": 16.420, + "args": { + "External id": 154452, "cbid": 41, "correlation": 289999067 + } + }, + { + "ph": "s", "id": 289999067, "pid": 5714, "tid": 6744, "ts": 6303771925236.221, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303772005210.087, "dur": 33.024, + "args": { + "External id": 154449, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999085, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999085, "pid": 0, "tid": 7, "ts": 6303772005210.087, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925351.740, "dur": 8.370, + "args": { + "External id": 154449, "cbid": 307, "correlation": 289999085 + } + }, + { + "ph": "s", "id": 289999085, "pid": 5714, "tid": 6744, "ts": 6303771925351.740, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303772005243.815, "dur": 40.577, + "args": { + "External id": 154459, "device": 0, "context": 1, "stream": 7, "correlation": 289999100, "bytes": 25165824, "memory bandwidth (GB/s)": 620.1992261626044 + } + }, + { + "ph": "f", "id": 289999100, "pid": 0, "tid": 7, "ts": 6303772005243.815, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771925418.390, "dur": 13.310, + "args": { + "External id": 154459, "cbid": 41, "correlation": 289999100 + } + }, + { + "ph": "s", "id": 289999100, "pid": 5714, "tid": 6744, "ts": 6303771925418.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303772005285.000, "dur": 28.992, + "args": { + "External id": 154456, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999118, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999118, "pid": 0, "tid": 7, "ts": 6303772005285.000, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925515.020, "dur": 7.330, + "args": { + "External id": 154456, "cbid": 307, "correlation": 289999118 + } + }, + { + "ph": "s", "id": 289999118, "pid": 5714, "tid": 6744, "ts": 6303771925515.020, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771925633.810, "dur": 0.530, + "args": { + "External id": 154464, "cbid": 200, "correlation": 289999148 + } + }, + { + "ph": "f", "id": 289999148, "pid": 5714, "tid": 6744, "ts": 6303771925633.810, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772005315.208, "dur": 1.248, + "args": { + "External id": 154464, "device": 0, "context": 1, "stream": 7, "correlation": 289999151, "bytes": 576, "memory bandwidth (GB/s)": 0.46153846153846156 + } + }, + { + "ph": "f", "id": 289999151, "pid": 0, "tid": 7, "ts": 6303772005315.208, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771925636.020, "dur": 7.300, + "args": { + "External id": 154464, "cbid": 51, "correlation": 289999151 + } + }, + { + "ph": "s", "id": 289999151, "pid": 5714, "tid": 6744, "ts": 6303771925636.020, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772005317.800, "dur": 144.450, + "args": { + "External id": 154464, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999152, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999152, "pid": 0, "tid": 7, "ts": 6303772005317.800, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925643.540, "dur": 7.690, + "args": { + "External id": 154464, "cbid": 307, "correlation": 289999152 + } + }, + { + "ph": "s", "id": 289999152, "pid": 5714, "tid": 6744, "ts": 6303771925643.540, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771925676.149, "dur": 0.300, + "args": { + "External id": 154465, "cbid": 200, "correlation": 289999177 + } + }, + { + "ph": "f", "id": 289999177, "pid": 5714, "tid": 6744, "ts": 6303771925676.149, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772005463.530, "dur": 1.248, + "args": { + "External id": 154465, "device": 0, "context": 1, "stream": 7, "correlation": 289999180, "bytes": 576, "memory bandwidth (GB/s)": 0.46153846153846156 + } + }, + { + "ph": "f", "id": 289999180, "pid": 0, "tid": 7, "ts": 6303772005463.530, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771925677.449, "dur": 4.160, + "args": { + "External id": 154465, "cbid": 51, "correlation": 289999180 + } + }, + { + "ph": "s", "id": 289999180, "pid": 5714, "tid": 6744, "ts": 6303771925677.449, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772005466.090, "dur": 355.844, + "args": { + "External id": 154465, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999181, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999181, "pid": 0, "tid": 7, "ts": 6303772005466.090, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925681.760, "dur": 4.969, + "args": { + "External id": 154465, "cbid": 307, "correlation": 289999181 + } + }, + { + "ph": "s", "id": 289999181, "pid": 5714, "tid": 6744, "ts": 6303771925681.760, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771925707.260, "dur": 0.300, + "args": { + "External id": 154466, "cbid": 200, "correlation": 289999206 + } + }, + { + "ph": "f", "id": 289999206, "pid": 5714, "tid": 6744, "ts": 6303771925707.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772005894.031, "dur": 71.009, + "args": { + "External id": 154466, "device": 0, "context": 1, "stream": 7, "correlation": 289999209, "bytes": 576, "memory bandwidth (GB/s)": 0.008111647819290512 + } + }, + { + "ph": "f", "id": 289999209, "pid": 0, "tid": 7, "ts": 6303772005894.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771925708.469, "dur": 3.791, + "args": { + "External id": 154466, "cbid": 51, "correlation": 289999209 + } + }, + { + "ph": "s", "id": 289999209, "pid": 5714, "tid": 6744, "ts": 6303771925708.469, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772006021.680, "dur": 409.349, + "args": { + "External id": 154466, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999210, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999210, "pid": 0, "tid": 7, "ts": 6303772006021.680, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925712.389, "dur": 4.560, + "args": { + "External id": 154466, "cbid": 307, "correlation": 289999210 + } + }, + { + "ph": "s", "id": 289999210, "pid": 5714, "tid": 6744, "ts": 6303771925712.389, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772006431.669, "dur": 152.194, + "args": { + "External id": 154467, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999232, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999232, "pid": 0, "tid": 7, "ts": 6303772006431.669, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925738.660, "dur": 5.440, + "args": { + "External id": 154467, "cbid": 211, "correlation": 289999232 + } + }, + { + "ph": "s", "id": 289999232, "pid": 5714, "tid": 6744, "ts": 6303771925738.660, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772006584.503, "dur": 141.217, + "args": { + "External id": 154468, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999255, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999255, "pid": 0, "tid": 7, "ts": 6303772006584.503, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925761.389, "dur": 4.471, + "args": { + "External id": 154468, "cbid": 211, "correlation": 289999255 + } + }, + { + "ph": "s", "id": 289999255, "pid": 5714, "tid": 6744, "ts": 6303771925761.389, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772006726.360, "dur": 122.114, + "args": { + "External id": 154469, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999278, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999278, "pid": 0, "tid": 7, "ts": 6303772006726.360, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925782.359, "dur": 4.360, + "args": { + "External id": 154469, "cbid": 211, "correlation": 289999278 + } + }, + { + "ph": "s", "id": 289999278, "pid": 5714, "tid": 6744, "ts": 6303771925782.359, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6303772006849.146, "dur": 271.107, + "args": { + "External id": 154470, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999286, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999286, "pid": 0, "tid": 7, "ts": 6303772006849.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925818.809, "dur": 5.050, + "args": { + "External id": 154470, "cbid": 307, "correlation": 289999286 + } + }, + { + "ph": "s", "id": 289999286, "pid": 5714, "tid": 6744, "ts": 6303771925818.809, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303772007180.158, "dur": 183.362, + "args": { + "External id": 154485, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999315, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999315, "pid": 0, "tid": 7, "ts": 6303772007180.158, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771925971.339, "dur": 8.810, + "args": { + "External id": 154485, "cbid": 307, "correlation": 289999315 + } + }, + { + "ph": "s", "id": 289999315, "pid": 5714, "tid": 6744, "ts": 6303771925971.339, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303772007364.128, "dur": 3.872, + "args": { + "External id": 154486, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999323, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289999323, "pid": 0, "tid": 7, "ts": 6303772007364.128, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771926005.539, "dur": 5.210, + "args": { + "External id": 154486, "cbid": 307, "correlation": 289999323 + } + }, + { + "ph": "s", "id": 289999323, "pid": 5714, "tid": 6744, "ts": 6303771926005.539, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303772007368.640, "dur": 51.201, + "args": { + "External id": 154487, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999334, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999334, "pid": 0, "tid": 7, "ts": 6303772007368.640, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771926036.299, "dur": 5.110, + "args": { + "External id": 154487, "cbid": 307, "correlation": 289999334 + } + }, + { + "ph": "s", "id": 289999334, "pid": 5714, "tid": 6744, "ts": 6303771926036.299, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303772007420.545, "dur": 47.648, + "args": { + "External id": 154488, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999339, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999339, "pid": 0, "tid": 7, "ts": 6303772007420.545, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771926075.089, "dur": 7.080, + "args": { + "External id": 154488, "cbid": 211, "correlation": 289999339 + } + }, + { + "ph": "s", "id": 289999339, "pid": 5714, "tid": 6744, "ts": 6303771926075.089, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771926238.388, "dur": 2.680, + "args": { + "External id": 154494, "cbid": 147, "correlation": 289999356 + } + }, + { + "ph": "s", "id": 289999356, "pid": 5714, "tid": 6744, "ts": 6303771926238.388, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771926343.028, "dur": 2.360, + "args": { + "External id": 154502, "cbid": 138, "correlation": 289999371 + } + }, + { + "ph": "f", "id": 289999371, "pid": 5714, "tid": 6744, "ts": 6303771926343.028, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303772007477.537, "dur": 3.072, + "args": { + "External id": 154506, "device": 0, "context": 1, "stream": 7, "correlation": 289999382, "bytes": 28112, "memory bandwidth (GB/s)": 9.151041666666666 + } + }, + { + "ph": "f", "id": 289999382, "pid": 0, "tid": 7, "ts": 6303772007477.537, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771926366.258, "dur": 11.650, + "args": { + "External id": 154506, "cbid": 41, "correlation": 289999382 + } + }, + { + "ph": "s", "id": 289999382, "pid": 5714, "tid": 6744, "ts": 6303771926366.258, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771926381.958, "dur": 1.890, + "args": { + "External id": 154501, "cbid": 135, "correlation": 289999386 + } + }, + { + "ph": "f", "id": 289999386, "pid": 5714, "tid": 6744, "ts": 6303771926381.958, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303772007483.233, "dur": 52.001, + "args": { + "External id": 154501, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999390, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999390, "pid": 0, "tid": 7, "ts": 6303772007483.233, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771926387.008, "dur": 9.710, + "args": { + "External id": 154501, "cbid": 211, "correlation": 289999390 + } + }, + { + "ph": "s", "id": 289999390, "pid": 5714, "tid": 6744, "ts": 6303771926387.008, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771926434.388, "dur": 1.090, + "args": { + "External id": 154494, "cbid": 135, "correlation": 289999401 + } + }, + { + "ph": "f", "id": 289999401, "pid": 5714, "tid": 6744, "ts": 6303771926434.388, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771926437.468, "dur": 1.320, + "args": { + "External id": 154494, "cbid": 147, "correlation": 289999405 + } + }, + { + "ph": "s", "id": 289999405, "pid": 5714, "tid": 6744, "ts": 6303771926437.468, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771926503.568, "dur": 0.990, + "args": { + "External id": 154510, "cbid": 317, "correlation": 289999425 + } + }, + { + "ph": "f", "id": 289999425, "pid": 5714, "tid": 6744, "ts": 6303771926503.568, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771926506.488, "dur": 1.230, + "args": { + "External id": 154510, "cbid": 135, "correlation": 289999427 + } + }, + { + "ph": "f", "id": 289999427, "pid": 5714, "tid": 6744, "ts": 6303771926506.488, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771926509.018, "dur": 1.030, + "args": { + "External id": 154510, "cbid": 147, "correlation": 289999431 + } + }, + { + "ph": "s", "id": 289999431, "pid": 5714, "tid": 6744, "ts": 6303771926509.018, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771926523.618, "dur": 0.690, + "args": { + "External id": 154510, "cbid": 409, "correlation": 289999434 + } + }, + { + "ph": "f", "id": 289999434, "pid": 5714, "tid": 6744, "ts": 6303771926523.618, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771926528.198, "dur": 0.810, + "args": { + "External id": 154510, "cbid": 135, "correlation": 289999437 + } + }, + { + "ph": "f", "id": 289999437, "pid": 5714, "tid": 6744, "ts": 6303771926528.198, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771926529.158, "dur": 0.800, + "args": { + "External id": 154510, "cbid": 147, "correlation": 289999438 + } + }, + { + "ph": "s", "id": 289999438, "pid": 5714, "tid": 6744, "ts": 6303771926529.158, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303772012479.003, "dur": 8559.172, + "args": { + "External id": 154510, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289999440, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289999440, "pid": 0, "tid": 20, "ts": 6303772012479.003, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771926531.058, "dur": 9.540, + "args": { + "External id": 154510, "cbid": 430, "correlation": 289999440 + } + }, + { + "ph": "s", "id": 289999440, "pid": 5714, "tid": 6744, "ts": 6303771926531.058, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771926541.578, "dur": 0.380, + "args": { + "External id": 154510, "cbid": 135, "correlation": 289999442 + } + }, + { + "ph": "f", "id": 289999442, "pid": 5714, "tid": 6744, "ts": 6303771926541.578, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771926542.098, "dur": 0.600, + "args": { + "External id": 154510, "cbid": 147, "correlation": 289999443 + } + }, + { + "ph": "s", "id": 289999443, "pid": 5714, "tid": 6744, "ts": 6303771926542.098, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771926544.318, "dur": 0.680, + "args": { + "External id": 154510, "cbid": 135, "correlation": 289999446 + } + }, + { + "ph": "f", "id": 289999446, "pid": 5714, "tid": 6744, "ts": 6303771926544.318, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771926553.038, "dur": 0.450, + "args": { + "External id": 154510, "cbid": 135, "correlation": 289999453 + } + }, + { + "ph": "f", "id": 289999453, "pid": 5714, "tid": 6744, "ts": 6303771926553.038, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771926578.018, "dur": 0.949, + "args": { + "External id": 154512, "cbid": 147, "correlation": 289999458 + } + }, + { + "ph": "s", "id": 289999458, "pid": 5714, "tid": 6744, "ts": 6303771926578.018, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771926594.707, "dur": 0.751, + "args": { + "External id": 154494, "cbid": 135, "correlation": 289999473 + } + }, + { + "ph": "f", "id": 289999473, "pid": 5714, "tid": 6744, "ts": 6303771926594.707, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771926773.397, "dur": 1.080, + "args": { + "External id": 154494, "cbid": 135, "correlation": 289999486 + } + }, + { + "ph": "f", "id": 289999486, "pid": 5714, "tid": 6744, "ts": 6303771926773.397, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771926873.547, "dur": 2.820, + "args": { + "External id": 154522, "cbid": 147, "correlation": 289999497 + } + }, + { + "ph": "s", "id": 289999497, "pid": 5714, "tid": 6744, "ts": 6303771926873.547, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771926977.727, "dur": 1.130, + "args": { + "External id": 154536, "cbid": 317, "correlation": 289999538 + } + }, + { + "ph": "f", "id": 289999538, "pid": 5714, "tid": 6744, "ts": 6303771926977.727, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771926985.487, "dur": 2.130, + "args": { + "External id": 154537, "cbid": 138, "correlation": 289999541 + } + }, + { + "ph": "f", "id": 289999541, "pid": 5714, "tid": 6744, "ts": 6303771926985.487, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303772012484.027, "dur": 1.793, + "args": { + "External id": 154541, "device": 0, "context": 1, "stream": 7, "correlation": 289999552, "bytes": 7224, "memory bandwidth (GB/s)": 4.029001673173452 + } + }, + { + "ph": "f", "id": 289999552, "pid": 0, "tid": 7, "ts": 6303772012484.027, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771927007.137, "dur": 11.380, + "args": { + "External id": 154541, "cbid": 41, "correlation": 289999552 + } + }, + { + "ph": "s", "id": 289999552, "pid": 5714, "tid": 6744, "ts": 6303771927007.137, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771927022.906, "dur": 1.480, + "args": { + "External id": 154536, "cbid": 135, "correlation": 289999556 + } + }, + { + "ph": "f", "id": 289999556, "pid": 5714, "tid": 6744, "ts": 6303771927022.906, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303772012488.700, "dur": 12.608, + "args": { + "External id": 154536, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999560, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999560, "pid": 0, "tid": 7, "ts": 6303772012488.700, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927026.826, "dur": 10.000, + "args": { + "External id": 154536, "cbid": 211, "correlation": 289999560 + } + }, + { + "ph": "s", "id": 289999560, "pid": 5714, "tid": 6744, "ts": 6303771927026.826, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771927122.366, "dur": 1.310, + "args": { + "External id": 154522, "cbid": 135, "correlation": 289999571 + } + }, + { + "ph": "f", "id": 289999571, "pid": 5714, "tid": 6744, "ts": 6303771927122.366, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771927126.726, "dur": 1.170, + "args": { + "External id": 154522, "cbid": 147, "correlation": 289999575 + } + }, + { + "ph": "s", "id": 289999575, "pid": 5714, "tid": 6744, "ts": 6303771927126.726, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771927129.496, "dur": 0.670, + "args": { + "External id": 154522, "cbid": 147, "correlation": 289999579 + } + }, + { + "ph": "s", "id": 289999579, "pid": 5714, "tid": 6744, "ts": 6303771927129.496, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303772012535.132, "dur": 27.328, + "args": { + "External id": 154555, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289999603, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289999603, "pid": 0, "tid": 17, "ts": 6303772012535.132, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927267.466, "dur": 11.000, + "args": { + "External id": 154555, "cbid": 211, "correlation": 289999603 + } + }, + { + "ph": "s", "id": 289999603, "pid": 5714, "tid": 6744, "ts": 6303771927267.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303772012573.084, "dur": 10.817, + "args": { + "External id": 154571, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 289999616, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 289999616, "pid": 0, "tid": 17, "ts": 6303772012573.084, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927397.526, "dur": 9.530, + "args": { + "External id": 154571, "cbid": 211, "correlation": 289999616 + } + }, + { + "ph": "s", "id": 289999616, "pid": 5714, "tid": 6744, "ts": 6303771927397.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771927430.506, "dur": 1.260, + "args": { + "External id": 154522, "cbid": 135, "correlation": 289999626 + } + }, + { + "ph": "f", "id": 289999626, "pid": 5714, "tid": 6744, "ts": 6303771927430.506, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771927433.646, "dur": 1.130, + "args": { + "External id": 154522, "cbid": 147, "correlation": 289999630 + } + }, + { + "ph": "s", "id": 289999630, "pid": 5714, "tid": 6744, "ts": 6303771927433.646, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771927484.305, "dur": 0.851, + "args": { + "External id": 154573, "cbid": 317, "correlation": 289999643 + } + }, + { + "ph": "f", "id": 289999643, "pid": 5714, "tid": 6744, "ts": 6303771927484.305, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771927486.965, "dur": 1.091, + "args": { + "External id": 154573, "cbid": 135, "correlation": 289999645 + } + }, + { + "ph": "f", "id": 289999645, "pid": 5714, "tid": 6744, "ts": 6303771927486.965, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771927489.405, "dur": 1.091, + "args": { + "External id": 154573, "cbid": 147, "correlation": 289999649 + } + }, + { + "ph": "s", "id": 289999649, "pid": 5714, "tid": 6744, "ts": 6303771927489.405, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771927503.916, "dur": 0.669, + "args": { + "External id": 154573, "cbid": 409, "correlation": 289999652 + } + }, + { + "ph": "f", "id": 289999652, "pid": 5714, "tid": 6744, "ts": 6303771927503.916, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771927508.325, "dur": 0.760, + "args": { + "External id": 154573, "cbid": 135, "correlation": 289999655 + } + }, + { + "ph": "f", "id": 289999655, "pid": 5714, "tid": 6744, "ts": 6303771927508.325, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771927509.256, "dur": 0.789, + "args": { + "External id": 154573, "cbid": 147, "correlation": 289999656 + } + }, + { + "ph": "s", "id": 289999656, "pid": 5714, "tid": 6744, "ts": 6303771927509.256, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303772021039.391, "dur": 5234.429, + "args": { + "External id": 154573, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 289999658, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 289999658, "pid": 0, "tid": 20, "ts": 6303772021039.391, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771927511.105, "dur": 9.460, + "args": { + "External id": 154573, "cbid": 430, "correlation": 289999658 + } + }, + { + "ph": "s", "id": 289999658, "pid": 5714, "tid": 6744, "ts": 6303771927511.105, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771927521.545, "dur": 0.380, + "args": { + "External id": 154573, "cbid": 135, "correlation": 289999660 + } + }, + { + "ph": "f", "id": 289999660, "pid": 5714, "tid": 6744, "ts": 6303771927521.545, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771927522.045, "dur": 0.471, + "args": { + "External id": 154573, "cbid": 147, "correlation": 289999661 + } + }, + { + "ph": "s", "id": 289999661, "pid": 5714, "tid": 6744, "ts": 6303771927522.045, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771927523.905, "dur": 0.731, + "args": { + "External id": 154573, "cbid": 135, "correlation": 289999664 + } + }, + { + "ph": "f", "id": 289999664, "pid": 5714, "tid": 6744, "ts": 6303771927523.905, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771927531.905, "dur": 0.431, + "args": { + "External id": 154573, "cbid": 135, "correlation": 289999671 + } + }, + { + "ph": "f", "id": 289999671, "pid": 5714, "tid": 6744, "ts": 6303771927531.905, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771927556.796, "dur": 1.009, + "args": { + "External id": 154575, "cbid": 147, "correlation": 289999676 + } + }, + { + "ph": "s", "id": 289999676, "pid": 5714, "tid": 6744, "ts": 6303771927556.796, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771927573.385, "dur": 0.860, + "args": { + "External id": 154522, "cbid": 135, "correlation": 289999691 + } + }, + { + "ph": "f", "id": 289999691, "pid": 5714, "tid": 6744, "ts": 6303771927573.385, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303772012501.980, "dur": 2298.362, + "args": { + "External id": 154577, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999716, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999716, "pid": 0, "tid": 7, "ts": 6303772012501.980, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927711.355, "dur": 10.630, + "args": { + "External id": 154577, "cbid": 211, "correlation": 289999716 + } + }, + { + "ph": "s", "id": 289999716, "pid": 5714, "tid": 6744, "ts": 6303771927711.355, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6303772014841.399, "dur": 572.935, + "args": { + "External id": 154578, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999739, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 289999739, "pid": 0, "tid": 7, "ts": 6303772014841.399, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927767.905, "dur": 6.020, + "args": { + "External id": 154578, "cbid": 307, "correlation": 289999739 + } + }, + { + "ph": "s", "id": 289999739, "pid": 5714, "tid": 6744, "ts": 6303771927767.905, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771927806.605, "dur": 0.500, + "args": { + "External id": 154579, "cbid": 200, "correlation": 289999762 + } + }, + { + "ph": "f", "id": 289999762, "pid": 5714, "tid": 6744, "ts": 6303771927806.605, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772015490.430, "dur": 64.769, + "args": { + "External id": 154579, "device": 0, "context": 1, "stream": 7, "correlation": 289999765, "bytes": 1536, "memory bandwidth (GB/s)": 0.02371504886596983 + } + }, + { + "ph": "f", "id": 289999765, "pid": 0, "tid": 7, "ts": 6303772015490.430, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771927808.685, "dur": 6.670, + "args": { + "External id": 154579, "cbid": 51, "correlation": 289999765 + } + }, + { + "ph": "s", "id": 289999765, "pid": 5714, "tid": 6744, "ts": 6303771927808.685, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772015596.063, "dur": 789.706, + "args": { + "External id": 154579, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999766, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999766, "pid": 0, "tid": 7, "ts": 6303772015596.063, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927815.575, "dur": 5.820, + "args": { + "External id": 154579, "cbid": 307, "correlation": 289999766 + } + }, + { + "ph": "s", "id": 289999766, "pid": 5714, "tid": 6744, "ts": 6303771927815.575, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771927847.365, "dur": 0.320, + "args": { + "External id": 154580, "cbid": 200, "correlation": 289999791 + } + }, + { + "ph": "f", "id": 289999791, "pid": 5714, "tid": 6744, "ts": 6303771927847.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772016387.145, "dur": 1.248, + "args": { + "External id": 154580, "device": 0, "context": 1, "stream": 7, "correlation": 289999794, "bytes": 1536, "memory bandwidth (GB/s)": 1.2307692307692308 + } + }, + { + "ph": "f", "id": 289999794, "pid": 0, "tid": 7, "ts": 6303772016387.145, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771927848.725, "dur": 4.480, + "args": { + "External id": 154580, "cbid": 51, "correlation": 289999794 + } + }, + { + "ph": "s", "id": 289999794, "pid": 5714, "tid": 6744, "ts": 6303771927848.725, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772016389.961, "dur": 354.756, + "args": { + "External id": 154580, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999795, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999795, "pid": 0, "tid": 7, "ts": 6303772016389.961, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927853.325, "dur": 4.970, + "args": { + "External id": 154580, "cbid": 307, "correlation": 289999795 + } + }, + { + "ph": "s", "id": 289999795, "pid": 5714, "tid": 6744, "ts": 6303771927853.325, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771927880.765, "dur": 0.270, + "args": { + "External id": 154581, "cbid": 200, "correlation": 289999820 + } + }, + { + "ph": "f", "id": 289999820, "pid": 5714, "tid": 6744, "ts": 6303771927880.765, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772016745.421, "dur": 357.476, + "args": { + "External id": 154581, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999823, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999823, "pid": 0, "tid": 7, "ts": 6303772016745.421, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927882.165, "dur": 5.070, + "args": { + "External id": 154581, "cbid": 307, "correlation": 289999823 + } + }, + { + "ph": "s", "id": 289999823, "pid": 5714, "tid": 6744, "ts": 6303771927882.165, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771927908.035, "dur": 0.210, + "args": { + "External id": 154582, "cbid": 200, "correlation": 289999848 + } + }, + { + "ph": "f", "id": 289999848, "pid": 5714, "tid": 6744, "ts": 6303771927908.035, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772017104.145, "dur": 1.472, + "args": { + "External id": 154582, "device": 0, "context": 1, "stream": 7, "correlation": 289999851, "bytes": 1536, "memory bandwidth (GB/s)": 1.0434782608695652 + } + }, + { + "ph": "f", "id": 289999851, "pid": 0, "tid": 7, "ts": 6303772017104.145, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771927909.125, "dur": 4.370, + "args": { + "External id": 154582, "cbid": 51, "correlation": 289999851 + } + }, + { + "ph": "s", "id": 289999851, "pid": 5714, "tid": 6744, "ts": 6303771927909.125, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772017106.897, "dur": 355.364, + "args": { + "External id": 154582, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999852, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999852, "pid": 0, "tid": 7, "ts": 6303772017106.897, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927913.635, "dur": 4.720, + "args": { + "External id": 154582, "cbid": 307, "correlation": 289999852 + } + }, + { + "ph": "s", "id": 289999852, "pid": 5714, "tid": 6744, "ts": 6303771927913.635, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771927940.064, "dur": 0.300, + "args": { + "External id": 154583, "cbid": 200, "correlation": 289999877 + } + }, + { + "ph": "f", "id": 289999877, "pid": 5714, "tid": 6744, "ts": 6303771927940.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772017462.901, "dur": 359.013, + "args": { + "External id": 154583, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999880, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999880, "pid": 0, "tid": 7, "ts": 6303772017462.901, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927941.335, "dur": 4.889, + "args": { + "External id": 154583, "cbid": 307, "correlation": 289999880 + } + }, + { + "ph": "s", "id": 289999880, "pid": 5714, "tid": 6744, "ts": 6303771927941.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303772017822.618, "dur": 89.888, + "args": { + "External id": 154584, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999893, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999893, "pid": 0, "tid": 7, "ts": 6303772017822.618, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771927981.924, "dur": 5.711, + "args": { + "External id": 154584, "cbid": 307, "correlation": 289999893 + } + }, + { + "ph": "s", "id": 289999893, "pid": 5714, "tid": 6744, "ts": 6303771927981.924, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6303772017913.146, "dur": 3.905, + "args": { + "External id": 154585, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999901, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 289999901, "pid": 0, "tid": 7, "ts": 6303772017913.146, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928011.975, "dur": 5.039, + "args": { + "External id": 154585, "cbid": 307, "correlation": 289999901 + } + }, + { + "ph": "s", "id": 289999901, "pid": 5714, "tid": 6744, "ts": 6303771928011.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6303772017918.171, "dur": 114.145, + "args": { + "External id": 154586, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999909, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 289999909, "pid": 0, "tid": 7, "ts": 6303772017918.171, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928042.504, "dur": 5.880, + "args": { + "External id": 154586, "cbid": 307, "correlation": 289999909 + } + }, + { + "ph": "s", "id": 289999909, "pid": 5714, "tid": 6744, "ts": 6303771928042.504, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771928211.834, "dur": 0.490, + "args": { + "External id": 154605, "cbid": 200, "correlation": 289999955 + } + }, + { + "ph": "f", "id": 289999955, "pid": 5714, "tid": 6744, "ts": 6303771928211.834, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772018033.788, "dur": 0.800, + "args": { + "External id": 154605, "device": 0, "context": 1, "stream": 7, "correlation": 289999958, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 289999958, "pid": 0, "tid": 7, "ts": 6303772018033.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771928213.904, "dur": 6.970, + "args": { + "External id": 154605, "cbid": 51, "correlation": 289999958 + } + }, + { + "ph": "s", "id": 289999958, "pid": 5714, "tid": 6744, "ts": 6303771928213.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772018035.772, "dur": 144.002, + "args": { + "External id": 154605, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999959, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999959, "pid": 0, "tid": 7, "ts": 6303772018035.772, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928221.104, "dur": 8.030, + "args": { + "External id": 154605, "cbid": 307, "correlation": 289999959 + } + }, + { + "ph": "s", "id": 289999959, "pid": 5714, "tid": 6744, "ts": 6303771928221.104, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772018180.478, "dur": 141.761, + "args": { + "External id": 154606, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 289999981, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 289999981, "pid": 0, "tid": 7, "ts": 6303772018180.478, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928254.904, "dur": 5.760, + "args": { + "External id": 154606, "cbid": 211, "correlation": 289999981 + } + }, + { + "ph": "s", "id": 289999981, "pid": 5714, "tid": 6744, "ts": 6303771928254.904, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771928339.644, "dur": 0.430, + "args": { + "External id": 154607, "cbid": 200, "correlation": 289999999 + } + }, + { + "ph": "f", "id": 289999999, "pid": 5714, "tid": 6744, "ts": 6303771928339.644, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771928340.194, "dur": 0.210, + "args": { + "External id": 154607, "cbid": 200, "correlation": 290000000 + } + }, + { + "ph": "f", "id": 290000000, "pid": 5714, "tid": 6744, "ts": 6303771928340.194, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771928359.674, "dur": 0.230, + "args": { + "External id": 154607, "cbid": 200, "correlation": 290000018 + } + }, + { + "ph": "f", "id": 290000018, "pid": 5714, "tid": 6744, "ts": 6303771928359.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303772018322.911, "dur": 92.417, + "args": { + "External id": 154607, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000019, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000019, "pid": 0, "tid": 7, "ts": 6303772018322.911, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928361.074, "dur": 9.449, + "args": { + "External id": 154607, "cbid": 211, "correlation": 290000019 + } + }, + { + "ph": "s", "id": 290000019, "pid": 5714, "tid": 6744, "ts": 6303771928361.074, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771928371.294, "dur": 0.909, + "args": { + "External id": 154607, "cbid": 273, "correlation": 290000021 + } + }, + { + "ph": "f", "id": 290000021, "pid": 5714, "tid": 6744, "ts": 6303771928371.294, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303772018416.064, "dur": 1335.696, + "args": { + "External id": 154607, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000022, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290000022, "pid": 0, "tid": 7, "ts": 6303772018416.064, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928372.514, "dur": 4.080, + "args": { + "External id": 154607, "cbid": 211, "correlation": 290000022 + } + }, + { + "ph": "s", "id": 290000022, "pid": 5714, "tid": 6744, "ts": 6303771928372.514, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303772019752.496, "dur": 161.698, + "args": { + "External id": 154607, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000024, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 290000024, "pid": 0, "tid": 7, "ts": 6303772019752.496, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928377.114, "dur": 3.689, + "args": { + "External id": 154607, "cbid": 211, "correlation": 290000024 + } + }, + { + "ph": "s", "id": 290000024, "pid": 5714, "tid": 6744, "ts": 6303771928377.114, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303772019914.930, "dur": 175.138, + "args": { + "External id": 154618, "device": 0, "context": 1, "stream": 7, "correlation": 290000046, "bytes": 25165824, "memory bandwidth (GB/s)": 143.6913976407176 + } + }, + { + "ph": "f", "id": 290000046, "pid": 0, "tid": 7, "ts": 6303772019914.930, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771928504.843, "dur": 16.840, + "args": { + "External id": 154618, "cbid": 41, "correlation": 290000046 + } + }, + { + "ph": "s", "id": 290000046, "pid": 5714, "tid": 6744, "ts": 6303771928504.843, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303772020093.748, "dur": 158.753, + "args": { + "External id": 154615, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000064, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000064, "pid": 0, "tid": 7, "ts": 6303772020093.748, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928613.723, "dur": 8.110, + "args": { + "External id": 154615, "cbid": 307, "correlation": 290000064 + } + }, + { + "ph": "s", "id": 290000064, "pid": 5714, "tid": 6744, "ts": 6303771928613.723, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303772020253.141, "dur": 205.923, + "args": { + "External id": 154625, "device": 0, "context": 1, "stream": 7, "correlation": 290000079, "bytes": 25165824, "memory bandwidth (GB/s)": 122.2098745647645 + } + }, + { + "ph": "f", "id": 290000079, "pid": 0, "tid": 7, "ts": 6303772020253.141, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771928686.483, "dur": 13.520, + "args": { + "External id": 154625, "cbid": 41, "correlation": 290000079 + } + }, + { + "ph": "s", "id": 290000079, "pid": 5714, "tid": 6744, "ts": 6303771928686.483, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303772020459.736, "dur": 153.890, + "args": { + "External id": 154622, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000097, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000097, "pid": 0, "tid": 7, "ts": 6303772020459.736, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928783.923, "dur": 7.220, + "args": { + "External id": 154622, "cbid": 307, "correlation": 290000097 + } + }, + { + "ph": "s", "id": 290000097, "pid": 5714, "tid": 6744, "ts": 6303771928783.923, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771928901.102, "dur": 0.500, + "args": { + "External id": 154630, "cbid": 200, "correlation": 290000127 + } + }, + { + "ph": "f", "id": 290000127, "pid": 5714, "tid": 6744, "ts": 6303771928901.102, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772020683.867, "dur": 34.752, + "args": { + "External id": 154630, "device": 0, "context": 1, "stream": 7, "correlation": 290000130, "bytes": 576, "memory bandwidth (GB/s)": 0.016574585635359115 + } + }, + { + "ph": "f", "id": 290000130, "pid": 0, "tid": 7, "ts": 6303772020683.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771928903.262, "dur": 7.071, + "args": { + "External id": 154630, "cbid": 51, "correlation": 290000130 + } + }, + { + "ph": "s", "id": 290000130, "pid": 5714, "tid": 6744, "ts": 6303771928903.262, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772020732.379, "dur": 152.130, + "args": { + "External id": 154630, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000131, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000131, "pid": 0, "tid": 7, "ts": 6303772020732.379, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928910.542, "dur": 8.040, + "args": { + "External id": 154630, "cbid": 307, "correlation": 290000131 + } + }, + { + "ph": "s", "id": 290000131, "pid": 5714, "tid": 6744, "ts": 6303771928910.542, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771928944.232, "dur": 0.300, + "args": { + "External id": 154631, "cbid": 200, "correlation": 290000156 + } + }, + { + "ph": "f", "id": 290000156, "pid": 5714, "tid": 6744, "ts": 6303771928944.232, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772020889.181, "dur": 6.752, + "args": { + "External id": 154631, "device": 0, "context": 1, "stream": 7, "correlation": 290000159, "bytes": 576, "memory bandwidth (GB/s)": 0.08530805687203792 + } + }, + { + "ph": "f", "id": 290000159, "pid": 0, "tid": 7, "ts": 6303772020889.181, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771928945.482, "dur": 4.360, + "args": { + "External id": 154631, "cbid": 51, "correlation": 290000159 + } + }, + { + "ph": "s", "id": 290000159, "pid": 5714, "tid": 6744, "ts": 6303771928945.482, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772020901.661, "dur": 144.738, + "args": { + "External id": 154631, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000160, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000160, "pid": 0, "tid": 7, "ts": 6303772020901.661, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928949.972, "dur": 4.830, + "args": { + "External id": 154631, "cbid": 307, "correlation": 290000160 + } + }, + { + "ph": "s", "id": 290000160, "pid": 5714, "tid": 6744, "ts": 6303771928949.972, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771928976.442, "dur": 0.230, + "args": { + "External id": 154632, "cbid": 200, "correlation": 290000185 + } + }, + { + "ph": "f", "id": 290000185, "pid": 5714, "tid": 6744, "ts": 6303771928976.442, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772021047.231, "dur": 0.800, + "args": { + "External id": 154632, "device": 0, "context": 1, "stream": 7, "correlation": 290000188, "bytes": 576, "memory bandwidth (GB/s)": 0.72 + } + }, + { + "ph": "f", "id": 290000188, "pid": 0, "tid": 7, "ts": 6303772021047.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771928977.582, "dur": 3.970, + "args": { + "External id": 154632, "cbid": 51, "correlation": 290000188 + } + }, + { + "ph": "s", "id": 290000188, "pid": 5714, "tid": 6744, "ts": 6303771928977.582, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772021049.215, "dur": 142.785, + "args": { + "External id": 154632, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000189, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000189, "pid": 0, "tid": 7, "ts": 6303772021049.215, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771928981.682, "dur": 4.620, + "args": { + "External id": 154632, "cbid": 307, "correlation": 290000189 + } + }, + { + "ph": "s", "id": 290000189, "pid": 5714, "tid": 6744, "ts": 6303771928981.682, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772021192.672, "dur": 553.607, + "args": { + "External id": 154633, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000211, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000211, "pid": 0, "tid": 7, "ts": 6303772021192.672, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771929008.802, "dur": 5.240, + "args": { + "External id": 154633, "cbid": 211, "correlation": 290000211 + } + }, + { + "ph": "s", "id": 290000211, "pid": 5714, "tid": 6744, "ts": 6303771929008.802, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772021746.919, "dur": 141.634, + "args": { + "External id": 154634, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000234, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000234, "pid": 0, "tid": 7, "ts": 6303772021746.919, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771929031.342, "dur": 4.430, + "args": { + "External id": 154634, "cbid": 211, "correlation": 290000234 + } + }, + { + "ph": "s", "id": 290000234, "pid": 5714, "tid": 6744, "ts": 6303771929031.342, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772021889.256, "dur": 143.202, + "args": { + "External id": 154635, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000257, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000257, "pid": 0, "tid": 7, "ts": 6303772021889.256, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771929051.732, "dur": 4.410, + "args": { + "External id": 154635, "cbid": 211, "correlation": 290000257 + } + }, + { + "ph": "s", "id": 290000257, "pid": 5714, "tid": 6744, "ts": 6303771929051.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6303772022033.162, "dur": 79.553, + "args": { + "External id": 154636, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000265, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000265, "pid": 0, "tid": 7, "ts": 6303772022033.162, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771929086.672, "dur": 5.170, + "args": { + "External id": 154636, "cbid": 307, "correlation": 290000265 + } + }, + { + "ph": "s", "id": 290000265, "pid": 5714, "tid": 6744, "ts": 6303771929086.672, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303772022113.419, "dur": 48.321, + "args": { + "External id": 154651, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000294, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000294, "pid": 0, "tid": 7, "ts": 6303772022113.419, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771929239.512, "dur": 8.770, + "args": { + "External id": 154651, "cbid": 307, "correlation": 290000294 + } + }, + { + "ph": "s", "id": 290000294, "pid": 5714, "tid": 6744, "ts": 6303771929239.512, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303772022162.348, "dur": 4.000, + "args": { + "External id": 154652, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000302, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 290000302, "pid": 0, "tid": 7, "ts": 6303772022162.348, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771929272.981, "dur": 5.180, + "args": { + "External id": 154652, "cbid": 307, "correlation": 290000302 + } + }, + { + "ph": "s", "id": 290000302, "pid": 5714, "tid": 6744, "ts": 6303771929272.981, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303772022166.988, "dur": 50.016, + "args": { + "External id": 154653, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000313, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000313, "pid": 0, "tid": 7, "ts": 6303772022166.988, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771929312.041, "dur": 5.580, + "args": { + "External id": 154653, "cbid": 307, "correlation": 290000313 + } + }, + { + "ph": "s", "id": 290000313, "pid": 5714, "tid": 6744, "ts": 6303771929312.041, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303772022217.740, "dur": 47.169, + "args": { + "External id": 154654, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000318, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000318, "pid": 0, "tid": 7, "ts": 6303772022217.740, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771929352.192, "dur": 7.180, + "args": { + "External id": 154654, "cbid": 211, "correlation": 290000318 + } + }, + { + "ph": "s", "id": 290000318, "pid": 5714, "tid": 6744, "ts": 6303771929352.192, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771929518.221, "dur": 2.820, + "args": { + "External id": 154660, "cbid": 147, "correlation": 290000335 + } + }, + { + "ph": "s", "id": 290000335, "pid": 5714, "tid": 6744, "ts": 6303771929518.221, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771929617.801, "dur": 2.540, + "args": { + "External id": 154668, "cbid": 138, "correlation": 290000350 + } + }, + { + "ph": "f", "id": 290000350, "pid": 5714, "tid": 6744, "ts": 6303771929617.801, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771929620.671, "dur": 0.600, + "args": { + "External id": 154668, "cbid": 138, "correlation": 290000351 + } + }, + { + "ph": "f", "id": 290000351, "pid": 5714, "tid": 6744, "ts": 6303771929620.671, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771929621.571, "dur": 0.630, + "args": { + "External id": 154668, "cbid": 138, "correlation": 290000352 + } + }, + { + "ph": "f", "id": 290000352, "pid": 5714, "tid": 6744, "ts": 6303771929621.571, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303772022268.365, "dur": 5.248, + "args": { + "External id": 154672, "device": 0, "context": 1, "stream": 7, "correlation": 290000363, "bytes": 28112, "memory bandwidth (GB/s)": 5.3567073170731705 + } + }, + { + "ph": "f", "id": 290000363, "pid": 0, "tid": 7, "ts": 6303772022268.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771929641.841, "dur": 11.340, + "args": { + "External id": 154672, "cbid": 41, "correlation": 290000363 + } + }, + { + "ph": "s", "id": 290000363, "pid": 5714, "tid": 6744, "ts": 6303771929641.841, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771929656.971, "dur": 1.640, + "args": { + "External id": 154667, "cbid": 135, "correlation": 290000367 + } + }, + { + "ph": "f", "id": 290000367, "pid": 5714, "tid": 6744, "ts": 6303771929656.971, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303772022276.109, "dur": 50.945, + "args": { + "External id": 154667, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000371, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000371, "pid": 0, "tid": 7, "ts": 6303772022276.109, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771929661.511, "dur": 9.790, + "args": { + "External id": 154667, "cbid": 211, "correlation": 290000371 + } + }, + { + "ph": "s", "id": 290000371, "pid": 5714, "tid": 6744, "ts": 6303771929661.511, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771929708.631, "dur": 0.880, + "args": { + "External id": 154660, "cbid": 135, "correlation": 290000382 + } + }, + { + "ph": "f", "id": 290000382, "pid": 5714, "tid": 6744, "ts": 6303771929708.631, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771929711.441, "dur": 1.260, + "args": { + "External id": 154660, "cbid": 147, "correlation": 290000386 + } + }, + { + "ph": "s", "id": 290000386, "pid": 5714, "tid": 6744, "ts": 6303771929711.441, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771929778.100, "dur": 1.011, + "args": { + "External id": 154676, "cbid": 317, "correlation": 290000406 + } + }, + { + "ph": "f", "id": 290000406, "pid": 5714, "tid": 6744, "ts": 6303771929778.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771929780.940, "dur": 1.211, + "args": { + "External id": 154676, "cbid": 135, "correlation": 290000408 + } + }, + { + "ph": "f", "id": 290000408, "pid": 5714, "tid": 6744, "ts": 6303771929780.940, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771929783.500, "dur": 1.011, + "args": { + "External id": 154676, "cbid": 147, "correlation": 290000412 + } + }, + { + "ph": "s", "id": 290000412, "pid": 5714, "tid": 6744, "ts": 6303771929783.500, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771929798.040, "dur": 0.671, + "args": { + "External id": 154676, "cbid": 409, "correlation": 290000415 + } + }, + { + "ph": "f", "id": 290000415, "pid": 5714, "tid": 6744, "ts": 6303771929798.040, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771929802.591, "dur": 0.840, + "args": { + "External id": 154676, "cbid": 135, "correlation": 290000418 + } + }, + { + "ph": "f", "id": 290000418, "pid": 5714, "tid": 6744, "ts": 6303771929802.591, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771929803.611, "dur": 0.809, + "args": { + "External id": 154676, "cbid": 147, "correlation": 290000419 + } + }, + { + "ph": "s", "id": 290000419, "pid": 5714, "tid": 6744, "ts": 6303771929803.611, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303772026277.116, "dur": 8405.889, + "args": { + "External id": 154676, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 290000421, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 290000421, "pid": 0, "tid": 20, "ts": 6303772026277.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771929805.500, "dur": 9.251, + "args": { + "External id": 154676, "cbid": 430, "correlation": 290000421 + } + }, + { + "ph": "s", "id": 290000421, "pid": 5714, "tid": 6744, "ts": 6303771929805.500, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771929815.790, "dur": 0.400, + "args": { + "External id": 154676, "cbid": 135, "correlation": 290000423 + } + }, + { + "ph": "f", "id": 290000423, "pid": 5714, "tid": 6744, "ts": 6303771929815.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771929816.380, "dur": 0.530, + "args": { + "External id": 154676, "cbid": 147, "correlation": 290000424 + } + }, + { + "ph": "s", "id": 290000424, "pid": 5714, "tid": 6744, "ts": 6303771929816.380, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771929818.270, "dur": 0.740, + "args": { + "External id": 154676, "cbid": 135, "correlation": 290000427 + } + }, + { + "ph": "f", "id": 290000427, "pid": 5714, "tid": 6744, "ts": 6303771929818.270, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771929826.270, "dur": 0.440, + "args": { + "External id": 154676, "cbid": 135, "correlation": 290000434 + } + }, + { + "ph": "f", "id": 290000434, "pid": 5714, "tid": 6744, "ts": 6303771929826.270, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771929851.310, "dur": 0.940, + "args": { + "External id": 154678, "cbid": 147, "correlation": 290000439 + } + }, + { + "ph": "s", "id": 290000439, "pid": 5714, "tid": 6744, "ts": 6303771929851.310, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771929867.590, "dur": 0.820, + "args": { + "External id": 154660, "cbid": 135, "correlation": 290000454 + } + }, + { + "ph": "f", "id": 290000454, "pid": 5714, "tid": 6744, "ts": 6303771929867.590, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771930038.800, "dur": 1.240, + "args": { + "External id": 154660, "cbid": 135, "correlation": 290000467 + } + }, + { + "ph": "f", "id": 290000467, "pid": 5714, "tid": 6744, "ts": 6303771930038.800, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771930139.560, "dur": 3.100, + "args": { + "External id": 154688, "cbid": 147, "correlation": 290000478 + } + }, + { + "ph": "s", "id": 290000478, "pid": 5714, "tid": 6744, "ts": 6303771930139.560, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771930246.339, "dur": 1.160, + "args": { + "External id": 154702, "cbid": 317, "correlation": 290000519 + } + }, + { + "ph": "f", "id": 290000519, "pid": 5714, "tid": 6744, "ts": 6303771930246.339, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771930254.450, "dur": 2.440, + "args": { + "External id": 154703, "cbid": 138, "correlation": 290000522 + } + }, + { + "ph": "f", "id": 290000522, "pid": 5714, "tid": 6744, "ts": 6303771930254.450, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303772026277.884, "dur": 2.400, + "args": { + "External id": 154707, "device": 0, "context": 1, "stream": 7, "correlation": 290000533, "bytes": 7224, "memory bandwidth (GB/s)": 3.01 + } + }, + { + "ph": "f", "id": 290000533, "pid": 0, "tid": 7, "ts": 6303772026277.884, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771930276.149, "dur": 11.370, + "args": { + "External id": 154707, "cbid": 41, "correlation": 290000533 + } + }, + { + "ph": "s", "id": 290000533, "pid": 5714, "tid": 6744, "ts": 6303771930276.149, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771930291.599, "dur": 1.590, + "args": { + "External id": 154702, "cbid": 135, "correlation": 290000537 + } + }, + { + "ph": "f", "id": 290000537, "pid": 5714, "tid": 6744, "ts": 6303771930291.599, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303772026282.332, "dur": 13.056, + "args": { + "External id": 154702, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000541, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000541, "pid": 0, "tid": 7, "ts": 6303772026282.332, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771930295.519, "dur": 17.420, + "args": { + "External id": 154702, "cbid": 211, "correlation": 290000541 + } + }, + { + "ph": "s", "id": 290000541, "pid": 5714, "tid": 6744, "ts": 6303771930295.519, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771930400.599, "dur": 1.220, + "args": { + "External id": 154688, "cbid": 135, "correlation": 290000552 + } + }, + { + "ph": "f", "id": 290000552, "pid": 5714, "tid": 6744, "ts": 6303771930400.599, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771930404.849, "dur": 1.130, + "args": { + "External id": 154688, "cbid": 147, "correlation": 290000556 + } + }, + { + "ph": "s", "id": 290000556, "pid": 5714, "tid": 6744, "ts": 6303771930404.849, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771930407.689, "dur": 0.780, + "args": { + "External id": 154688, "cbid": 147, "correlation": 290000560 + } + }, + { + "ph": "s", "id": 290000560, "pid": 5714, "tid": 6744, "ts": 6303771930407.689, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::CopyFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303772026328.700, "dur": 27.264, + "args": { + "External id": 154721, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 290000584, "registers per thread": 40, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 290000584, "pid": 0, "tid": 17, "ts": 6303772026328.700, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771930547.269, "dur": 10.990, + "args": { + "External id": 154721, "cbid": 211, "correlation": 290000584 + } + }, + { + "ph": "s", "id": 290000584, "pid": 5714, "tid": 6744, "ts": 6303771930547.269, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy >(at::native::(anonymous namespace)::TensorListMetadata<2>, at::native::(anonymous namespace)::UnaryOpFunctor, at::native::Copy)", "pid": 0, "tid": 17, + "ts": 6303772026371.261, "dur": 333.891, + "args": { + "External id": 154737, "queued": 0, "device": 0, "context": 1, "stream": 17, "correlation": 290000597, "registers per thread": 36, "shared memory": 0, "blocks per SM": 0.250000, "warps per SM": 4.000000, "grid": [32, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 290000597, "pid": 0, "tid": 17, "ts": 6303772026371.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771930652.549, "dur": 9.160, + "args": { + "External id": 154737, "cbid": 211, "correlation": 290000597 + } + }, + { + "ph": "s", "id": 290000597, "pid": 5714, "tid": 6744, "ts": 6303771930652.549, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771930683.978, "dur": 1.220, + "args": { + "External id": 154688, "cbid": 135, "correlation": 290000607 + } + }, + { + "ph": "f", "id": 290000607, "pid": 5714, "tid": 6744, "ts": 6303771930683.978, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771930687.009, "dur": 1.100, + "args": { + "External id": 154688, "cbid": 147, "correlation": 290000611 + } + }, + { + "ph": "s", "id": 290000611, "pid": 5714, "tid": 6744, "ts": 6303771930687.009, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771930736.108, "dur": 0.840, + "args": { + "External id": 154739, "cbid": 317, "correlation": 290000624 + } + }, + { + "ph": "f", "id": 290000624, "pid": 5714, "tid": 6744, "ts": 6303771930736.108, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771930738.698, "dur": 1.030, + "args": { + "External id": 154739, "cbid": 135, "correlation": 290000626 + } + }, + { + "ph": "f", "id": 290000626, "pid": 5714, "tid": 6744, "ts": 6303771930738.698, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771930741.158, "dur": 1.040, + "args": { + "External id": 154739, "cbid": 147, "correlation": 290000630 + } + }, + { + "ph": "s", "id": 290000630, "pid": 5714, "tid": 6744, "ts": 6303771930741.158, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771930755.298, "dur": 0.650, + "args": { + "External id": 154739, "cbid": 409, "correlation": 290000633 + } + }, + { + "ph": "f", "id": 290000633, "pid": 5714, "tid": 6744, "ts": 6303771930755.298, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771930759.678, "dur": 0.690, + "args": { + "External id": 154739, "cbid": 135, "correlation": 290000636 + } + }, + { + "ph": "f", "id": 290000636, "pid": 5714, "tid": 6744, "ts": 6303771930759.678, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771930760.538, "dur": 0.800, + "args": { + "External id": 154739, "cbid": 147, "correlation": 290000637 + } + }, + { + "ph": "s", "id": 290000637, "pid": 5714, "tid": 6744, "ts": 6303771930760.538, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllGather_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303772034804.062, "dur": 5113.660, + "args": { + "External id": 154739, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 290000639, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_allgather_base", "In msg nelems": 1769856, "Out msg nelems": 7079424, "Group size": 4, "dtype": "BFloat16", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 290000639, "pid": 0, "tid": 20, "ts": 6303772034804.062, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771930762.378, "dur": 9.300, + "args": { + "External id": 154739, "cbid": 430, "correlation": 290000639 + } + }, + { + "ph": "s", "id": 290000639, "pid": 5714, "tid": 6744, "ts": 6303771930762.378, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771930772.898, "dur": 0.410, + "args": { + "External id": 154739, "cbid": 135, "correlation": 290000641 + } + }, + { + "ph": "f", "id": 290000641, "pid": 5714, "tid": 6744, "ts": 6303771930772.898, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771930773.478, "dur": 0.480, + "args": { + "External id": 154739, "cbid": 147, "correlation": 290000642 + } + }, + { + "ph": "s", "id": 290000642, "pid": 5714, "tid": 6744, "ts": 6303771930773.478, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771930775.338, "dur": 0.850, + "args": { + "External id": 154739, "cbid": 135, "correlation": 290000645 + } + }, + { + "ph": "f", "id": 290000645, "pid": 5714, "tid": 6744, "ts": 6303771930775.338, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771930783.078, "dur": 0.440, + "args": { + "External id": 154739, "cbid": 135, "correlation": 290000652 + } + }, + { + "ph": "f", "id": 290000652, "pid": 5714, "tid": 6744, "ts": 6303771930783.078, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771930807.438, "dur": 0.960, + "args": { + "External id": 154741, "cbid": 147, "correlation": 290000657 + } + }, + { + "ph": "s", "id": 290000657, "pid": 5714, "tid": 6744, "ts": 6303771930807.438, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771930823.498, "dur": 0.820, + "args": { + "External id": 154688, "cbid": 135, "correlation": 290000672 + } + }, + { + "ph": "f", "id": 290000672, "pid": 5714, "tid": 6744, "ts": 6303771930823.498, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303772026295.964, "dur": 2464.476, + "args": { + "External id": 154743, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000697, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000697, "pid": 0, "tid": 7, "ts": 6303772026295.964, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771930958.118, "dur": 10.820, + "args": { + "External id": 154743, "cbid": 211, "correlation": 290000697 + } + }, + { + "ph": "s", "id": 290000697, "pid": 5714, "tid": 6744, "ts": 6303771930958.118, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6303772028804.697, "dur": 617.095, + "args": { + "External id": 154744, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000720, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 290000720, "pid": 0, "tid": 7, "ts": 6303772028804.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931013.988, "dur": 6.250, + "args": { + "External id": 154744, "cbid": 307, "correlation": 290000720 + } + }, + { + "ph": "s", "id": 290000720, "pid": 5714, "tid": 6744, "ts": 6303771931013.988, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771931052.728, "dur": 0.540, + "args": { + "External id": 154745, "cbid": 200, "correlation": 290000743 + } + }, + { + "ph": "f", "id": 290000743, "pid": 5714, "tid": 6744, "ts": 6303771931052.728, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772029493.889, "dur": 69.697, + "args": { + "External id": 154745, "device": 0, "context": 1, "stream": 7, "correlation": 290000746, "bytes": 1536, "memory bandwidth (GB/s)": 0.02203825128771683 + } + }, + { + "ph": "f", "id": 290000746, "pid": 0, "tid": 7, "ts": 6303772029493.889, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771931054.858, "dur": 6.660, + "args": { + "External id": 154745, "cbid": 51, "correlation": 290000746 + } + }, + { + "ph": "s", "id": 290000746, "pid": 5714, "tid": 6744, "ts": 6303771931054.858, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772029645.986, "dur": 566.567, + "args": { + "External id": 154745, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000747, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000747, "pid": 0, "tid": 7, "ts": 6303772029645.986, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931061.728, "dur": 5.960, + "args": { + "External id": 154745, "cbid": 307, "correlation": 290000747 + } + }, + { + "ph": "s", "id": 290000747, "pid": 5714, "tid": 6744, "ts": 6303771931061.728, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771931093.717, "dur": 0.291, + "args": { + "External id": 154746, "cbid": 200, "correlation": 290000772 + } + }, + { + "ph": "f", "id": 290000772, "pid": 5714, "tid": 6744, "ts": 6303771931093.717, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772030213.769, "dur": 1.376, + "args": { + "External id": 154746, "device": 0, "context": 1, "stream": 7, "correlation": 290000775, "bytes": 1536, "memory bandwidth (GB/s)": 1.1162790697674418 + } + }, + { + "ph": "f", "id": 290000775, "pid": 0, "tid": 7, "ts": 6303772030213.769, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771931095.017, "dur": 4.511, + "args": { + "External id": 154746, "cbid": 51, "correlation": 290000775 + } + }, + { + "ph": "s", "id": 290000775, "pid": 5714, "tid": 6744, "ts": 6303771931095.017, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772030216.329, "dur": 353.604, + "args": { + "External id": 154746, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000776, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000776, "pid": 0, "tid": 7, "ts": 6303772030216.329, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931099.677, "dur": 4.980, + "args": { + "External id": 154746, "cbid": 307, "correlation": 290000776 + } + }, + { + "ph": "s", "id": 290000776, "pid": 5714, "tid": 6744, "ts": 6303771931099.677, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771931126.728, "dur": 0.269, + "args": { + "External id": 154747, "cbid": 200, "correlation": 290000801 + } + }, + { + "ph": "f", "id": 290000801, "pid": 5714, "tid": 6744, "ts": 6303771931126.728, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772030570.573, "dur": 355.908, + "args": { + "External id": 154747, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000804, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000804, "pid": 0, "tid": 7, "ts": 6303772030570.573, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931128.077, "dur": 5.080, + "args": { + "External id": 154747, "cbid": 307, "correlation": 290000804 + } + }, + { + "ph": "s", "id": 290000804, "pid": 5714, "tid": 6744, "ts": 6303771931128.077, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771931153.437, "dur": 0.251, + "args": { + "External id": 154748, "cbid": 200, "correlation": 290000829 + } + }, + { + "ph": "f", "id": 290000829, "pid": 5714, "tid": 6744, "ts": 6303771931153.437, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772030927.761, "dur": 1.440, + "args": { + "External id": 154748, "device": 0, "context": 1, "stream": 7, "correlation": 290000832, "bytes": 1536, "memory bandwidth (GB/s)": 1.0666666666666667 + } + }, + { + "ph": "f", "id": 290000832, "pid": 0, "tid": 7, "ts": 6303772030927.761, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771931154.608, "dur": 4.220, + "args": { + "External id": 154748, "cbid": 51, "correlation": 290000832 + } + }, + { + "ph": "s", "id": 290000832, "pid": 5714, "tid": 6744, "ts": 6303771931154.608, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772030930.449, "dur": 355.588, + "args": { + "External id": 154748, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000833, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000833, "pid": 0, "tid": 7, "ts": 6303772030930.449, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931158.977, "dur": 4.740, + "args": { + "External id": 154748, "cbid": 307, "correlation": 290000833 + } + }, + { + "ph": "s", "id": 290000833, "pid": 5714, "tid": 6744, "ts": 6303771931158.977, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771931185.727, "dur": 0.270, + "args": { + "External id": 154749, "cbid": 200, "correlation": 290000858 + } + }, + { + "ph": "f", "id": 290000858, "pid": 5714, "tid": 6744, "ts": 6303771931185.727, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772031286.646, "dur": 359.556, + "args": { + "External id": 154749, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000861, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000861, "pid": 0, "tid": 7, "ts": 6303772031286.646, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931186.987, "dur": 5.010, + "args": { + "External id": 154749, "cbid": 307, "correlation": 290000861 + } + }, + { + "ph": "s", "id": 290000861, "pid": 5714, "tid": 6744, "ts": 6303771931186.987, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303772031646.842, "dur": 85.409, + "args": { + "External id": 154750, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000874, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000874, "pid": 0, "tid": 7, "ts": 6303772031646.842, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931226.157, "dur": 5.630, + "args": { + "External id": 154750, "cbid": 307, "correlation": 290000874 + } + }, + { + "ph": "s", "id": 290000874, "pid": 5714, "tid": 6744, "ts": 6303771931226.157, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6303772031732.891, "dur": 3.424, + "args": { + "External id": 154751, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000882, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 290000882, "pid": 0, "tid": 7, "ts": 6303772031732.891, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931255.497, "dur": 4.840, + "args": { + "External id": 154751, "cbid": 307, "correlation": 290000882 + } + }, + { + "ph": "s", "id": 290000882, "pid": 5714, "tid": 6744, "ts": 6303771931255.497, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6303772031737.467, "dur": 114.017, + "args": { + "External id": 154752, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000890, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290000890, "pid": 0, "tid": 7, "ts": 6303772031737.467, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931284.317, "dur": 4.720, + "args": { + "External id": 154752, "cbid": 307, "correlation": 290000890 + } + }, + { + "ph": "s", "id": 290000890, "pid": 5714, "tid": 6744, "ts": 6303771931284.317, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771931466.217, "dur": 0.490, + "args": { + "External id": 154771, "cbid": 200, "correlation": 290000936 + } + }, + { + "ph": "f", "id": 290000936, "pid": 5714, "tid": 6744, "ts": 6303771931466.217, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772031852.732, "dur": 1.152, + "args": { + "External id": 154771, "device": 0, "context": 1, "stream": 7, "correlation": 290000939, "bytes": 576, "memory bandwidth (GB/s)": 0.5 + } + }, + { + "ph": "f", "id": 290000939, "pid": 0, "tid": 7, "ts": 6303772031852.732, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771931468.377, "dur": 7.380, + "args": { + "External id": 154771, "cbid": 51, "correlation": 290000939 + } + }, + { + "ph": "s", "id": 290000939, "pid": 5714, "tid": 6744, "ts": 6303771931468.377, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772031855.356, "dur": 144.290, + "args": { + "External id": 154771, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000940, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000940, "pid": 0, "tid": 7, "ts": 6303772031855.356, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931475.987, "dur": 8.140, + "args": { + "External id": 154771, "cbid": 307, "correlation": 290000940 + } + }, + { + "ph": "s", "id": 290000940, "pid": 5714, "tid": 6744, "ts": 6303771931475.987, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772032000.350, "dur": 259.875, + "args": { + "External id": 154772, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290000962, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290000962, "pid": 0, "tid": 7, "ts": 6303772032000.350, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931508.717, "dur": 5.780, + "args": { + "External id": 154772, "cbid": 211, "correlation": 290000962 + } + }, + { + "ph": "s", "id": 290000962, "pid": 5714, "tid": 6744, "ts": 6303771931508.717, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771931583.936, "dur": 0.420, + "args": { + "External id": 154773, "cbid": 200, "correlation": 290000980 + } + }, + { + "ph": "f", "id": 290000980, "pid": 5714, "tid": 6744, "ts": 6303771931583.936, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771931584.487, "dur": 0.189, + "args": { + "External id": 154773, "cbid": 200, "correlation": 290000981 + } + }, + { + "ph": "f", "id": 290000981, "pid": 5714, "tid": 6744, "ts": 6303771931584.487, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771931602.867, "dur": 0.209, + "args": { + "External id": 154773, "cbid": 200, "correlation": 290000999 + } + }, + { + "ph": "f", "id": 290000999, "pid": 5714, "tid": 6744, "ts": 6303771931602.867, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303772032260.865, "dur": 208.130, + "args": { + "External id": 154773, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001000, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001000, "pid": 0, "tid": 7, "ts": 6303772032260.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931604.216, "dur": 8.840, + "args": { + "External id": 154773, "cbid": 211, "correlation": 290001000 + } + }, + { + "ph": "s", "id": 290001000, "pid": 5714, "tid": 6744, "ts": 6303771931604.216, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771931613.796, "dur": 0.930, + "args": { + "External id": 154773, "cbid": 273, "correlation": 290001002 + } + }, + { + "ph": "f", "id": 290001002, "pid": 5714, "tid": 6744, "ts": 6303771931613.796, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303772032469.667, "dur": 1473.265, + "args": { + "External id": 154773, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001003, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290001003, "pid": 0, "tid": 7, "ts": 6303772032469.667, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931615.026, "dur": 4.220, + "args": { + "External id": 154773, "cbid": 211, "correlation": 290001003 + } + }, + { + "ph": "s", "id": 290001003, "pid": 5714, "tid": 6744, "ts": 6303771931615.026, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303772033944.116, "dur": 204.643, + "args": { + "External id": 154773, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001005, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 290001005, "pid": 0, "tid": 7, "ts": 6303772033944.116, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931619.766, "dur": 3.530, + "args": { + "External id": 154773, "cbid": 211, "correlation": 290001005 + } + }, + { + "ph": "s", "id": 290001005, "pid": 5714, "tid": 6744, "ts": 6303771931619.766, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303772034149.367, "dur": 174.658, + "args": { + "External id": 154784, "device": 0, "context": 1, "stream": 7, "correlation": 290001027, "bytes": 25165824, "memory bandwidth (GB/s)": 144.08629435811702 + } + }, + { + "ph": "f", "id": 290001027, "pid": 0, "tid": 7, "ts": 6303772034149.367, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771931748.456, "dur": 16.330, + "args": { + "External id": 154784, "cbid": 41, "correlation": 290001027 + } + }, + { + "ph": "s", "id": 290001027, "pid": 5714, "tid": 6744, "ts": 6303771931748.456, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303772034324.697, "dur": 84.257, + "args": { + "External id": 154781, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001045, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001045, "pid": 0, "tid": 7, "ts": 6303772034324.697, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771931856.516, "dur": 8.370, + "args": { + "External id": 154781, "cbid": 307, "correlation": 290001045 + } + }, + { + "ph": "s", "id": 290001045, "pid": 5714, "tid": 6744, "ts": 6303771931856.516, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303772034409.562, "dur": 57.664, + "args": { + "External id": 154791, "device": 0, "context": 1, "stream": 7, "correlation": 290001060, "bytes": 25165824, "memory bandwidth (GB/s)": 436.4217536071032 + } + }, + { + "ph": "f", "id": 290001060, "pid": 0, "tid": 7, "ts": 6303772034409.562, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771931922.546, "dur": 12.970, + "args": { + "External id": 154791, "cbid": 41, "correlation": 290001060 + } + }, + { + "ph": "s", "id": 290001060, "pid": 5714, "tid": 6744, "ts": 6303771931922.546, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303772034469.083, "dur": 43.360, + "args": { + "External id": 154788, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001078, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001078, "pid": 0, "tid": 7, "ts": 6303772034469.083, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932018.175, "dur": 7.340, + "args": { + "External id": 154788, "cbid": 307, "correlation": 290001078 + } + }, + { + "ph": "s", "id": 290001078, "pid": 5714, "tid": 6744, "ts": 6303771932018.175, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771932137.755, "dur": 0.520, + "args": { + "External id": 154796, "cbid": 200, "correlation": 290001108 + } + }, + { + "ph": "f", "id": 290001108, "pid": 5714, "tid": 6744, "ts": 6303771932137.755, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772034519.419, "dur": 5.664, + "args": { + "External id": 154796, "device": 0, "context": 1, "stream": 7, "correlation": 290001111, "bytes": 576, "memory bandwidth (GB/s)": 0.1016949152542373 + } + }, + { + "ph": "f", "id": 290001111, "pid": 0, "tid": 7, "ts": 6303772034519.419, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771932139.915, "dur": 7.200, + "args": { + "External id": 154796, "cbid": 51, "correlation": 290001111 + } + }, + { + "ph": "s", "id": 290001111, "pid": 5714, "tid": 6744, "ts": 6303771932139.915, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772034529.211, "dur": 148.898, + "args": { + "External id": 154796, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001112, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001112, "pid": 0, "tid": 7, "ts": 6303772034529.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932147.335, "dur": 7.630, + "args": { + "External id": 154796, "cbid": 307, "correlation": 290001112 + } + }, + { + "ph": "s", "id": 290001112, "pid": 5714, "tid": 6744, "ts": 6303771932147.335, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771932179.475, "dur": 0.300, + "args": { + "External id": 154797, "cbid": 200, "correlation": 290001137 + } + }, + { + "ph": "f", "id": 290001137, "pid": 5714, "tid": 6744, "ts": 6303771932179.475, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772034678.973, "dur": 0.768, + "args": { + "External id": 154797, "device": 0, "context": 1, "stream": 7, "correlation": 290001140, "bytes": 576, "memory bandwidth (GB/s)": 0.75 + } + }, + { + "ph": "f", "id": 290001140, "pid": 0, "tid": 7, "ts": 6303772034678.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771932180.715, "dur": 4.250, + "args": { + "External id": 154797, "cbid": 51, "correlation": 290001140 + } + }, + { + "ph": "s", "id": 290001140, "pid": 5714, "tid": 6744, "ts": 6303771932180.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772034680.957, "dur": 137.665, + "args": { + "External id": 154797, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001141, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001141, "pid": 0, "tid": 7, "ts": 6303772034680.957, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932185.105, "dur": 5.000, + "args": { + "External id": 154797, "cbid": 307, "correlation": 290001141 + } + }, + { + "ph": "s", "id": 290001141, "pid": 5714, "tid": 6744, "ts": 6303771932185.105, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771932210.615, "dur": 0.270, + "args": { + "External id": 154798, "cbid": 200, "correlation": 290001166 + } + }, + { + "ph": "f", "id": 290001166, "pid": 5714, "tid": 6744, "ts": 6303771932210.615, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772034819.838, "dur": 1.248, + "args": { + "External id": 154798, "device": 0, "context": 1, "stream": 7, "correlation": 290001169, "bytes": 576, "memory bandwidth (GB/s)": 0.46153846153846156 + } + }, + { + "ph": "f", "id": 290001169, "pid": 0, "tid": 7, "ts": 6303772034819.838, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771932211.715, "dur": 4.110, + "args": { + "External id": 154798, "cbid": 51, "correlation": 290001169 + } + }, + { + "ph": "s", "id": 290001169, "pid": 5714, "tid": 6744, "ts": 6303771932211.715, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772034822.559, "dur": 393.092, + "args": { + "External id": 154798, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001170, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001170, "pid": 0, "tid": 7, "ts": 6303772034822.559, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932215.975, "dur": 4.760, + "args": { + "External id": 154798, "cbid": 307, "correlation": 290001170 + } + }, + { + "ph": "s", "id": 290001170, "pid": 5714, "tid": 6744, "ts": 6303771932215.975, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772035284.516, "dur": 235.587, + "args": { + "External id": 154799, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001192, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001192, "pid": 0, "tid": 7, "ts": 6303772035284.516, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932242.225, "dur": 5.360, + "args": { + "External id": 154799, "cbid": 211, "correlation": 290001192 + } + }, + { + "ph": "s", "id": 290001192, "pid": 5714, "tid": 6744, "ts": 6303771932242.225, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772035520.711, "dur": 141.826, + "args": { + "External id": 154800, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001215, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001215, "pid": 0, "tid": 7, "ts": 6303772035520.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932265.025, "dur": 4.630, + "args": { + "External id": 154800, "cbid": 211, "correlation": 290001215 + } + }, + { + "ph": "s", "id": 290001215, "pid": 5714, "tid": 6744, "ts": 6303771932265.025, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772035663.241, "dur": 142.561, + "args": { + "External id": 154801, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001238, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001238, "pid": 0, "tid": 7, "ts": 6303772035663.241, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932285.705, "dur": 4.460, + "args": { + "External id": 154801, "cbid": 211, "correlation": 290001238 + } + }, + { + "ph": "s", "id": 290001238, "pid": 5714, "tid": 6744, "ts": 6303771932285.705, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6303772035806.442, "dur": 81.793, + "args": { + "External id": 154802, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001246, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001246, "pid": 0, "tid": 7, "ts": 6303772035806.442, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932329.445, "dur": 5.970, + "args": { + "External id": 154802, "cbid": 307, "correlation": 290001246 + } + }, + { + "ph": "s", "id": 290001246, "pid": 5714, "tid": 6744, "ts": 6303771932329.445, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303772035888.907, "dur": 46.592, + "args": { + "External id": 154817, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001275, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001275, "pid": 0, "tid": 7, "ts": 6303772035888.907, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932482.485, "dur": 8.780, + "args": { + "External id": 154817, "cbid": 307, "correlation": 290001275 + } + }, + { + "ph": "s", "id": 290001275, "pid": 5714, "tid": 6744, "ts": 6303771932482.485, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303772035936.107, "dur": 3.776, + "args": { + "External id": 154818, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001283, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 290001283, "pid": 0, "tid": 7, "ts": 6303772035936.107, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932515.154, "dur": 4.870, + "args": { + "External id": 154818, "cbid": 307, "correlation": 290001283 + } + }, + { + "ph": "s", "id": 290001283, "pid": 5714, "tid": 6744, "ts": 6303771932515.154, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303772035940.619, "dur": 51.777, + "args": { + "External id": 154819, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001294, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001294, "pid": 0, "tid": 7, "ts": 6303772035940.619, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932546.204, "dur": 5.170, + "args": { + "External id": 154819, "cbid": 307, "correlation": 290001294 + } + }, + { + "ph": "s", "id": 290001294, "pid": 5714, "tid": 6744, "ts": 6303771932546.204, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303772035993.100, "dur": 46.209, + "args": { + "External id": 154820, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001299, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001299, "pid": 0, "tid": 7, "ts": 6303772035993.100, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932584.654, "dur": 6.680, + "args": { + "External id": 154820, "cbid": 211, "correlation": 290001299 + } + }, + { + "ph": "s", "id": 290001299, "pid": 5714, "tid": 6744, "ts": 6303771932584.654, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771932745.464, "dur": 2.780, + "args": { + "External id": 154826, "cbid": 147, "correlation": 290001316 + } + }, + { + "ph": "s", "id": 290001316, "pid": 5714, "tid": 6744, "ts": 6303771932745.464, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771932841.054, "dur": 2.430, + "args": { + "External id": 154834, "cbid": 138, "correlation": 290001331 + } + }, + { + "ph": "f", "id": 290001331, "pid": 5714, "tid": 6744, "ts": 6303771932841.054, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303772036043.373, "dur": 3.136, + "args": { + "External id": 154838, "device": 0, "context": 1, "stream": 7, "correlation": 290001342, "bytes": 28112, "memory bandwidth (GB/s)": 8.964285714285714 + } + }, + { + "ph": "f", "id": 290001342, "pid": 0, "tid": 7, "ts": 6303772036043.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771932864.014, "dur": 11.250, + "args": { + "External id": 154838, "cbid": 41, "correlation": 290001342 + } + }, + { + "ph": "s", "id": 290001342, "pid": 5714, "tid": 6744, "ts": 6303771932864.014, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771932879.084, "dur": 1.829, + "args": { + "External id": 154833, "cbid": 135, "correlation": 290001346 + } + }, + { + "ph": "f", "id": 290001346, "pid": 5714, "tid": 6744, "ts": 6303771932879.084, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303772036049.101, "dur": 50.368, + "args": { + "External id": 154833, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001350, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001350, "pid": 0, "tid": 7, "ts": 6303772036049.101, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771932883.744, "dur": 9.669, + "args": { + "External id": 154833, "cbid": 211, "correlation": 290001350 + } + }, + { + "ph": "s", "id": 290001350, "pid": 5714, "tid": 6744, "ts": 6303771932883.744, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771932930.973, "dur": 0.960, + "args": { + "External id": 154826, "cbid": 135, "correlation": 290001361 + } + }, + { + "ph": "f", "id": 290001361, "pid": 5714, "tid": 6744, "ts": 6303771932930.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771932933.844, "dur": 1.220, + "args": { + "External id": 154826, "cbid": 147, "correlation": 290001365 + } + }, + { + "ph": "s", "id": 290001365, "pid": 5714, "tid": 6744, "ts": 6303771932933.844, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771932999.993, "dur": 0.970, + "args": { + "External id": 154842, "cbid": 317, "correlation": 290001385 + } + }, + { + "ph": "f", "id": 290001385, "pid": 5714, "tid": 6744, "ts": 6303771932999.993, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771933002.713, "dur": 1.290, + "args": { + "External id": 154842, "cbid": 135, "correlation": 290001387 + } + }, + { + "ph": "f", "id": 290001387, "pid": 5714, "tid": 6744, "ts": 6303771933002.713, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771933005.273, "dur": 1.040, + "args": { + "External id": 154842, "cbid": 147, "correlation": 290001391 + } + }, + { + "ph": "s", "id": 290001391, "pid": 5714, "tid": 6744, "ts": 6303771933005.273, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771933019.853, "dur": 0.700, + "args": { + "External id": 154842, "cbid": 409, "correlation": 290001394 + } + }, + { + "ph": "f", "id": 290001394, "pid": 5714, "tid": 6744, "ts": 6303771933019.853, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771933024.373, "dur": 0.800, + "args": { + "External id": 154842, "cbid": 135, "correlation": 290001397 + } + }, + { + "ph": "f", "id": 290001397, "pid": 5714, "tid": 6744, "ts": 6303771933024.373, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771933025.343, "dur": 0.750, + "args": { + "External id": 154842, "cbid": 147, "correlation": 290001398 + } + }, + { + "ph": "s", "id": 290001398, "pid": 5714, "tid": 6744, "ts": 6303771933025.343, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303772039919.738, "dur": 9082.377, + "args": { + "External id": 154842, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 290001400, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 290001400, "pid": 0, "tid": 20, "ts": 6303772039919.738, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771933027.153, "dur": 9.400, + "args": { + "External id": 154842, "cbid": 430, "correlation": 290001400 + } + }, + { + "ph": "s", "id": 290001400, "pid": 5714, "tid": 6744, "ts": 6303771933027.153, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771933037.493, "dur": 0.380, + "args": { + "External id": 154842, "cbid": 135, "correlation": 290001402 + } + }, + { + "ph": "f", "id": 290001402, "pid": 5714, "tid": 6744, "ts": 6303771933037.493, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771933037.973, "dur": 0.610, + "args": { + "External id": 154842, "cbid": 147, "correlation": 290001403 + } + }, + { + "ph": "s", "id": 290001403, "pid": 5714, "tid": 6744, "ts": 6303771933037.973, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771933039.973, "dur": 0.780, + "args": { + "External id": 154842, "cbid": 135, "correlation": 290001406 + } + }, + { + "ph": "f", "id": 290001406, "pid": 5714, "tid": 6744, "ts": 6303771933039.973, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771933048.013, "dur": 0.460, + "args": { + "External id": 154842, "cbid": 135, "correlation": 290001413 + } + }, + { + "ph": "f", "id": 290001413, "pid": 5714, "tid": 6744, "ts": 6303771933048.013, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771933072.433, "dur": 0.980, + "args": { + "External id": 154844, "cbid": 147, "correlation": 290001418 + } + }, + { + "ph": "s", "id": 290001418, "pid": 5714, "tid": 6744, "ts": 6303771933072.433, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771933088.733, "dur": 0.790, + "args": { + "External id": 154826, "cbid": 135, "correlation": 290001433 + } + }, + { + "ph": "f", "id": 290001433, "pid": 5714, "tid": 6744, "ts": 6303771933088.733, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771933262.533, "dur": 1.180, + "args": { + "External id": 154826, "cbid": 135, "correlation": 290001446 + } + }, + { + "ph": "f", "id": 290001446, "pid": 5714, "tid": 6744, "ts": 6303771933262.533, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771933371.903, "dur": 2.900, + "args": { + "External id": 154854, "cbid": 147, "correlation": 290001457 + } + }, + { + "ph": "s", "id": 290001457, "pid": 5714, "tid": 6744, "ts": 6303771933371.903, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771933475.172, "dur": 1.140, + "args": { + "External id": 154868, "cbid": 317, "correlation": 290001498 + } + }, + { + "ph": "f", "id": 290001498, "pid": 5714, "tid": 6744, "ts": 6303771933475.172, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771933483.122, "dur": 2.140, + "args": { + "External id": 154869, "cbid": 138, "correlation": 290001501 + } + }, + { + "ph": "f", "id": 290001501, "pid": 5714, "tid": 6744, "ts": 6303771933483.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303772039921.690, "dur": 1.792, + "args": { + "External id": 154873, "device": 0, "context": 1, "stream": 7, "correlation": 290001512, "bytes": 7224, "memory bandwidth (GB/s)": 4.03125 + } + }, + { + "ph": "f", "id": 290001512, "pid": 0, "tid": 7, "ts": 6303772039921.690, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771933503.942, "dur": 11.370, + "args": { + "External id": 154873, "cbid": 41, "correlation": 290001512 + } + }, + { + "ph": "s", "id": 290001512, "pid": 5714, "tid": 6744, "ts": 6303771933503.942, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771933519.612, "dur": 1.820, + "args": { + "External id": 154868, "cbid": 135, "correlation": 290001516 + } + }, + { + "ph": "f", "id": 290001516, "pid": 5714, "tid": 6744, "ts": 6303771933519.612, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "at::native::detail::split_with_sizes_copy_out_contiguous_no_cast_kernel(char**, char**, long*, long*, long*, long, long)", "pid": 0, "tid": 7, + "ts": 6303772039925.114, "dur": 12.480, + "args": { + "External id": 154868, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001520, "registers per thread": 38, "shared memory": 0, "blocks per SM": 13.531250, "warps per SM": 54.125000, "grid": [866, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001520, "pid": 0, "tid": 7, "ts": 6303772039925.114, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771933523.652, "dur": 9.780, + "args": { + "External id": 154868, "cbid": 211, "correlation": 290001520 + } + }, + { + "ph": "s", "id": 290001520, "pid": 5714, "tid": 6744, "ts": 6303771933523.652, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771933621.602, "dur": 1.170, + "args": { + "External id": 154854, "cbid": 135, "correlation": 290001531 + } + }, + { + "ph": "f", "id": 290001531, "pid": 5714, "tid": 6744, "ts": 6303771933621.602, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771933625.712, "dur": 1.140, + "args": { + "External id": 154854, "cbid": 147, "correlation": 290001535 + } + }, + { + "ph": "s", "id": 290001535, "pid": 5714, "tid": 6744, "ts": 6303771933625.712, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771933628.482, "dur": 0.730, + "args": { + "External id": 154854, "cbid": 147, "correlation": 290001539 + } + }, + { + "ph": "s", "id": 290001539, "pid": 5714, "tid": 6744, "ts": 6303771933628.482, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s16816gemm_bf16_128x64_ldg8_f2f_nn", "pid": 0, "tid": 7, + "ts": 6303772039938.266, "dur": 2282.331, + "args": { + "External id": 154875, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001571, "registers per thread": 213, "shared memory": 24576, "blocks per SM": 32.000000, "warps per SM": 64.000000, "grid": [16, 256, 1], "block": [64, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001571, "pid": 0, "tid": 7, "ts": 6303772039938.266, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771933734.492, "dur": 10.240, + "args": { + "External id": 154875, "cbid": 211, "correlation": 290001571 + } + }, + { + "ph": "s", "id": 290001571, "pid": 5714, "tid": 6744, "ts": 6303771933734.492, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused__to_copy_add_mul_rsub_silu_0", "pid": 0, "tid": 7, + "ts": 6303772042273.877, "dur": 530.054, + "args": { + "External id": 154876, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001594, "registers per thread": 44, "shared memory": 0, "blocks per SM": 256.000000, "warps per SM": 1024.000000, "grid": [32768, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 290001594, "pid": 0, "tid": 7, "ts": 6303772042273.877, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771933788.531, "dur": 5.991, + "args": { + "External id": 154876, "cbid": 307, "correlation": 290001594 + } + }, + { + "ph": "s", "id": 290001594, "pid": 5714, "tid": 6744, "ts": 6303771933788.531, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771933826.762, "dur": 0.520, + "args": { + "External id": 154877, "cbid": 200, "correlation": 290001617 + } + }, + { + "ph": "f", "id": 290001617, "pid": 5714, "tid": 6744, "ts": 6303771933826.762, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772042851.836, "dur": 57.504, + "args": { + "External id": 154877, "device": 0, "context": 1, "stream": 7, "correlation": 290001620, "bytes": 1536, "memory bandwidth (GB/s)": 0.02671118530884808 + } + }, + { + "ph": "f", "id": 290001620, "pid": 0, "tid": 7, "ts": 6303772042851.836, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771933828.891, "dur": 6.251, + "args": { + "External id": 154877, "cbid": 51, "correlation": 290001620 + } + }, + { + "ph": "s", "id": 290001620, "pid": 5714, "tid": 6744, "ts": 6303771933828.891, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772043119.871, "dur": 941.003, + "args": { + "External id": 154877, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001621, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 21.000000, "warps per SM": 84.000000, "grid": [96, 4, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001621, "pid": 0, "tid": 7, "ts": 6303772043119.871, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771933835.331, "dur": 5.660, + "args": { + "External id": 154877, "cbid": 307, "correlation": 290001621 + } + }, + { + "ph": "s", "id": 290001621, "pid": 5714, "tid": 6744, "ts": 6303771933835.331, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771933867.391, "dur": 0.280, + "args": { + "External id": 154878, "cbid": 200, "correlation": 290001646 + } + }, + { + "ph": "f", "id": 290001646, "pid": 5714, "tid": 6744, "ts": 6303771933867.391, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772044075.018, "dur": 1.152, + "args": { + "External id": 154878, "device": 0, "context": 1, "stream": 7, "correlation": 290001649, "bytes": 1536, "memory bandwidth (GB/s)": 1.3333333333333333 + } + }, + { + "ph": "f", "id": 290001649, "pid": 0, "tid": 7, "ts": 6303772044075.018, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771933868.591, "dur": 4.520, + "args": { + "External id": 154878, "cbid": 51, "correlation": 290001649 + } + }, + { + "ph": "s", "id": 290001649, "pid": 5714, "tid": 6744, "ts": 6303771933868.591, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772044077.610, "dur": 354.340, + "args": { + "External id": 154878, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001650, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001650, "pid": 0, "tid": 7, "ts": 6303772044077.610, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771933873.261, "dur": 4.910, + "args": { + "External id": 154878, "cbid": 307, "correlation": 290001650 + } + }, + { + "ph": "s", "id": 290001650, "pid": 5714, "tid": 6744, "ts": 6303771933873.261, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771933900.251, "dur": 0.300, + "args": { + "External id": 154879, "cbid": 200, "correlation": 290001675 + } + }, + { + "ph": "f", "id": 290001675, "pid": 5714, "tid": 6744, "ts": 6303771933900.251, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772044432.654, "dur": 457.542, + "args": { + "External id": 154879, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001678, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001678, "pid": 0, "tid": 7, "ts": 6303772044432.654, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771933901.631, "dur": 5.140, + "args": { + "External id": 154879, "cbid": 307, "correlation": 290001678 + } + }, + { + "ph": "s", "id": 290001678, "pid": 5714, "tid": 6744, "ts": 6303771933901.631, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771933927.231, "dur": 0.250, + "args": { + "External id": 154880, "cbid": 200, "correlation": 290001703 + } + }, + { + "ph": "f", "id": 290001703, "pid": 5714, "tid": 6744, "ts": 6303771933927.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772044891.572, "dur": 1.216, + "args": { + "External id": 154880, "device": 0, "context": 1, "stream": 7, "correlation": 290001706, "bytes": 1536, "memory bandwidth (GB/s)": 1.263157894736842 + } + }, + { + "ph": "f", "id": 290001706, "pid": 0, "tid": 7, "ts": 6303772044891.572, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771933928.451, "dur": 4.450, + "args": { + "External id": 154880, "cbid": 51, "correlation": 290001706 + } + }, + { + "ph": "s", "id": 290001706, "pid": 5714, "tid": 6744, "ts": 6303771933928.451, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772044894.004, "dur": 358.244, + "args": { + "External id": 154880, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001707, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 28.000000, "warps per SM": 112.000000, "grid": [256, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001707, "pid": 0, "tid": 7, "ts": 6303772044894.004, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771933933.021, "dur": 4.730, + "args": { + "External id": 154880, "cbid": 307, "correlation": 290001707 + } + }, + { + "ph": "s", "id": 290001707, "pid": 5714, "tid": 6744, "ts": 6303771933933.021, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771933959.321, "dur": 0.300, + "args": { + "External id": 154881, "cbid": 200, "correlation": 290001732 + } + }, + { + "ph": "f", "id": 290001732, "pid": 5714, "tid": 6744, "ts": 6303771933959.321, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nn_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772045252.888, "dur": 356.644, + "args": { + "External id": 154881, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001735, "registers per thread": 94, "shared memory": 49152, "blocks per SM": 32.000000, "warps per SM": 128.000000, "grid": [2048, 2, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001735, "pid": 0, "tid": 7, "ts": 6303772045252.888, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771933960.641, "dur": 4.870, + "args": { + "External id": 154881, "cbid": 307, "correlation": 290001735 + } + }, + { + "ph": "s", "id": 290001735, "pid": 5714, "tid": 6744, "ts": 6303771933960.641, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303772045610.204, "dur": 89.249, + "args": { + "External id": 154882, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001748, "registers per thread": 40, "shared memory": 3072, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001748, "pid": 0, "tid": 7, "ts": 6303772045610.204, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934000.781, "dur": 5.840, + "args": { + "External id": 154882, "cbid": 307, "correlation": 290001748 + } + }, + { + "ph": "s", "id": 290001748, "pid": 5714, "tid": 6744, "ts": 6303771934000.781, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_add_mul_sum_2", "pid": 0, "tid": 7, + "ts": 6303772045700.093, "dur": 3.680, + "args": { + "External id": 154883, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001756, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 290001756, "pid": 0, "tid": 7, "ts": 6303772045700.093, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934029.881, "dur": 4.940, + "args": { + "External id": 154883, "cbid": 307, "correlation": 290001756 + } + }, + { + "ph": "s", "id": 290001756, "pid": 5714, "tid": 6744, "ts": 6303771934029.881, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_3", "pid": 0, "tid": 7, + "ts": 6303772045705.021, "dur": 112.897, + "args": { + "External id": 154884, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001764, "registers per thread": 27, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001764, "pid": 0, "tid": 7, "ts": 6303772045705.021, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934059.261, "dur": 4.900, + "args": { + "External id": 154884, "cbid": 307, "correlation": 290001764 + } + }, + { + "ph": "s", "id": 290001764, "pid": 5714, "tid": 6744, "ts": 6303771934059.261, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771934226.981, "dur": 0.520, + "args": { + "External id": 154903, "cbid": 200, "correlation": 290001810 + } + }, + { + "ph": "f", "id": 290001810, "pid": 5714, "tid": 6744, "ts": 6303771934226.981, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772045819.230, "dur": 1.408, + "args": { + "External id": 154903, "device": 0, "context": 1, "stream": 7, "correlation": 290001813, "bytes": 576, "memory bandwidth (GB/s)": 0.4090909090909091 + } + }, + { + "ph": "f", "id": 290001813, "pid": 0, "tid": 7, "ts": 6303772045819.230, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771934229.110, "dur": 7.040, + "args": { + "External id": 154903, "cbid": 51, "correlation": 290001813 + } + }, + { + "ph": "s", "id": 290001813, "pid": 5714, "tid": 6744, "ts": 6303771934229.110, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772045822.078, "dur": 143.938, + "args": { + "External id": 154903, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001814, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001814, "pid": 0, "tid": 7, "ts": 6303772045822.078, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934236.390, "dur": 7.660, + "args": { + "External id": 154903, "cbid": 307, "correlation": 290001814 + } + }, + { + "ph": "s", "id": 290001814, "pid": 5714, "tid": 6744, "ts": 6303771934236.390, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772045966.656, "dur": 141.442, + "args": { + "External id": 154904, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001836, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001836, "pid": 0, "tid": 7, "ts": 6303772045966.656, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934268.970, "dur": 5.840, + "args": { + "External id": 154904, "cbid": 211, "correlation": 290001836 + } + }, + { + "ph": "s", "id": 290001836, "pid": 5714, "tid": 6744, "ts": 6303771934268.970, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771934353.290, "dur": 0.420, + "args": { + "External id": 154905, "cbid": 200, "correlation": 290001854 + } + }, + { + "ph": "f", "id": 290001854, "pid": 5714, "tid": 6744, "ts": 6303771934353.290, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771934353.840, "dur": 0.240, + "args": { + "External id": 154905, "cbid": 200, "correlation": 290001855 + } + }, + { + "ph": "f", "id": 290001855, "pid": 5714, "tid": 6744, "ts": 6303771934353.840, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771934372.790, "dur": 0.280, + "args": { + "External id": 154905, "cbid": 200, "correlation": 290001873 + } + }, + { + "ph": "f", "id": 290001873, "pid": 5714, "tid": 6744, "ts": 6303771934372.790, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dot_do_o_kernel > >(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303772046108.802, "dur": 92.161, + "args": { + "External id": 154905, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001874, "registers per thread": 32, "shared memory": 0, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001874, "pid": 0, "tid": 7, "ts": 6303772046108.802, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934374.220, "dur": 9.290, + "args": { + "External id": 154905, "cbid": 211, "correlation": 290001874 + } + }, + { + "ph": "s", "id": 290001874, "pid": 5714, "tid": 6744, "ts": 6303771934374.220, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaFuncSetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771934384.260, "dur": 0.890, + "args": { + "External id": 154905, "cbid": 273, "correlation": 290001876 + } + }, + { + "ph": "f", "id": 290001876, "pid": 5714, "tid": 6744, "ts": 6303771934384.260, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_dq_dk_dv_loop_seqk_parallel_kernel >, false, true, false, false, true, true, false>(flash::Flash_bwd_params)", "pid": 0, "tid": 7, + "ts": 6303772046201.699, "dur": 1434.896, + "args": { + "External id": 154905, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001877, "registers per thread": 255, "shared memory": 73728, "blocks per SM": 12.000000, "warps per SM": 96.000000, "grid": [16, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290001877, "pid": 0, "tid": 7, "ts": 6303772046201.699, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934385.440, "dur": 4.000, + "args": { + "External id": 154905, "cbid": 211, "correlation": 290001877 + } + }, + { + "ph": "s", "id": 290001877, "pid": 5714, "tid": 6744, "ts": 6303771934385.440, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void flash::flash_bwd_convert_dq_kernel > >(flash::Flash_bwd_params, int)", "pid": 0, "tid": 7, + "ts": 6303772047637.299, "dur": 73.633, + "args": { + "External id": 154905, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001879, "registers per thread": 48, "shared memory": 8192, "blocks per SM": 24.000000, "warps per SM": 192.000000, "grid": [32, 8, 12], "block": [256, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 290001879, "pid": 0, "tid": 7, "ts": 6303772047637.299, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934389.960, "dur": 3.630, + "args": { + "External id": 154905, "cbid": 211, "correlation": 290001879 + } + }, + { + "ph": "s", "id": 290001879, "pid": 5714, "tid": 6744, "ts": 6303771934389.960, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303772047711.572, "dur": 104.706, + "args": { + "External id": 154916, "device": 0, "context": 1, "stream": 7, "correlation": 290001901, "bytes": 25165824, "memory bandwidth (GB/s)": 240.34748725001432 + } + }, + { + "ph": "f", "id": 290001901, "pid": 0, "tid": 7, "ts": 6303772047711.572, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771934518.300, "dur": 16.930, + "args": { + "External id": 154916, "cbid": 41, "correlation": 290001901 + } + }, + { + "ph": "s", "id": 290001901, "pid": 5714, "tid": 6744, "ts": 6303771934518.300, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303772047816.886, "dur": 203.682, + "args": { + "External id": 154913, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001919, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001919, "pid": 0, "tid": 7, "ts": 6303772047816.886, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934637.820, "dur": 9.930, + "args": { + "External id": 154913, "cbid": 307, "correlation": 290001919 + } + }, + { + "ph": "s", "id": 290001919, "pid": 5714, "tid": 6744, "ts": 6303771934637.820, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303772048021.176, "dur": 198.274, + "args": { + "External id": 154923, "device": 0, "context": 1, "stream": 7, "correlation": 290001934, "bytes": 25165824, "memory bandwidth (GB/s)": 126.92447824727398 + } + }, + { + "ph": "f", "id": 290001934, "pid": 0, "tid": 7, "ts": 6303772048021.176, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771934721.229, "dur": 15.731, + "args": { + "External id": 154923, "cbid": 41, "correlation": 290001934 + } + }, + { + "ph": "s", "id": 290001934, "pid": 5714, "tid": 6744, "ts": 6303771934721.229, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "rotary_embedding_kernel", "pid": 0, "tid": 7, + "ts": 6303772048220.122, "dur": 162.946, + "args": { + "External id": 154920, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001952, "registers per thread": 19, "shared memory": 0, "blocks per SM": 96.000000, "warps per SM": 1536.000000, "grid": [128, 8, 12], "block": [512, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290001952, "pid": 0, "tid": 7, "ts": 6303772048220.122, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934831.829, "dur": 7.790, + "args": { + "External id": 154920, "cbid": 307, "correlation": 290001952 + } + }, + { + "ph": "s", "id": 290001952, "pid": 5714, "tid": 6744, "ts": 6303771934831.829, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771934958.359, "dur": 0.560, + "args": { + "External id": 154928, "cbid": 200, "correlation": 290001982 + } + }, + { + "ph": "f", "id": 290001982, "pid": 5714, "tid": 6744, "ts": 6303771934958.359, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772048455.005, "dur": 70.561, + "args": { + "External id": 154928, "device": 0, "context": 1, "stream": 7, "correlation": 290001985, "bytes": 576, "memory bandwidth (GB/s)": 0.008163149615226541 + } + }, + { + "ph": "f", "id": 290001985, "pid": 0, "tid": 7, "ts": 6303772048455.005, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771934960.739, "dur": 7.690, + "args": { + "External id": 154928, "cbid": 51, "correlation": 290001985 + } + }, + { + "ph": "s", "id": 290001985, "pid": 5714, "tid": 6744, "ts": 6303771934960.739, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772048582.750, "dur": 159.266, + "args": { + "External id": 154928, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290001986, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290001986, "pid": 0, "tid": 7, "ts": 6303772048582.750, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771934968.679, "dur": 8.030, + "args": { + "External id": 154928, "cbid": 307, "correlation": 290001986 + } + }, + { + "ph": "s", "id": 290001986, "pid": 5714, "tid": 6744, "ts": 6303771934968.679, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771935003.289, "dur": 0.310, + "args": { + "External id": 154929, "cbid": 200, "correlation": 290002011 + } + }, + { + "ph": "f", "id": 290002011, "pid": 5714, "tid": 6744, "ts": 6303771935003.289, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772048749.632, "dur": 7.008, + "args": { + "External id": 154929, "device": 0, "context": 1, "stream": 7, "correlation": 290002014, "bytes": 576, "memory bandwidth (GB/s)": 0.0821917808219178 + } + }, + { + "ph": "f", "id": 290002014, "pid": 0, "tid": 7, "ts": 6303772048749.632, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771935004.639, "dur": 4.800, + "args": { + "External id": 154929, "cbid": 51, "correlation": 290002014 + } + }, + { + "ph": "s", "id": 290002014, "pid": 5714, "tid": 6744, "ts": 6303771935004.639, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772048762.624, "dur": 155.682, + "args": { + "External id": 154929, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002015, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290002015, "pid": 0, "tid": 7, "ts": 6303772048762.624, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935009.629, "dur": 5.150, + "args": { + "External id": 154929, "cbid": 307, "correlation": 290002015 + } + }, + { + "ph": "s", "id": 290002015, "pid": 5714, "tid": 6744, "ts": 6303771935009.629, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceGetAttribute", "pid": 5714, "tid": 6744, + "ts": 6303771935037.519, "dur": 0.290, + "args": { + "External id": 154930, "cbid": 200, "correlation": 290002040 + } + }, + { + "ph": "f", "id": 290002040, "pid": 5714, "tid": 6744, "ts": 6303771935037.519, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memset", "name": "Memset (Device)", "pid": 0, "tid": 7, + "ts": 6303772048925.506, "dur": 6.369, + "args": { + "External id": 154930, "device": 0, "context": 1, "stream": 7, "correlation": 290002043, "bytes": 576, "memory bandwidth (GB/s)": 0.09043805934997645 + } + }, + { + "ph": "f", "id": 290002043, "pid": 0, "tid": 7, "ts": 6303772048925.506, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemsetAsync", "pid": 5714, "tid": 6744, + "ts": 6303771935038.759, "dur": 4.270, + "args": { + "External id": 154930, "cbid": 51, "correlation": 290002043 + } + }, + { + "ph": "s", "id": 290002043, "pid": 5714, "tid": 6744, "ts": 6303771935038.759, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void cutlass::Kernel2(cutlass_80_tensorop_bf16_s16816gemm_relu_bf16_64x64_32x6_nt_align8::Params)", "pid": 0, "tid": 7, + "ts": 6303772048939.523, "dur": 138.913, + "args": { + "External id": 154930, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002044, "registers per thread": 92, "shared memory": 49152, "blocks per SM": 10.500000, "warps per SM": 42.000000, "grid": [96, 2, 7], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290002044, "pid": 0, "tid": 7, "ts": 6303772048939.523, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935043.169, "dur": 4.910, + "args": { + "External id": 154930, "cbid": 307, "correlation": 290002044 + } + }, + { + "ph": "s", "id": 290002044, "pid": 5714, "tid": 6744, "ts": 6303771935043.169, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772049079.108, "dur": 120.642, + "args": { + "External id": 154931, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002066, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290002066, "pid": 0, "tid": 7, "ts": 6303772049079.108, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935072.149, "dur": 5.710, + "args": { + "External id": 154931, "cbid": 211, "correlation": 290002066 + } + }, + { + "ph": "s", "id": 290002066, "pid": 5714, "tid": 6744, "ts": 6303771935072.149, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772049200.454, "dur": 121.889, + "args": { + "External id": 154932, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002089, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290002089, "pid": 0, "tid": 7, "ts": 6303772049200.454, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935096.099, "dur": 4.950, + "args": { + "External id": 154932, "cbid": 211, "correlation": 290002089 + } + }, + { + "ph": "s", "id": 290002089, "pid": 5714, "tid": 6744, "ts": 6303771935096.099, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ampere_bf16_s1688gemm_bf16_128x128_ldg8_f2f_stages_32x1_nn", "pid": 0, "tid": 7, + "ts": 6303772049322.983, "dur": 122.562, + "args": { + "External id": 154933, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002112, "registers per thread": 234, "shared memory": 32768, "blocks per SM": 6.000000, "warps per SM": 24.000000, "grid": [6, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 17 + } + }, + { + "ph": "f", "id": 290002112, "pid": 0, "tid": 7, "ts": 6303772049322.983, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935118.319, "dur": 4.700, + "args": { + "External id": 154933, "cbid": 211, "correlation": 290002112 + } + }, + { + "ph": "s", "id": 290002112, "pid": 5714, "tid": 6744, "ts": 6303771935118.319, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_add_0", "pid": 0, "tid": 7, + "ts": 6303772049446.281, "dur": 79.904, + "args": { + "External id": 154934, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002120, "registers per thread": 16, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290002120, "pid": 0, "tid": 7, "ts": 6303772049446.281, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935155.748, "dur": 5.580, + "args": { + "External id": 154934, "cbid": 307, "correlation": 290002120 + } + }, + { + "ph": "s", "id": 290002120, "pid": 5714, "tid": 6744, "ts": 6303771935155.748, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_0", "pid": 0, "tid": 7, + "ts": 6303772049526.793, "dur": 42.145, + "args": { + "External id": 154949, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002149, "registers per thread": 40, "shared memory": 1536, "blocks per SM": 12.000000, "warps per SM": 48.000000, "grid": [1536, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290002149, "pid": 0, "tid": 7, "ts": 6303772049526.793, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935324.688, "dur": 9.980, + "args": { + "External id": 154949, "cbid": 307, "correlation": 290002149 + } + }, + { + "ph": "s", "id": 290002149, "pid": 5714, "tid": 6744, "ts": 6303771935324.688, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_red_fused__to_copy_mul_sum_1", "pid": 0, "tid": 7, + "ts": 6303772049569.674, "dur": 1.824, + "args": { + "External id": 154950, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002157, "registers per thread": 39, "shared memory": 1024, "blocks per SM": 0.093750, "warps per SM": 0.375000, "grid": [12, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 1 + } + }, + { + "ph": "f", "id": 290002157, "pid": 0, "tid": 7, "ts": 6303772049569.674, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935361.258, "dur": 5.550, + "args": { + "External id": 154950, "cbid": 307, "correlation": 290002157 + } + }, + { + "ph": "s", "id": 290002157, "pid": 5714, "tid": 6744, "ts": 6303771935361.258, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_per_fused__to_copy_add_div_mul_pow_sum_2", "pid": 0, "tid": 7, + "ts": 6303772049572.234, "dur": 50.848, + "args": { + "External id": 154951, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002168, "registers per thread": 24, "shared memory": 32, "blocks per SM": 128.000000, "warps per SM": 1024.000000, "grid": [16384, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290002168, "pid": 0, "tid": 7, "ts": 6303772049572.234, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935394.208, "dur": 5.220, + "args": { + "External id": 154951, "cbid": 307, "correlation": 290002168 + } + }, + { + "ph": "s", "id": 290002168, "pid": 5714, "tid": 6744, "ts": 6303771935394.208, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303772049623.722, "dur": 44.385, + "args": { + "External id": 154952, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002173, "registers per thread": 23, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 768.000000, "grid": [24576, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290002173, "pid": 0, "tid": 7, "ts": 6303772049623.722, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935434.318, "dur": 6.930, + "args": { + "External id": 154952, "cbid": 211, "correlation": 290002173 + } + }, + { + "ph": "s", "id": 290002173, "pid": 5714, "tid": 6744, "ts": 6303771935434.318, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771935614.458, "dur": 3.400, + "args": { + "External id": 154958, "cbid": 147, "correlation": 290002190 + } + }, + { + "ph": "s", "id": 290002190, "pid": 5714, "tid": 6744, "ts": 6303771935614.458, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771935740.897, "dur": 3.160, + "args": { + "External id": 154966, "cbid": 138, "correlation": 290002205 + } + }, + { + "ph": "f", "id": 290002205, "pid": 5714, "tid": 6744, "ts": 6303771935740.897, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303772049675.019, "dur": 2.368, + "args": { + "External id": 154970, "device": 0, "context": 1, "stream": 7, "correlation": 290002216, "bytes": 28112, "memory bandwidth (GB/s)": 11.871621621621621 + } + }, + { + "ph": "f", "id": 290002216, "pid": 0, "tid": 7, "ts": 6303772049675.019, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771935771.087, "dur": 14.820, + "args": { + "External id": 154970, "cbid": 41, "correlation": 290002216 + } + }, + { + "ph": "s", "id": 290002216, "pid": 5714, "tid": 6744, "ts": 6303771935771.087, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771935790.507, "dur": 2.170, + "args": { + "External id": 154965, "cbid": 135, "correlation": 290002220 + } + }, + { + "ph": "f", "id": 290002220, "pid": 5714, "tid": 6744, "ts": 6303771935790.507, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303772049679.051, "dur": 48.129, + "args": { + "External id": 154965, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002224, "registers per thread": 32, "shared memory": 0, "blocks per SM": 108.062500, "warps per SM": 432.250000, "grid": [3458, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290002224, "pid": 0, "tid": 7, "ts": 6303772049679.051, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771935796.017, "dur": 10.960, + "args": { + "External id": 154965, "cbid": 211, "correlation": 290002224 + } + }, + { + "ph": "s", "id": 290002224, "pid": 5714, "tid": 6744, "ts": 6303771935796.017, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771935847.987, "dur": 1.080, + "args": { + "External id": 154958, "cbid": 135, "correlation": 290002235 + } + }, + { + "ph": "f", "id": 290002235, "pid": 5714, "tid": 6744, "ts": 6303771935847.987, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771935851.127, "dur": 1.450, + "args": { + "External id": 154958, "cbid": 147, "correlation": 290002239 + } + }, + { + "ph": "s", "id": 290002239, "pid": 5714, "tid": 6744, "ts": 6303771935851.127, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771935924.587, "dur": 1.050, + "args": { + "External id": 154974, "cbid": 317, "correlation": 290002259 + } + }, + { + "ph": "f", "id": 290002259, "pid": 5714, "tid": 6744, "ts": 6303771935924.587, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771935927.657, "dur": 1.510, + "args": { + "External id": 154974, "cbid": 135, "correlation": 290002261 + } + }, + { + "ph": "f", "id": 290002261, "pid": 5714, "tid": 6744, "ts": 6303771935927.657, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771935930.657, "dur": 1.220, + "args": { + "External id": 154974, "cbid": 147, "correlation": 290002265 + } + }, + { + "ph": "s", "id": 290002265, "pid": 5714, "tid": 6744, "ts": 6303771935930.657, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771935946.747, "dur": 0.790, + "args": { + "External id": 154974, "cbid": 409, "correlation": 290002268 + } + }, + { + "ph": "f", "id": 290002268, "pid": 5714, "tid": 6744, "ts": 6303771935946.747, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771935951.977, "dur": 0.930, + "args": { + "External id": 154974, "cbid": 135, "correlation": 290002271 + } + }, + { + "ph": "f", "id": 290002271, "pid": 5714, "tid": 6744, "ts": 6303771935951.977, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771935953.107, "dur": 0.910, + "args": { + "External id": 154974, "cbid": 147, "correlation": 290002272 + } + }, + { + "ph": "s", "id": 290002272, "pid": 5714, "tid": 6744, "ts": 6303771935953.107, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303772049820.941, "dur": 10440.505, + "args": { + "External id": 154974, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 290002274, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 7079424, "Out msg nelems": 1769856, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 290002274, "pid": 0, "tid": 20, "ts": 6303772049820.941, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771935955.217, "dur": 10.450, + "args": { + "External id": 154974, "cbid": 430, "correlation": 290002274 + } + }, + { + "ph": "s", "id": 290002274, "pid": 5714, "tid": 6744, "ts": 6303771935955.217, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771935966.827, "dur": 0.520, + "args": { + "External id": 154974, "cbid": 135, "correlation": 290002276 + } + }, + { + "ph": "f", "id": 290002276, "pid": 5714, "tid": 6744, "ts": 6303771935966.827, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771935967.467, "dur": 0.580, + "args": { + "External id": 154974, "cbid": 147, "correlation": 290002277 + } + }, + { + "ph": "s", "id": 290002277, "pid": 5714, "tid": 6744, "ts": 6303771935967.467, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771935969.597, "dur": 0.830, + "args": { + "External id": 154974, "cbid": 135, "correlation": 290002280 + } + }, + { + "ph": "f", "id": 290002280, "pid": 5714, "tid": 6744, "ts": 6303771935969.597, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771935978.487, "dur": 0.500, + "args": { + "External id": 154974, "cbid": 135, "correlation": 290002287 + } + }, + { + "ph": "f", "id": 290002287, "pid": 5714, "tid": 6744, "ts": 6303771935978.487, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771936006.177, "dur": 1.060, + "args": { + "External id": 154976, "cbid": 147, "correlation": 290002292 + } + }, + { + "ph": "s", "id": 290002292, "pid": 5714, "tid": 6744, "ts": 6303771936006.177, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771936024.317, "dur": 0.949, + "args": { + "External id": 154958, "cbid": 135, "correlation": 290002307 + } + }, + { + "ph": "f", "id": 290002307, "pid": 5714, "tid": 6744, "ts": 6303771936024.317, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771936220.316, "dur": 1.370, + "args": { + "External id": 154958, "cbid": 135, "correlation": 290002320 + } + }, + { + "ph": "f", "id": 290002320, "pid": 5714, "tid": 6744, "ts": 6303771936220.316, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_dense_backward_0", "pid": 0, "tid": 7, + "ts": 6303772049727.788, "dur": 95.937, + "args": { + "External id": 154988, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002336, "registers per thread": 16, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 3000.000000, "grid": [48000, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290002336, "pid": 0, "tid": 7, "ts": 6303772049727.788, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771936384.746, "dur": 12.640, + "args": { + "External id": 154988, "cbid": 307, "correlation": 290002336 + } + }, + { + "ph": "s", "id": 290002336, "pid": 5714, "tid": 6744, "ts": 6303771936384.746, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_dense_backward_1", "pid": 0, "tid": 7, + "ts": 6303772049824.397, "dur": 863.338, + "args": { + "External id": 154989, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002341, "registers per thread": 32, "shared memory": 0, "blocks per SM": 192.000000, "warps per SM": 1536.000000, "grid": [24576, 1, 1], "block": [256, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290002341, "pid": 0, "tid": 7, "ts": 6303772049824.397, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771936421.966, "dur": 6.370, + "args": { + "External id": 154989, "cbid": 307, "correlation": 290002341 + } + }, + { + "ph": "s", "id": 290002341, "pid": 5714, "tid": 6744, "ts": 6303771936421.966, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "triton_poi_fused_embedding_dense_backward_2", "pid": 0, "tid": 7, + "ts": 6303772050732.791, "dur": 807.466, + "args": { + "External id": 154990, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002349, "registers per thread": 20, "shared memory": 0, "blocks per SM": 187.500000, "warps per SM": 750.000000, "grid": [24000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290002349, "pid": 0, "tid": 7, "ts": 6303772050732.791, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_driver", "name": "cuLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771936456.566, "dur": 6.979, + "args": { + "External id": 154990, "cbid": 307, "correlation": 290002349 + } + }, + { + "ph": "s", "id": 290002349, "pid": 5714, "tid": 6744, "ts": 6303771936456.566, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctor_add, std::array >(int, at::native::CUDAFunctor_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303772051701.699, "dur": 274.371, + "args": { + "External id": 154991, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002354, "registers per thread": 23, "shared memory": 0, "blocks per SM": 375.000000, "warps per SM": 1500.000000, "grid": [48000, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290002354, "pid": 0, "tid": 7, "ts": 6303772051701.699, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771936497.936, "dur": 7.280, + "args": { + "External id": 154991, "cbid": 211, "correlation": 290002354 + } + }, + { + "ph": "s", "id": 290002354, "pid": 5714, "tid": 6744, "ts": 6303771936497.936, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771936873.605, "dur": 3.490, + "args": { + "cbid": 147, "correlation": 290002369 + } + }, + { + "ph": "s", "id": 290002369, "pid": 5714, "tid": 6744, "ts": 6303771936873.605, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 6744, + "ts": 6303771937097.714, "dur": 3.060, + "args": { + "External id": 155004, "cbid": 138, "correlation": 290002384 + } + }, + { + "ph": "f", "id": 290002384, "pid": 5714, "tid": 6744, "ts": 6303771937097.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy HtoD (Pinned -> Device)", "pid": 0, "tid": 7, + "ts": 6303772060267.782, "dur": 9.184, + "args": { + "External id": 155008, "device": 0, "context": 1, "stream": 7, "correlation": 290002395, "bytes": 208504, "memory bandwidth (GB/s)": 22.70296167247387 + } + }, + { + "ph": "f", "id": 290002395, "pid": 0, "tid": 7, "ts": 6303772060267.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 6744, + "ts": 6303771937142.304, "dur": 14.490, + "args": { + "External id": 155008, "cbid": 41, "correlation": 290002395 + } + }, + { + "ph": "s", "id": 290002395, "pid": 5714, "tid": 6744, "ts": 6303771937142.304, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771937162.074, "dur": 2.090, + "args": { + "External id": 155003, "cbid": 135, "correlation": 290002399 + } + }, + { + "ph": "f", "id": 290002399, "pid": 5714, "tid": 6744, "ts": 6303771937162.074, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::detail::chunk_cat_cuda_kernel(c10::BFloat16**, float*, long*, long*, long*, long*, long*, long*, long, long, long)", "pid": 0, "tid": 7, + "ts": 6303772060278.598, "dur": 334.820, + "args": { + "External id": 155003, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002403, "registers per thread": 32, "shared memory": 0, "blocks per SM": 807.281250, "warps per SM": 3229.125000, "grid": [25833, 4, 1], "block": [128, 1, 1], "est. achieved occupancy %": 100 + } + }, + { + "ph": "f", "id": 290002403, "pid": 0, "tid": 7, "ts": 6303772060278.598, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 6744, + "ts": 6303771937167.704, "dur": 12.090, + "args": { + "External id": 155003, "cbid": 211, "correlation": 290002403 + } + }, + { + "ph": "s", "id": 290002403, "pid": 5714, "tid": 6744, "ts": 6303771937167.704, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771937248.414, "dur": 1.360, + "args": { + "cbid": 135, "correlation": 290002414 + } + }, + { + "ph": "f", "id": 290002414, "pid": 5714, "tid": 6744, "ts": 6303771937248.414, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771937252.194, "dur": 1.600, + "args": { + "cbid": 147, "correlation": 290002418 + } + }, + { + "ph": "s", "id": 290002418, "pid": 5714, "tid": 6744, "ts": 6303771937252.194, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 6744, + "ts": 6303771937343.874, "dur": 1.370, + "args": { + "External id": 155012, "cbid": 317, "correlation": 290002438 + } + }, + { + "ph": "f", "id": 290002438, "pid": 5714, "tid": 6744, "ts": 6303771937343.874, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771937347.544, "dur": 1.700, + "args": { + "External id": 155012, "cbid": 135, "correlation": 290002440 + } + }, + { + "ph": "f", "id": 290002440, "pid": 5714, "tid": 6744, "ts": 6303771937347.544, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771937350.994, "dur": 1.370, + "args": { + "External id": 155012, "cbid": 147, "correlation": 290002444 + } + }, + { + "ph": "s", "id": 290002444, "pid": 5714, "tid": 6744, "ts": 6303771937350.994, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 6744, + "ts": 6303771937369.203, "dur": 0.820, + "args": { + "External id": 155012, "cbid": 409, "correlation": 290002447 + } + }, + { + "ph": "f", "id": 290002447, "pid": 5714, "tid": 6744, "ts": 6303771937369.203, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771937374.934, "dur": 1.109, + "args": { + "External id": 155012, "cbid": 135, "correlation": 290002450 + } + }, + { + "ph": "f", "id": 290002450, "pid": 5714, "tid": 6744, "ts": 6303771937374.934, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771937376.263, "dur": 0.991, + "args": { + "External id": 155012, "cbid": 147, "correlation": 290002451 + } + }, + { + "ph": "s", "id": 290002451, "pid": 5714, "tid": 6744, "ts": 6303771937376.263, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_ReduceScatter_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303772060616.074, "dur": 72130.469, + "args": { + "External id": 155012, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 290002453, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.031250, "warps per SM": 0.531250, "grid": [4, 1, 1], "block": [544, 1, 1], "est. achieved occupancy %": 0, "Collective name": "_reduce_scatter_base", "In msg nelems": 52894464, "Out msg nelems": 13223616, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 290002453, "pid": 0, "tid": 20, "ts": 6303772060616.074, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 6744, + "ts": 6303771937378.903, "dur": 12.231, + "args": { + "External id": 155012, "cbid": 430, "correlation": 290002453 + } + }, + { + "ph": "s", "id": 290002453, "pid": 5714, "tid": 6744, "ts": 6303771937378.903, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771937392.334, "dur": 0.500, + "args": { + "External id": 155012, "cbid": 135, "correlation": 290002455 + } + }, + { + "ph": "f", "id": 290002455, "pid": 5714, "tid": 6744, "ts": 6303771937392.334, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771937392.963, "dur": 0.591, + "args": { + "External id": 155012, "cbid": 147, "correlation": 290002456 + } + }, + { + "ph": "s", "id": 290002456, "pid": 5714, "tid": 6744, "ts": 6303771937392.963, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771937395.334, "dur": 0.909, + "args": { + "External id": 155012, "cbid": 135, "correlation": 290002459 + } + }, + { + "ph": "f", "id": 290002459, "pid": 5714, "tid": 6744, "ts": 6303771937395.334, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771937405.443, "dur": 0.531, + "args": { + "External id": 155012, "cbid": 135, "correlation": 290002466 + } + }, + { + "ph": "f", "id": 290002466, "pid": 5714, "tid": 6744, "ts": 6303771937405.443, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771937435.394, "dur": 1.140, + "args": { + "External id": 155014, "cbid": 147, "correlation": 290002471 + } + }, + { + "ph": "s", "id": 290002471, "pid": 5714, "tid": 6744, "ts": 6303771937435.394, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771937455.723, "dur": 1.000, + "args": { + "cbid": 135, "correlation": 290002486 + } + }, + { + "ph": "f", "id": 290002486, "pid": 5714, "tid": 6744, "ts": 6303771937455.723, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 6744, + "ts": 6303771938186.572, "dur": 1.650, + "args": { + "cbid": 135, "correlation": 290002499 + } + }, + { + "ph": "f", "id": 290002499, "pid": 5714, "tid": 6744, "ts": 6303771938186.572, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938254.642, "dur": 3.160, + "args": { + "cbid": 147, "correlation": 290002506 + } + }, + { + "ph": "s", "id": 290002506, "pid": 5714, "tid": 6744, "ts": 6303771938254.642, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938279.232, "dur": 1.720, + "args": { + "cbid": 147, "correlation": 290002516 + } + }, + { + "ph": "s", "id": 290002516, "pid": 5714, "tid": 6744, "ts": 6303771938279.232, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938296.201, "dur": 10.680, + "args": { + "cbid": 147, "correlation": 290002526 + } + }, + { + "ph": "s", "id": 290002526, "pid": 5714, "tid": 6744, "ts": 6303771938296.201, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938321.572, "dur": 1.200, + "args": { + "cbid": 147, "correlation": 290002536 + } + }, + { + "ph": "s", "id": 290002536, "pid": 5714, "tid": 6744, "ts": 6303771938321.572, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938335.732, "dur": 1.120, + "args": { + "cbid": 147, "correlation": 290002546 + } + }, + { + "ph": "s", "id": 290002546, "pid": 5714, "tid": 6744, "ts": 6303771938335.732, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938349.312, "dur": 1.180, + "args": { + "cbid": 147, "correlation": 290002556 + } + }, + { + "ph": "s", "id": 290002556, "pid": 5714, "tid": 6744, "ts": 6303771938349.312, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938363.261, "dur": 1.120, + "args": { + "cbid": 147, "correlation": 290002566 + } + }, + { + "ph": "s", "id": 290002566, "pid": 5714, "tid": 6744, "ts": 6303771938363.261, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938376.671, "dur": 1.060, + "args": { + "cbid": 147, "correlation": 290002576 + } + }, + { + "ph": "s", "id": 290002576, "pid": 5714, "tid": 6744, "ts": 6303771938376.671, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938389.991, "dur": 1.180, + "args": { + "cbid": 147, "correlation": 290002586 + } + }, + { + "ph": "s", "id": 290002586, "pid": 5714, "tid": 6744, "ts": 6303771938389.991, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938403.151, "dur": 1.120, + "args": { + "cbid": 147, "correlation": 290002596 + } + }, + { + "ph": "s", "id": 290002596, "pid": 5714, "tid": 6744, "ts": 6303771938403.151, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938416.941, "dur": 1.110, + "args": { + "cbid": 147, "correlation": 290002606 + } + }, + { + "ph": "s", "id": 290002606, "pid": 5714, "tid": 6744, "ts": 6303771938416.941, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 6744, + "ts": 6303771938430.491, "dur": 1.260, + "args": { + "cbid": 147, "correlation": 290002616 + } + }, + { + "ph": "s", "id": 290002616, "pid": 5714, "tid": 6744, "ts": 6303771938430.491, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303772132747.279, "dur": 1.472, + "args": { + "External id": 151162, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002629, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290002629, "pid": 0, "tid": 7, "ts": 6303772132747.279, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771938576.701, "dur": 20.670, + "args": { + "External id": 151162, "cbid": 211, "correlation": 290002629 + } + }, + { + "ph": "s", "id": 290002629, "pid": 5714, "tid": 5714, "ts": 6303771938576.701, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BUnaryFunctor >, std::array >(int, at::native::BUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303772132749.391, "dur": 1.312, + "args": { + "External id": 151163, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002639, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290002639, "pid": 0, "tid": 7, "ts": 6303772132749.391, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771938632.571, "dur": 31.520, + "args": { + "External id": 151163, "cbid": 211, "correlation": 290002639 + } + }, + { + "ph": "s", "id": 290002639, "pid": 5714, "tid": 5714, "ts": 6303771938632.571, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303772132751.375, "dur": 1.120, + "args": { + "External id": 151164, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002649, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290002649, "pid": 0, "tid": 7, "ts": 6303772132751.375, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771938696.771, "dur": 9.220, + "args": { + "External id": 151164, "cbid": 211, "correlation": 290002649 + } + }, + { + "ph": "s", "id": 290002649, "pid": 5714, "tid": 5714, "ts": 6303771938696.771, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303772132753.231, "dur": 1.056, + "args": { + "External id": 151165, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002659, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290002659, "pid": 0, "tid": 7, "ts": 6303772132753.231, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771939099.070, "dur": 10.310, + "args": { + "External id": 151165, "cbid": 211, "correlation": 290002659 + } + }, + { + "ph": "s", "id": 290002659, "pid": 5714, "tid": 5714, "ts": 6303771939099.070, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303772132754.927, "dur": 1.088, + "args": { + "External id": 151166, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002669, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290002669, "pid": 0, "tid": 7, "ts": 6303772132754.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771939131.090, "dur": 7.250, + "args": { + "External id": 151166, "cbid": 211, "correlation": 290002669 + } + }, + { + "ph": "s", "id": 290002669, "pid": 5714, "tid": 5714, "ts": 6303771939131.090, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::FillFunctor, std::array >(int, at::native::FillFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303772132756.655, "dur": 0.864, + "args": { + "External id": 151172, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290002684, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.187500, "warps per SM": 0.750000, "grid": [24, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 2 + } + }, + { + "ph": "f", "id": 290002684, "pid": 0, "tid": 7, "ts": 6303772132756.655, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771940615.506, "dur": 19.400, + "args": { + "External id": 151172, "cbid": 211, "correlation": 290002684 + } + }, + { + "ph": "s", "id": 290002684, "pid": 5714, "tid": 5714, "ts": 6303771940615.506, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::LpNormFunctor, float*, int>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::LpNormFunctor, float*, int)", "pid": 0, "tid": 7, + "ts": 6303772132758.191, "dur": 89.569, + "args": { + "External id": 151168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003203, "registers per thread": 28, "shared memory": 2048, "blocks per SM": 2.500000, "warps per SM": 40.000000, "grid": [320, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 290003203, "pid": 0, "tid": 7, "ts": 6303772132758.191, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771941120.435, "dur": 17.530, + "args": { + "External id": 151168, "cbid": 211, "correlation": 290003203 + } + }, + { + "ph": "s", "id": 290003203, "pid": 5714, "tid": 5714, "ts": 6303771941120.435, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::LpNormFunctor, float*, int>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::LpNormFunctor, float*, int)", "pid": 0, "tid": 7, + "ts": 6303772132926.353, "dur": 61.057, + "args": { + "External id": 151168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003206, "registers per thread": 28, "shared memory": 2048, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 290003206, "pid": 0, "tid": 7, "ts": 6303772132926.353, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771941143.985, "dur": 9.090, + "args": { + "External id": 151168, "cbid": 211, "correlation": 290003206 + } + }, + { + "ph": "s", "id": 290003206, "pid": 5714, "tid": 5714, "ts": 6303771941143.985, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::lpnorm_cleanup(float const*, at::native::TensorListAddresses, int)", "pid": 0, "tid": 7, + "ts": 6303772132988.082, "dur": 1.728, + "args": { + "External id": 151168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003211, "registers per thread": 16, "shared memory": 2048, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 290003211, "pid": 0, "tid": 7, "ts": 6303772132988.082, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771941170.305, "dur": 9.620, + "args": { + "External id": 151168, "cbid": 211, "correlation": 290003211 + } + }, + { + "ph": "s", "id": 290003211, "pid": 5714, "tid": 5714, "ts": 6303771941170.305, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::CatArrayBatchedCopy_aligned16_contig, unsigned int, 1, 128, 1>(at::native::(anonymous namespace)::OpaqueType<4u>*, at::native::(anonymous namespace)::CatArrInputTensorMetadata, unsigned int, 128, 1>, at::native::(anonymous namespace)::TensorSizeStride, int, unsigned int)", "pid": 0, "tid": 7, + "ts": 6303772132990.450, "dur": 1.600, + "args": { + "External id": 155143, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003224, "registers per thread": 30, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 4.000000, "grid": [1, 128, 1], "block": [128, 1, 1], "est. achieved occupancy %": 8 + } + }, + { + "ph": "f", "id": 290003224, "pid": 0, "tid": 7, "ts": 6303772132990.450, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771946732.043, "dur": 74.590, + "args": { + "External id": 155143, "cbid": 211, "correlation": 290003224 + } + }, + { + "ph": "s", "id": 290003224, "pid": 5714, "tid": 5714, "ts": 6303771946732.043, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::reduce_kernel<512, 1, at::native::ReduceOp, unsigned int, float, 4> >(at::native::ReduceOp, unsigned int, float, 4>)", "pid": 0, "tid": 7, + "ts": 6303772132992.754, "dur": 2.336, + "args": { + "External id": 155145, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003238, "registers per thread": 32, "shared memory": 528, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290003238, "pid": 0, "tid": 7, "ts": 6303772132992.754, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771947642.891, "dur": 79.049, + "args": { + "External id": 155145, "cbid": 211, "correlation": 290003238 + } + }, + { + "ph": "s", "id": 290003238, "pid": 5714, "tid": 5714, "ts": 6303771947642.891, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::pow_tensor_scalar_kernel_impl(at::TensorIteratorBase&, float)::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303772132995.730, "dur": 1.088, + "args": { + "External id": 155148, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003248, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290003248, "pid": 0, "tid": 7, "ts": 6303772132995.730, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771949416.097, "dur": 111.490, + "args": { + "External id": 155148, "cbid": 211, "correlation": 290003248 + } + }, + { + "ph": "s", "id": 290003248, "pid": 5714, "tid": 5714, "ts": 6303771949416.097, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 0, "tid": 7, + "ts": 6303772132997.426, "dur": 0.992, + "args": { + "External id": 155155, "device": 0, "context": 1, "stream": 7, "correlation": 290003260, "bytes": 4, "memory bandwidth (GB/s)": 0.004032258064516129 + } + }, + { + "ph": "f", "id": 290003260, "pid": 0, "tid": 7, "ts": 6303772132997.426, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771949928.466, "dur": 101.549, + "args": { + "External id": 155155, "cbid": 41, "correlation": 290003260 + } + }, + { + "ph": "s", "id": 290003260, "pid": 5714, "tid": 5714, "ts": 6303771949928.466, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303771950239.535, "dur": 7.050, + "args": { + "External id": 155157, "cbid": 317, "correlation": 290003266 + } + }, + { + "ph": "f", "id": 290003266, "pid": 5714, "tid": 5714, "ts": 6303771950239.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771950261.645, "dur": 14.170, + "args": { + "External id": 155157, "cbid": 135, "correlation": 290003268 + } + }, + { + "ph": "f", "id": 290003268, "pid": 5714, "tid": 5714, "ts": 6303771950261.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771950287.185, "dur": 54.700, + "args": { + "External id": 155157, "cbid": 147, "correlation": 290003272 + } + }, + { + "ph": "s", "id": 290003272, "pid": 5714, "tid": 5714, "ts": 6303771950287.185, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamGetCaptureInfo_v2", "pid": 5714, "tid": 5714, + "ts": 6303771950471.705, "dur": 6.749, + "args": { + "External id": 155157, "cbid": 409, "correlation": 290003275 + } + }, + { + "ph": "f", "id": 290003275, "pid": 5714, "tid": 5714, "ts": 6303771950471.705, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771950510.364, "dur": 9.260, + "args": { + "External id": 155157, "cbid": 135, "correlation": 290003278 + } + }, + { + "ph": "f", "id": 290003278, "pid": 5714, "tid": 5714, "ts": 6303771950510.364, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771950521.094, "dur": 9.160, + "args": { + "External id": 155157, "cbid": 147, "correlation": 290003279 + } + }, + { + "ph": "s", "id": 290003279, "pid": 5714, "tid": 5714, "ts": 6303771950521.094, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "ncclDevKernel_AllReduce_Sum_f32_RING_LL(ncclDevComm*, unsigned long, ncclWork*)", "pid": 0, "tid": 20, + "ts": 6303772133001.010, "dur": 606.919, + "args": { + "External id": 155157, "queued": 0, "device": 0, "context": 1, "stream": 20, "correlation": 290003281, "registers per thread": 96, "shared memory": 88416, "blocks per SM": 0.007812, "warps per SM": 0.023438, "grid": [1, 1, 1], "block": [96, 1, 1], "est. achieved occupancy %": 0, "Collective name": "allreduce", "In msg nelems": 1, "Out msg nelems": 1, "Group size": 4, "dtype": "Float", "In split size": "[]", "Out split size": "[]", "Process Group Name": "0", "Process Group Description": "default_pg", "Process Group Ranks": "[0, 1, 2, 3]" + } + }, + { + "ph": "f", "id": 290003281, "pid": 0, "tid": 20, "ts": 6303772133001.010, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernelExC", "pid": 5714, "tid": 5714, + "ts": 6303771950538.764, "dur": 69.530, + "args": { + "External id": 155157, "cbid": 430, "correlation": 290003281 + } + }, + { + "ph": "s", "id": 290003281, "pid": 5714, "tid": 5714, "ts": 6303771950538.764, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771950616.584, "dur": 3.790, + "args": { + "External id": 155157, "cbid": 135, "correlation": 290003283 + } + }, + { + "ph": "f", "id": 290003283, "pid": 5714, "tid": 5714, "ts": 6303771950616.584, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771950621.344, "dur": 4.480, + "args": { + "External id": 155157, "cbid": 147, "correlation": 290003284 + } + }, + { + "ph": "s", "id": 290003284, "pid": 5714, "tid": 5714, "ts": 6303771950621.344, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771950639.424, "dur": 5.530, + "args": { + "External id": 155157, "cbid": 135, "correlation": 290003287 + } + }, + { + "ph": "f", "id": 290003287, "pid": 5714, "tid": 5714, "ts": 6303771950639.424, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventRecord", "pid": 5714, "tid": 5714, + "ts": 6303771950709.084, "dur": 3.780, + "args": { + "External id": 155157, "cbid": 135, "correlation": 290003294 + } + }, + { + "ph": "f", "id": 290003294, "pid": 5714, "tid": 5714, "ts": 6303771950709.084, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamWaitEvent", "pid": 5714, "tid": 5714, + "ts": 6303771951661.812, "dur": 19.250, + "args": { + "External id": 155161, "cbid": 147, "correlation": 290003299 + } + }, + { + "ph": "s", "id": 290003299, "pid": 5714, "tid": 5714, "ts": 6303771951661.812, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::sqrt_kernel_cuda(at::TensorIteratorBase&)::{lambda()#2}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303772133608.569, "dur": 1.088, + "args": { + "External id": 155162, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003315, "registers per thread": 20, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290003315, "pid": 0, "tid": 7, "ts": 6303772133608.569, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771952136.191, "dur": 94.199, + "args": { + "External id": 155162, "cbid": 211, "correlation": 290003315 + } + }, + { + "ph": "s", "id": 290003315, "pid": 5714, "tid": 5714, "ts": 6303771952136.191, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::CUDAFunctorOnSelf_add, std::array >(int, at::native::CUDAFunctorOnSelf_add, std::array)", "pid": 0, "tid": 7, + "ts": 6303772133610.265, "dur": 0.992, + "args": { + "External id": 155168, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003325, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290003325, "pid": 0, "tid": 7, "ts": 6303772133610.265, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771953851.247, "dur": 85.580, + "args": { + "External id": 155168, "cbid": 211, "correlation": 290003325 + } + }, + { + "ph": "s", "id": 290003325, "pid": 5714, "tid": 5714, "ts": 6303771953851.247, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::reciprocal_kernel_cuda(at::TensorIteratorBase&)::{lambda()#1}::operator()() const::{lambda()#2}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303772133611.993, "dur": 1.120, + "args": { + "External id": 155169, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003335, "registers per thread": 24, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290003335, "pid": 0, "tid": 7, "ts": 6303772133611.993, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771954055.817, "dur": 41.769, + "args": { + "External id": 155169, "cbid": 211, "correlation": 290003335 + } + }, + { + "ph": "s", "id": 290003335, "pid": 5714, "tid": 5714, "ts": 6303771954055.817, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303772133613.721, "dur": 0.992, + "args": { + "External id": 155170, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003345, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290003345, "pid": 0, "tid": 7, "ts": 6303772133613.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771954203.526, "dur": 37.050, + "args": { + "External id": 155170, "cbid": 211, "correlation": 290003345 + } + }, + { + "ph": "s", "id": 290003345, "pid": 5714, "tid": 5714, "ts": 6303771954203.526, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array >(int, at::native::(anonymous namespace)::launch_clamp_scalar(at::TensorIteratorBase&, c10::Scalar, c10::Scalar, at::native::detail::ClampLimits)::{lambda()#1}::operator()() const::{lambda()#7}::operator()() const::{lambda(float)#1}, std::array)", "pid": 0, "tid": 7, + "ts": 6303772133615.417, "dur": 1.056, + "args": { + "External id": 155171, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003355, "registers per thread": 18, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290003355, "pid": 0, "tid": 7, "ts": 6303772133615.417, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771954447.016, "dur": 46.380, + "args": { + "External id": 155171, "cbid": 211, "correlation": 290003355 + } + }, + { + "ph": "s", "id": 290003355, "pid": 5714, "tid": 5714, "ts": 6303771954447.016, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float)", "pid": 0, "tid": 7, + "ts": 6303772133617.113, "dur": 116.481, + "args": { + "External id": 155175, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003361, "registers per thread": 28, "shared memory": 0, "blocks per SM": 2.500000, "warps per SM": 40.000000, "grid": [320, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 83 + } + }, + { + "ph": "f", "id": 290003361, "pid": 0, "tid": 7, "ts": 6303772133617.113, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771957419.669, "dur": 82.570, + "args": { + "External id": 155175, "cbid": 211, "correlation": 290003361 + } + }, + { + "ph": "s", "id": 290003361, "pid": 5714, "tid": 5714, "ts": 6303771957419.669, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarTensorFunctor, std::multiplies, float*, float)", "pid": 0, "tid": 7, + "ts": 6303772133734.202, "dur": 112.066, + "args": { + "External id": 155175, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003364, "registers per thread": 28, "shared memory": 0, "blocks per SM": 1.742188, "warps per SM": 27.875000, "grid": [223, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 58 + } + }, + { + "ph": "f", "id": 290003364, "pid": 0, "tid": 7, "ts": 6303772133734.202, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771957520.089, "dur": 29.800, + "args": { + "External id": 155175, "cbid": 211, "correlation": 290003364 + } + }, + { + "ph": "s", "id": 290003364, "pid": 5714, "tid": 5714, "ts": 6303771957520.089, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::BinaryFunctor >, std::array >(int, at::native::BinaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303772133915.900, "dur": 1.376, + "args": { + "External id": 155177, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003374, "registers per thread": 22, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290003374, "pid": 0, "tid": 7, "ts": 6303772133915.900, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303771958011.738, "dur": 50.330, + "args": { + "External id": 155177, "cbid": 211, "correlation": 290003374 + } + }, + { + "ph": "s", "id": 290003374, "pid": 5714, "tid": 5714, "ts": 6303771958011.738, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771958132.077, "dur": 16.340, + "args": { + "External id": 155180, "cbid": 138, "correlation": 290003379 + } + }, + { + "ph": "f", "id": 290003379, "pid": 5714, "tid": 5714, "ts": 6303771958132.077, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771958151.517, "dur": 4.560, + "args": { + "External id": 155180, "cbid": 138, "correlation": 290003380 + } + }, + { + "ph": "f", "id": 290003380, "pid": 5714, "tid": 5714, "ts": 6303771958151.517, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771958157.527, "dur": 3.100, + "args": { + "External id": 155180, "cbid": 138, "correlation": 290003381 + } + }, + { + "ph": "f", "id": 290003381, "pid": 5714, "tid": 5714, "ts": 6303771958157.527, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771958161.927, "dur": 2.610, + "args": { + "External id": 155180, "cbid": 138, "correlation": 290003382 + } + }, + { + "ph": "f", "id": 290003382, "pid": 5714, "tid": 5714, "ts": 6303771958161.927, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303771958166.157, "dur": 3.680, + "args": { + "External id": 155180, "cbid": 138, "correlation": 290003383 + } + }, + { + "ph": "f", "id": 290003383, "pid": 5714, "tid": 5714, "ts": 6303771958166.157, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6303772133923.644, "dur": 0.864, + "args": { + "External id": 155180, "device": 0, "context": 1, "stream": 7, "correlation": 290003386, "bytes": 1, "memory bandwidth (GB/s)": 0.0011574074074074073 + } + }, + { + "ph": "f", "id": 290003386, "pid": 0, "tid": 7, "ts": 6303772133923.644, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303771958181.967, "dur": 65.170, + "args": { + "External id": 155180, "cbid": 41, "correlation": 290003386 + } + }, + { + "ph": "s", "id": 290003386, "pid": 5714, "tid": 5714, "ts": 6303771958181.967, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995065.675, "dur": 43.260, + "args": { + "cbid": 138, "correlation": 290003388 + } + }, + { + "ph": "f", "id": 290003388, "pid": 5714, "tid": 1822426688, "ts": 6303771995065.675, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995111.525, "dur": 5.150, + "args": { + "cbid": 138, "correlation": 290003389 + } + }, + { + "ph": "f", "id": 290003389, "pid": 5714, "tid": 1822426688, "ts": 6303771995111.525, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995142.475, "dur": 3.710, + "args": { + "cbid": 138, "correlation": 290003390 + } + }, + { + "ph": "f", "id": 290003390, "pid": 5714, "tid": 1822426688, "ts": 6303771995142.475, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995171.435, "dur": 5.780, + "args": { + "cbid": 138, "correlation": 290003391 + } + }, + { + "ph": "f", "id": 290003391, "pid": 5714, "tid": 1822426688, "ts": 6303771995171.435, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995178.125, "dur": 3.240, + "args": { + "cbid": 138, "correlation": 290003392 + } + }, + { + "ph": "f", "id": 290003392, "pid": 5714, "tid": 1822426688, "ts": 6303771995178.125, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995185.865, "dur": 2.800, + "args": { + "cbid": 138, "correlation": 290003393 + } + }, + { + "ph": "f", "id": 290003393, "pid": 5714, "tid": 1822426688, "ts": 6303771995185.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995197.725, "dur": 4.510, + "args": { + "cbid": 138, "correlation": 290003394 + } + }, + { + "ph": "f", "id": 290003394, "pid": 5714, "tid": 1822426688, "ts": 6303771995197.725, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995203.175, "dur": 2.520, + "args": { + "cbid": 138, "correlation": 290003395 + } + }, + { + "ph": "f", "id": 290003395, "pid": 5714, "tid": 1822426688, "ts": 6303771995203.175, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995209.335, "dur": 2.890, + "args": { + "cbid": 138, "correlation": 290003396 + } + }, + { + "ph": "f", "id": 290003396, "pid": 5714, "tid": 1822426688, "ts": 6303771995209.335, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995220.875, "dur": 5.030, + "args": { + "cbid": 138, "correlation": 290003397 + } + }, + { + "ph": "f", "id": 290003397, "pid": 5714, "tid": 1822426688, "ts": 6303771995220.875, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995226.815, "dur": 2.600, + "args": { + "cbid": 138, "correlation": 290003398 + } + }, + { + "ph": "f", "id": 290003398, "pid": 5714, "tid": 1822426688, "ts": 6303771995226.815, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995233.125, "dur": 2.840, + "args": { + "cbid": 138, "correlation": 290003399 + } + }, + { + "ph": "f", "id": 290003399, "pid": 5714, "tid": 1822426688, "ts": 6303771995233.125, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995244.255, "dur": 6.380, + "args": { + "cbid": 138, "correlation": 290003400 + } + }, + { + "ph": "f", "id": 290003400, "pid": 5714, "tid": 1822426688, "ts": 6303771995244.255, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995251.585, "dur": 2.710, + "args": { + "cbid": 138, "correlation": 290003401 + } + }, + { + "ph": "f", "id": 290003401, "pid": 5714, "tid": 1822426688, "ts": 6303771995251.585, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995258.515, "dur": 2.850, + "args": { + "cbid": 138, "correlation": 290003402 + } + }, + { + "ph": "f", "id": 290003402, "pid": 5714, "tid": 1822426688, "ts": 6303771995258.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995269.055, "dur": 4.750, + "args": { + "cbid": 138, "correlation": 290003403 + } + }, + { + "ph": "f", "id": 290003403, "pid": 5714, "tid": 1822426688, "ts": 6303771995269.055, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995274.675, "dur": 2.590, + "args": { + "cbid": 138, "correlation": 290003404 + } + }, + { + "ph": "f", "id": 290003404, "pid": 5714, "tid": 1822426688, "ts": 6303771995274.675, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995281.845, "dur": 2.730, + "args": { + "cbid": 138, "correlation": 290003405 + } + }, + { + "ph": "f", "id": 290003405, "pid": 5714, "tid": 1822426688, "ts": 6303771995281.845, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995292.365, "dur": 4.300, + "args": { + "cbid": 138, "correlation": 290003406 + } + }, + { + "ph": "f", "id": 290003406, "pid": 5714, "tid": 1822426688, "ts": 6303771995292.365, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995339.225, "dur": 3.330, + "args": { + "cbid": 138, "correlation": 290003407 + } + }, + { + "ph": "f", "id": 290003407, "pid": 5714, "tid": 1822426688, "ts": 6303771995339.225, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995347.105, "dur": 2.910, + "args": { + "cbid": 138, "correlation": 290003408 + } + }, + { + "ph": "f", "id": 290003408, "pid": 5714, "tid": 1822426688, "ts": 6303771995347.105, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995358.815, "dur": 4.910, + "args": { + "cbid": 138, "correlation": 290003409 + } + }, + { + "ph": "f", "id": 290003409, "pid": 5714, "tid": 1822426688, "ts": 6303771995358.815, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995364.595, "dur": 2.480, + "args": { + "cbid": 138, "correlation": 290003410 + } + }, + { + "ph": "f", "id": 290003410, "pid": 5714, "tid": 1822426688, "ts": 6303771995364.595, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995371.285, "dur": 2.850, + "args": { + "cbid": 138, "correlation": 290003411 + } + }, + { + "ph": "f", "id": 290003411, "pid": 5714, "tid": 1822426688, "ts": 6303771995371.285, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995382.535, "dur": 4.780, + "args": { + "cbid": 138, "correlation": 290003412 + } + }, + { + "ph": "f", "id": 290003412, "pid": 5714, "tid": 1822426688, "ts": 6303771995382.535, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995388.185, "dur": 2.540, + "args": { + "cbid": 138, "correlation": 290003413 + } + }, + { + "ph": "f", "id": 290003413, "pid": 5714, "tid": 1822426688, "ts": 6303771995388.185, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995394.225, "dur": 2.949, + "args": { + "cbid": 138, "correlation": 290003414 + } + }, + { + "ph": "f", "id": 290003414, "pid": 5714, "tid": 1822426688, "ts": 6303771995394.225, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995404.865, "dur": 5.609, + "args": { + "cbid": 138, "correlation": 290003415 + } + }, + { + "ph": "f", "id": 290003415, "pid": 5714, "tid": 1822426688, "ts": 6303771995404.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995411.354, "dur": 2.551, + "args": { + "cbid": 138, "correlation": 290003416 + } + }, + { + "ph": "f", "id": 290003416, "pid": 5714, "tid": 1822426688, "ts": 6303771995411.354, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995417.805, "dur": 2.820, + "args": { + "cbid": 138, "correlation": 290003417 + } + }, + { + "ph": "f", "id": 290003417, "pid": 5714, "tid": 1822426688, "ts": 6303771995417.805, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995428.645, "dur": 5.100, + "args": { + "cbid": 138, "correlation": 290003418 + } + }, + { + "ph": "f", "id": 290003418, "pid": 5714, "tid": 1822426688, "ts": 6303771995428.645, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995445.765, "dur": 3.840, + "args": { + "cbid": 138, "correlation": 290003420 + } + }, + { + "ph": "f", "id": 290003420, "pid": 5714, "tid": 1822426688, "ts": 6303771995445.765, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995456.585, "dur": 4.969, + "args": { + "cbid": 138, "correlation": 290003422 + } + }, + { + "ph": "f", "id": 290003422, "pid": 5714, "tid": 1822426688, "ts": 6303771995456.585, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995468.554, "dur": 4.391, + "args": { + "cbid": 138, "correlation": 290003424 + } + }, + { + "ph": "f", "id": 290003424, "pid": 5714, "tid": 1822426688, "ts": 6303771995468.554, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995479.654, "dur": 3.631, + "args": { + "cbid": 138, "correlation": 290003426 + } + }, + { + "ph": "f", "id": 290003426, "pid": 5714, "tid": 1822426688, "ts": 6303771995479.654, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995490.005, "dur": 3.680, + "args": { + "cbid": 138, "correlation": 290003428 + } + }, + { + "ph": "f", "id": 290003428, "pid": 5714, "tid": 1822426688, "ts": 6303771995490.005, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995500.234, "dur": 3.650, + "args": { + "cbid": 138, "correlation": 290003430 + } + }, + { + "ph": "f", "id": 290003430, "pid": 5714, "tid": 1822426688, "ts": 6303771995500.234, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995510.814, "dur": 3.910, + "args": { + "cbid": 138, "correlation": 290003432 + } + }, + { + "ph": "f", "id": 290003432, "pid": 5714, "tid": 1822426688, "ts": 6303771995510.814, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995520.824, "dur": 3.700, + "args": { + "cbid": 138, "correlation": 290003434 + } + }, + { + "ph": "f", "id": 290003434, "pid": 5714, "tid": 1822426688, "ts": 6303771995520.824, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995531.664, "dur": 3.980, + "args": { + "cbid": 138, "correlation": 290003436 + } + }, + { + "ph": "f", "id": 290003436, "pid": 5714, "tid": 1822426688, "ts": 6303771995531.664, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303771995542.714, "dur": 4.350, + "args": { + "cbid": 138, "correlation": 290003438 + } + }, + { + "ph": "f", "id": 290003438, "pid": 5714, "tid": 1822426688, "ts": 6303771995542.714, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095731.862, "dur": 33.780, + "args": { + "cbid": 138, "correlation": 290003440 + } + }, + { + "ph": "f", "id": 290003440, "pid": 5714, "tid": 1822426688, "ts": 6303772095731.862, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095768.782, "dur": 5.260, + "args": { + "cbid": 138, "correlation": 290003441 + } + }, + { + "ph": "f", "id": 290003441, "pid": 5714, "tid": 1822426688, "ts": 6303772095768.782, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095793.211, "dur": 3.451, + "args": { + "cbid": 138, "correlation": 290003442 + } + }, + { + "ph": "f", "id": 290003442, "pid": 5714, "tid": 1822426688, "ts": 6303772095793.211, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095816.711, "dur": 4.590, + "args": { + "cbid": 138, "correlation": 290003443 + } + }, + { + "ph": "f", "id": 290003443, "pid": 5714, "tid": 1822426688, "ts": 6303772095816.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095928.861, "dur": 2.710, + "args": { + "cbid": 138, "correlation": 290003444 + } + }, + { + "ph": "f", "id": 290003444, "pid": 5714, "tid": 1822426688, "ts": 6303772095928.861, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095936.711, "dur": 2.890, + "args": { + "cbid": 138, "correlation": 290003445 + } + }, + { + "ph": "f", "id": 290003445, "pid": 5714, "tid": 1822426688, "ts": 6303772095936.711, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095947.601, "dur": 4.680, + "args": { + "cbid": 138, "correlation": 290003446 + } + }, + { + "ph": "f", "id": 290003446, "pid": 5714, "tid": 1822426688, "ts": 6303772095947.601, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095953.201, "dur": 2.670, + "args": { + "cbid": 138, "correlation": 290003447 + } + }, + { + "ph": "f", "id": 290003447, "pid": 5714, "tid": 1822426688, "ts": 6303772095953.201, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095960.281, "dur": 2.990, + "args": { + "cbid": 138, "correlation": 290003448 + } + }, + { + "ph": "f", "id": 290003448, "pid": 5714, "tid": 1822426688, "ts": 6303772095960.281, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095971.381, "dur": 3.850, + "args": { + "cbid": 138, "correlation": 290003449 + } + }, + { + "ph": "f", "id": 290003449, "pid": 5714, "tid": 1822426688, "ts": 6303772095971.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095976.111, "dur": 2.490, + "args": { + "cbid": 138, "correlation": 290003450 + } + }, + { + "ph": "f", "id": 290003450, "pid": 5714, "tid": 1822426688, "ts": 6303772095976.111, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095981.911, "dur": 2.850, + "args": { + "cbid": 138, "correlation": 290003451 + } + }, + { + "ph": "f", "id": 290003451, "pid": 5714, "tid": 1822426688, "ts": 6303772095981.911, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095992.031, "dur": 3.530, + "args": { + "cbid": 138, "correlation": 290003452 + } + }, + { + "ph": "f", "id": 290003452, "pid": 5714, "tid": 1822426688, "ts": 6303772095992.031, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772095996.411, "dur": 2.580, + "args": { + "cbid": 138, "correlation": 290003453 + } + }, + { + "ph": "f", "id": 290003453, "pid": 5714, "tid": 1822426688, "ts": 6303772095996.411, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096002.251, "dur": 2.890, + "args": { + "cbid": 138, "correlation": 290003454 + } + }, + { + "ph": "f", "id": 290003454, "pid": 5714, "tid": 1822426688, "ts": 6303772096002.251, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096012.561, "dur": 3.680, + "args": { + "cbid": 138, "correlation": 290003455 + } + }, + { + "ph": "f", "id": 290003455, "pid": 5714, "tid": 1822426688, "ts": 6303772096012.561, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096017.091, "dur": 2.540, + "args": { + "cbid": 138, "correlation": 290003456 + } + }, + { + "ph": "f", "id": 290003456, "pid": 5714, "tid": 1822426688, "ts": 6303772096017.091, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096022.731, "dur": 2.760, + "args": { + "cbid": 138, "correlation": 290003457 + } + }, + { + "ph": "f", "id": 290003457, "pid": 5714, "tid": 1822426688, "ts": 6303772096022.731, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096032.261, "dur": 3.490, + "args": { + "cbid": 138, "correlation": 290003458 + } + }, + { + "ph": "f", "id": 290003458, "pid": 5714, "tid": 1822426688, "ts": 6303772096032.261, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096036.621, "dur": 2.500, + "args": { + "cbid": 138, "correlation": 290003459 + } + }, + { + "ph": "f", "id": 290003459, "pid": 5714, "tid": 1822426688, "ts": 6303772096036.621, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096043.601, "dur": 2.980, + "args": { + "cbid": 138, "correlation": 290003460 + } + }, + { + "ph": "f", "id": 290003460, "pid": 5714, "tid": 1822426688, "ts": 6303772096043.601, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096053.381, "dur": 4.200, + "args": { + "cbid": 138, "correlation": 290003461 + } + }, + { + "ph": "f", "id": 290003461, "pid": 5714, "tid": 1822426688, "ts": 6303772096053.381, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096058.431, "dur": 2.500, + "args": { + "cbid": 138, "correlation": 290003462 + } + }, + { + "ph": "f", "id": 290003462, "pid": 5714, "tid": 1822426688, "ts": 6303772096058.431, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096064.981, "dur": 2.820, + "args": { + "cbid": 138, "correlation": 290003463 + } + }, + { + "ph": "f", "id": 290003463, "pid": 5714, "tid": 1822426688, "ts": 6303772096064.981, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096074.851, "dur": 3.680, + "args": { + "cbid": 138, "correlation": 290003464 + } + }, + { + "ph": "f", "id": 290003464, "pid": 5714, "tid": 1822426688, "ts": 6303772096074.851, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096079.391, "dur": 2.500, + "args": { + "cbid": 138, "correlation": 290003465 + } + }, + { + "ph": "f", "id": 290003465, "pid": 5714, "tid": 1822426688, "ts": 6303772096079.391, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096086.391, "dur": 3.010, + "args": { + "cbid": 138, "correlation": 290003466 + } + }, + { + "ph": "f", "id": 290003466, "pid": 5714, "tid": 1822426688, "ts": 6303772096086.391, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096095.961, "dur": 4.520, + "args": { + "cbid": 138, "correlation": 290003467 + } + }, + { + "ph": "f", "id": 290003467, "pid": 5714, "tid": 1822426688, "ts": 6303772096095.961, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 1822426688, + "ts": 6303772096109.721, "dur": 3.250, + "args": { + "cbid": 138, "correlation": 290003469 + } + }, + { + "ph": "f", "id": 290003469, "pid": 5714, "tid": 1822426688, "ts": 6303772096109.721, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6303771958249.427, "dur": 175706.490, + "args": { + "External id": 155180, "cbid": 131, "correlation": 290003387 + } + }, + { + "ph": "s", "id": 290003387, "pid": 5714, "tid": 5714, "ts": 6303771958249.427, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AbsFunctor, std::array >(int, at::native::AbsFunctor, std::array)", "pid": 0, "tid": 7, + "ts": 6303772134475.683, "dur": 1.024, + "args": { + "External id": 155184, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003486, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290003486, "pid": 0, "tid": 7, "ts": 6303772134475.683, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303772134381.926, "dur": 111.179, + "args": { + "External id": 155184, "cbid": 211, "correlation": 290003486 + } + }, + { + "ph": "s", "id": 290003486, "pid": 5714, "tid": 5714, "ts": 6303772134381.926, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::vectorized_elementwise_kernel<4, at::native::AUnaryFunctor >, std::array >(int, at::native::AUnaryFunctor >, std::array)", "pid": 0, "tid": 7, + "ts": 6303772134652.485, "dur": 1.024, + "args": { + "External id": 155186, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003496, "registers per thread": 16, "shared memory": 0, "blocks per SM": 0.007812, "warps per SM": 0.031250, "grid": [1, 1, 1], "block": [128, 1, 1], "est. achieved occupancy %": 0 + } + }, + { + "ph": "f", "id": 290003496, "pid": 0, "tid": 7, "ts": 6303772134652.485, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303772134616.785, "dur": 45.170, + "args": { + "External id": 155186, "cbid": 211, "correlation": 290003496 + } + }, + { + "ph": "s", "id": 290003496, "pid": 5714, "tid": 5714, "ts": 6303772134616.785, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134748.235, "dur": 14.270, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003501 + } + }, + { + "ph": "f", "id": 290003501, "pid": 5714, "tid": 5714, "ts": 6303772134748.235, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134766.035, "dur": 4.840, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003502 + } + }, + { + "ph": "f", "id": 290003502, "pid": 5714, "tid": 5714, "ts": 6303772134766.035, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134773.465, "dur": 3.980, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003503 + } + }, + { + "ph": "f", "id": 290003503, "pid": 5714, "tid": 5714, "ts": 6303772134773.465, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134778.975, "dur": 3.970, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003504 + } + }, + { + "ph": "f", "id": 290003504, "pid": 5714, "tid": 5714, "ts": 6303772134778.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134784.975, "dur": 3.820, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003505 + } + }, + { + "ph": "f", "id": 290003505, "pid": 5714, "tid": 5714, "ts": 6303772134784.975, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134790.245, "dur": 4.030, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003506 + } + }, + { + "ph": "f", "id": 290003506, "pid": 5714, "tid": 5714, "ts": 6303772134790.245, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134798.785, "dur": 3.860, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003507 + } + }, + { + "ph": "f", "id": 290003507, "pid": 5714, "tid": 5714, "ts": 6303772134798.785, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134804.475, "dur": 3.430, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003508 + } + }, + { + "ph": "f", "id": 290003508, "pid": 5714, "tid": 5714, "ts": 6303772134804.475, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134809.735, "dur": 3.250, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003509 + } + }, + { + "ph": "f", "id": 290003509, "pid": 5714, "tid": 5714, "ts": 6303772134809.735, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134814.515, "dur": 2.940, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003510 + } + }, + { + "ph": "f", "id": 290003510, "pid": 5714, "tid": 5714, "ts": 6303772134814.515, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134818.875, "dur": 3.000, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003511 + } + }, + { + "ph": "f", "id": 290003511, "pid": 5714, "tid": 5714, "ts": 6303772134818.875, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134824.075, "dur": 3.200, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003512 + } + }, + { + "ph": "f", "id": 290003512, "pid": 5714, "tid": 5714, "ts": 6303772134824.075, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134828.865, "dur": 3.260, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003513 + } + }, + { + "ph": "f", "id": 290003513, "pid": 5714, "tid": 5714, "ts": 6303772134828.865, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaEventQuery", "pid": 5714, "tid": 5714, + "ts": 6303772134834.635, "dur": 2.740, + "args": { + "External id": 155189, "cbid": 138, "correlation": 290003514 + } + }, + { + "ph": "f", "id": 290003514, "pid": 5714, "tid": 5714, "ts": 6303772134834.635, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoH (Device -> Pinned)", "pid": 0, "tid": 7, + "ts": 6303772134914.888, "dur": 0.992, + "args": { + "External id": 155189, "device": 0, "context": 1, "stream": 7, "correlation": 290003516, "bytes": 1, "memory bandwidth (GB/s)": 0.0010080645161290322 + } + }, + { + "ph": "f", "id": 290003516, "pid": 0, "tid": 7, "ts": 6303772134914.888, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaMemcpyAsync", "pid": 5714, "tid": 5714, + "ts": 6303772134849.685, "dur": 73.119, + "args": { + "External id": 155189, "cbid": 41, "correlation": 290003516 + } + }, + { + "ph": "s", "id": 290003516, "pid": 5714, "tid": 5714, "ts": 6303772134849.685, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamSynchronize", "pid": 5714, "tid": 5714, + "ts": 6303772134924.795, "dur": 13.669, + "args": { + "External id": 155189, "cbid": 131, "correlation": 290003517 + } + }, + { + "ph": "s", "id": 290003517, "pid": 5714, "tid": 5714, "ts": 6303772134924.795, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaStreamIsCapturing", "pid": 5714, "tid": 5714, + "ts": 6303772135499.953, "dur": 8.050, + "args": { + "cbid": 317, "correlation": 290003523 + } + }, + { + "ph": "f", "id": 290003523, "pid": 5714, "tid": 5714, "ts": 6303772135499.953, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float)", "pid": 0, "tid": 7, + "ts": 6303772137714.280, "dur": 1.728, + "args": { + "External id": 155192, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003526, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.859375, "warps per SM": 13.750000, "grid": [110, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 29 + } + }, + { + "ph": "f", "id": 290003526, "pid": 0, "tid": 7, "ts": 6303772137714.280, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303772137687.078, "dur": 34.510, + "args": { + "External id": 155192, "cbid": 211, "correlation": 290003526 + } + }, + { + "ph": "s", "id": 290003526, "pid": 5714, "tid": 5714, "ts": 6303772137687.078, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float>(at::native::(anonymous namespace)::TensorListMetadata<1>, at::native::(anonymous namespace)::BinaryOpScalarFunctor, std::plus, float)", "pid": 0, "tid": 7, + "ts": 6303772137736.329, "dur": 1.568, + "args": { + "External id": 155192, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003529, "registers per thread": 30, "shared memory": 0, "blocks per SM": 0.140625, "warps per SM": 2.250000, "grid": [18, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 5 + } + }, + { + "ph": "f", "id": 290003529, "pid": 0, "tid": 7, "ts": 6303772137736.329, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303772137725.828, "dur": 12.080, + "args": { + "External id": 155192, "cbid": 211, "correlation": 290003529 + } + }, + { + "ph": "s", "id": 290003529, "pid": 5714, "tid": 5714, "ts": 6303772137725.828, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6303772141291.506, "dur": 514.598, + "args": { + "External id": 155322, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003535, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.687500, "warps per SM": 27.000000, "grid": [216, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 56 + } + }, + { + "ph": "f", "id": 290003535, "pid": 0, "tid": 7, "ts": 6303772141291.506, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303772141275.800, "dur": 18.260, + "args": { + "External id": 155322, "cbid": 211, "correlation": 290003535 + } + }, + { + "ph": "s", "id": 290003535, "pid": 5714, "tid": 5714, "ts": 6303772141275.800, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6303772141807.000, "dur": 260.707, + "args": { + "External id": 155322, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003538, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 290003538, "pid": 0, "tid": 7, "ts": 6303772141807.000, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303772141307.020, "dur": 8.110, + "args": { + "External id": 155322, "cbid": 211, "correlation": 290003538 + } + }, + { + "ph": "s", "id": 290003538, "pid": 5714, "tid": 5714, "ts": 6303772141307.020, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6303772142068.411, "dur": 259.971, + "args": { + "External id": 155322, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003541, "registers per thread": 64, "shared memory": 0, "blocks per SM": 1.000000, "warps per SM": 16.000000, "grid": [128, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 33 + } + }, + { + "ph": "f", "id": 290003541, "pid": 0, "tid": 7, "ts": 6303772142068.411, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303772141319.350, "dur": 5.700, + "args": { + "External id": 155322, "cbid": 211, "correlation": 290003541 + } + }, + { + "ph": "s", "id": 290003541, "pid": 5714, "tid": 5714, "ts": 6303772141319.350, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "kernel", "name": "void at::native::(anonymous namespace)::multi_tensor_apply_kernel, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*>(at::native::(anonymous namespace)::FusedOptimizerTensorListMetadata<4>, at::native::(anonymous namespace)::FusedAdamMathFunctor, float const*, double, double, double, double, double, bool, float const*, float const*)", "pid": 0, "tid": 7, + "ts": 6303772142328.990, "dur": 259.043, + "args": { + "External id": 155322, "queued": 0, "device": 0, "context": 1, "stream": 7, "correlation": 290003544, "registers per thread": 64, "shared memory": 0, "blocks per SM": 0.554688, "warps per SM": 8.875000, "grid": [71, 1, 1], "block": [512, 1, 1], "est. achieved occupancy %": 18 + } + }, + { + "ph": "f", "id": 290003544, "pid": 0, "tid": 7, "ts": 6303772142328.990, + "cat": "ac2g", "name": "ac2g", "bp": "e" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaLaunchKernel", "pid": 5714, "tid": 5714, + "ts": 6303772141328.210, "dur": 5.980, + "args": { + "External id": 155322, "cbid": 211, "correlation": 290003544 + } + }, + { + "ph": "s", "id": 290003544, "pid": 5714, "tid": 5714, "ts": 6303772141328.210, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "cuda_runtime", "name": "cudaDeviceSynchronize", "pid": 5714, "tid": 5714, + "ts": 6303772141688.609, "dur": 902.449, + "args": { + "cbid": 165, "correlation": 290003550 + } + }, + { + "ph": "s", "id": 290003550, "pid": 5714, "tid": 5714, "ts": 6303772141688.609, + "cat": "ac2g", "name": "ac2g" + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "Optimizer.step#AdamW.step", "pid": 0, "tid": 7, + "ts": 6303772137714.279, "dur": 4873.755, + "args": { + "External id": 155191 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce", "pid": 0, "tid": 7, + "ts": 6303772060267.781, "dur": 345.638, + "args": { + "External id": 154999 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.0)", "pid": 0, "tid": 7, + "ts": 6303772049675.018, "dur": 52.163, + "args": { + "External id": 154961 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 0, "tid": 7, + "ts": 6303772039921.689, "dur": 15.906, + "args": { + "External id": 154856 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 0, "tid": 7, + "ts": 6303771780846.362, "dur": 445.096, + "args": { + "External id": 148674 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 0, "tid": 7, + "ts": 6303771998054.003, "dur": 18.178, + "args": { + "External id": 154358 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 0, "tid": 7, + "ts": 6303771776197.380, "dur": 20.483, + "args": { + "External id": 148551 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 0, "tid": 7, + "ts": 6303771771507.534, "dur": 214.212, + "args": { + "External id": 148428 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.2)", "pid": 0, "tid": 7, + "ts": 6303772022268.364, "dur": 58.691, + "args": { + "External id": 154663 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 0, "tid": 7, + "ts": 6303771766568.725, "dur": 314.725, + "args": { + "External id": 148305 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 0, "tid": 7, + "ts": 6303771983647.116, "dur": 16.162, + "args": { + "External id": 154192 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.4)", "pid": 0, "tid": 7, + "ts": 6303771761215.190, "dur": 88.836, + "args": { + "External id": 148182 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "ProfilerStep#9727", "pid": 0, "tid": 7, + "ts": 6303771454965.365, "dur": 679950.516, + "args": { + "External id": 147457 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 0, "tid": 7, + "ts": 6303771744339.475, "dur": 23.395, + "args": { + "External id": 147936 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.5)", "pid": 0, "tid": 7, + "ts": 6303771969138.947, "dur": 16.706, + "args": { + "External id": 154026 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 0, "tid": 7, + "ts": 6303771739916.768, "dur": 24.898, + "args": { + "External id": 147813 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.9)", "pid": 0, "tid": 7, + "ts": 6303771928822.256, "dur": 38.978, + "args": { + "External id": 153501 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.1)", "pid": 0, "tid": 7, + "ts": 6303772026277.883, "dur": 17.506, + "args": { + "External id": 154690 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.1)", "pid": 0, "tid": 7, + "ts": 6303772036043.372, "dur": 56.098, + "args": { + "External id": 154829 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.0)", "pid": 0, "tid": 7, + "ts": 6303771735729.583, "dur": 17.827, + "args": { + "External id": 147690 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.4)", "pid": 0, "tid": 7, + "ts": 6303771993919.619, "dur": 42.978, + "args": { + "External id": 154331 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.3)", "pid": 0, "tid": 7, + "ts": 6303771748651.749, "dur": 24.546, + "args": { + "External id": 148059 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.3)", "pid": 0, "tid": 7, + "ts": 6303772007477.536, "dur": 57.699, + "args": { + "External id": 154497 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out", "pid": 0, "tid": 7, + "ts": 6303771732501.770, "dur": 1072.014, + "args": { + "External id": 147604 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.9)", "pid": 0, "tid": 7, + "ts": 6303771784888.553, "dur": 26.659, + "args": { + "External id": 148797 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.7)", "pid": 0, "tid": 7, + "ts": 6303771942348.428, "dur": 14.947, + "args": { + "External id": 153694 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.8)", "pid": 0, "tid": 7, + "ts": 6303771928864.369, "dur": 362.534, + "args": { + "External id": 153528 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.8)", "pid": 0, "tid": 7, + "ts": 6303771938019.930, "dur": 39.427, + "args": { + "External id": 153667 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.7)", "pid": 0, "tid": 7, + "ts": 6303771951498.711, "dur": 45.538, + "args": { + "External id": 153833 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.6)", "pid": 0, "tid": 7, + "ts": 6303771955316.515, "dur": 17.858, + "args": { + "External id": 153860 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.6)", "pid": 0, "tid": 7, + "ts": 6303771964886.706, "dur": 45.219, + "args": { + "External id": 153999 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::post_backward_reduce (model.layers.5)", "pid": 0, "tid": 7, + "ts": 6303771979150.295, "dur": 45.123, + "args": { + "External id": 154165 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather_copy_out (model.layers.2)", "pid": 0, "tid": 7, + "ts": 6303772012484.026, "dur": 17.283, + "args": { + "External id": 154524 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 0, "tid": 17, + "ts": 6303771969189.380, "dur": 49.187, + "args": { + "External id": 154045 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 0, "tid": 17, + "ts": 6303771955367.523, "dur": 261.765, + "args": { + "External id": 153879 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 0, "tid": 17, + "ts": 6303771942397.069, "dur": 325.478, + "args": { + "External id": 153713 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather", "pid": 0, "tid": 17, + "ts": 6303771455756.862, "dur": 262.917, + "args": { + "External id": 147513 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 0, "tid": 17, + "ts": 6303771735750.448, "dur": 81.186, + "args": { + "External id": 147903 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 0, "tid": 17, + "ts": 6303771983697.132, "dur": 662.218, + "args": { + "External id": 154211 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 0, "tid": 17, + "ts": 6303771457413.425, "dur": 1160.272, + "args": { + "External id": 147657 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 0, "tid": 17, + "ts": 6303771733628.631, "dur": 439.015, + "args": { + "External id": 147780 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.2)", "pid": 0, "tid": 17, + "ts": 6303771998105.459, "dur": 698.635, + "args": { + "External id": 154377 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.3)", "pid": 0, "tid": 17, + "ts": 6303771739945.344, "dur": 176.772, + "args": { + "External id": 148026 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.4)", "pid": 0, "tid": 17, + "ts": 6303771744365.780, "dur": 516.967, + "args": { + "External id": 148149 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.5)", "pid": 0, "tid": 17, + "ts": 6303771748679.717, "dur": 73.475, + "args": { + "External id": 148272 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 0, "tid": 17, + "ts": 6303771903618.507, "dur": 281.734, + "args": { + "External id": 153381 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.1)", "pid": 0, "tid": 17, + "ts": 6303772012535.131, "dur": 48.771, + "args": { + "External id": 154543 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.6)", "pid": 0, "tid": 17, + "ts": 6303771761530.042, "dur": 639.658, + "args": { + "External id": 148395 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 0, "tid": 17, + "ts": 6303771767089.915, "dur": 436.935, + "args": { + "External id": 148518 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.8)", "pid": 0, "tid": 17, + "ts": 6303771771905.427, "dur": 544.328, + "args": { + "External id": 148641 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.0)", "pid": 0, "tid": 17, + "ts": 6303772026328.699, "dur": 376.454, + "args": { + "External id": 154709 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.9)", "pid": 0, "tid": 17, + "ts": 6303771776265.157, "dur": 883.405, + "args": { + "External id": 148764 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "FSDP::all_gather (model.layers.7)", "pid": 0, "tid": 17, + "ts": 6303771930076.958, "dur": 593.129, + "args": { + "External id": 153547 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:all_reduce", "pid": 0, "tid": 20, + "ts": 6303772133001.009, "dur": 606.921, + "args": { + "External id": 155158 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303772060616.073, "dur": 72130.471, + "args": { + "External id": 155013 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303772034804.061, "dur": 5113.662, + "args": { + "External id": 154740 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771776197.188, "dur": 4644.632, + "args": { + "External id": 148672 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771780844.058, "dur": 4039.441, + "args": { + "External id": 148795 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303771928863.345, "dur": 9149.963, + "args": { + "External id": 153515 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771771504.622, "dur": 4691.448, + "args": { + "External id": 148549 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303772049820.940, "dur": 10440.507, + "args": { + "External id": 154975 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771766565.333, "dur": 4938.107, + "args": { + "External id": 148426 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303772021039.390, "dur": 5234.431, + "args": { + "External id": 154574 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303771955313.219, "dur": 8611.750, + "args": { + "External id": 153847 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771761213.398, "dur": 5350.432, + "args": { + "External id": 148303 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771748651.813, "dur": 12559.924, + "args": { + "External id": 148180 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771739913.888, "dur": 4423.509, + "args": { + "External id": 147934 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771732497.546, "dur": 3227.527, + "args": { + "External id": 147688 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771735727.023, "dur": 4184.371, + "args": { + "External id": 147811 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771992821.206, "dur": 5231.967, + "args": { + "External id": 154242 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771744339.475, "dur": 4310.260, + "args": { + "External id": 148057 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303772006846.489, "dur": 5631.043, + "args": { + "External id": 154408 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771456136.738, "dur": 276359.561, + "args": { + "External id": 147602 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303771969139.555, "dur": 8915.114, + "args": { + "External id": 154013 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303771942349.356, "dur": 7842.109, + "args": { + "External id": 153681 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771903978.543, "dur": 5992.520, + "args": { + "External id": 153412 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303771983646.092, "dur": 9172.844, + "args": { + "External id": 154179 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303772026277.115, "dur": 8405.891, + "args": { + "External id": 154677 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771963926.503, "dur": 5209.982, + "args": { + "External id": 153910 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771938014.362, "dur": 4333.204, + "args": { + "External id": 153578 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303771998054.643, "dur": 8670.854, + "args": { + "External id": 154345 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303772039919.737, "dur": 9082.379, + "args": { + "External id": 154843 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771978057.035, "dur": 5586.339, + "args": { + "External id": 154076 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_all_gather_base", "pid": 0, "tid": 20, + "ts": 6303771950230.280, "dur": 5081.405, + "args": { + "External id": 153744 + } + }, + { + "ph": "X", "cat": "gpu_user_annotation", "name": "nccl:_reduce_scatter_base", "pid": 0, "tid": 20, + "ts": 6303772012479.002, "dur": 8559.174, + "args": { + "External id": 154511 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 5714, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 5714, "tid": 0, + "args": { + "labels": "CPU" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 5714, "tid": 0, + "args": { + "sort_index": 5714 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 0, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 0, "tid": 0, + "args": { + "labels": "GPU 0" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 0, "tid": 0, + "args": { + "sort_index": 5000000 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 1, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 1, "tid": 0, + "args": { + "labels": "GPU 1" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 1, "tid": 0, + "args": { + "sort_index": 5000001 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 2, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 2, "tid": 0, + "args": { + "labels": "GPU 2" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 2, "tid": 0, + "args": { + "sort_index": 5000002 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 3, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 3, "tid": 0, + "args": { + "labels": "GPU 3" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 3, "tid": 0, + "args": { + "sort_index": 5000003 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 4, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 4, "tid": 0, + "args": { + "labels": "GPU 4" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 4, "tid": 0, + "args": { + "sort_index": 5000004 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 5, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 5, "tid": 0, + "args": { + "labels": "GPU 5" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 5, "tid": 0, + "args": { + "sort_index": 5000005 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 6, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 6, "tid": 0, + "args": { + "labels": "GPU 6" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 6, "tid": 0, + "args": { + "sort_index": 5000006 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 7, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 7, "tid": 0, + "args": { + "labels": "GPU 7" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 7, "tid": 0, + "args": { + "sort_index": 5000007 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 8, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 8, "tid": 0, + "args": { + "labels": "GPU 8" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 8, "tid": 0, + "args": { + "sort_index": 5000008 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 9, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 9, "tid": 0, + "args": { + "labels": "GPU 9" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 9, "tid": 0, + "args": { + "sort_index": 5000009 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 10, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 10, "tid": 0, + "args": { + "labels": "GPU 10" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 10, "tid": 0, + "args": { + "sort_index": 5000010 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 11, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 11, "tid": 0, + "args": { + "labels": "GPU 11" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 11, "tid": 0, + "args": { + "sort_index": 5000011 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 12, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 12, "tid": 0, + "args": { + "labels": "GPU 12" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 12, "tid": 0, + "args": { + "sort_index": 5000012 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 13, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 13, "tid": 0, + "args": { + "labels": "GPU 13" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 13, "tid": 0, + "args": { + "sort_index": 5000013 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 14, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 14, "tid": 0, + "args": { + "labels": "GPU 14" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 14, "tid": 0, + "args": { + "sort_index": 5000014 + } + }, + { + "name": "process_name", "ph": "M", "ts": 6303771452582.416, "pid": 15, "tid": 0, + "args": { + "name": "python3" + } + }, + { + "name": "process_labels", "ph": "M", "ts": 6303771452582.416, "pid": 15, "tid": 0, + "args": { + "labels": "GPU 15" + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 15, "tid": 0, + "args": { + "sort_index": 5000015 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6303771452582.416, "pid": 0, "tid": 7, + "args": { + "name": "stream 7 " + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 0, "tid": 7, + "args": { + "sort_index": 7 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6303771452582.416, "pid": 0, "tid": 17, + "args": { + "name": "stream 17 " + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 0, "tid": 17, + "args": { + "sort_index": 17 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6303771452582.416, "pid": 0, "tid": 20, + "args": { + "name": "stream 20 " + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 0, "tid": 20, + "args": { + "sort_index": 20 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6303771452582.416, "pid": 5714, "tid": 5714, + "args": { + "name": "thread 5714 (python3)" + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 5714, "tid": 5714, + "args": { + "sort_index": 5714 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6303771452582.416, "pid": 5714, "tid": 6744, + "args": { + "name": "thread 6744 (python3)" + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 5714, "tid": 6744, + "args": { + "sort_index": 6744 + } + }, + { + "name": "thread_name", "ph": "M", "ts": 6303771452582.416, "pid": 5714, "tid": 6744, + "args": { + "name": "thread 6744 (pt_autograd_0)" + } + }, + { + "name": "thread_sort_index", "ph": "M", "ts": 6303771452582.416, "pid": 5714, "tid": 6744, + "args": { + "sort_index": 6744 + } + }, + { + "ph": "X", "cat": "Trace", "ts": 6303771452497.216, "dur": 690107.966, + "pid": "Spans", "tid": "PyTorch Profiler", + "name": "PyTorch Profiler (0)", + "args": { + "Op count": 0 + } + }, + { + "name": "process_sort_index", "ph": "M", "ts": 6303771452497.216, + "pid": "Spans", "tid": 0, + "args": { + "sort_index": 536870912 + } + }, + { + "name": "Iteration Start: PyTorch Profiler", "ph": "i", "s": "g", + "pid": "Traces", "tid": "Trace PyTorch Profiler", "ts": 6303771452497.216 + }, + { + "name": "Record Window End", "ph": "i", "s": "g", + "pid": "", "tid": "", "ts": 6303772167791.916 + } + ], + "traceName": "exp/mtp.120M.batch8.seqlen2048.context2048.warmup1000.update1.steps15000.nft4.lr5e-4.cosine/profile_trace/iteration_9728/rank0_trace.json", + "displayTimeUnit": "ms", + "baseTimeNanoseconds": 1743521598000000000 +} \ No newline at end of file diff --git a/torchtitan/__init__.py b/torchtitan/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8bc7e6b933401b00ee27aae1bbd73235f2b49923 --- /dev/null +++ b/torchtitan/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Copyright (c) Meta Platforms, Inc. All Rights Reserved. + +# Import to register Float8Converter. +import torchtitan.components.float8 # noqa: F401 + +# Import the built-in models here so that the corresponding register_model_spec() +# will be called. +import torchtitan.experiments # noqa: F401 +import torchtitan.models # noqa: F401